From 804b6d77e6a3f91c9f6bd3aaa0abd37bb0955924 Mon Sep 17 00:00:00 2001 From: Pal Kerecsenyi Date: Thu, 25 Sep 2025 13:29:18 +0100 Subject: [PATCH 01/10] WIP: docs: start writing upgrade guide --- docs/upgrading.rst | 133 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 docs/upgrading.rst diff --git a/docs/upgrading.rst b/docs/upgrading.rst new file mode 100644 index 00000000..e84d85b0 --- /dev/null +++ b/docs/upgrading.rst @@ -0,0 +1,133 @@ +.. + This file is part of Invenio. + Copyright (C) 2025 CERN. + + Invenio is free software; you can redistribute it and/or modify it + under the terms of the MIT License; see LICENSE file for more details. + +Upgrading +========= + +====== +v4.0.0 +====== + +This version consists of a major refactor of the module and a full rename to ``invenio-vcs`` (from ``invenio-github``). +The new version has now been made generic and can support any VCS provider by implementing the relevant abstract classes. + +Contrib implementations are provided for GitHub and GitLab. +GitHub is supported with the exact same set of features as before, meaning this module can continue to be used for the original +purpose of ``invenio-github`` with just some migrations and configuration changes required. + +Please follow this guide if: + +- you are **not** using InvenioRDM; or +- you would like to try out ``invenio-vcs`` before InvenioRDM v14 is released. + + - This is not officially supported but should work for the most part. + +RDM-specific instructions can instead be found in the `InvenioRDM upgrade guide `_. + +-------------------------- +1. Update the dependencies +-------------------------- + +In your ``Pipfile`` (or any similar file you are using to manage dependencies), change the name and version of the ``invenio-vcs`` packages. +Additionally, you will need to ensure some other dependencies are up to date for compatibility with the new changes. + +.. code-block:: toml + + [packages] + # ... + invenio-vcs = ">=4.0.0,<5.0.0" + invenio-rdm-records = "TODO" + invenio-app-rdm = "TODO" + invenio-oauthclient = "TODO" + +.. note:: + + ``invenio-vcs`` is no longer packaged by default with InvenioRDM, as was the case with ``invenio-github``. + You must declare it as an explicit dependency on the instance level. + +Next, run the install operation and make sure the old module is no longer installed. +Having both installed simultaneously will lead to numerous conflicts, especially with Alembic migrations. + +.. code-block:: bash + + invenio-cli install + pip uninstall invenio-github + +---------------------------------- +2. Perform the database migrations +---------------------------------- + +Depending on the size of your instance, the migrations can be performed either automatically by running an Alembic script, or manually by +carefully following the instructions in this guide. + +If your instance meets one of these criteria, please use the manual method to avoid database stability issues: + +- An ``oauthclient_remoteaccount`` table with more than 50k rows +- A ``github_repositories`` table with more than 100k rows + +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +2a. Automated Alembic script +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Run the upgrade command: + +.. code-block:: bash + + pipenv run invenio alembic upgrade + +^^^^^^^^^^^^^^^^^ +2b. Manual method +^^^^^^^^^^^^^^^^^ + +.. SQL auto-generated from migration file using `alembic upgrade --sql` + +In an SQL shell (e.g. ``psql`` for PostgreSQL), execute the following: + +.. code-block:: sql + + BEGIN; + ALTER TABLE github_repositories RENAME TO vcs_repositories; + ALTER TABLE vcs_repositories ALTER COLUMN github_id TYPE VARCHAR(255); + ALTER TABLE vcs_repositories ALTER COLUMN github_id SET NOT NULL; + ALTER TABLE vcs_repositories RENAME github_id TO provider_id; + ALTER TABLE vcs_repositories ALTER COLUMN hook TYPE VARCHAR(255); + ALTER TABLE vcs_repositories ALTER COLUMN hook DROP NOT NULL; + ALTER TABLE vcs_repositories ADD COLUMN provider VARCHAR(255) DEFAULT 'github' NOT NULL; + ALTER TABLE vcs_repositories ADD COLUMN default_branch VARCHAR(255) DEFAULT 'master' NOT NULL; + ALTER TABLE vcs_repositories ADD COLUMN description VARCHAR(10000); + ALTER TABLE vcs_repositories ADD COLUMN html_url VARCHAR(10000); + ALTER TABLE vcs_repositories ADD COLUMN license_spdx VARCHAR(255); + ALTER TABLE vcs_repositories RENAME user_id TO enabled_by_id; + DROP INDEX ix_github_repositories_name; + DROP INDEX ix_github_repositories_github_id; + ALTER TABLE vcs_repositories ADD CONSTRAINT uq_vcs_repositories_provider_provider_id UNIQUE (provider, provider_id); + ALTER TABLE vcs_repositories ADD CONSTRAINT uq_vcs_repositories_provider_name UNIQUE (provider, name); + COMMIT; + +TODO: script to migrate ``oauthclient_remoteaccount`` JSON data. + +.. code-block:: sql + + BEGIN; + ALTER TABLE vcs_repositories ALTER COLUMN html_url SET NOT NULL; + ALTER TABLE github_releases RENAME TO vcs_releases; + ALTER TABLE vcs_releases ALTER COLUMN release_id TYPE VARCHAR(255); + ALTER TABLE vcs_releases ALTER COLUMN release_id SET NOT NULL; + ALTER TABLE vcs_releases RENAME release_id TO provider_id; + ALTER TABLE vcs_releases ADD COLUMN provider VARCHAR(255) DEFAULT 'github' NOT NULL; + ALTER TABLE vcs_releases ALTER COLUMN errors TYPE JSONB USING errors::text::jsonb; + ALTER TABLE vcs_releases DROP CONSTRAINT uq_github_releases_release_id; + ALTER TABLE vcs_releases ADD CONSTRAINT uq_vcs_releases_provider_id_provider UNIQUE (provider_id, provider); + ALTER TABLE vcs_releases ADD CONSTRAINT uq_vcs_releases_provider_id_provider_tag UNIQUE (provider_id, provider, tag); + CREATE TABLE vcs_repository_users ( + repository_id UUID NOT NULL, + user_id INTEGER NOT NULL, + PRIMARY KEY (repository_id, user_id), + CONSTRAINT fk_vcs_repository_users_repository_id_vcs_repositories FOREIGN KEY(repository_id) REFERENCES vcs_repositories (id), + CONSTRAINT fk_vcs_repository_users_user_id_accounts_user FOREIGN KEY(user_id) REFERENCES accounts_user (id) + ); + COMMIT; From 729de68c20ab4df65ba6201a1fb088ca88e77dd6 Mon Sep 17 00:00:00 2001 From: Pal Kerecsenyi Date: Fri, 10 Oct 2025 12:21:25 +0100 Subject: [PATCH 02/10] WIP: docs: usage docs --- docs/api.rst | 5 +- docs/usage.rst | 112 +++++++++++++++++++++++++++++++++++++++++- invenio_vcs/config.py | 64 ++++++++++++++++++++++++ 3 files changed, 177 insertions(+), 4 deletions(-) create mode 100644 invenio_vcs/config.py diff --git a/docs/api.rst b/docs/api.rst index 8bdd0d1c..5a5114b0 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -25,6 +25,5 @@ API Docs ======== -invenio_github --------------- - +.. automodule:: invenio_vcs.ext + :members: diff --git a/docs/usage.rst b/docs/usage.rst index e8ebe1ac..de52ecb1 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -25,4 +25,114 @@ Usage ===== -.. automodule:: invenio_github +Invenio-VCS allows you to flexibly configure multiple VCS providers and allow users to sync their repositories. +These can then be published as records (in InvenioRDM) or any other customisable action can be performed when a release is made. + +Currently, the following VCS providers are officially supported in this module: + +* GitHub +* GitLab + +However, you can add support for any other provider (including non-Git ones) by implementing an abstract class. + +This guide explains how to configure this module in general, while the corresponding guides for each provider go into more detail about the specific steps required. + +=========== +Quick start +=========== + +1. Choose which provider you'd like to use. We'll use GitHub in this example. In ``invenio.cfg`` create the provider factory corresponding to your provider: + + .. code-block:: python + + _vcs_github = GitHubProviderFactory( + base_url="https://github.com", + webhook_receiver_url="https://example.com/api/hooks/receivers/github/events/?access_token={token}", + ) + + These are the only two required arguments, and they're the same for all providers. For more details, see :ref:`provider-arguments`. + + +2. Configure the OAuth client. Each provider provides its own configuration for ``invenio-oauthclient`` through which users can authenticate to get the necessary access token for syncing their repositories. + This should be added to any OAuth clients you may already have configured: + + .. code-block:: python + + OAUTHCLIENT_REMOTE_APPS = { + "github": _vcs_github.remote_config, + } + + OAUTHCLIENT_REST_REMOTE_APPS = { + "github": _vcs_github.remote_config, + } + + If you used a custom ``id`` when constructing the provider factory, this ID must correspond to that. The default ID for the GitHub provider is ``github``. + +3. Register an OAuth application with the provider. For GitHub, this can be done through the `Developer Settings `_. Please refer to the provider documentation for more details. + + Usually, you'll be asked for a redirect URL. By default, this will be of the form: + + .. code-block:: + + https://example.com/oauth/authorized/github/ + + where ``github`` corresponds to the ID of your provider factory. + + Once your app is registered, you'll be given a Client ID and Secret. You need to specify these to ``invenio-oauthclient``: + + .. code-block:: python + + GITHUB_APP_CREDENTIALS = { + "consumer_key": "your_client_id", + "consumer_secret": "your_client_secret", + } + + .. note:: + + The name of this config variable is specified by the ``credentials_key`` constructor argument of the provider factory. The '``GITHUB_``' is *not* derived from the ID, so you'll need to manually override this argument if you're using multiple instances of the same provider. + +4. Register the provider. By adding provider factories to this list, you can enable each of them as a repository syncing method. + + .. code-block:: python + + VCS_PROVIDERS = [_vcs_github] + + You can add multiple of the same type of provider here. For example, you could have both public GitHub.com and a self-hosted GitHub Enterprise instance. The only requirement is to use a different ID and ``credentials_key`` for each provider factory. + + .. caution:: + + Once repositories have been enabled from a given provider, removing it from this list is a dangerous operation. It's an unsupported behaviour that could cause unexpected errors and inconsistencies. + +.. _provider-arguments: + +================== +Provider arguments +================== + +When constructing the provider factory, there are some common arguments that can be configured: + +* ``base_url`` (**required**): the URL of the VCS instance, for example ``https://github.com``. This can correspond to either the public officially hosted instance or a self-hosted one. + +* ``webhook_receiver_url`` (**required**): the endpoint on your Invenio server that will handle webhook events from GitHub. This will almost always follow the pattern shown below, but can be customised depending on your use of ``invenio-webhooks``. The ``{token}`` variable can be placed anywhere in the URL and is used to validate the authenticity of the webhook call. + + .. code-block:: + + https://example.com/api/hooks/receivers/github/events/?access_token={token} + +* ``id``: uniquely identifies the provider within your instance. This value is used across database models and URLs to relate data to the provider. Once it has been used, the ID must not be changed. Each provider comes with a default ID (e.g. ``github``) but this should be changed if multiple instances of the same provider are being used. + +* ``name``: the displayed name of the provider, e.g. ``GitHub``. You can, for example, set this to an institution-specific name to make it clear to users that it's not referring to the public instance. + +* ``description``: a short text explaining the role of the provider in the instance. Shown in the user's OAuth settings page. + +* ``credentials_key``: the name of the config variable specifying the OAuth Client ID and Secret for ``invenio-oauthclient``. + +* ``config``: a dictionary of custom provider-specific configuration options. + +============= +Configuration +============= + +.. automodule:: invenio_vcs.config + :members: + :exclude-members: get_provider_by_id, get_provider_list diff --git a/invenio_vcs/config.py b/invenio_vcs/config.py new file mode 100644 index 00000000..86ca5b94 --- /dev/null +++ b/invenio_vcs/config.py @@ -0,0 +1,64 @@ +# -*- coding: utf-8 -*- +# This file is part of Invenio. +# Copyright (C) 2025 CERN. +# +# Invenio is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. + +"""You can use these options to configure the Invenio-VCS module. + +Other than ``VCS_PROVIDERS``, they are all optional and configured with reasonable defaults. +""" + +from typing import TYPE_CHECKING + +from flask import current_app + +if TYPE_CHECKING: + from invenio_vcs.providers import RepositoryServiceProviderFactory + +VCS_PROVIDERS = [] +"""The list of RepositoryProviderFactory instances. + +These will be visible to the user in their settings and they will be able to sync repositories +from all of them. Multiple instances of different providers as well as of the same provider +can be combined in this list, but each provider must have a unique ``id`` and ``credentials_key``. +""" + +VCS_RELEASE_CLASS = "invenio_vcs.service:VCSRelease" +"""VCSRelease class to be used for release handling.""" + +VCS_TEMPLATE_INDEX = "invenio_vcs/settings/index.html" +"""Repositories list template.""" + +VCS_TEMPLATE_VIEW = "invenio_vcs/settings/view.html" +"""Repository detail view template.""" + +VCS_ERROR_HANDLERS = None +"""Definition of the way specific exceptions are handled.""" + +VCS_MAX_CONTRIBUTORS_NUMBER = 30 +"""Max number of contributors of a release to be retrieved from vcs.""" + +VCS_CITATION_FILE = None +"""Citation file name.""" + +VCS_CITATION_METADATA_SCHEMA = None +"""Citation metadata schema.""" + +VCS_ZIPBALL_TIMEOUT = 300 +"""Timeout for the zipball download, in seconds.""" + + +def get_provider_list(app=current_app) -> list["RepositoryServiceProviderFactory"]: + """Get a list of configured VCS provider factories.""" + return app.config["VCS_PROVIDERS"] + + +def get_provider_by_id(id: str) -> "RepositoryServiceProviderFactory": + """Get a specific VCS provider by its registered ID.""" + providers = get_provider_list() + for provider in providers: + if id == provider.id: + return provider + raise Exception(f"VCS provider with ID {id} not registered") From 550527283ce2366e445bdaaba74cd2eac8b6f016 Mon Sep 17 00:00:00 2001 From: Pal Kerecsenyi Date: Mon, 13 Oct 2025 13:35:19 +0100 Subject: [PATCH 03/10] WIP: docs: contrib provider implementation docs --- docs/contrib/github.rst | 41 ++++++++++++++++++++++++++ docs/contrib/gitlab.rst | 64 +++++++++++++++++++++++++++++++++++++++++ docs/index.rst | 12 ++++++++ 3 files changed, 117 insertions(+) create mode 100644 docs/contrib/github.rst create mode 100644 docs/contrib/gitlab.rst diff --git a/docs/contrib/github.rst b/docs/contrib/github.rst new file mode 100644 index 00000000..8f2fa15b --- /dev/null +++ b/docs/contrib/github.rst @@ -0,0 +1,41 @@ +.. + This file is part of Invenio. + Copyright (C) 2025 CERN. + + Invenio is free software; you can redistribute it and/or modify it + under the terms of the MIT License; see LICENSE file for more details. + +GitHub +====== + +To register GitHub as a VCS provider: + +* instantiate ``invenio_vcs.contrib.github.GitHubProviderFactory`` +* add it into the ``VCS_PROVIDERS`` list +* add it into the ``OAUTHCLIENT_REMOTE_APPS`` and ``OAUTHCLIENT_REST_REMOTE_APPS`` dictionaries under the same key as the ID (``github`` by default). +* configure ``GITHUB_APP_CREDENTIALS`` with your OAuth credentials generated by GitHub. + +For more details and a full example, please see `the Usage Guide <../usage>`_. + +================== +OAuth registration +================== + +To register a GitHub VCS provider, you need to `create an OAuth app `_ in your Developer Settings. + +You can enter any details for the app, but the authorization callback URL must be of the form: + +.. code-block:: + + https://example.com/oauth/authorized/github/ + +where ``example.com`` is the hostname of your instance and ``github`` is your configured provider ID. + +===================== +Special config values +===================== + +These optional values can be passed as keys of the ``config`` dictionary in the ``GitHubProviderFactory`` constructor. + +* ``shared_secret``: signing secret for the webhook payload. See the `GitHub documentation `_ for more details. Currently, one instance-wide secret is used for all webhooks. In addition, an internal per-repository access token is automatically attached to the webhook URL to validate access. +* ``insecure_ssl``: a boolean to indicate whether GitHub should accept self-signed or otherwise insecure SSl/TLS certificates when attempting to deliver the webhook. diff --git a/docs/contrib/gitlab.rst b/docs/contrib/gitlab.rst new file mode 100644 index 00000000..c85fe6a6 --- /dev/null +++ b/docs/contrib/gitlab.rst @@ -0,0 +1,64 @@ +.. + This file is part of Invenio. + Copyright (C) 2025 CERN. + + Invenio is free software; you can redistribute it and/or modify it + under the terms of the MIT License; see LICENSE file for more details. + +GitLab +====== + +To register GitLab as a VCS provider: + +* instantiate ``invenio_vcs.contrib.gitlab.GitLabProviderFactory`` +* add it into the ``VCS_PROVIDERS`` list +* add it into the ``OAUTHCLIENT_REMOTE_APPS`` and ``OAUTHCLIENT_REST_REMOTE_APPS`` dictionaries under the same key as the ID (``gitlab`` by default). +* configure ``GITLAB_APP_CREDENTIALS`` with your OAuth credentials generated by GitLab. + +For more details and a full example, please see `the Usage Guide <../usage>`_. + +============= +Compatibility +============= + +The integration has been tested to work with the following versions of GitLab: + +* 18.x +* 17.x +* 16.x + +Support for anything but the latest major version is not guaranteed. +Please refer also to the `GitLab release policy `_. + +Any modified GitLab instances running custom code are also not guaranteed to be compatible. +You can, however, override the ``invenio_vcs.contrib.gitlab.GitLabProviderFactory`` and ``invenio_vcs.contrib.gitlab.GitLabProvider`` classes to add support for any non-standard API behaviour. + +================== +OAuth registration +================== + +To register a GitLab VCS provider, you need to `create a new Application `_ in your User Settings. +For a self-hosted GitLab instance, navigate to ``https://my-gitlab-instance.com/-/user_settings/applications``. + +Configure the app with the following settings: + +* You can use any name. +* Use a redirect URI of the form + + .. code-block:: + + https://example.com/oauth/authorized/gitlab/ + + where ``example.com`` is the hostname of your instance and ``gitlab`` is your configured provider ID. + +* Ensure 'Confidential' is checked. +* Select the ``api`` scope. Unfortunately, webhook management doesn't currently have a more narrow scope so this highly-general scope must be selected. + +===================== +Special config values +===================== + +These optional values can be passed as keys of the ``config`` dictionary in the ``GitHubProviderFactory`` constructor. + +* ``shared_secret``: signing secret for the webhook payload. See the `GitHub documentation `_ for more details. Currently, one instance-wide secret is used for all webhooks. In addition, an internal per-repository access token is automatically attached to the webhook URL to validate access. +* ``insecure_ssl``: a boolean to indicate whether GitHub should accept self-signed or otherwise insecure SSl/TLS certificates when attempting to deliver the webhook. diff --git a/docs/index.rst b/docs/index.rst index 2cdfd71e..da383404 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -37,6 +37,18 @@ Invenio-GitHub. usage +Provider Guides +--------------- + +These docs provide configuration instructions specific to each VCS provider. + +.. toctree:: + + contrib/github + contrib/gitlab + + + API Reference ------------- From 1a80a19b176a656a2240626b9dfb9a11e09a0469 Mon Sep 17 00:00:00 2001 From: Pal Kerecsenyi Date: Mon, 13 Oct 2025 13:35:19 +0100 Subject: [PATCH 04/10] WIP: docs: contrib provider implementation docs --- docs/contrib/gitlab.rst | 3 +-- docs/index.rst | 12 ++++++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/docs/contrib/gitlab.rst b/docs/contrib/gitlab.rst index c85fe6a6..5ac9c47b 100644 --- a/docs/contrib/gitlab.rst +++ b/docs/contrib/gitlab.rst @@ -60,5 +60,4 @@ Special config values These optional values can be passed as keys of the ``config`` dictionary in the ``GitHubProviderFactory`` constructor. -* ``shared_secret``: signing secret for the webhook payload. See the `GitHub documentation `_ for more details. Currently, one instance-wide secret is used for all webhooks. In addition, an internal per-repository access token is automatically attached to the webhook URL to validate access. -* ``insecure_ssl``: a boolean to indicate whether GitHub should accept self-signed or otherwise insecure SSl/TLS certificates when attempting to deliver the webhook. +* ``shared_validation_token``: Validation secret token for the webhook payload. See the `GitLab documentation `_ for more details. Currently, one instance-wide token is used for all webhooks. In addition, an internal per-repository access token is automatically attached to the webhook URL to validate access. diff --git a/docs/index.rst b/docs/index.rst index da383404..edf5985e 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -49,6 +49,18 @@ These docs provide configuration instructions specific to each VCS provider. +Provider Guides +--------------- + +These docs provide configuration instructions specific to each VCS provider. + +.. toctree:: + + contrib/github + contrib/gitlab + + + API Reference ------------- From 3a466949627c6895c45d0e21ea1d72c92d2b7785 Mon Sep 17 00:00:00 2001 From: Pal Kerecsenyi Date: Wed, 15 Oct 2025 15:08:10 +0200 Subject: [PATCH 05/10] WIP: docs: complete upgrade guide --- docs/upgrading.rst | 361 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 331 insertions(+), 30 deletions(-) diff --git a/docs/upgrading.rst b/docs/upgrading.rst index e84d85b0..8703f66f 100644 --- a/docs/upgrading.rst +++ b/docs/upgrading.rst @@ -89,40 +89,339 @@ In an SQL shell (e.g. ``psql`` for PostgreSQL), execute the following: .. code-block:: sql - BEGIN; - ALTER TABLE github_repositories RENAME TO vcs_repositories; - ALTER TABLE vcs_repositories ALTER COLUMN github_id TYPE VARCHAR(255); - ALTER TABLE vcs_repositories ALTER COLUMN github_id SET NOT NULL; - ALTER TABLE vcs_repositories RENAME github_id TO provider_id; - ALTER TABLE vcs_repositories ALTER COLUMN hook TYPE VARCHAR(255); - ALTER TABLE vcs_repositories ALTER COLUMN hook DROP NOT NULL; - ALTER TABLE vcs_repositories ADD COLUMN provider VARCHAR(255) DEFAULT 'github' NOT NULL; - ALTER TABLE vcs_repositories ADD COLUMN default_branch VARCHAR(255) DEFAULT 'master' NOT NULL; - ALTER TABLE vcs_repositories ADD COLUMN description VARCHAR(10000); - ALTER TABLE vcs_repositories ADD COLUMN html_url VARCHAR(10000); - ALTER TABLE vcs_repositories ADD COLUMN license_spdx VARCHAR(255); - ALTER TABLE vcs_repositories RENAME user_id TO enabled_by_id; - DROP INDEX ix_github_repositories_name; - DROP INDEX ix_github_repositories_github_id; - ALTER TABLE vcs_repositories ADD CONSTRAINT uq_vcs_repositories_provider_provider_id UNIQUE (provider, provider_id); - ALTER TABLE vcs_repositories ADD CONSTRAINT uq_vcs_repositories_provider_name UNIQUE (provider, name); - COMMIT; - -TODO: script to migrate ``oauthclient_remoteaccount`` JSON data. + BEGIN; + CREATE TABLE vcs_repositories ( + id uuid NOT NULL, + provider_id character varying(255) NOT NULL, + name character varying(255) NOT NULL, + hook character varying(255), + enabled_by_id integer, + created timestamp without time zone NOT NULL, + updated timestamp without time zone NOT NULL, + provider character varying(255) DEFAULT 'github'::character varying NOT NULL, + default_branch character varying(255) DEFAULT 'master'::character varying NOT NULL, + description character varying(10000), + html_url character varying(10000) NOT NULL, + license_spdx character varying(255) + ); + ALTER TABLE ONLY vcs_repositories ADD CONSTRAINT pk_vcs_repositories PRIMARY KEY (id); + ALTER TABLE ONLY vcs_repositories ADD CONSTRAINT uq_vcs_repositories_provider_name UNIQUE (provider, name); + ALTER TABLE ONLY vcs_repositories ADD CONSTRAINT uq_vcs_repositories_provider_provider_id UNIQUE (provider, provider_id); + ALTER TABLE ONLY vcs_repositories ADD CONSTRAINT fk_vcs_repositories_enabled_by_id_accounts_user FOREIGN KEY (enabled_by_id) REFERENCES accounts_user(id); + + CREATE TABLE vcs_releases ( + id uuid NOT NULL, + provider_id character varying(255) NOT NULL, + tag character varying(255), + errors jsonb, + repository_id uuid, + event_id uuid, + record_id uuid, + status character(1) NOT NULL, + created timestamp without time zone NOT NULL, + updated timestamp without time zone NOT NULL, + provider character varying(255) DEFAULT 'github'::character varying NOT NULL + ); + ALTER TABLE ONLY vcs_releases ADD CONSTRAINT pk_vcs_releases PRIMARY KEY (id); + ALTER TABLE ONLY vcs_releases ADD CONSTRAINT uq_vcs_releases_provider_id_provider UNIQUE (provider_id, provider); + ALTER TABLE ONLY vcs_releases ADD CONSTRAINT uq_vcs_releases_provider_id_provider_tag UNIQUE (provider_id, provider, tag); + CREATE INDEX ix_vcs_releases_record_id ON vcs_releases USING btree (record_id); + ALTER TABLE ONLY vcs_releases ADD CONSTRAINT fk_vcs_releases_event_id_webhooks_events FOREIGN KEY (event_id) REFERENCES webhooks_events(id); + ALTER TABLE ONLY vcs_releases ADD CONSTRAINT fk_vcs_releases_repository_id_vcs_repositories FOREIGN KEY (repository_id) REFERENCES vcs_repositories(id); + COMMIT; + +Next, you must perform some manual data migrations: + +* The ``oauthclient_remoteaccount`` table stores the user's *entire* GitHub repository list as a dictionary within the ``extra_data`` column. + Before the upgrade, the format is as follows: + + .. code-block:: json + + { + "last_sync":"2025-10-15T12:30:01.027133+00:00", + "repos": { + "123": { + "id": "123", + "full_name": "org/repo", + "description": "An example repository", + "default_branch": "main" + } + } + } + + In the new format, we no longer store repos in this JSON column. This is an inefficient approach that systems with hundreds of thousands + of repos have outgrown. + Previously, only *activated* repos were stored in the ``github_repositories`` table. + Now, *all* repos are stored directly as rows of the ``vcs_repositories`` table. + Whether or not they're activated is indicated by the presence of non-null values for the ``hook`` and ``enabled_by_id`` columns. + + You must perform this migration, leaving only the ``"last_sync"`` value in the ``extra_data`` JSON column. + Not all columns of the ``vcs_repositories`` table need to be filled during the migration. + The following columns can be left blank and will be filled during the first sync after the migration: + + * ``description`` + * ``license_spdx`` + + The value for ``provider`` defaults to ``github``. + +* The ``github_repositories`` table needs to be copied over to the ``vcs_repositories`` table, taking into account the changed columns. + This should be merged with the new repos created by copying data from ``oauthclient_remoteaccount`` as shown above. + +* The ``github_releases`` table needs to be copied over to the ``vcs_releases`` table. + This can *not* be done automatically during a sync and needs to be done manually during the migration. + +You can use this script as a starting point to automating these changes, but it may need some customisation depending on the size +and uptime requirements of your instance. + +.. raw:: html + +
+ Example script + +This script is non-destructive and atomic and should work for the majority of use cases, but may need slight customisation. +You can set the database connection string via the ``UPGRADE_DB`` environment variable. + +.. code-block:: python + + import os + import uuid + from datetime import datetime, timezone + + import sqlalchemy as sa + from sqlalchemy.dialects import postgresql + from sqlalchemy.ext.mutable import MutableDict + from sqlalchemy.orm import Session + from sqlalchemy_utils import JSONType, UUIDType + from tqdm import tqdm + + engine = sa.create_engine(os.getenv("UPGRADE_DB"), echo=False) + + # Lightweight models for all of the tables (incl old and new versions) + remote_account_table = sa.table( + "oauthclient_remoteaccount", + sa.Column("id", sa.Integer, primary_key=True), + sa.Column("user_id", sa.Integer, sa.ForeignKey("account_user.id")), + sa.Column("client_id", sa.String(255)), + # We may have changed this if we merge https://github.com/inveniosoftware/invenio-oauthclient/pull/360 + # but we're only reading this column so it shouldn't make a difference. + sa.Column("extra_data", MutableDict.as_mutable(JSONType)), + ) + github_repositories_table = sa.table( + "github_repositories", + sa.Column("id", UUIDType, primary_key=True), + sa.Column("github_id", sa.String(255), nullable=True), + sa.Column("name", sa.String(255), nullable=False), + sa.Column("hook", sa.Integer, nullable=True), + sa.Column("user_id", sa.Integer, sa.ForeignKey("account_user.id"), nullable=True), + sa.Column("created", sa.DateTime, nullable=False), + sa.Column("updated", sa.DateTime, nullable=False), + ) + vcs_repositories_table = sa.table( + "vcs_repositories", + sa.Column("id", UUIDType, primary_key=True), + sa.Column("provider_id", sa.String(255), nullable=True), + sa.Column("provider", sa.String(255), nullable=True), + sa.Column("description", sa.String(10000), nullable=True), + sa.Column("html_url", sa.String(10000), nullable=False), + sa.Column("license_spdx", sa.String(255), nullable=True), + sa.Column("default_branch", sa.String(255), nullable=False), + sa.Column("name", sa.String(255), nullable=False), + sa.Column("hook", sa.String(255), nullable=True), + sa.Column( + "enabled_by_id", sa.Integer, sa.ForeignKey("account_user.id"), nullable=True + ), + sa.Column("created", sa.DateTime, nullable=False), + sa.Column("updated", sa.DateTime, nullable=False), + ) + github_releases_table = sa.table( + "github_releases", + sa.Column("id", UUIDType, primary_key=True), + sa.Column("release_id", sa.Integer, primary_key=True), + sa.Column("tag", sa.String(255), nullable=True), + sa.Column("errors", MutableDict.as_mutable(JSONType), nullable=True), + sa.Column( + "repository_id", + UUIDType, + sa.ForeignKey("github_repositories.id"), + nullable=True, + ), + sa.Column("event_id", UUIDType, sa.ForeignKey("webhooks_events.id"), nullable=True), + sa.Column("record_id", UUIDType, nullable=True), + sa.Column("status", sa.CHAR(1), nullable=False), + sa.Column("created", sa.DateTime, nullable=False), + sa.Column("updated", sa.DateTime, nullable=False), + ) + vcs_releases_table = sa.table( + "vcs_repositories", + sa.Column("id", UUIDType, primary_key=True), + sa.Column("provider_id", sa.String(255), nullable=True), + sa.Column("provider", sa.String(255), nullable=True), + sa.Column("tag", sa.String(255), nullable=True), + sa.Column( + "errors", + MutableDict.as_mutable( + sa.JSON() + .with_variant(postgresql.JSONB(), "postgresql") + .with_variant(JSONType(), "sqlite") + .with_variant(JSONType(), "mysql") + ), + nullable=True, + ), + sa.Column( + "repository_id", + UUIDType, + sa.ForeignKey("vcs_repositories.id"), + nullable=True, + ), + sa.Column("event_id", UUIDType, sa.ForeignKey("webhooks_events.id"), nullable=True), + sa.Column("record_id", UUIDType, nullable=True), + sa.Column("status", sa.CHAR(1), nullable=False), + sa.Column("created", sa.DateTime, nullable=False), + sa.Column("updated", sa.DateTime, nullable=False), + ) + + with Session(engine) as session: + + # First, we move the JSON repos from oauthclient_remoteaccount to the new vcs_repositories table + + # We don't know the client ID as this is a config variable. + # So to find the RemoteAccounts that correspond to GitHub, we need to check for the existence + # of the `repos` key in the `extra_data` JSON. We cannot make this very efficient sadly, because + # (a) in Postgres we are using JSON not JSONB so there is no efficient JSON querying and (b) the + # instance might be using MySQL/SQLite where we store it as `TEXT`. + + remote_accounts = session.execute(sa.select(remote_account_table)) + for remote_account in tqdm(remote_accounts.mappings(), desc="remote_account"): + if "repos" not in remote_account["extra_data"]: + continue + + repos = remote_account["extra_data"]["repos"] + + for id, github_repo in repos.items(): + # `id` (the dict key) is a string because JSON keys must be strings + + # We might have already created it for another user + matching_db_repo_id = session.scalar( + sa.select(vcs_repositories_table).filter_by(provider_id=id) + ) + + if matching_db_repo_id is None: + # We are now storing _all_ repositories (even non-enabled ones) in the DB. + # The repo-user association will be created on the first sync after this migration, we need to download + # the list of users with access to the repo from the GitHub API. + session.execute( + vcs_repositories_table.insert().values( + id=uuid.uuid4(), + provider_id=id, + provider="github", + description=github_repo["description"], + name=github_repo["full_name"], + default_branch=github_repo["default_branch"], + # So far we have only supported github.com so we can safely assume the URL + html_url=f'https://github.com/{github_repo["full_name"]}', + # We have never stored this, it is queried at runtime right now. When the first + # sync happens after this migration, we will download all the license IDs from the VCS. + license_spdx=None, + # This repo wasn't enabled + hook=None, + enabled_by_id=None, + created=datetime.now(tz=timezone.utc), + updated=datetime.now(tz=timezone.utc), + ) + ) + else: + session.execute( + vcs_repositories_table.update() + .filter_by(id=matching_db_repo_id) + .values( + description=github_repo["description"], + name=github_repo["full_name"], + default_branch=github_repo["default_branch"], + html_url=f'https://github.com/{github_repo["full_name"]}', + updated=datetime.now(tz=timezone.utc), + ) + ) + + # Remove `repos` from the existing `extra_data`, leaving only the last sync timestamp + session.execute( + remote_account_table.update() + .filter_by(id=remote_account["id"]) + .values(extra_data={"last_sync": remote_account["extra_data"]["last_sync"]}) + ) + + # Next, we move over any old rows from github_repositories that weren't attached to any user (for whatever reason). + old_db_repos = session.execute(sa.select(github_repositories_table)) + for old_db_repo in tqdm(old_db_repos.mappings(), desc="repos"): + matching_new_repo_id = session.scalar( + sa.select( + vcs_repositories_table.c.id, + ).filter_by(provider_id=str(old_db_repo["github_id"])) + ) + + if matching_new_repo_id is None: + # We only have very limited metadata available at this point. + # The first sync job after this migration will fill in the rest. + session.execute( + vcs_repositories_table.insert().values( + id=old_db_repo["id"], + provider_id=str(old_db_repo["github_id"]), + provider="github", + name=old_db_repo["name"], + default_branch="main", + html_url=f"https://github.com/{old_db_repo["name"]}", + license_spdx=None, + hook=old_db_repo["hook"], + enabled_by_id=old_db_repo["user_id"], + created=old_db_repo["created"], + updated=datetime.now(tz=timezone.utc), + ) + ) + else: + session.execute( + vcs_repositories_table.update() + .filter_by(id=matching_new_repo_id) + .values( + id=old_db_repo["id"], + hook=str(old_db_repo["hook"]), + enabled_by_id=old_db_repo["user_id"], + created=old_db_repo["created"], + ) + ) + + # Finally, we copy over the releases + old_db_releases = session.execute(sa.select(github_releases_table)) + for old_db_release in tqdm(old_db_releases.mappings(), desc="releases"): + # Since we've created all the repos, we know due to referential integrity that this release's repo ID corresponds + # to a valid and existent repo. + + session.execute( + vcs_releases_table.insert().values( + id=old_db_release["id"], + provider_id=str(old_db_release["release_id"]), + provider="github", + tag=old_db_release["tag"], + errors=old_db_release["errors"], + repository_id=old_db_release["repository_id"], + event_id=old_db_release["event_id"], + record_id=old_db_release["record_id"], + status=old_db_release["status"], + created=old_db_release["created"], + updated=datetime.now(tz=timezone.utc), + ) + ) + + session.commit() + +.. raw:: html + +
+ +Finally, once you are certain that all data has been copied over to the new tables, you can finish the migration +by running the following SQL commands: .. code-block:: sql BEGIN; - ALTER TABLE vcs_repositories ALTER COLUMN html_url SET NOT NULL; - ALTER TABLE github_releases RENAME TO vcs_releases; - ALTER TABLE vcs_releases ALTER COLUMN release_id TYPE VARCHAR(255); - ALTER TABLE vcs_releases ALTER COLUMN release_id SET NOT NULL; - ALTER TABLE vcs_releases RENAME release_id TO provider_id; - ALTER TABLE vcs_releases ADD COLUMN provider VARCHAR(255) DEFAULT 'github' NOT NULL; - ALTER TABLE vcs_releases ALTER COLUMN errors TYPE JSONB USING errors::text::jsonb; - ALTER TABLE vcs_releases DROP CONSTRAINT uq_github_releases_release_id; - ALTER TABLE vcs_releases ADD CONSTRAINT uq_vcs_releases_provider_id_provider UNIQUE (provider_id, provider); - ALTER TABLE vcs_releases ADD CONSTRAINT uq_vcs_releases_provider_id_provider_tag UNIQUE (provider_id, provider, tag); CREATE TABLE vcs_repository_users ( repository_id UUID NOT NULL, user_id INTEGER NOT NULL, @@ -130,4 +429,6 @@ TODO: script to migrate ``oauthclient_remoteaccount`` JSON data. CONSTRAINT fk_vcs_repository_users_repository_id_vcs_repositories FOREIGN KEY(repository_id) REFERENCES vcs_repositories (id), CONSTRAINT fk_vcs_repository_users_user_id_accounts_user FOREIGN KEY(user_id) REFERENCES accounts_user (id) ); + DROP TABLE github_repositories; + DROP TABLE github_releases; COMMIT; From 1da63e9752082e1104eb20b65d2d2bbf40420244 Mon Sep 17 00:00:00 2001 From: Pal Kerecsenyi Date: Wed, 15 Oct 2025 15:33:22 +0200 Subject: [PATCH 06/10] WIP: chore: license --- .editorconfig | 23 +++-------------------- .travis.yml | 23 +++-------------------- AUTHORS.rst | 23 +++-------------------- CHANGES.rst | 23 +++-------------------- MANIFEST.in | 23 +++-------------------- README.rst | 24 ++++-------------------- docs/conf.py | 24 +++--------------------- requirements-devel.txt | 23 +++-------------------- setup.cfg | 23 +++-------------------- setup.py | 25 ++++--------------------- 10 files changed, 32 insertions(+), 202 deletions(-) diff --git a/.editorconfig b/.editorconfig index 32f4e1e1..fb10ef01 100644 --- a/.editorconfig +++ b/.editorconfig @@ -1,26 +1,9 @@ # -*- coding: utf-8 -*- -# # This file is part of Invenio. -# Copyright (C) 2016 CERN. -# -# Invenio is free software; you can redistribute it -# and/or modify it under the terms of the GNU General Public License as -# published by the Free Software Foundation; either version 2 of the -# License, or (at your option) any later version. -# -# Invenio is distributed in the hope that it will be -# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Invenio; if not, write to the -# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, -# MA 02111-1307, USA. +# Copyright (C) 2025 CERN. # -# In applying this license, CERN does not -# waive the privileges and immunities granted to it by virtue of its status -# as an Intergovernmental Organization or submit itself to any jurisdiction. +# Invenio is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. root = true diff --git a/.travis.yml b/.travis.yml index ed1ed703..264d2f55 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,26 +1,9 @@ # -*- coding: utf-8 -*- -# # This file is part of Invenio. -# Copyright (C) 2016 CERN. -# -# Invenio is free software; you can redistribute it -# and/or modify it under the terms of the GNU General Public License as -# published by the Free Software Foundation; either version 2 of the -# License, or (at your option) any later version. -# -# Invenio is distributed in the hope that it will be -# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Invenio; if not, write to the -# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, -# MA 02111-1307, USA. +# Copyright (C) 2025 CERN. # -# In applying this license, CERN does not -# waive the privileges and immunities granted to it by virtue of its status -# as an Intergovernmental Organization or submit itself to any jurisdiction. +# Invenio is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. notifications: diff --git a/AUTHORS.rst b/AUTHORS.rst index 60529a07..e3dd1992 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -1,26 +1,9 @@ .. This file is part of Invenio. - Copyright (C) 2016 CERN. - - Invenio is free software; you can redistribute it - and/or modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of the - License, or (at your option) any later version. - - Invenio is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with Invenio; if not, write to the - Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307, USA. - - In applying this license, CERN does not - waive the privileges and immunities granted to it by virtue of its status - as an Intergovernmental Organization or submit itself to any jurisdiction. + Copyright (C) 2025 CERN. + Invenio is free software; you can redistribute it and/or modify it + under the terms of the MIT License; see LICENSE file for more details. Authors ======= diff --git a/CHANGES.rst b/CHANGES.rst index 18c8819e..1f6caed1 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,27 +1,10 @@ .. This file is part of Invenio. - Copyright (C) 2016-2024 CERN. + Copyright (C) 2016-2025 CERN. Copyright (C) 2024-2025 Graz University of Technology. - Invenio is free software; you can redistribute it - and/or modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of the - License, or (at your option) any later version. - - Invenio is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with Invenio; if not, write to the - Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307, USA. - - In applying this license, CERN does not - waive the privileges and immunities granted to it by virtue of its status - as an Intergovernmental Organization or submit itself to any jurisdiction. - + Invenio is free software; you can redistribute it and/or modify it + under the terms of the MIT License; see LICENSE file for more details. Changes ======= diff --git a/MANIFEST.in b/MANIFEST.in index d699aee6..d65e9b13 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,26 +1,9 @@ # -*- coding: utf-8 -*- -# # This file is part of Invenio. -# Copyright (C) 2023 CERN. -# -# Invenio is free software; you can redistribute it -# and/or modify it under the terms of the GNU General Public License as -# published by the Free Software Foundation; either version 2 of the -# License, or (at your option) any later version. -# -# Invenio is distributed in the hope that it will be -# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Invenio; if not, write to the -# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, -# MA 02111-1307, USA. +# Copyright (C) 2025 CERN. # -# In applying this license, CERN does not -# waive the privileges and immunities granted to it by virtue of its status -# as an Intergovernmental Organization or submit itself to any jurisdiction. +# Invenio is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. exclude .dockerignore diff --git a/README.rst b/README.rst index f993df5a..1509ae49 100644 --- a/README.rst +++ b/README.rst @@ -1,25 +1,9 @@ .. This file is part of Invenio. - Copyright (C) 2023 CERN. - - Invenio is free software; you can redistribute it - and/or modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of the - License, or (at your option) any later version. - - Invenio is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with Invenio; if not, write to the - Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307, USA. - - In applying this license, CERN does not - waive the privileges and immunities granted to it by virtue of its status - as an Intergovernmental Organization or submit itself to any jurisdiction. + Copyright (C) 2025 CERN. + + Invenio is free software; you can redistribute it and/or modify it + under the terms of the MIT License; see LICENSE file for more details. ================ Invenio-GitHub diff --git a/docs/conf.py b/docs/conf.py index 75c64ae5..c279174e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,27 +1,9 @@ # -*- coding: utf-8 -*- -# # This file is part of Invenio. -# Copyright (C) 2016 CERN. -# Copyright (C) 2023 Graz University of Technology. -# -# Invenio is free software; you can redistribute it -# and/or modify it under the terms of the GNU General Public License as -# published by the Free Software Foundation; either version 2 of the -# License, or (at your option) any later version. -# -# Invenio is distributed in the hope that it will be -# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Invenio; if not, write to the -# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, -# MA 02111-1307, USA. +# Copyright (C) 2025 CERN. # -# In applying this license, CERN does not -# waive the privileges and immunities granted to it by virtue of its status -# as an Intergovernmental Organization or submit itself to any jurisdiction. +# Invenio is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. """Sphinx configuration.""" diff --git a/requirements-devel.txt b/requirements-devel.txt index 58f6e0db..702b0ae9 100644 --- a/requirements-devel.txt +++ b/requirements-devel.txt @@ -1,23 +1,6 @@ # -*- coding: utf-8 -*- -# # This file is part of Invenio. -# Copyright (C) 2016 CERN. -# -# Invenio is free software; you can redistribute it -# and/or modify it under the terms of the GNU General Public License as -# published by the Free Software Foundation; either version 2 of the -# License, or (at your option) any later version. -# -# Invenio is distributed in the hope that it will be -# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Invenio; if not, write to the -# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, -# MA 02111-1307, USA. +# Copyright (C) 2025 CERN. # -# In applying this license, CERN does not -# waive the privileges and immunities granted to it by virtue of its status -# as an Intergovernmental Organization or submit itself to any jurisdiction. +# Invenio is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. diff --git a/setup.cfg b/setup.cfg index d51fba95..2450b8ba 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,27 +1,10 @@ # -*- coding: utf-8 -*- -# # This file is part of Invenio. -# Copyright (C) 2023-2025 CERN. +# Copyright (C) 2025 CERN. # Copyright (C) 2023-2025 Graz University of Technology. # -# Invenio is free software; you can redistribute it -# and/or modify it under the terms of the GNU General Public License as -# published by the Free Software Foundation; either version 2 of the -# License, or (at your option) any later version. -# -# Invenio is distributed in the hope that it will be -# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Invenio; if not, write to the -# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, -# MA 02111-1307, USA. -# -# In applying this license, CERN does not -# waive the privileges and immunities granted to it by virtue of its status -# as an Intergovernmental Organization or submit itself to any jurisdiction. +# Invenio is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. [metadata] name = invenio-github diff --git a/setup.py b/setup.py index 2b4d4968..b3053b2d 100644 --- a/setup.py +++ b/setup.py @@ -1,27 +1,10 @@ # -*- coding: utf-8 -*- -# # This file is part of Invenio. -# Copyright (C) 2023 CERN. -# Copyright (C) 2023 Graz University of Technology. -# -# Invenio is free software; you can redistribute it -# and/or modify it under the terms of the GNU General Public License as -# published by the Free Software Foundation; either version 2 of the -# License, or (at your option) any later version. -# -# Invenio is distributed in the hope that it will be -# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Invenio; if not, write to the -# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, -# MA 02111-1307, USA. +# Copyright (C) 2025 CERN. +# Copyright (C) 2023-2025 Graz University of Technology. # -# In applying this license, CERN does not -# waive the privileges and immunities granted to it by virtue of its status -# as an Intergovernmental Organization or submit itself to any jurisdiction. +# Invenio is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. """Invenio module that adds GitHub integration to the platform.""" From 4141cf182798398b8378209429b1671ca95165ae Mon Sep 17 00:00:00 2001 From: Pal Kerecsenyi Date: Wed, 15 Oct 2025 15:08:10 +0200 Subject: [PATCH 07/10] WIP: docs: complete upgrade guide --- docs/upgrading.rst | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/upgrading.rst b/docs/upgrading.rst index 8703f66f..5b1dac20 100644 --- a/docs/upgrading.rst +++ b/docs/upgrading.rst @@ -432,3 +432,15 @@ by running the following SQL commands: DROP TABLE github_repositories; DROP TABLE github_releases; COMMIT; + +Mark the relevant migration as having been manually performed: + +.. code-block:: bash + + invenio alembic stamp invenio_github@1754318294 + +.. note:: + + The Alembic branch name ``invenio_github`` is unchanged despite all the other renamed references. + Changing the name of an Alembic branch is not supported and would introduce too many bugs to make it + worthwhile. From ec28eb903318fde5a6b450e0e4282f2b76de5b1b Mon Sep 17 00:00:00 2001 From: Pal Kerecsenyi Date: Wed, 22 Oct 2025 14:25:24 +0200 Subject: [PATCH 08/10] WIP: docs: renamed enabled_by_id -> enabled_by_user_id --- docs/upgrading.rst | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/upgrading.rst b/docs/upgrading.rst index 5b1dac20..f269806e 100644 --- a/docs/upgrading.rst +++ b/docs/upgrading.rst @@ -95,7 +95,7 @@ In an SQL shell (e.g. ``psql`` for PostgreSQL), execute the following: provider_id character varying(255) NOT NULL, name character varying(255) NOT NULL, hook character varying(255), - enabled_by_id integer, + enabled_by_user_id integer, created timestamp without time zone NOT NULL, updated timestamp without time zone NOT NULL, provider character varying(255) DEFAULT 'github'::character varying NOT NULL, @@ -107,7 +107,7 @@ In an SQL shell (e.g. ``psql`` for PostgreSQL), execute the following: ALTER TABLE ONLY vcs_repositories ADD CONSTRAINT pk_vcs_repositories PRIMARY KEY (id); ALTER TABLE ONLY vcs_repositories ADD CONSTRAINT uq_vcs_repositories_provider_name UNIQUE (provider, name); ALTER TABLE ONLY vcs_repositories ADD CONSTRAINT uq_vcs_repositories_provider_provider_id UNIQUE (provider, provider_id); - ALTER TABLE ONLY vcs_repositories ADD CONSTRAINT fk_vcs_repositories_enabled_by_id_accounts_user FOREIGN KEY (enabled_by_id) REFERENCES accounts_user(id); + ALTER TABLE ONLY vcs_repositories ADD CONSTRAINT fk_vcs_repositories_enabled_by_user_id_accounts_user FOREIGN KEY (enabled_by_user_id) REFERENCES accounts_user(id); CREATE TABLE vcs_releases ( id uuid NOT NULL, @@ -153,7 +153,7 @@ Next, you must perform some manual data migrations: of repos have outgrown. Previously, only *activated* repos were stored in the ``github_repositories`` table. Now, *all* repos are stored directly as rows of the ``vcs_repositories`` table. - Whether or not they're activated is indicated by the presence of non-null values for the ``hook`` and ``enabled_by_id`` columns. + Whether or not they're activated is indicated by the presence of non-null values for the ``hook`` and ``enabled_by_user_id`` columns. You must perform this migration, leaving only the ``"last_sync"`` value in the ``extra_data`` JSON column. Not all columns of the ``vcs_repositories`` table need to be filled during the migration. @@ -228,7 +228,7 @@ You can set the database connection string via the ``UPGRADE_DB`` environment va sa.Column("name", sa.String(255), nullable=False), sa.Column("hook", sa.String(255), nullable=True), sa.Column( - "enabled_by_id", sa.Integer, sa.ForeignKey("account_user.id"), nullable=True + "enabled_by_user_id", sa.Integer, sa.ForeignKey("account_user.id"), nullable=True ), sa.Column("created", sa.DateTime, nullable=False), sa.Column("updated", sa.DateTime, nullable=False), @@ -324,7 +324,7 @@ You can set the database connection string via the ``UPGRADE_DB`` environment va license_spdx=None, # This repo wasn't enabled hook=None, - enabled_by_id=None, + enabled_by_user_id=None, created=datetime.now(tz=timezone.utc), updated=datetime.now(tz=timezone.utc), ) @@ -371,7 +371,7 @@ You can set the database connection string via the ``UPGRADE_DB`` environment va html_url=f"https://github.com/{old_db_repo["name"]}", license_spdx=None, hook=old_db_repo["hook"], - enabled_by_id=old_db_repo["user_id"], + enabled_by_user_id=old_db_repo["user_id"], created=old_db_repo["created"], updated=datetime.now(tz=timezone.utc), ) @@ -383,7 +383,7 @@ You can set the database connection string via the ``UPGRADE_DB`` environment va .values( id=old_db_repo["id"], hook=str(old_db_repo["hook"]), - enabled_by_id=old_db_repo["user_id"], + enabled_by_user_id=old_db_repo["user_id"], created=old_db_repo["created"], ) ) From 662f56f658ed200c403ce7b50aab9f5eb0976d6e Mon Sep 17 00:00:00 2001 From: Pal Kerecsenyi Date: Thu, 23 Oct 2025 11:28:16 +0200 Subject: [PATCH 09/10] WIP: docs: remove html_url --- docs/upgrading.rst | 6 ------ 1 file changed, 6 deletions(-) diff --git a/docs/upgrading.rst b/docs/upgrading.rst index f269806e..fe5e5735 100644 --- a/docs/upgrading.rst +++ b/docs/upgrading.rst @@ -101,7 +101,6 @@ In an SQL shell (e.g. ``psql`` for PostgreSQL), execute the following: provider character varying(255) DEFAULT 'github'::character varying NOT NULL, default_branch character varying(255) DEFAULT 'master'::character varying NOT NULL, description character varying(10000), - html_url character varying(10000) NOT NULL, license_spdx character varying(255) ); ALTER TABLE ONLY vcs_repositories ADD CONSTRAINT pk_vcs_repositories PRIMARY KEY (id); @@ -222,7 +221,6 @@ You can set the database connection string via the ``UPGRADE_DB`` environment va sa.Column("provider_id", sa.String(255), nullable=True), sa.Column("provider", sa.String(255), nullable=True), sa.Column("description", sa.String(10000), nullable=True), - sa.Column("html_url", sa.String(10000), nullable=False), sa.Column("license_spdx", sa.String(255), nullable=True), sa.Column("default_branch", sa.String(255), nullable=False), sa.Column("name", sa.String(255), nullable=False), @@ -317,8 +315,6 @@ You can set the database connection string via the ``UPGRADE_DB`` environment va description=github_repo["description"], name=github_repo["full_name"], default_branch=github_repo["default_branch"], - # So far we have only supported github.com so we can safely assume the URL - html_url=f'https://github.com/{github_repo["full_name"]}', # We have never stored this, it is queried at runtime right now. When the first # sync happens after this migration, we will download all the license IDs from the VCS. license_spdx=None, @@ -337,7 +333,6 @@ You can set the database connection string via the ``UPGRADE_DB`` environment va description=github_repo["description"], name=github_repo["full_name"], default_branch=github_repo["default_branch"], - html_url=f'https://github.com/{github_repo["full_name"]}', updated=datetime.now(tz=timezone.utc), ) ) @@ -368,7 +363,6 @@ You can set the database connection string via the ``UPGRADE_DB`` environment va provider="github", name=old_db_repo["name"], default_branch="main", - html_url=f"https://github.com/{old_db_repo["name"]}", license_spdx=None, hook=old_db_repo["hook"], enabled_by_user_id=old_db_repo["user_id"], From e5545e5dd920bb43d94027b8b550d3101fed36b5 Mon Sep 17 00:00:00 2001 From: Pal Kerecsenyi Date: Thu, 23 Oct 2025 16:19:00 +0200 Subject: [PATCH 10/10] WIP: docs: provisionally remove upgrade script --- docs/upgrading.rst | 223 --------------------------------------------- 1 file changed, 223 deletions(-) diff --git a/docs/upgrading.rst b/docs/upgrading.rst index fe5e5735..a9b2d6ad 100644 --- a/docs/upgrading.rst +++ b/docs/upgrading.rst @@ -182,229 +182,6 @@ You can set the database connection string via the ``UPGRADE_DB`` environment va .. code-block:: python - import os - import uuid - from datetime import datetime, timezone - - import sqlalchemy as sa - from sqlalchemy.dialects import postgresql - from sqlalchemy.ext.mutable import MutableDict - from sqlalchemy.orm import Session - from sqlalchemy_utils import JSONType, UUIDType - from tqdm import tqdm - - engine = sa.create_engine(os.getenv("UPGRADE_DB"), echo=False) - - # Lightweight models for all of the tables (incl old and new versions) - remote_account_table = sa.table( - "oauthclient_remoteaccount", - sa.Column("id", sa.Integer, primary_key=True), - sa.Column("user_id", sa.Integer, sa.ForeignKey("account_user.id")), - sa.Column("client_id", sa.String(255)), - # We may have changed this if we merge https://github.com/inveniosoftware/invenio-oauthclient/pull/360 - # but we're only reading this column so it shouldn't make a difference. - sa.Column("extra_data", MutableDict.as_mutable(JSONType)), - ) - github_repositories_table = sa.table( - "github_repositories", - sa.Column("id", UUIDType, primary_key=True), - sa.Column("github_id", sa.String(255), nullable=True), - sa.Column("name", sa.String(255), nullable=False), - sa.Column("hook", sa.Integer, nullable=True), - sa.Column("user_id", sa.Integer, sa.ForeignKey("account_user.id"), nullable=True), - sa.Column("created", sa.DateTime, nullable=False), - sa.Column("updated", sa.DateTime, nullable=False), - ) - vcs_repositories_table = sa.table( - "vcs_repositories", - sa.Column("id", UUIDType, primary_key=True), - sa.Column("provider_id", sa.String(255), nullable=True), - sa.Column("provider", sa.String(255), nullable=True), - sa.Column("description", sa.String(10000), nullable=True), - sa.Column("license_spdx", sa.String(255), nullable=True), - sa.Column("default_branch", sa.String(255), nullable=False), - sa.Column("name", sa.String(255), nullable=False), - sa.Column("hook", sa.String(255), nullable=True), - sa.Column( - "enabled_by_user_id", sa.Integer, sa.ForeignKey("account_user.id"), nullable=True - ), - sa.Column("created", sa.DateTime, nullable=False), - sa.Column("updated", sa.DateTime, nullable=False), - ) - github_releases_table = sa.table( - "github_releases", - sa.Column("id", UUIDType, primary_key=True), - sa.Column("release_id", sa.Integer, primary_key=True), - sa.Column("tag", sa.String(255), nullable=True), - sa.Column("errors", MutableDict.as_mutable(JSONType), nullable=True), - sa.Column( - "repository_id", - UUIDType, - sa.ForeignKey("github_repositories.id"), - nullable=True, - ), - sa.Column("event_id", UUIDType, sa.ForeignKey("webhooks_events.id"), nullable=True), - sa.Column("record_id", UUIDType, nullable=True), - sa.Column("status", sa.CHAR(1), nullable=False), - sa.Column("created", sa.DateTime, nullable=False), - sa.Column("updated", sa.DateTime, nullable=False), - ) - vcs_releases_table = sa.table( - "vcs_repositories", - sa.Column("id", UUIDType, primary_key=True), - sa.Column("provider_id", sa.String(255), nullable=True), - sa.Column("provider", sa.String(255), nullable=True), - sa.Column("tag", sa.String(255), nullable=True), - sa.Column( - "errors", - MutableDict.as_mutable( - sa.JSON() - .with_variant(postgresql.JSONB(), "postgresql") - .with_variant(JSONType(), "sqlite") - .with_variant(JSONType(), "mysql") - ), - nullable=True, - ), - sa.Column( - "repository_id", - UUIDType, - sa.ForeignKey("vcs_repositories.id"), - nullable=True, - ), - sa.Column("event_id", UUIDType, sa.ForeignKey("webhooks_events.id"), nullable=True), - sa.Column("record_id", UUIDType, nullable=True), - sa.Column("status", sa.CHAR(1), nullable=False), - sa.Column("created", sa.DateTime, nullable=False), - sa.Column("updated", sa.DateTime, nullable=False), - ) - - with Session(engine) as session: - - # First, we move the JSON repos from oauthclient_remoteaccount to the new vcs_repositories table - - # We don't know the client ID as this is a config variable. - # So to find the RemoteAccounts that correspond to GitHub, we need to check for the existence - # of the `repos` key in the `extra_data` JSON. We cannot make this very efficient sadly, because - # (a) in Postgres we are using JSON not JSONB so there is no efficient JSON querying and (b) the - # instance might be using MySQL/SQLite where we store it as `TEXT`. - - remote_accounts = session.execute(sa.select(remote_account_table)) - for remote_account in tqdm(remote_accounts.mappings(), desc="remote_account"): - if "repos" not in remote_account["extra_data"]: - continue - - repos = remote_account["extra_data"]["repos"] - - for id, github_repo in repos.items(): - # `id` (the dict key) is a string because JSON keys must be strings - - # We might have already created it for another user - matching_db_repo_id = session.scalar( - sa.select(vcs_repositories_table).filter_by(provider_id=id) - ) - - if matching_db_repo_id is None: - # We are now storing _all_ repositories (even non-enabled ones) in the DB. - # The repo-user association will be created on the first sync after this migration, we need to download - # the list of users with access to the repo from the GitHub API. - session.execute( - vcs_repositories_table.insert().values( - id=uuid.uuid4(), - provider_id=id, - provider="github", - description=github_repo["description"], - name=github_repo["full_name"], - default_branch=github_repo["default_branch"], - # We have never stored this, it is queried at runtime right now. When the first - # sync happens after this migration, we will download all the license IDs from the VCS. - license_spdx=None, - # This repo wasn't enabled - hook=None, - enabled_by_user_id=None, - created=datetime.now(tz=timezone.utc), - updated=datetime.now(tz=timezone.utc), - ) - ) - else: - session.execute( - vcs_repositories_table.update() - .filter_by(id=matching_db_repo_id) - .values( - description=github_repo["description"], - name=github_repo["full_name"], - default_branch=github_repo["default_branch"], - updated=datetime.now(tz=timezone.utc), - ) - ) - - # Remove `repos` from the existing `extra_data`, leaving only the last sync timestamp - session.execute( - remote_account_table.update() - .filter_by(id=remote_account["id"]) - .values(extra_data={"last_sync": remote_account["extra_data"]["last_sync"]}) - ) - - # Next, we move over any old rows from github_repositories that weren't attached to any user (for whatever reason). - old_db_repos = session.execute(sa.select(github_repositories_table)) - for old_db_repo in tqdm(old_db_repos.mappings(), desc="repos"): - matching_new_repo_id = session.scalar( - sa.select( - vcs_repositories_table.c.id, - ).filter_by(provider_id=str(old_db_repo["github_id"])) - ) - - if matching_new_repo_id is None: - # We only have very limited metadata available at this point. - # The first sync job after this migration will fill in the rest. - session.execute( - vcs_repositories_table.insert().values( - id=old_db_repo["id"], - provider_id=str(old_db_repo["github_id"]), - provider="github", - name=old_db_repo["name"], - default_branch="main", - license_spdx=None, - hook=old_db_repo["hook"], - enabled_by_user_id=old_db_repo["user_id"], - created=old_db_repo["created"], - updated=datetime.now(tz=timezone.utc), - ) - ) - else: - session.execute( - vcs_repositories_table.update() - .filter_by(id=matching_new_repo_id) - .values( - id=old_db_repo["id"], - hook=str(old_db_repo["hook"]), - enabled_by_user_id=old_db_repo["user_id"], - created=old_db_repo["created"], - ) - ) - - # Finally, we copy over the releases - old_db_releases = session.execute(sa.select(github_releases_table)) - for old_db_release in tqdm(old_db_releases.mappings(), desc="releases"): - # Since we've created all the repos, we know due to referential integrity that this release's repo ID corresponds - # to a valid and existent repo. - - session.execute( - vcs_releases_table.insert().values( - id=old_db_release["id"], - provider_id=str(old_db_release["release_id"]), - provider="github", - tag=old_db_release["tag"], - errors=old_db_release["errors"], - repository_id=old_db_release["repository_id"], - event_id=old_db_release["event_id"], - record_id=old_db_release["record_id"], - status=old_db_release["status"], - created=old_db_release["created"], - updated=datetime.now(tz=timezone.utc), - ) - ) - - session.commit() .. raw:: html