Skip to content

Commit 449f41d

Browse files
committed
WIP: feat(vcs): new data model
* Updated the data model to accommodate the new generic approach to VCS integration. This involves renaming the `github_...` tables to `vcs_...`, adding a new column to the relevant tables to identify which provider the records relate to, and more. * Added an Alembic migration, including moving the repository data from `oauthclient_remoteaccount` to the `vcs_repositories` table, which is a complex and long-running operation. This will be supplemented by a manual migration guide for instances like Zenodo where a several-minute full DB lock is not feasible. The difference between whether to use the automated migration or the manual one will be clarified in the docs. * Added a repo-user m-to-m mapping table. By not storing repos in the Remote Accounts table, we need a different way of associating users with the repos they have access to. This table is synced using code that will be included in other PRs. * This PR contains only the data model changes themselves and not the associated functional changes needed to do anything useful. * This commit on its own is UNRELEASABLE. We will merge multiple commits related to the VCS upgrade into the `vcs-staging` branch and then merge them all into `master` once we have a fully release-ready prototype. At that point, we will create a squash commit.
1 parent a841b14 commit 449f41d

2 files changed

Lines changed: 678 additions & 0 deletions

File tree

Lines changed: 331 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,331 @@
1+
#
2+
# This file is part of Invenio.
3+
# Copyright (C) 2025 CERN.
4+
#
5+
# Invenio is free software; you can redistribute it and/or modify it
6+
# under the terms of the MIT License; see LICENSE file for more details.
7+
8+
"""Switch to generic git services"""
9+
10+
import uuid
11+
from datetime import datetime, timezone
12+
13+
import sqlalchemy as sa
14+
from alembic import op
15+
from sqlalchemy.ext.mutable import MutableDict
16+
from sqlalchemy_utils import JSONType, UUIDType
17+
18+
# revision identifiers, used by Alembic.
19+
revision = "1754318294"
20+
down_revision = "b0eaee37b545"
21+
# You cannot rename an Alembic branch. So we will have to keep
22+
# the branch label `invenio-github` despite changing the module
23+
# to `invenio-vcs`.
24+
branch_labels = ()
25+
depends_on = None
26+
27+
28+
def upgrade():
29+
"""Upgrade database."""
30+
op.rename_table("github_repositories", "vcs_repositories")
31+
op.alter_column(
32+
"vcs_repositories",
33+
"github_id",
34+
new_column_name="provider_id",
35+
type_=sa.String(length=255),
36+
nullable=False,
37+
existing_type=sa.Integer(),
38+
existing_nullable=True,
39+
)
40+
op.alter_column(
41+
"vcs_repositories",
42+
"hook",
43+
type_=sa.String(length=255),
44+
nullable=True,
45+
existing_type=sa.Integer(),
46+
existing_nullable=True,
47+
)
48+
op.add_column(
49+
"vcs_repositories",
50+
# We use the provider name "github" by default as this is what we're already using across the codebase
51+
sa.Column("provider", sa.String(255), nullable=False, server_default="github"),
52+
)
53+
op.add_column(
54+
"vcs_repositories",
55+
sa.Column(
56+
"default_branch", sa.String(255), nullable=False, server_default="master"
57+
),
58+
)
59+
op.add_column(
60+
"vcs_repositories", sa.Column("description", sa.String(10000), nullable=True)
61+
)
62+
op.add_column(
63+
# Nullable for now (see below)
64+
"vcs_repositories",
65+
sa.Column("html_url", sa.String(10000), nullable=True),
66+
)
67+
op.add_column(
68+
"vcs_repositories", sa.Column("license_spdx", sa.String(255), nullable=True)
69+
)
70+
op.alter_column("vcs_repositories", "user_id", new_column_name="enabled_by_id")
71+
op.drop_index("ix_github_repositories_name")
72+
op.drop_index("ix_github_repositories_github_id")
73+
74+
# Because they rely on the `provider` column, these are automatically
75+
# deleted when downgrading so we don't need a separate drop command
76+
# for them.
77+
op.create_unique_constraint(
78+
constraint_name=op.f("uq_vcs_repositories_provider_provider_id"),
79+
table_name="vcs_repositories",
80+
columns=["provider", "provider_id"],
81+
)
82+
op.create_unique_constraint(
83+
constraint_name=op.f("uq_vcs_repositories_provider_name"),
84+
table_name="vcs_repositories",
85+
columns=["provider", "name"],
86+
)
87+
88+
# Migrate data from the OAuth remote `extra_data` field to the repositories table
89+
# where we will now store everything directly.
90+
#
91+
# We need to recreate the SQLAlchemy models for `RemoteAccount` and `Repository` here but
92+
# in a much more lightweight way. We cannot simply import the models because (a) they depend
93+
# on the full Invenio app being initialised and all extensions available and (b) we need
94+
# to work with the models as they stand precisely at this point in the migration chain
95+
# rather than the model file itself which may be at a later commit.
96+
#
97+
# We only include here the columns, constraints, and relations that we actually need to
98+
# perform the migration, therefore keeping these models as lightweight as possible.
99+
remote_account_table = sa.table(
100+
"oauthclient_remoteaccount",
101+
sa.Column("id", sa.Integer, primary_key=True),
102+
sa.Column("user_id", sa.Integer, sa.ForeignKey("account_user.id")),
103+
sa.Column("client_id", sa.String(255)),
104+
sa.Column("extra_data", MutableDict.as_mutable(JSONType)),
105+
)
106+
vcs_repositories_table = sa.table(
107+
"vcs_repositories",
108+
sa.Column("id", UUIDType, primary_key=True),
109+
sa.Column("provider_id", sa.String(255), nullable=True),
110+
sa.Column("provider", sa.String(255), nullable=True),
111+
sa.Column("description", sa.String(10000), nullable=True),
112+
sa.Column("html_url", sa.String(10000), nullable=False),
113+
sa.Column("license_spdx", sa.String(255), nullable=True),
114+
sa.Column("default_branch", sa.String(255), nullable=False),
115+
sa.Column("name", sa.String(255), nullable=False),
116+
sa.Column("hook", sa.String(255), nullable=True),
117+
sa.Column(
118+
"enabled_by_id", sa.Integer, sa.ForeignKey("account_user.id"), nullable=True
119+
),
120+
sa.Column("created", sa.DateTime, nullable=False),
121+
sa.Column("updated", sa.DateTime, nullable=False),
122+
)
123+
124+
# This is the recommended way to run SQLAlchemy operations in a migration, see https://alembic.sqlalchemy.org/en/latest/ops.html#alembic.operations.Operations.execute
125+
session = op.get_bind()
126+
127+
# We don't know the client ID as this is a config variable.
128+
# So to find the RemoteAccounts that correspond to GitHub, we need to check for the existence
129+
# of the `repos` key in the `extra_data` JSON. We cannot make this very efficient sadly, because
130+
# (a) in Postgres we are using JSON not JSONB so there is no efficient JSON querying and (b) the
131+
# instance might be using MySQL/SQLite where we store it as `TEXT`.
132+
133+
remote_accounts = session.execute(sa.select(remote_account_table))
134+
for remote_account in remote_accounts.mappings():
135+
if "repos" not in remote_account["extra_data"]:
136+
continue
137+
138+
repos = remote_account["extra_data"]["repos"]
139+
140+
for id, github_repo in repos.items():
141+
# `id` (the dict key) is a string because JSON keys must be strings
142+
143+
matching_db_repo_id = session.scalar(
144+
sa.select(vcs_repositories_table).filter_by(provider_id=id)
145+
)
146+
147+
if matching_db_repo_id is None:
148+
# We are now storing _all_ repositories (even non-enabled ones) in the DB.
149+
# The repo-user association will be created on the first sync after this migration, we need to download
150+
# the list of users with access to the repo from the GitHub API.
151+
session.execute(
152+
vcs_repositories_table.insert().values(
153+
id=uuid.uuid4(),
154+
provider_id=id,
155+
provider="github",
156+
description=github_repo["description"],
157+
name=github_repo["full_name"],
158+
default_branch=github_repo["default_branch"],
159+
# So far we have only supported github.com so we can safely assume the URL
160+
html_url=f'https://github.com/{github_repo["full_name"]}',
161+
# We have never stored this, it is queried at runtime right now. When the first
162+
# sync happens after this migration, we will download all the license IDs from the VCS.
163+
license_spdx=None,
164+
# This repo wasn't enabled
165+
hook=None,
166+
enabled_by_id=None,
167+
created=datetime.now(tz=timezone.utc),
168+
updated=datetime.now(tz=timezone.utc),
169+
)
170+
)
171+
else:
172+
session.execute(
173+
vcs_repositories_table.update()
174+
.filter_by(id=matching_db_repo_id)
175+
.values(
176+
description=github_repo["description"],
177+
name=github_repo["full_name"],
178+
default_branch=github_repo["default_branch"],
179+
html_url=f'https://github.com/{github_repo["full_name"]}',
180+
updated=datetime.now(tz=timezone.utc),
181+
)
182+
)
183+
184+
# Remove `repos` from the existing `extra_data`, leaving only the last sync timestamp
185+
session.execute(
186+
remote_account_table.update()
187+
.filter_by(id=remote_account["id"])
188+
.values(extra_data={"last_sync": remote_account["extra_data"]["last_sync"]})
189+
)
190+
191+
# We initially set this to nullable=True so we can create the column without an error
192+
# (it would be null for existing records) but after the SQLAlchemy operations above we
193+
# have populated it so we can mark it non-nullable.
194+
op.alter_column(
195+
"vcs_repositories", "html_url", nullable=False, existing_nullable=True
196+
)
197+
198+
op.rename_table("github_releases", "vcs_releases")
199+
op.alter_column(
200+
"vcs_releases",
201+
"release_id",
202+
new_column_name="provider_id",
203+
type_=sa.String(length=255),
204+
nullable=False,
205+
existing_type=sa.Integer(),
206+
existing_nullable=True,
207+
)
208+
op.add_column(
209+
"vcs_releases",
210+
sa.Column("provider", sa.String(255), nullable=False, server_default="github"),
211+
)
212+
if op.get_context().dialect.name == "postgresql":
213+
op.alter_column(
214+
"vcs_releases",
215+
"errors",
216+
type_=sa.dialects.postgresql.JSONB,
217+
postgresql_using="errors::text::jsonb",
218+
)
219+
220+
op.drop_constraint(
221+
op.f("uq_github_releases_release_id"), table_name="vcs_releases", type_="unique"
222+
)
223+
# A given provider cannot have duplicate repository IDs.
224+
# These constraints are also inherently deleted when the `provider` column is dropped
225+
op.create_unique_constraint(
226+
constraint_name=op.f("uq_vcs_releases_provider_id_provider"),
227+
table_name="vcs_releases",
228+
columns=["provider_id", "provider"],
229+
)
230+
# A specific repository from a given provider cannot have multiple releases of the same tag
231+
op.create_unique_constraint(
232+
constraint_name=op.f("uq_vcs_releases_provider_id_provider_tag"),
233+
table_name="vcs_releases",
234+
columns=["provider_id", "provider", "tag"],
235+
)
236+
237+
op.create_table(
238+
"vcs_repository_users",
239+
sa.Column("repository_id", UUIDType(), primary_key=True),
240+
sa.Column("user_id", sa.Integer(), primary_key=True),
241+
sa.ForeignKeyConstraint(
242+
["repository_id"],
243+
["vcs_repositories.id"],
244+
name=op.f("fk_vcs_repository_users_repository_id_vcs_repositories"),
245+
),
246+
sa.ForeignKeyConstraint(
247+
["user_id"],
248+
["accounts_user.id"],
249+
name=op.f("fk_vcs_repository_users_user_id_accounts_user"),
250+
),
251+
)
252+
# ### end Alembic commands ###
253+
254+
255+
def downgrade():
256+
"""Downgrade database."""
257+
258+
# Currently, the downgrade can only be peformed **without data**. The tables are transformed but
259+
# data will not be successfully migrated. The upgrade migration has a large amount of custom logic
260+
# for migrating the data into the new format, and this is not replicated/reversed for downgrading.
261+
262+
op.alter_column(
263+
"vcs_repositories",
264+
"enabled_by_id",
265+
new_column_name="user_id",
266+
)
267+
op.drop_table("vcs_repository_users")
268+
269+
op.rename_table("vcs_repositories", "github_repositories")
270+
op.alter_column(
271+
"github_repositories",
272+
"provider_id",
273+
new_column_name="github_id",
274+
type_=sa.Integer(),
275+
nullable=True,
276+
existing_type=sa.String(length=255),
277+
existing_nullable=False,
278+
postgresql_using="provider_id::integer",
279+
)
280+
op.alter_column(
281+
"github_repositories",
282+
"hook",
283+
type_=sa.Integer(),
284+
nullable=True,
285+
existing_type=sa.String(length=255),
286+
existing_nullable=True,
287+
postgresql_using="hook::integer",
288+
)
289+
op.drop_column("github_repositories", "provider")
290+
op.drop_column("github_repositories", "description")
291+
op.drop_column("github_repositories", "html_url")
292+
op.drop_column("github_repositories", "license_spdx")
293+
op.drop_column("github_repositories", "default_branch")
294+
op.create_index(
295+
op.f("ix_github_repositories_github_id"),
296+
"github_repositories",
297+
["github_id"],
298+
unique=True,
299+
)
300+
op.create_index(
301+
op.f("ix_github_repositories_name"),
302+
"github_repositories",
303+
["name"],
304+
unique=True,
305+
)
306+
307+
op.rename_table("vcs_releases", "github_releases")
308+
op.alter_column(
309+
"github_releases",
310+
"provider_id",
311+
new_column_name="release_id",
312+
type_=sa.Integer(),
313+
nullable=True,
314+
existing_type=sa.String(length=255),
315+
existing_nullable=False,
316+
postgresql_using="provider_id::integer",
317+
)
318+
op.drop_column("github_releases", "provider")
319+
if op.get_context().dialect.name == "postgresql":
320+
op.alter_column(
321+
"github_releases",
322+
"errors",
323+
type_=sa.dialects.postgresql.JSON,
324+
postgresql_using="errors::text::json",
325+
)
326+
op.create_unique_constraint(
327+
op.f("uq_github_releases_release_id"),
328+
table_name="github_releases",
329+
columns=["release_id"],
330+
)
331+
# ### end Alembic commands ###

0 commit comments

Comments
 (0)