From 5726da88730882e78e1f92c32cff3712b3a08d47 Mon Sep 17 00:00:00 2001 From: Uchechukwu Orji Date: Wed, 6 May 2026 11:00:43 +0100 Subject: [PATCH] fix retention rules to always keep latest version of two recent months --- backend/src/cms_backend/db/title.py | 24 ++--- backend/tests/mill/processors/test_title.py | 97 +++++++++++++++++++-- 2 files changed, 103 insertions(+), 18 deletions(-) diff --git a/backend/src/cms_backend/db/title.py b/backend/src/cms_backend/db/title.py index 28776cc..205f25e 100644 --- a/backend/src/cms_backend/db/title.py +++ b/backend/src/cms_backend/db/title.py @@ -1,11 +1,11 @@ import datetime from collections import defaultdict from pathlib import Path +from typing import cast from uuid import UUID from psycopg.errors import UniqueViolation -from sqlalchemy import Date, select -from sqlalchemy import cast as sql_cast +from sqlalchemy import select from sqlalchemy.exc import IntegrityError from sqlalchemy.orm import Session as OrmSession from sqlalchemy.orm import selectinload @@ -362,6 +362,7 @@ def apply_retention_rules(session: OrmSession, title: Title): """ now = getnow() + thirty_days_ago = (now - datetime.timedelta(days=30)).date() books_by_flavour: dict[str, list[Book]] = defaultdict(list) for book in session.scalars( @@ -369,7 +370,6 @@ def apply_retention_rules(session: OrmSession, title: Title): Book.title_id == title.id, Book.has_error.is_(False), Book.date.is_not(None), - sql_cast(Book.date, Date) <= (now - datetime.timedelta(days=30)).date(), Book.location_kind == "prod", Book.needs_file_operation.is_(False), ) @@ -382,22 +382,22 @@ def apply_retention_rules(session: OrmSession, title: Title): # Group books by period (without the suffix) books_by_period: dict[str, list[Book]] = defaultdict(list) for book in books: - if not book.date: - continue - books_by_period[book.date[:PERIOD_LENGTH]].append(book) + books_by_period[cast(str, book.date)[:PERIOD_LENGTH]].append(book) - # Keep last version from each of the 2 most recent periods sorted_periods = sorted(books_by_period.keys(), reverse=True) + # Keep latest version from each of the 2 most recent periods + books_to_keep: set[UUID] = set() + for period in sorted_periods[:2]: sorted_books_by_period = sort_books_by_filename_period( books_by_period[period] ) - # Mark all but the most recent one for deletion - books_to_delete.extend(sorted_books_by_period[1:]) + books_to_keep.add(sorted_books_by_period[0].id) - # Mark the remainder of the books to be deleted. - for period in sorted_periods[2:]: - books_to_delete.extend(books_by_period[period]) + for book in books: + book_date = datetime.date.fromisoformat(cast(str, book.date)) + if book_date <= thirty_days_ago and book.id not in books_to_keep: + books_to_delete.append(book) deletion_date = now + Context.book_deletion_delay diff --git a/backend/tests/mill/processors/test_title.py b/backend/tests/mill/processors/test_title.py index ed9dc22..980ec98 100644 --- a/backend/tests/mill/processors/test_title.py +++ b/backend/tests/mill/processors/test_title.py @@ -184,16 +184,14 @@ def test_apply_retention_rules_keeps_last_version_of_two_most_recent_months( dbsession.flush() - # Keep 2024-04 books since their Date are still less than 30 days + # We only keep books from the last two latest months including this one + # Keep all 2024-04 books since their Date are still less than 30 days assert book4a.location_kind == "prod" assert book4b.location_kind == "prod" - # Should keep only the latest from the two most recent months: - # - 2024-03c - # - 2024-02b + # Keep 2024-03c books since they are the latest of the second month assert book3c.location_kind == "prod" - assert book2b.location_kind == "prod" - # All others should be marked for deletion + assert book2b.location_kind == "to_delete" assert book2a.location_kind == "to_delete" assert book3a.location_kind == "to_delete" assert book3b.location_kind == "to_delete" @@ -279,3 +277,90 @@ def test_apply_retention_rules_handles_different_flavours_separately( assert book_nopic_feb.location_kind == "prod" assert book_maxi_feb.location_kind == "prod" assert book_maxi_jan.location_kind == "prod" + + +def test_apply_retention_rules_all_older_than_30_days( + dbsession: OrmSession, + create_title: Callable[..., Title], + create_book: Callable[..., Book], + create_book_location: Callable[..., BookLocation], +): + """ + Retention rules should keep only the most recent book of the two most + recent months when all are > 30 days old. + """ + title = create_title(name="test_wiki_en_all") + now = getnow() + + # Month 1: 2024-01-01 + book_jan = create_book( + name="test_wiki", + date="2024-01-01", + flavour="nopic", + created_at=now, + ) + book_jan.location_kind = "prod" + book_jan.title = title + create_book_location(book=book_jan, filename="test_wiki_2024-01.zim") + + # Month 2: 2024-02-01 and 2024-02-15 + book_feb1 = create_book( + name="test_wiki", + date="2024-02-01", + flavour="nopic", + created_at=now, + ) + book_feb1.location_kind = "prod" + book_feb1.title = title + create_book_location(book=book_feb1, filename="test_wiki_2024-02.zim") + + book_feb2 = create_book( + name="test_wiki", + date="2024-02-15", + flavour="nopic", + created_at=now, + ) + book_feb2.location_kind = "prod" + book_feb2.title = title + create_book_location(book=book_feb2, filename="test_wiki_2024-02a.zim") + + # Month 3: 2024-03-01 and 2024-03-15 + book_mar1 = create_book( + name="test_wiki", + date="2024-03-01", + flavour="nopic", + created_at=now, + ) + book_mar1.location_kind = "prod" + book_mar1.title = title + create_book_location(book=book_mar1, filename="test_wiki_2024-03.zim") + + book_mar2 = create_book( + name="test_wiki", + date="2024-03-15", + flavour="nopic", + created_at=now, + ) + book_mar2.location_kind = "prod" + book_mar2.title = title + create_book_location(book=book_mar2, filename="test_wiki_2024-03a.zim") + + dbsession.flush() + + # Set 'now' to 2024-06-01 so all books are older than 30 days + with patch( + "cms_backend.db.title.getnow", + return_value=datetime.datetime(2024, 6, 1), + ): + apply_retention_rules(dbsession, title) + + dbsession.flush() + + # The most recent book of the two most recent months should be kept + assert book_mar2.location_kind == "prod" + assert book_feb2.location_kind == "prod" + + # All others should be deleted + assert book_mar1.location_kind == "to_delete" + assert book_feb1.location_kind == "to_delete" + assert book_jan.location_kind == "to_delete"