From ee0d80454222248f8c5929849b35df0280c52e18 Mon Sep 17 00:00:00 2001 From: Rafael JP Damaceno Date: Fri, 1 May 2026 22:30:32 -0300 Subject: [PATCH 01/31] chore: remover models.py do app article --- article/models.py | 143 ---------------------------------------------- 1 file changed, 143 deletions(-) delete mode 100644 article/models.py diff --git a/article/models.py b/article/models.py deleted file mode 100644 index 80d2a97..0000000 --- a/article/models.py +++ /dev/null @@ -1,143 +0,0 @@ -from django.db import models -from django.utils.translation import gettext_lazy as _ - -from core.models import CommonControlField -from collection.models import Collection - - -class Article(CommonControlField): - collection = models.ForeignKey( - Collection, - verbose_name=_('Collection'), - on_delete=models.CASCADE, - blank=False, - null=False, - db_index=True, - ) - - scielo_issn = models.CharField( - verbose_name=_('SciELO ISSN'), - max_length=9, - blank=False, - null=False, - db_index=True, - ) - - pid_v2 = models.CharField( - verbose_name=_('PID V2'), - max_length=23, - blank=False, - null=False, - db_index=True, - ) - - pid_v3 = models.CharField( - verbose_name=_('PID V3'), - max_length=23, - blank=True, - null=True, - db_index=True, - ) - - pid_generic = models.CharField( - verbose_name=_('PID Generic'), - max_length=50, - blank=True, - null=True, - db_index=True, - ) - - files = models.JSONField( - verbose_name=_('Files'), - null=True, - blank=True, - default=dict, - ) - - default_lang = models.CharField( - verbose_name=_('Default Language'), - max_length=2, - blank=False, - null=False, - ) - - text_langs = models.JSONField( - verbose_name=_('Text Languages'), - null=True, - blank=True, - default=dict, - ) - - processing_date = models.CharField( - verbose_name=_('Processing Date'), - max_length=32, - null=False, - blank=False, - ) - - publication_date = models.CharField( - verbose_name=_('Publication Date'), - max_length=32, - null=False, - blank=False, - ) - - publication_year = models.CharField( - verbose_name=_('Publication Year'), - max_length=4, - null=False, - blank=False, - db_index=True, - ) - - def __str__(self): - return f'{self.collection.acron3} - {self.scielo_issn} - {self.pid_v2 or self.pid_v3 or self.pid_generic}' - - @classmethod - def metadata(cls, collection=None): - qs = cls.objects.select_related('collection').only( - 'collection__acron3', - 'default_lang', - 'files', - 'pid_v2', - 'pid_v3', - 'pid_generic', - 'processing_date', - 'publication_date', - 'publication_year', - 'scielo_issn', - 'text_langs', - ) - - if collection: - qs = qs.filter(collection=collection) - - for a in qs.iterator(): - yield { - 'collection': a.collection.acron3, - 'default_lang': a.default_lang, - 'files': a.files, - 'pid_v2': a.pid_v2, - 'pid_v3': a.pid_v3, - 'pid_generic': a.pid_generic, - 'processing_date': a.processing_date, - 'publication_date': a.publication_date, - 'publication_year': a.publication_year, - 'scielo_issn': a.scielo_issn, - 'text_langs': a.text_langs, - } - - class Meta: - verbose_name = _('Article') - verbose_name_plural = _('Articles') - unique_together = ( - 'collection', - 'scielo_issn', - 'pid_v2', - 'pid_v3', - 'pid_generic', - ) - indexes = [ - models.Index(fields=['collection', 'scielo_issn'], name='collection_scielo_issn_idx'), - ] - From a342af06fb15fdd3d5346bc6448f90e6f5f06165 Mon Sep 17 00:00:00 2001 From: Rafael JP Damaceno Date: Fri, 1 May 2026 22:30:41 -0300 Subject: [PATCH 02/31] chore: remover migrations, tasks, utils, views, admin e hooks do app article --- article/__init__.py | 0 article/admin.py | 3 - article/apps.py | 6 - article/management/__init__.py | 0 article/management/commands/__init__.py | 0 .../commands/load_articles_by_year.py | 80 ------ article/migrations/0001_initial.py | 137 --------- ..._unique_together_article_files_and_more.py | 42 --- ...0003_article_collection_scielo_issn_idx.py | 21 -- article/migrations/__init__.py | 0 article/tasks.py | 259 ------------------ article/tests.py | 3 - article/utils.py | 204 -------------- article/views.py | 3 - article/wagtail_hooks.py | 39 --- 15 files changed, 797 deletions(-) delete mode 100644 article/__init__.py delete mode 100644 article/admin.py delete mode 100644 article/apps.py delete mode 100644 article/management/__init__.py delete mode 100644 article/management/commands/__init__.py delete mode 100644 article/management/commands/load_articles_by_year.py delete mode 100644 article/migrations/0001_initial.py delete mode 100644 article/migrations/0002_alter_article_unique_together_article_files_and_more.py delete mode 100644 article/migrations/0003_article_collection_scielo_issn_idx.py delete mode 100644 article/migrations/__init__.py delete mode 100644 article/tasks.py delete mode 100644 article/tests.py delete mode 100644 article/utils.py delete mode 100644 article/views.py delete mode 100644 article/wagtail_hooks.py diff --git a/article/__init__.py b/article/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/article/admin.py b/article/admin.py deleted file mode 100644 index 8c38f3f..0000000 --- a/article/admin.py +++ /dev/null @@ -1,3 +0,0 @@ -from django.contrib import admin - -# Register your models here. diff --git a/article/apps.py b/article/apps.py deleted file mode 100644 index 8c0e2c9..0000000 --- a/article/apps.py +++ /dev/null @@ -1,6 +0,0 @@ -from django.apps import AppConfig - - -class ArticleConfig(AppConfig): - default_auto_field = "django.db.models.BigAutoField" - name = "article" diff --git a/article/management/__init__.py b/article/management/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/article/management/commands/__init__.py b/article/management/commands/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/article/management/commands/load_articles_by_year.py b/article/management/commands/load_articles_by_year.py deleted file mode 100644 index 335598e..0000000 --- a/article/management/commands/load_articles_by_year.py +++ /dev/null @@ -1,80 +0,0 @@ -from django.core.management.base import BaseCommand - -from article.tasks import task_load_article_from_opac, task_load_article_from_article_meta - - -class Command(BaseCommand): - help = 'Generate task requests for loading article data from Article Meta for each year from 1900 to 2025' - - def add_arguments(self, parser): - parser.add_argument( - '--start-year', - type=int, - default=1990, - help='Start year (default: 1990)' - ) - parser.add_argument( - '--end-year', - type=int, - default=2025, - help='End year (default: 2025)' - ) - parser.add_argument( - '--collection', - type=str, - default='scl', - help='Collection code (default: scl)' - ) - parser.add_argument( - '--task', - choices=['load_article_from_opac', 'load_article_from_article_meta'], - default='load_article_from_opac', - help='Task to execute (default: load_article_from_opac)', - ) - - def handle(self, *args, **options): - start_year = options['start_year'] - end_year = options['end_year'] - collection = options['collection'] - - self.stdout.write( - self.style.SUCCESS( - f'Generating task requests from {start_year} to {end_year} for collection: {collection}' - ) - ) - - total_tasks = 0 - - for year in range(start_year, end_year + 1): - from_date = f'{year}-01-01' - until_date = f'{year}-12-31' - - self.stdout.write(f'Queuing task for year {year}...') - - # Queue the task for each year - if options['task'] == 'load_article_from_article_meta': - task_result = task_load_article_from_article_meta.delay( - from_date=from_date, - until_date=until_date, - collection=collection - ) - else: - task_result = task_load_article_from_opac.delay( - from_date=from_date, - until_date=until_date, - collection=collection - ) - - total_tasks += 1 - - self.stdout.write( - self.style.SUCCESS( - f'✓ Task queued for year {year}: {from_date} to {until_date} (Task ID: {task_result.id})' - ) - ) - - self.stdout.write( - self.style.SUCCESS( - f'\nCompleted! {total_tasks} tasks have been queued successfully.' - ) - ) diff --git a/article/migrations/0001_initial.py b/article/migrations/0001_initial.py deleted file mode 100644 index 816d61e..0000000 --- a/article/migrations/0001_initial.py +++ /dev/null @@ -1,137 +0,0 @@ -# Generated by Django 5.0.7 on 2025-02-07 17:50 - -import django.db.models.deletion -from django.conf import settings -from django.db import migrations, models - - -class Migration(migrations.Migration): - initial = True - - dependencies = [ - ("collection", "0001_initial"), - migrations.swappable_dependency(settings.AUTH_USER_MODEL), - ] - - operations = [ - migrations.CreateModel( - name="Article", - fields=[ - ( - "id", - models.BigAutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ( - "created", - models.DateTimeField( - auto_now_add=True, verbose_name="Creation date" - ), - ), - ( - "updated", - models.DateTimeField( - auto_now=True, verbose_name="Last update date" - ), - ), - ( - "scielo_issn", - models.CharField( - db_index=True, max_length=9, verbose_name="SciELO ISSN" - ), - ), - ( - "pid_v2", - models.CharField( - db_index=True, max_length=23, verbose_name="PID V2" - ), - ), - ( - "pid_v3", - models.CharField( - blank=True, - db_index=True, - max_length=23, - null=True, - verbose_name="PID V3", - ), - ), - ( - "pdfs", - models.JSONField( - blank=True, - default=dict, - null=True, - verbose_name="Format with Language", - ), - ), - ( - "default_lang", - models.CharField(max_length=2, verbose_name="Default Language"), - ), - ( - "text_langs", - models.JSONField( - blank=True, - default=dict, - null=True, - verbose_name="Text Languages", - ), - ), - ( - "processing_date", - models.CharField(max_length=32, verbose_name="Processing Date"), - ), - ( - "publication_date", - models.CharField(max_length=32, verbose_name="Publication Date"), - ), - ( - "publication_year", - models.CharField( - db_index=True, max_length=4, verbose_name="Publication Year" - ), - ), - ( - "collection", - models.ForeignKey( - on_delete=django.db.models.deletion.CASCADE, - to="collection.collection", - verbose_name="Collection", - ), - ), - ( - "creator", - models.ForeignKey( - editable=False, - null=True, - on_delete=django.db.models.deletion.SET_NULL, - related_name="%(class)s_creator", - to=settings.AUTH_USER_MODEL, - verbose_name="Creator", - ), - ), - ( - "updated_by", - models.ForeignKey( - blank=True, - editable=False, - null=True, - on_delete=django.db.models.deletion.SET_NULL, - related_name="%(class)s_last_mod_user", - to=settings.AUTH_USER_MODEL, - verbose_name="Updater", - ), - ), - ], - options={ - "verbose_name": "Article", - "verbose_name_plural": "Articles", - "unique_together": {("collection", "scielo_issn", "pid_v2", "pid_v3")}, - }, - ), - ] diff --git a/article/migrations/0002_alter_article_unique_together_article_files_and_more.py b/article/migrations/0002_alter_article_unique_together_article_files_and_more.py deleted file mode 100644 index cee055c..0000000 --- a/article/migrations/0002_alter_article_unique_together_article_files_and_more.py +++ /dev/null @@ -1,42 +0,0 @@ -# Generated by Django 5.0.7 on 2025-04-01 01:09 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - dependencies = [ - ("article", "0001_initial"), - ("collection", "0001_initial"), - ] - - operations = [ - migrations.AddField( - model_name="article", - name="files", - field=models.JSONField( - blank=True, default=dict, null=True, verbose_name="Files" - ), - ), - migrations.AddField( - model_name="article", - name="pid_generic", - field=models.CharField( - blank=True, - db_index=True, - max_length=50, - null=True, - verbose_name="PID Generic", - ), - ), - migrations.RemoveField( - model_name="article", - name="pdfs", - ), - migrations.AlterUniqueTogether( - name="article", - unique_together={ - ("collection", "scielo_issn", "pid_v2", "pid_v3", "pid_generic") - }, - ), - - ] diff --git a/article/migrations/0003_article_collection_scielo_issn_idx.py b/article/migrations/0003_article_collection_scielo_issn_idx.py deleted file mode 100644 index 753ac98..0000000 --- a/article/migrations/0003_article_collection_scielo_issn_idx.py +++ /dev/null @@ -1,21 +0,0 @@ -# Generated by Django 5.0.7 on 2025-06-12 17:16 - -from django.conf import settings -from django.db import migrations, models - - -class Migration(migrations.Migration): - dependencies = [ - ("article", "0002_alter_article_unique_together_article_files_and_more"), - ("collection", "0001_initial"), - migrations.swappable_dependency(settings.AUTH_USER_MODEL), - ] - - operations = [ - migrations.AddIndex( - model_name="article", - index=models.Index( - fields=["collection", "scielo_issn"], name="collection_scielo_issn_idx" - ), - ), - ] diff --git a/article/migrations/__init__.py b/article/migrations/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/article/tasks.py b/article/tasks.py deleted file mode 100644 index 3514fca..0000000 --- a/article/tasks.py +++ /dev/null @@ -1,259 +0,0 @@ -import logging - -from django.contrib.auth import get_user_model -from django.db.models import Q -from django.db import DataError -from django.utils.translation import gettext as _ - -from collection.models import Collection -from config import celery_app -from core.utils import date_utils -from core.utils.utils import _get_user - -from journal.models import Journal - -from tracker.models import ArticleEvent -from tracker.choices import ARTICLE_EVENT_TYPE_MULTIPLE_OBJS_RETURNED, ARTICLE_EVENT_TYPE_DATA_ERROR - -from . import models, utils - - -User = get_user_model() - -@celery_app.task(bind=True, name=_('Load article data from Article Meta'), timelimit=-1, queue='load') -def task_load_article_from_article_meta(self, from_date=None, until_date=None, days_to_go_back=None, collection=None, issn=None, force_update=True, user_id=None, username=None): - user = _get_user(self.request, username=username, user_id=user_id) - - from_date, until_date = date_utils.get_date_range_str(from_date, until_date, days_to_go_back) - logging.info(f'Loading articles from Article Meta. From: {from_date}, Until: {until_date}, Collection: {collection}, ISSN: {issn}.') - - offset = 0 - limit = 1000 - while True: - logging.info(f'{from_date}, {until_date}, {offset}, {limit}, {collection}, {issn}') - response = utils.fetch_article_meta_dict(from_date, until_date, offset=offset, limit=limit, collection=collection, issn=issn) - objects = response.get('objects') - if not objects: - break - - for obj in objects: - codes = obj.get('code_title') - - for issn_code in codes: - jou = Journal.objects.filter( - Q(issns__electronic_issn=issn_code) | - Q(issns__scielo_issn=issn_code) | - Q(issns__print_issn=issn_code) - ).first() - if not jou: - continue - - if not jou: - logging.info(f'Journal not found for ISSNs: {codes}') - continue - - col_obj = Collection.objects.get(acron3=obj.get('collection')) - if not col_obj: - logging.info(f'Collection not found: {obj.get("collection")}') - continue - - try: - article, created = models.Article.objects.get_or_create(collection=col_obj, scielo_issn=jou.scielo_issn, pid_v2=obj.get('code')) - if created or force_update: - article.files = obj.get('pdfs') or {} - article.processing_date = obj.get('processing_date') or '' - article.publication_date = obj.get('publication_date') or '' - article.publication_year = obj.get('publication_year') or '' - article.default_lang = obj.get('default_language') or '' - article.text_langs = obj.get('text_langs') or '' - - article.save() - logging.info(f'Article {"created" if created else "updated"}: {article}') - except models.Article.MultipleObjectsReturned as e: - logging.error(f'Error getting Article: {e}. Collection: {col_obj}, ISSN: {jou.scielo_issn}, PIDv2: {obj.get("code")}') - ArticleEvent.create( - event_type=ARTICLE_EVENT_TYPE_MULTIPLE_OBJS_RETURNED, - message=f'Error getting Article: {e}. Collection: {col_obj}, ISSN: {jou.scielo_issn}, PIDv2: {obj.get("code")}', - data=obj - ) - continue - except DataError as e: - logging.error(f'Error saving Article: {e}. Collection: {col_obj}, ISSN: {jou.scielo_issn}, PIDv2: {obj.get("code")}') - ArticleEvent.create( - event_type=ARTICLE_EVENT_TYPE_DATA_ERROR, - message=f'Error saving Article: {e}. Collection: {col_obj}, ISSN: {jou.scielo_issn}, PIDv2: {obj.get("code")}', - data=obj - ) - continue - - offset += limit - - return True - - -@celery_app.task(bind=True, name=_('Load article data from OPAC'), timelimit=-1, queue='load') -def task_load_article_from_opac(self, collection='scl', from_date=None, until_date=None, days_to_go_back=None, page=1, force_update=True, user_id=None, username=None): - user = _get_user(self.request, username=username, user_id=user_id) - - from_date, until_date = date_utils.get_date_range_str(from_date, until_date, days_to_go_back) - logging.info(f'Loading articles from OPAC. From: {from_date}, Until: {until_date}') - - while True: - response = utils.fetch_opac_dict(from_date, until_date, page=page) - - documents = response.get('documents') - - for doc_id, doc in documents.items(): - col_obj = Collection.objects.get(acron3=collection) - if not col_obj: - logging.error(f'Collection not found: {collection}') - continue - - journal = Journal.objects.get(collection=col_obj, acronym=doc.get('journal_acronym')) - if not journal: - logging.error(f'Journal not found: {doc.get("journal_acronym")}') - continue - - try: - article, created = models.Article.objects.get_or_create(collection=col_obj, scielo_issn=journal.scielo_issn, pid_v2=doc.get('pid_v2')) - - if created or force_update: - article.pid_v3 = doc.get('pid_v3') or '' - if not created: - article.pid_v2 = doc.get('pid_v2') or '' - article.publication_date = doc.get('publication_date') or article.publication_date or '' - article.default_lang = doc.get('default_language') or article.default_lang or '' - - try: - article.publication_year = article.publication_date[:4] - except IndexError: - article.publication_year = '' - - article.save() - logging.info(f'Article {"created" if created else "updated"}: {article}') - - except models.Article.MultipleObjectsReturned as e: - logging.error(f'Error getting Article: {e}. Collection: {col_obj}, Journal: {journal.scielo_issn}, PIDv2: {doc.get("pid_v2")}') - ArticleEvent.create( - event_type=ARTICLE_EVENT_TYPE_MULTIPLE_OBJS_RETURNED, - message=f'Error creating Article: {e}. Collection: {col_obj}, Journal: {journal.scielo_issn}, PIDv2: {doc.get("pid_v2")}', - data=doc - ) - continue - except DataError as e: - logging.error(f'Error saving Article: {e}. Collection: {col_obj}, Journal: {journal.scielo_issn}, PIDv2: {doc.get("pid_v2")}') - ArticleEvent.create( - event_type=ARTICLE_EVENT_TYPE_DATA_ERROR, - message=f'Error saving Article: {e}. Collection: {col_obj}, Journal: {journal.scielo_issn}, PIDv2: {doc.get("pid_v2")}', - data=doc - ) - continue - - page += 1 - if page > int(response.get('pages', 0)): - break - - return True - - -@celery_app.task(bind=True, name=_('Load preprint data from SciELO Preprints'), timelimit=-1, queue='load') -def task_load_preprints_from_preprints_api(self, from_date=None, until_date=None, days_to_go_back=None, force_update=True, user_id=None, username=None): - user = _get_user(self.request, username=username, user_id=user_id) - - from_date, until_date = date_utils.get_date_range_str(from_date, until_date, days_to_go_back) - logging.info(f'Loading preprints from SciELO Preprints. From: {from_date}, Until: {until_date}') - - col_obj = Collection.objects.get(acron3='preprints') - if not col_obj: - logging.error(f'Collection not found: preprints') - return False - - for record in utils.fetch_preprint_oai_pmh(from_date, until_date): - data = utils.extract_preprint_data(record) - - if not data.get('pid_generic'): - logging.error(f'Preprint ID not found in record: {record}') - continue - - try: - article, created = models.Article.objects.get_or_create(collection=col_obj, pid_generic=data['pid_generic']) - if created or force_update: - article.text_langs = data.get('text_langs') - article.default_lang = data.get('default_language') - article.publication_date = data.get('publication_date') - article.publication_year = data.get('publication_year') - - # Preprints do not have a scielo_issn yet - article.scielo_issn = '0000-0000' - - article.save() - logging.debug(f'Article {"created" if created else "updated"}: {article}') - except models.Article.MultipleObjectsReturned as e: - logging.error(f'Error creating Article: {e}. Collection: {col_obj}, PID: {data["pid_generic"]}') - ArticleEvent.create( - event_type=ARTICLE_EVENT_TYPE_MULTIPLE_OBJS_RETURNED, - message=f'Error creating Article: {e}. Collection: {col_obj}, PID: {data["pid_generic"]}', - data=data - ) - continue - except DataError as e: - logging.error(f'Error saving Article: {e}. Collection: {col_obj}, PID: {data["pid_generic"]}') - ArticleEvent.create( - event_type=ARTICLE_EVENT_TYPE_DATA_ERROR, - message=f'Error saving Article: {e}. Collection: {col_obj}, PID: {data["pid_generic"]}', - data=data - ) - continue - - -@celery_app.task(bind=True, name=_('Load dataset metadata from Dataverse'), timelimit=-1, queue='load') -def task_load_dataset_metadata_from_dataverse(self, from_date=None, until_date=None, days_to_go_back=None, force_update=True, user_id=None, username=None): - user = _get_user(self.request, username=username, user_id=user_id) - - from_date, until_date = date_utils.get_date_range_str(from_date, until_date, days_to_go_back) - logging.info(f'Loading dataset metadata from SciELO Data. From: {from_date}, Until: {until_date}') - - col_obj = Collection.objects.get(acron3='data') - if not col_obj: - logging.error(f'Collection not found: data') - return False - - for record in utils.fetch_dataverse_metadata(from_date, until_date): - dataset_doi = record.get('dataset_doi') - if not dataset_doi: - logging.error(f'Dataset DOI not found in record: {record}') - continue - - try: - dataset, created = models.Article.objects.get_or_create(collection=col_obj, pid_generic=dataset_doi) - if created or force_update: - dataset.publication_date = record.get('dataset_published') - - file_persistent_id = record.get('file_persistent_id') - file_id = record.get('file_id') - file_name = record.get('file_name') - file_url = record.get('file_url') - - if file_id: - dataset.files[file_id] = {'name': file_name, 'url': file_url, 'file_persisent_id': file_persistent_id} - - dataset.save() - logging.debug(f'Dataset {"created" if created else "updated"}: {dataset}') - except models.Article.MultipleObjectsReturned as e: - logging.error(f'Error creating Dataset: {e}. Collection: {col_obj}, PID: {dataset_doi}') - ArticleEvent.create( - event_type=ARTICLE_EVENT_TYPE_MULTIPLE_OBJS_RETURNED, - message=f'Error creating Dataset: {e}. Collection: {col_obj}, PID: {dataset_doi}', - data=record - ) - continue - except DataError as e: - logging.error(f'Error saving Dataset: {e}. Collection: {col_obj}, PID: {dataset_doi}') - ArticleEvent.create( - event_type=ARTICLE_EVENT_TYPE_DATA_ERROR, - message=f'Error saving Dataset: {e}. Collection: {col_obj}, PID: {dataset_doi}', - data=record - ) - continue - - return True diff --git a/article/tests.py b/article/tests.py deleted file mode 100644 index 7ce503c..0000000 --- a/article/tests.py +++ /dev/null @@ -1,3 +0,0 @@ -from django.test import TestCase - -# Create your tests here. diff --git a/article/utils.py b/article/utils.py deleted file mode 100644 index b9a094e..0000000 --- a/article/utils.py +++ /dev/null @@ -1,204 +0,0 @@ -import logging -import requests -import os - -from sickle import Sickle -from time import sleep - -from core.utils import standardizer - - -ARTICLEMETA_ENDPOINT = os.environ.get('ARTICLEMETA_COLLECT_URL', 'http://articlemeta.scielo.org/api/v1/article/counter_dict') -ARTICLEMETA_MAX_RETRIES = int(os.environ.get('ARTICLEMETA_MAX_RETRIES', 5)) -ARTICLEMETA_SLEEP_TIME = int(os.environ.get('ARTICLEMETA_SLEEP_TIME', 30)) - -OPAC_ENDPOINT = os.environ.get('OPAC_ENDPOINT', 'https://www.scielo.br/api/v1/counter_dict') -OPAC_MAX_RETRIES = int(os.environ.get('OPAC_MAX_RETRIES', 5)) -OPAC_SLEEP_TIME = int(os.environ.get('OPAC_SLEEP_TIME', 30)) - -OAI_PMH_PREPRINT_ENDPOINT = os.environ.get('OAI_PMH_PREPRINT_ENDPOINT', 'https://preprints.scielo.org/index.php/scielo/oai') -OAI_METADATA_PREFIX = os.environ.get('OAI_METADATA_PREFIX', 'oai_dc') -OAI_PMH_MAX_RETRIES = int(os.environ.get('OAI_PMH_MAX_RETRIES', 5)) - -DATAVERSE_ENDPOINT = os.environ.get('DATAVERSE_ENDPOINT', 'https://data.scielo.org/api') -DATAVERSE_ROOT_COLLECTION = os.environ.get('DATAVERSE_ROOT_COLLECTION', 'scielodata') -DATAVERSE_MAX_RETRIES = int(os.environ.get('DATAVERSE_MAX_RETRIES', 5)) -DATAVERSE_SLEEP_TIME = int(os.environ.get('DATAVERSE_SLEEP_TIME', 30)) - - -def fetch_article_meta_dict(from_date, until_date, offset=0, limit=1000, collection=None, issn=None): - for t in range(1, ARTICLEMETA_MAX_RETRIES + 1): - params = { - 'from': from_date, - 'until': until_date, - 'offset': offset, - 'limit': limit - } - - if collection: - params['collection'] = collection - - if issn: - params['issn'] = issn - - response = requests.get(ARTICLEMETA_ENDPOINT, params=params) - - try: - response.raise_for_status() - logging.info(response.url) - - except requests.exceptions.HTTPError: - logging.warning( - 'Failed to collect data from %s. Waiting %d seconds before retry %d of %d' % ( - response.url, - ARTICLEMETA_SLEEP_TIME, - t, - ARTICLEMETA_MAX_RETRIES - ) - ) - sleep(ARTICLEMETA_SLEEP_TIME) - - else: - return response.json() - - -def fetch_opac_dict(from_date, until_date, page=1): - for t in range(1, OPAC_MAX_RETRIES + 1): - params = { - 'begin_date': from_date, - 'end_date': until_date, - 'page': page - } - - response = requests.get(url=OPAC_ENDPOINT, params=params, verify=False) - - try: - response.raise_for_status() - logging.info(response.url) - - except requests.exceptions.HTTPError: - logging.warning('Não foi possível coletar dados de %s. Aguardando %d segundos para tentativa %d de %d' % (response.url, OPAC_SLEEP_TIME, t, OPAC_MAX_RETRIES)) - sleep(OPAC_SLEEP_TIME) - - else: - return response.json() - - -def fetch_preprint_oai_pmh(from_date, until_date): - oai_client = Sickle(endpoint=OAI_PMH_PREPRINT_ENDPOINT, max_retries=OAI_PMH_MAX_RETRIES, verify=False) - records = oai_client.ListRecords(**{ - 'metadataPrefix': OAI_METADATA_PREFIX, - 'from': from_date, - 'until': until_date, - }) - - for r in records: - yield r - - -def extract_preprint_data(record): - pid_generic = _extract_preprint_compatible_identifer(record.header.identifier) - text_langs = [standardizer.standardize_language_code(l) for l in record.metadata.get('language', [])] - publication_date = record.metadata.get('date', [''])[0] - default_language = text_langs[0] if text_langs else '' - publication_year = _extract_preprint_publication_year_from_date(publication_date) - - data = { - 'pid_generic': pid_generic, - 'text_langs': text_langs, - 'publication_date': publication_date, - 'default_language': default_language, - 'publication_year': publication_year - } - - return data - - -def _extract_preprint_compatible_identifer(pid_v2): - try: - # piv_v2 should be something like oai:ops.preprints.scielo.org:preprint/1195 - # we are using the last part of the string as the identifier - return pid_v2.split(':')[-1].split('/')[1] - except IndexError: - return '' - - -def _extract_preprint_publication_year_from_date(date_str): - try: - return date_str[:4] - except IndexError: - return '' - - -def fetch_dataverse_metadata(from_date=None, until_date=None): - def get_subdataverses(): - url = f"{DATAVERSE_ENDPOINT}/dataverses/{DATAVERSE_ROOT_COLLECTION}/contents" - try: - response = requests.get(url, timeout=DATAVERSE_SLEEP_TIME) - response.raise_for_status() - return response.json().get("data", []) - except requests.exceptions.RequestException as e: - logging.error(f"Error fetching subdataverses: {e}") - return [] - - def get_datasets(subdataverse_id): - url = f"{DATAVERSE_ENDPOINT}/dataverses/{subdataverse_id}/contents" - try: - response = requests.get(url, timeout=DATAVERSE_SLEEP_TIME) - response.raise_for_status() - return response.json().get("data", []) - except requests.exceptions.RequestException as e: - logging.error(f"Error fetching datasets for subdataverse {subdataverse_id}: {e}") - return [] - - def get_files(dataset_id): - url = f"{DATAVERSE_ENDPOINT}/datasets/{dataset_id}/versions/:latest/files" - try: - response = requests.get(url, timeout=DATAVERSE_SLEEP_TIME) - response.raise_for_status() - return response.json().get("data", []) - except requests.exceptions.RequestException as e: - logging.error(f"Error fetching files for dataset {dataset_id}: {e}") - return [] - - subdataverses = get_subdataverses() - - for subdataverse in subdataverses: - if subdataverse["type"] != "dataverse": - continue - - subdataverse_id = subdataverse["id"] - subdataverse_title = subdataverse["title"] - datasets = get_datasets(subdataverse_id) - - for dataset in datasets: - if dataset["type"] != "dataset": - continue - - dataset_id = dataset["id"] - doi = standardizer.standardize_doi(dataset.get("persistentUrl")) - if not doi: - logging.warning(f"Dataset {dataset_id} does not have a DOI.") - continue - - publication_date = dataset.get("publicationDate", None) - - if publication_date: - if (from_date and publication_date < from_date) or (until_date and publication_date > until_date): - continue - - files = get_files(dataset_id) - - for file in files: - file_persistent_id = file["dataFile"].get("persistentId", None) - file_persistent_id_stz = standardizer.standardize_pid_generic(file_persistent_id) if file_persistent_id else None - - yield { - "title": subdataverse_title, - "dataset_doi": doi, - "dataset_published": publication_date, - "file_id": file["dataFile"]["id"], - "file_name": file["label"], - "file_url": f"{DATAVERSE_ENDPOINT}/access/datafile/{file['dataFile']['id']}", - "file_persistent_id": file_persistent_id_stz, - } diff --git a/article/views.py b/article/views.py deleted file mode 100644 index 91ea44a..0000000 --- a/article/views.py +++ /dev/null @@ -1,3 +0,0 @@ -from django.shortcuts import render - -# Create your views here. diff --git a/article/wagtail_hooks.py b/article/wagtail_hooks.py deleted file mode 100644 index 4cf55bd..0000000 --- a/article/wagtail_hooks.py +++ /dev/null @@ -1,39 +0,0 @@ -from django.utils.translation import gettext_lazy as _ -from wagtail.snippets.views.snippets import SnippetViewSet -from wagtail.snippets.models import register_snippet - -from config.menu import get_menu_order - -from .models import Article - - -class ArticleSnippetViewSet(SnippetViewSet): - model = Article - icon = "folder-open-inverse" - menu_name = "article" - menu_label = _("Article") - menu_order = get_menu_order("article") - add_to_admin_menu = True - - list_display = ( - "collection", - "scielo_issn", - "pid_v2", - "pid_v3", - "pid_generic", - "files", - "publication_year", - ) - list_filter = ( - "collection", - "scielo_issn", - "publication_year", - ) - search_fields = ( - "scielo_issn", - "pid_v2", - "pid_v3", - "pid_generic", - ) - -register_snippet(ArticleSnippetViewSet) From f67d6dcf0efbf7a9b2647f3ff90ee5ee15b3e546 Mon Sep 17 00:00:00 2001 From: Rafael JP Damaceno Date: Fri, 1 May 2026 22:30:51 -0300 Subject: [PATCH 03/31] chore: remover models.py do app journal --- journal/models.py | 100 ---------------------------------------------- 1 file changed, 100 deletions(-) delete mode 100644 journal/models.py diff --git a/journal/models.py b/journal/models.py deleted file mode 100644 index 0d830e9..0000000 --- a/journal/models.py +++ /dev/null @@ -1,100 +0,0 @@ -from django.db import models -from django.utils.translation import gettext_lazy as _ - -from core.models import CommonControlField -from collection.models import Collection - - -class Journal(CommonControlField): - collection = models.ForeignKey( - Collection, - verbose_name=_('Collection'), - on_delete=models.CASCADE, - blank=False, - null=False, - db_index=True, - ) - - scielo_issn = models.CharField( - verbose_name=_('SciELO ISSN'), - max_length=9, - blank=False, - null=False, - db_index=True, - ) - - issns = models.JSONField( - verbose_name=_('ISSNs'), - null=True, - blank=True, - default=dict, - ) - - acronym = models.CharField( - verbose_name=_('Journal Acronym'), - max_length=32, - blank=True, - null=True, - default='', - ) - - title = models.CharField( - verbose_name=_('Journal Title'), - max_length=255, - blank=False, - null=False, - ) - - publisher_name = models.JSONField( - verbose_name=_('Publisher Name'), - blank=True, - null=True, - default=list, - ) - - subject_areas = models.JSONField( - verbose_name=_('Subject Areas (CAPES)'), - null=False, - blank=False, - default=list, - ) - - wos_subject_areas = models.JSONField( - verbose_name=_('Subject Areas (WoS)'), - null=False, - blank=False, - default=list, - ) - - def __str__(self): - return f'{self.collection.acron2} - {self.scielo_issn} - {self.acronym}' - - @classmethod - def metadata(cls, collection=None): - queryset = cls.objects.all() - if collection: - queryset = queryset.filter(collection=collection) - - for journal in queryset.only( - 'acronym', 'collection__acron3', 'issns', 'publisher_name', - 'scielo_issn', 'subject_areas', 'title', 'wos_subject_areas' - ): - yield { - 'acronym': journal.acronym, - 'collection': journal.collection.acron3, - 'issns': set([v for v in journal.issns.values() if v]), - 'publisher_name': journal.publisher_name, - 'scielo_issn': journal.scielo_issn, - 'subject_areas': journal.subject_areas, - 'title': journal.title, - 'wos_subject_areas': journal.wos_subject_areas, - } - - class Meta: - verbose_name = _('Journal') - verbose_name_plural = _('Journals') - unique_together = ( - 'collection', - 'scielo_issn', - 'acronym', - ) From b4b16ac6a17c4523b2c2019836ae400d89507b6c Mon Sep 17 00:00:00 2001 From: Rafael JP Damaceno Date: Fri, 1 May 2026 22:30:51 -0300 Subject: [PATCH 04/31] chore: remover migrations, tasks, utils, views, admin e hooks do app journal --- journal/__init__.py | 0 journal/admin.py | 3 - journal/apps.py | 6 - journal/migrations/0001_initial.py | 122 ------------------ .../0002_alter_journal_scielo_issn.py | 19 --- journal/migrations/__init__.py | 0 journal/tasks.py | 56 -------- journal/tests.py | 3 - journal/utils.py | 19 --- journal/views.py | 3 - journal/wagtail_hooks.py | 40 ------ 11 files changed, 271 deletions(-) delete mode 100644 journal/__init__.py delete mode 100644 journal/admin.py delete mode 100644 journal/apps.py delete mode 100644 journal/migrations/0001_initial.py delete mode 100644 journal/migrations/0002_alter_journal_scielo_issn.py delete mode 100644 journal/migrations/__init__.py delete mode 100644 journal/tasks.py delete mode 100644 journal/tests.py delete mode 100644 journal/utils.py delete mode 100644 journal/views.py delete mode 100644 journal/wagtail_hooks.py diff --git a/journal/__init__.py b/journal/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/journal/admin.py b/journal/admin.py deleted file mode 100644 index 8c38f3f..0000000 --- a/journal/admin.py +++ /dev/null @@ -1,3 +0,0 @@ -from django.contrib import admin - -# Register your models here. diff --git a/journal/apps.py b/journal/apps.py deleted file mode 100644 index e10a171..0000000 --- a/journal/apps.py +++ /dev/null @@ -1,6 +0,0 @@ -from django.apps import AppConfig - - -class JournalConfig(AppConfig): - default_auto_field = "django.db.models.BigAutoField" - name = "journal" diff --git a/journal/migrations/0001_initial.py b/journal/migrations/0001_initial.py deleted file mode 100644 index 7164bbc..0000000 --- a/journal/migrations/0001_initial.py +++ /dev/null @@ -1,122 +0,0 @@ -# Generated by Django 5.0.7 on 2025-02-07 17:50 - -import django.db.models.deletion -from django.conf import settings -from django.db import migrations, models - - -class Migration(migrations.Migration): - initial = True - - dependencies = [ - ("collection", "0001_initial"), - migrations.swappable_dependency(settings.AUTH_USER_MODEL), - ] - - operations = [ - migrations.CreateModel( - name="Journal", - fields=[ - ( - "id", - models.BigAutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ( - "created", - models.DateTimeField( - auto_now_add=True, verbose_name="Creation date" - ), - ), - ( - "updated", - models.DateTimeField( - auto_now=True, verbose_name="Last update date" - ), - ), - ( - "scielo_issn", - models.CharField(max_length=9, verbose_name="SciELO ISSN"), - ), - ( - "issns", - models.JSONField( - blank=True, default=dict, null=True, verbose_name="ISSNs" - ), - ), - ( - "acronym", - models.CharField( - blank=True, - default="", - max_length=32, - null=True, - verbose_name="Journal Acronym", - ), - ), - ( - "title", - models.CharField(max_length=255, verbose_name="Journal Title"), - ), - ( - "publisher_name", - models.JSONField( - blank=True, - default=list, - null=True, - verbose_name="Publisher Name", - ), - ), - ( - "subject_areas", - models.JSONField( - default=list, verbose_name="Subject Areas (CAPES)" - ), - ), - ( - "wos_subject_areas", - models.JSONField(default=list, verbose_name="Subject Areas (WoS)"), - ), - ( - "collection", - models.ForeignKey( - on_delete=django.db.models.deletion.CASCADE, - to="collection.collection", - verbose_name="Collection", - ), - ), - ( - "creator", - models.ForeignKey( - editable=False, - null=True, - on_delete=django.db.models.deletion.SET_NULL, - related_name="%(class)s_creator", - to=settings.AUTH_USER_MODEL, - verbose_name="Creator", - ), - ), - ( - "updated_by", - models.ForeignKey( - blank=True, - editable=False, - null=True, - on_delete=django.db.models.deletion.SET_NULL, - related_name="%(class)s_last_mod_user", - to=settings.AUTH_USER_MODEL, - verbose_name="Updater", - ), - ), - ], - options={ - "verbose_name": "Journal", - "verbose_name_plural": "Journals", - "unique_together": {("collection", "scielo_issn", "acronym")}, - }, - ), - ] diff --git a/journal/migrations/0002_alter_journal_scielo_issn.py b/journal/migrations/0002_alter_journal_scielo_issn.py deleted file mode 100644 index 07cf94f..0000000 --- a/journal/migrations/0002_alter_journal_scielo_issn.py +++ /dev/null @@ -1,19 +0,0 @@ -# Generated by Django 5.0.7 on 2025-06-12 17:16 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - dependencies = [ - ("journal", "0001_initial"), - ] - - operations = [ - migrations.AlterField( - model_name="journal", - name="scielo_issn", - field=models.CharField( - db_index=True, max_length=9, verbose_name="SciELO ISSN" - ), - ), - ] diff --git a/journal/migrations/__init__.py b/journal/migrations/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/journal/tasks.py b/journal/tasks.py deleted file mode 100644 index 71681cb..0000000 --- a/journal/tasks.py +++ /dev/null @@ -1,56 +0,0 @@ -import logging - -from django.contrib.auth import get_user_model -from django.db import IntegrityError -from django.utils import timezone -from django.utils.translation import gettext as _ - -from collection.models import Collection -from config import celery_app -from core.utils.utils import _get_user - -from . import models, utils - - -User = get_user_model() - - -@celery_app.task(bind=True, name=_('Load journal data from Article Meta'), queue='load') -def task_load_journal_data_from_article_meta(self, collections=[], force_update=True, user_id=None, username=None, mode='thrift'): - user = _get_user(user_id, username) - - for col in collections or Collection.acron3_list(): - for j in utils.fetch_article_meta_journals(collection=col, mode=mode): - collection = Collection.objects.get(acron3=j.collection_acronym) - if not collection: - logging.error(f'Collection {j.collection_acronym} does not exist') - continue - - try: - journal, created = models.Journal.objects.get_or_create(collection=collection, scielo_issn=j.scielo_issn) - except IntegrityError as e: - logging.error(f'Journal {j} has not been created due to error: {e}') - continue - - if created: - journal.creator = user - journal.created = timezone.now() - - if created or force_update: - journal.updated_by = user - journal.updated = timezone.now() - journal.issns = { - 'electronic_issn': j.electronic_issn or '', - 'print_issn': j.print_issn or '', - 'scielo_issn': j.scielo_issn - } - journal.acronym = j.acronym - journal.title = j.title - journal.publisher_name = j.publisher_name or '' - journal.subject_areas = j.subject_areas or [] - journal.wos_subject_areas = j.wos_subject_areas or [] - logging.info(f'Journal {"created" if created else "updated"}: {journal}') - - journal.save() - - return True diff --git a/journal/tests.py b/journal/tests.py deleted file mode 100644 index 7ce503c..0000000 --- a/journal/tests.py +++ /dev/null @@ -1,3 +0,0 @@ -from django.test import TestCase - -# Create your tests here. diff --git a/journal/utils.py b/journal/utils.py deleted file mode 100644 index 8a80521..0000000 --- a/journal/utils.py +++ /dev/null @@ -1,19 +0,0 @@ -from articlemeta.client import ThriftClient, RestfulClient - - -def fetch_article_meta_journals(collection='scl', mode='rest'): - """ - Fetches article metadata from journals. - - Returns - ------- - list - A list of article metadata. - """ - if mode == 'rest': - am = RestfulClient() - elif mode == 'thrift': - am = ThriftClient() - - for j in am.journals(collection=collection): - yield j diff --git a/journal/views.py b/journal/views.py deleted file mode 100644 index 91ea44a..0000000 --- a/journal/views.py +++ /dev/null @@ -1,3 +0,0 @@ -from django.shortcuts import render - -# Create your views here. diff --git a/journal/wagtail_hooks.py b/journal/wagtail_hooks.py deleted file mode 100644 index 725b370..0000000 --- a/journal/wagtail_hooks.py +++ /dev/null @@ -1,40 +0,0 @@ -from django.utils.translation import gettext_lazy as _ -from wagtail.snippets.views.snippets import SnippetViewSet -from wagtail.snippets.models import register_snippet - -from config.menu import get_menu_order - -from .models import Journal - - -class JournalSnippetViewSet(SnippetViewSet): - model = Journal - icon = "folder-open-inverse" - menu_name = "journal" - menu_label = _("Journal") - menu_order = get_menu_order('journal') - add_to_admin_menu = True - - list_display = ( - "collection", - "scielo_issn", - "acronym", - "title", - "issns", - "publisher_name", - "subject_areas", - "wos_subject_areas", - ) - list_filter = ( - "collection", - ) - search_fields = ( - "issns", - "acronym", - "publisher_name", - "subject_areas", - "wos_subject_areas", - ) - - -register_snippet(JournalSnippetViewSet) From 2ada2b0a12ef9daa4ce3d41d3693de9df81be81e Mon Sep 17 00:00:00 2001 From: Rafael JP Damaceno Date: Fri, 1 May 2026 22:31:03 -0300 Subject: [PATCH 05/31] =?UTF-8?q?chore:=20remover=20documenta=C3=A7=C3=A3o?= =?UTF-8?q?=20Sphinx=20n=C3=A3o=20utilizada=20(docs/)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .readthedocs.yml | 12 --------- docs/Makefile | 29 ---------------------- docs/__init__.py | 1 - docs/conf.py | 64 ------------------------------------------------ docs/howto.rst | 38 ---------------------------- docs/index.rst | 23 ----------------- docs/make.bat | 46 ---------------------------------- docs/users.rst | 15 ------------ 8 files changed, 228 deletions(-) delete mode 100644 .readthedocs.yml delete mode 100644 docs/Makefile delete mode 100644 docs/__init__.py delete mode 100644 docs/conf.py delete mode 100644 docs/howto.rst delete mode 100644 docs/index.rst delete mode 100644 docs/make.bat delete mode 100644 docs/users.rst diff --git a/.readthedocs.yml b/.readthedocs.yml deleted file mode 100644 index b4cf0c0..0000000 --- a/.readthedocs.yml +++ /dev/null @@ -1,12 +0,0 @@ -version: 2 - -sphinx: - configuration: docs/conf.py - -build: - image: testing - -python: - version: 3.9 - install: - - requirements: requirements/local.txt diff --git a/docs/Makefile b/docs/Makefile deleted file mode 100644 index 6957700..0000000 --- a/docs/Makefile +++ /dev/null @@ -1,29 +0,0 @@ -# Minimal makefile for Sphinx documentation -# - -# You can set these variables from the command line, and also -# from the environment for the first two. -SPHINXOPTS ?= -SPHINXBUILD ?= sphinx-build -SOURCEDIR = . -BUILDDIR = ./_build -APP = /app - -.PHONY: help livehtml apidocs Makefile - -# Put it first so that "make" without argument is like "make help". -help: - @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -c . - -# Build, watch and serve docs with live reload -livehtml: - sphinx-autobuild -b html --host 0.0.0.0 --port 9000 --watch $(APP) -c . $(SOURCEDIR) $(BUILDDIR)/html - -# Outputs rst files from django application code -apidocs: - sphinx-apidoc -o $(SOURCEDIR)/api $(APP) - -# Catch-all target: route all unknown targets to Sphinx using the new -# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -%: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -c . diff --git a/docs/__init__.py b/docs/__init__.py deleted file mode 100644 index 8772c82..0000000 --- a/docs/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# Included so that Django's startproject comment runs against the docs directory diff --git a/docs/conf.py b/docs/conf.py deleted file mode 100644 index 51cd921..0000000 --- a/docs/conf.py +++ /dev/null @@ -1,64 +0,0 @@ -# Configuration file for the Sphinx documentation builder. -# -# This file only contains a selection of the most common options. For a full -# list see the documentation: -# https://www.sphinx-doc.org/en/master/usage/configuration.html - -# -- Path setup -------------------------------------------------------------- - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. - -import os -import sys - -import django - -if os.getenv("READTHEDOCS", default=False) == "True": - sys.path.insert(0, os.path.abspath("..")) - os.environ["DJANGO_READ_DOT_ENV_FILE"] = "True" - os.environ["USE_DOCKER"] = "no" -else: - sys.path.insert(0, os.path.abspath("/app")) -os.environ["DATABASE_URL"] = "sqlite:///readthedocs.db" -os.environ["CELERY_BROKER_URL"] = os.getenv("REDIS_URL", "redis://redis:6379") -os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings.local") -django.setup() - -# -- Project information ----------------------------------------------------- - -project = "SciELO Core" -copyright = """2022, SciELO""" -author = "SciELO" - - -# -- General configuration --------------------------------------------------- - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - "sphinx.ext.autodoc", - "sphinx.ext.napoleon", -] - -# Add any paths that contain templates here, relative to this directory. -# templates_path = ["_templates"] - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] - -# -- Options for HTML output ------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# -html_theme = "alabaster" - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -# html_static_path = ["_static"] diff --git a/docs/howto.rst b/docs/howto.rst deleted file mode 100644 index 9fae300..0000000 --- a/docs/howto.rst +++ /dev/null @@ -1,38 +0,0 @@ -How To - Project Documentation -====================================================================== - -Get Started ----------------------------------------------------------------------- - -Documentation can be written as rst files in `core/docs`. - - -To build and serve docs, use the commands:: - - docker compose -f local.yml up docs - - - -Changes to files in `docs/_source` will be picked up and reloaded automatically. - -`Sphinx `_ is the tool used to build documentation. - -Docstrings to Documentation ----------------------------------------------------------------------- - -The sphinx extension `apidoc `_ is used to automatically document code using signatures and docstrings. - -Numpy or Google style docstrings will be picked up from project files and availble for documentation. See the `Napoleon `_ extension for details. - -For an in-use example, see the `page source <_sources/users.rst.txt>`_ for :ref:`users`. - -To compile all docstrings automatically into documentation source files, use the command: - :: - - make apidocs - - -This can be done in the docker container: - :: - - docker run --rm docs make apidocs diff --git a/docs/index.rst b/docs/index.rst deleted file mode 100644 index b6c6ded..0000000 --- a/docs/index.rst +++ /dev/null @@ -1,23 +0,0 @@ -.. SciELO Content Manager documentation master file, created by - sphinx-quickstart. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -Welcome to SciELO Core's documentation! -====================================================================== - -.. toctree:: - :maxdepth: 2 - :caption: Contents: - - howto - users - - - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` diff --git a/docs/make.bat b/docs/make.bat deleted file mode 100644 index 4f70eed..0000000 --- a/docs/make.bat +++ /dev/null @@ -1,46 +0,0 @@ -@ECHO OFF - -pushd %~dp0 - -REM Command file for Sphinx documentation - - -if "%SPHINXBUILD%" == "" ( - set SPHINXBUILD=sphinx-build -c . -) -set SOURCEDIR=_source -set BUILDDIR=_build -set APP=..\core - -if "%1" == "" goto help - -%SPHINXBUILD% >NUL 2>NUL -if errorlevel 9009 ( - echo. - echo.The 'sphinx-build' command was not found. Make sure you have Sphinx - echo.installed, then set the SPHINXBUILD environment variable to point - echo.to the full path of the 'sphinx-build' executable. Alternatively you - echo.may add the Sphinx directory to PATH. - echo. - echo.Install sphinx-autobuild for live serving. - echo.If you don't have Sphinx installed, grab it from - echo.http://sphinx-doc.org/ - exit /b 1 -) - -%SPHINXBUILD% -b %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% -goto end - -:livehtml -sphinx-autobuild -b html --open-browser -p 9000 --watch %APP% -c . %SOURCEDIR% %BUILDDIR%/html -GOTO :EOF - -:apidocs -sphinx-apidoc -o %SOURCEDIR%/api %APP% -GOTO :EOF - -:help -%SPHINXBUILD% -b help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% - -:end -popd diff --git a/docs/users.rst b/docs/users.rst deleted file mode 100644 index 21e08aa..0000000 --- a/docs/users.rst +++ /dev/null @@ -1,15 +0,0 @@ - .. _users: - -Users -====================================================================== - -Starting a new project, it’s highly recommended to set up a custom user model, -even if the default User model is sufficient for you. - -This model behaves identically to the default user model, -but you’ll be able to customize it in the future if the need arises. - -.. automodule:: core.users.models - :members: - :noindex: - From a9fbd227ad6f75b0a40cba9db3082166246e159d Mon Sep 17 00:00:00 2001 From: Rafael JP Damaceno Date: Fri, 1 May 2026 22:31:03 -0300 Subject: [PATCH 06/31] =?UTF-8?q?chore:=20remover=20arquivos=20de=20config?= =?UTF-8?q?ura=C3=A7=C3=A3o=20mortos=20(.pylintrc,=20COPYING,=20merge=5Fpr?= =?UTF-8?q?oduction=5Fdotenvs)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .pylintrc | 14 - COPYING | 674 -------------------------- merge_production_dotenvs_in_dotenv.py | 67 --- 3 files changed, 755 deletions(-) delete mode 100644 .pylintrc delete mode 100644 COPYING delete mode 100644 merge_production_dotenvs_in_dotenv.py diff --git a/.pylintrc b/.pylintrc deleted file mode 100644 index 55509fe..0000000 --- a/.pylintrc +++ /dev/null @@ -1,14 +0,0 @@ -[MASTER] -load-plugins=pylint_django, pylint_celery -django-settings-module=config.settings.base -[FORMAT] -max-line-length=120 - -[MESSAGES CONTROL] -disable=missing-docstring,invalid-name - -[DESIGN] -max-parents=13 - -[TYPECHECK] -generated-members=REQUEST,acl_users,aq_parent,"[a-zA-Z]+_set{1,2}",save,delete diff --git a/COPYING b/COPYING deleted file mode 100644 index 94a9ed0..0000000 --- a/COPYING +++ /dev/null @@ -1,674 +0,0 @@ - GNU GENERAL PUBLIC LICENSE - Version 3, 29 June 2007 - - Copyright (C) 2007 Free Software Foundation, Inc. - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - Preamble - - The GNU General Public License is a free, copyleft license for -software and other kinds of works. - - The licenses for most software and other practical works are designed -to take away your freedom to share and change the works. By contrast, -the GNU General Public License is intended to guarantee your freedom to -share and change all versions of a program--to make sure it remains free -software for all its users. We, the Free Software Foundation, use the -GNU General Public License for most of our software; it applies also to -any other work released this way by its authors. You can apply it to -your programs, too. - - When we speak of free software, we are referring to freedom, not -price. Our General Public Licenses are designed to make sure that you -have the freedom to distribute copies of free software (and charge for -them if you wish), that you receive source code or can get it if you -want it, that you can change the software or use pieces of it in new -free programs, and that you know you can do these things. - - To protect your rights, we need to prevent others from denying you -these rights or asking you to surrender the rights. Therefore, you have -certain responsibilities if you distribute copies of the software, or if -you modify it: responsibilities to respect the freedom of others. - - For example, if you distribute copies of such a program, whether -gratis or for a fee, you must pass on to the recipients the same -freedoms that you received. You must make sure that they, too, receive -or can get the source code. And you must show them these terms so they -know their rights. - - Developers that use the GNU GPL protect your rights with two steps: -(1) assert copyright on the software, and (2) offer you this License -giving you legal permission to copy, distribute and/or modify it. - - For the developers' and authors' protection, the GPL clearly explains -that there is no warranty for this free software. For both users' and -authors' sake, the GPL requires that modified versions be marked as -changed, so that their problems will not be attributed erroneously to -authors of previous versions. - - Some devices are designed to deny users access to install or run -modified versions of the software inside them, although the manufacturer -can do so. This is fundamentally incompatible with the aim of -protecting users' freedom to change the software. The systematic -pattern of such abuse occurs in the area of products for individuals to -use, which is precisely where it is most unacceptable. Therefore, we -have designed this version of the GPL to prohibit the practice for those -products. If such problems arise substantially in other domains, we -stand ready to extend this provision to those domains in future versions -of the GPL, as needed to protect the freedom of users. - - Finally, every program is threatened constantly by software patents. -States should not allow patents to restrict development and use of -software on general-purpose computers, but in those that do, we wish to -avoid the special danger that patents applied to a free program could -make it effectively proprietary. To prevent this, the GPL assures that -patents cannot be used to render the program non-free. - - The precise terms and conditions for copying, distribution and -modification follow. - - TERMS AND CONDITIONS - - 0. Definitions. - - "This License" refers to version 3 of the GNU General Public License. - - "Copyright" also means copyright-like laws that apply to other kinds of -works, such as semiconductor masks. - - "The Program" refers to any copyrightable work licensed under this -License. Each licensee is addressed as "you". "Licensees" and -"recipients" may be individuals or organizations. - - To "modify" a work means to copy from or adapt all or part of the work -in a fashion requiring copyright permission, other than the making of an -exact copy. The resulting work is called a "modified version" of the -earlier work or a work "based on" the earlier work. - - A "covered work" means either the unmodified Program or a work based -on the Program. - - To "propagate" a work means to do anything with it that, without -permission, would make you directly or secondarily liable for -infringement under applicable copyright law, except executing it on a -computer or modifying a private copy. Propagation includes copying, -distribution (with or without modification), making available to the -public, and in some countries other activities as well. - - To "convey" a work means any kind of propagation that enables other -parties to make or receive copies. Mere interaction with a user through -a computer network, with no transfer of a copy, is not conveying. - - An interactive user interface displays "Appropriate Legal Notices" -to the extent that it includes a convenient and prominently visible -feature that (1) displays an appropriate copyright notice, and (2) -tells the user that there is no warranty for the work (except to the -extent that warranties are provided), that licensees may convey the -work under this License, and how to view a copy of this License. If -the interface presents a list of user commands or options, such as a -menu, a prominent item in the list meets this criterion. - - 1. Source Code. - - The "source code" for a work means the preferred form of the work -for making modifications to it. "Object code" means any non-source -form of a work. - - A "Standard Interface" means an interface that either is an official -standard defined by a recognized standards body, or, in the case of -interfaces specified for a particular programming language, one that -is widely used among developers working in that language. - - The "System Libraries" of an executable work include anything, other -than the work as a whole, that (a) is included in the normal form of -packaging a Major Component, but which is not part of that Major -Component, and (b) serves only to enable use of the work with that -Major Component, or to implement a Standard Interface for which an -implementation is available to the public in source code form. A -"Major Component", in this context, means a major essential component -(kernel, window system, and so on) of the specific operating system -(if any) on which the executable work runs, or a compiler used to -produce the work, or an object code interpreter used to run it. - - The "Corresponding Source" for a work in object code form means all -the source code needed to generate, install, and (for an executable -work) run the object code and to modify the work, including scripts to -control those activities. However, it does not include the work's -System Libraries, or general-purpose tools or generally available free -programs which are used unmodified in performing those activities but -which are not part of the work. For example, Corresponding Source -includes interface definition files associated with source files for -the work, and the source code for shared libraries and dynamically -linked subprograms that the work is specifically designed to require, -such as by intimate data communication or control flow between those -subprograms and other parts of the work. - - The Corresponding Source need not include anything that users -can regenerate automatically from other parts of the Corresponding -Source. - - The Corresponding Source for a work in source code form is that -same work. - - 2. Basic Permissions. - - All rights granted under this License are granted for the term of -copyright on the Program, and are irrevocable provided the stated -conditions are met. This License explicitly affirms your unlimited -permission to run the unmodified Program. The output from running a -covered work is covered by this License only if the output, given its -content, constitutes a covered work. This License acknowledges your -rights of fair use or other equivalent, as provided by copyright law. - - You may make, run and propagate covered works that you do not -convey, without conditions so long as your license otherwise remains -in force. You may convey covered works to others for the sole purpose -of having them make modifications exclusively for you, or provide you -with facilities for running those works, provided that you comply with -the terms of this License in conveying all material for which you do -not control copyright. Those thus making or running the covered works -for you must do so exclusively on your behalf, under your direction -and control, on terms that prohibit them from making any copies of -your copyrighted material outside their relationship with you. - - Conveying under any other circumstances is permitted solely under -the conditions stated below. Sublicensing is not allowed; section 10 -makes it unnecessary. - - 3. Protecting Users' Legal Rights From Anti-Circumvention Law. - - No covered work shall be deemed part of an effective technological -measure under any applicable law fulfilling obligations under article -11 of the WIPO copyright treaty adopted on 20 December 1996, or -similar laws prohibiting or restricting circumvention of such -measures. - - When you convey a covered work, you waive any legal power to forbid -circumvention of technological measures to the extent such circumvention -is effected by exercising rights under this License with respect to -the covered work, and you disclaim any intention to limit operation or -modification of the work as a means of enforcing, against the work's -users, your or third parties' legal rights to forbid circumvention of -technological measures. - - 4. Conveying Verbatim Copies. - - You may convey verbatim copies of the Program's source code as you -receive it, in any medium, provided that you conspicuously and -appropriately publish on each copy an appropriate copyright notice; -keep intact all notices stating that this License and any -non-permissive terms added in accord with section 7 apply to the code; -keep intact all notices of the absence of any warranty; and give all -recipients a copy of this License along with the Program. - - You may charge any price or no price for each copy that you convey, -and you may offer support or warranty protection for a fee. - - 5. Conveying Modified Source Versions. - - You may convey a work based on the Program, or the modifications to -produce it from the Program, in the form of source code under the -terms of section 4, provided that you also meet all of these conditions: - - a) The work must carry prominent notices stating that you modified - it, and giving a relevant date. - - b) The work must carry prominent notices stating that it is - released under this License and any conditions added under section - 7. This requirement modifies the requirement in section 4 to - "keep intact all notices". - - c) You must license the entire work, as a whole, under this - License to anyone who comes into possession of a copy. This - License will therefore apply, along with any applicable section 7 - additional terms, to the whole of the work, and all its parts, - regardless of how they are packaged. This License gives no - permission to license the work in any other way, but it does not - invalidate such permission if you have separately received it. - - d) If the work has interactive user interfaces, each must display - Appropriate Legal Notices; however, if the Program has interactive - interfaces that do not display Appropriate Legal Notices, your - work need not make them do so. - - A compilation of a covered work with other separate and independent -works, which are not by their nature extensions of the covered work, -and which are not combined with it such as to form a larger program, -in or on a volume of a storage or distribution medium, is called an -"aggregate" if the compilation and its resulting copyright are not -used to limit the access or legal rights of the compilation's users -beyond what the individual works permit. Inclusion of a covered work -in an aggregate does not cause this License to apply to the other -parts of the aggregate. - - 6. Conveying Non-Source Forms. - - You may convey a covered work in object code form under the terms -of sections 4 and 5, provided that you also convey the -machine-readable Corresponding Source under the terms of this License, -in one of these ways: - - a) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by the - Corresponding Source fixed on a durable physical medium - customarily used for software interchange. - - b) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by a - written offer, valid for at least three years and valid for as - long as you offer spare parts or customer support for that product - model, to give anyone who possesses the object code either (1) a - copy of the Corresponding Source for all the software in the - product that is covered by this License, on a durable physical - medium customarily used for software interchange, for a price no - more than your reasonable cost of physically performing this - conveying of source, or (2) access to copy the - Corresponding Source from a network server at no charge. - - c) Convey individual copies of the object code with a copy of the - written offer to provide the Corresponding Source. This - alternative is allowed only occasionally and noncommercially, and - only if you received the object code with such an offer, in accord - with subsection 6b. - - d) Convey the object code by offering access from a designated - place (gratis or for a charge), and offer equivalent access to the - Corresponding Source in the same way through the same place at no - further charge. You need not require recipients to copy the - Corresponding Source along with the object code. If the place to - copy the object code is a network server, the Corresponding Source - may be on a different server (operated by you or a third party) - that supports equivalent copying facilities, provided you maintain - clear directions next to the object code saying where to find the - Corresponding Source. Regardless of what server hosts the - Corresponding Source, you remain obligated to ensure that it is - available for as long as needed to satisfy these requirements. - - e) Convey the object code using peer-to-peer transmission, provided - you inform other peers where the object code and Corresponding - Source of the work are being offered to the general public at no - charge under subsection 6d. - - A separable portion of the object code, whose source code is excluded -from the Corresponding Source as a System Library, need not be -included in conveying the object code work. - - A "User Product" is either (1) a "consumer product", which means any -tangible personal property which is normally used for personal, family, -or household purposes, or (2) anything designed or sold for incorporation -into a dwelling. In determining whether a product is a consumer product, -doubtful cases shall be resolved in favor of coverage. For a particular -product received by a particular user, "normally used" refers to a -typical or common use of that class of product, regardless of the status -of the particular user or of the way in which the particular user -actually uses, or expects or is expected to use, the product. A product -is a consumer product regardless of whether the product has substantial -commercial, industrial or non-consumer uses, unless such uses represent -the only significant mode of use of the product. - - "Installation Information" for a User Product means any methods, -procedures, authorization keys, or other information required to install -and execute modified versions of a covered work in that User Product from -a modified version of its Corresponding Source. The information must -suffice to ensure that the continued functioning of the modified object -code is in no case prevented or interfered with solely because -modification has been made. - - If you convey an object code work under this section in, or with, or -specifically for use in, a User Product, and the conveying occurs as -part of a transaction in which the right of possession and use of the -User Product is transferred to the recipient in perpetuity or for a -fixed term (regardless of how the transaction is characterized), the -Corresponding Source conveyed under this section must be accompanied -by the Installation Information. But this requirement does not apply -if neither you nor any third party retains the ability to install -modified object code on the User Product (for example, the work has -been installed in ROM). - - The requirement to provide Installation Information does not include a -requirement to continue to provide support service, warranty, or updates -for a work that has been modified or installed by the recipient, or for -the User Product in which it has been modified or installed. Access to a -network may be denied when the modification itself materially and -adversely affects the operation of the network or violates the rules and -protocols for communication across the network. - - Corresponding Source conveyed, and Installation Information provided, -in accord with this section must be in a format that is publicly -documented (and with an implementation available to the public in -source code form), and must require no special password or key for -unpacking, reading or copying. - - 7. Additional Terms. - - "Additional permissions" are terms that supplement the terms of this -License by making exceptions from one or more of its conditions. -Additional permissions that are applicable to the entire Program shall -be treated as though they were included in this License, to the extent -that they are valid under applicable law. If additional permissions -apply only to part of the Program, that part may be used separately -under those permissions, but the entire Program remains governed by -this License without regard to the additional permissions. - - When you convey a copy of a covered work, you may at your option -remove any additional permissions from that copy, or from any part of -it. (Additional permissions may be written to require their own -removal in certain cases when you modify the work.) You may place -additional permissions on material, added by you to a covered work, -for which you have or can give appropriate copyright permission. - - Notwithstanding any other provision of this License, for material you -add to a covered work, you may (if authorized by the copyright holders of -that material) supplement the terms of this License with terms: - - a) Disclaiming warranty or limiting liability differently from the - terms of sections 15 and 16 of this License; or - - b) Requiring preservation of specified reasonable legal notices or - author attributions in that material or in the Appropriate Legal - Notices displayed by works containing it; or - - c) Prohibiting misrepresentation of the origin of that material, or - requiring that modified versions of such material be marked in - reasonable ways as different from the original version; or - - d) Limiting the use for publicity purposes of names of licensors or - authors of the material; or - - e) Declining to grant rights under trademark law for use of some - trade names, trademarks, or service marks; or - - f) Requiring indemnification of licensors and authors of that - material by anyone who conveys the material (or modified versions of - it) with contractual assumptions of liability to the recipient, for - any liability that these contractual assumptions directly impose on - those licensors and authors. - - All other non-permissive additional terms are considered "further -restrictions" within the meaning of section 10. If the Program as you -received it, or any part of it, contains a notice stating that it is -governed by this License along with a term that is a further -restriction, you may remove that term. If a license document contains -a further restriction but permits relicensing or conveying under this -License, you may add to a covered work material governed by the terms -of that license document, provided that the further restriction does -not survive such relicensing or conveying. - - If you add terms to a covered work in accord with this section, you -must place, in the relevant source files, a statement of the -additional terms that apply to those files, or a notice indicating -where to find the applicable terms. - - Additional terms, permissive or non-permissive, may be stated in the -form of a separately written license, or stated as exceptions; -the above requirements apply either way. - - 8. Termination. - - You may not propagate or modify a covered work except as expressly -provided under this License. Any attempt otherwise to propagate or -modify it is void, and will automatically terminate your rights under -this License (including any patent licenses granted under the third -paragraph of section 11). - - However, if you cease all violation of this License, then your -license from a particular copyright holder is reinstated (a) -provisionally, unless and until the copyright holder explicitly and -finally terminates your license, and (b) permanently, if the copyright -holder fails to notify you of the violation by some reasonable means -prior to 60 days after the cessation. - - Moreover, your license from a particular copyright holder is -reinstated permanently if the copyright holder notifies you of the -violation by some reasonable means, this is the first time you have -received notice of violation of this License (for any work) from that -copyright holder, and you cure the violation prior to 30 days after -your receipt of the notice. - - Termination of your rights under this section does not terminate the -licenses of parties who have received copies or rights from you under -this License. If your rights have been terminated and not permanently -reinstated, you do not qualify to receive new licenses for the same -material under section 10. - - 9. Acceptance Not Required for Having Copies. - - You are not required to accept this License in order to receive or -run a copy of the Program. Ancillary propagation of a covered work -occurring solely as a consequence of using peer-to-peer transmission -to receive a copy likewise does not require acceptance. However, -nothing other than this License grants you permission to propagate or -modify any covered work. These actions infringe copyright if you do -not accept this License. Therefore, by modifying or propagating a -covered work, you indicate your acceptance of this License to do so. - - 10. Automatic Licensing of Downstream Recipients. - - Each time you convey a covered work, the recipient automatically -receives a license from the original licensors, to run, modify and -propagate that work, subject to this License. You are not responsible -for enforcing compliance by third parties with this License. - - An "entity transaction" is a transaction transferring control of an -organization, or substantially all assets of one, or subdividing an -organization, or merging organizations. If propagation of a covered -work results from an entity transaction, each party to that -transaction who receives a copy of the work also receives whatever -licenses to the work the party's predecessor in interest had or could -give under the previous paragraph, plus a right to possession of the -Corresponding Source of the work from the predecessor in interest, if -the predecessor has it or can get it with reasonable efforts. - - You may not impose any further restrictions on the exercise of the -rights granted or affirmed under this License. For example, you may -not impose a license fee, royalty, or other charge for exercise of -rights granted under this License, and you may not initiate litigation -(including a cross-claim or counterclaim in a lawsuit) alleging that -any patent claim is infringed by making, using, selling, offering for -sale, or importing the Program or any portion of it. - - 11. Patents. - - A "contributor" is a copyright holder who authorizes use under this -License of the Program or a work on which the Program is based. The -work thus licensed is called the contributor's "contributor version". - - A contributor's "essential patent claims" are all patent claims -owned or controlled by the contributor, whether already acquired or -hereafter acquired, that would be infringed by some manner, permitted -by this License, of making, using, or selling its contributor version, -but do not include claims that would be infringed only as a -consequence of further modification of the contributor version. For -purposes of this definition, "control" includes the right to grant -patent sublicenses in a manner consistent with the requirements of -this License. - - Each contributor grants you a non-exclusive, worldwide, royalty-free -patent license under the contributor's essential patent claims, to -make, use, sell, offer for sale, import and otherwise run, modify and -propagate the contents of its contributor version. - - In the following three paragraphs, a "patent license" is any express -agreement or commitment, however denominated, not to enforce a patent -(such as an express permission to practice a patent or covenant not to -sue for patent infringement). To "grant" such a patent license to a -party means to make such an agreement or commitment not to enforce a -patent against the party. - - If you convey a covered work, knowingly relying on a patent license, -and the Corresponding Source of the work is not available for anyone -to copy, free of charge and under the terms of this License, through a -publicly available network server or other readily accessible means, -then you must either (1) cause the Corresponding Source to be so -available, or (2) arrange to deprive yourself of the benefit of the -patent license for this particular work, or (3) arrange, in a manner -consistent with the requirements of this License, to extend the patent -license to downstream recipients. "Knowingly relying" means you have -actual knowledge that, but for the patent license, your conveying the -covered work in a country, or your recipient's use of the covered work -in a country, would infringe one or more identifiable patents in that -country that you have reason to believe are valid. - - If, pursuant to or in connection with a single transaction or -arrangement, you convey, or propagate by procuring conveyance of, a -covered work, and grant a patent license to some of the parties -receiving the covered work authorizing them to use, propagate, modify -or convey a specific copy of the covered work, then the patent license -you grant is automatically extended to all recipients of the covered -work and works based on it. - - A patent license is "discriminatory" if it does not include within -the scope of its coverage, prohibits the exercise of, or is -conditioned on the non-exercise of one or more of the rights that are -specifically granted under this License. You may not convey a covered -work if you are a party to an arrangement with a third party that is -in the business of distributing software, under which you make payment -to the third party based on the extent of your activity of conveying -the work, and under which the third party grants, to any of the -parties who would receive the covered work from you, a discriminatory -patent license (a) in connection with copies of the covered work -conveyed by you (or copies made from those copies), or (b) primarily -for and in connection with specific products or compilations that -contain the covered work, unless you entered into that arrangement, -or that patent license was granted, prior to 28 March 2007. - - Nothing in this License shall be construed as excluding or limiting -any implied license or other defenses to infringement that may -otherwise be available to you under applicable patent law. - - 12. No Surrender of Others' Freedom. - - If conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot convey a -covered work so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you may -not convey it at all. For example, if you agree to terms that obligate you -to collect a royalty for further conveying from those to whom you convey -the Program, the only way you could satisfy both those terms and this -License would be to refrain entirely from conveying the Program. - - 13. Use with the GNU Affero General Public License. - - Notwithstanding any other provision of this License, you have -permission to link or combine any covered work with a work licensed -under version 3 of the GNU Affero General Public License into a single -combined work, and to convey the resulting work. The terms of this -License will continue to apply to the part which is the covered work, -but the special requirements of the GNU Affero General Public License, -section 13, concerning interaction through a network will apply to the -combination as such. - - 14. Revised Versions of this License. - - The Free Software Foundation may publish revised and/or new versions of -the GNU General Public License from time to time. Such new versions will -be similar in spirit to the present version, but may differ in detail to -address new problems or concerns. - - Each version is given a distinguishing version number. If the -Program specifies that a certain numbered version of the GNU General -Public License "or any later version" applies to it, you have the -option of following the terms and conditions either of that numbered -version or of any later version published by the Free Software -Foundation. If the Program does not specify a version number of the -GNU General Public License, you may choose any version ever published -by the Free Software Foundation. - - If the Program specifies that a proxy can decide which future -versions of the GNU General Public License can be used, that proxy's -public statement of acceptance of a version permanently authorizes you -to choose that version for the Program. - - Later license versions may give you additional or different -permissions. However, no additional obligations are imposed on any -author or copyright holder as a result of your choosing to follow a -later version. - - 15. Disclaimer of Warranty. - - THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY -APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT -HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY -OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, -THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM -IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF -ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - 16. Limitation of Liability. - - IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING -WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS -THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY -GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE -USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF -DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD -PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), -EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF -SUCH DAMAGES. - - 17. Interpretation of Sections 15 and 16. - - If the disclaimer of warranty and limitation of liability provided -above cannot be given local legal effect according to their terms, -reviewing courts shall apply local law that most closely approximates -an absolute waiver of all civil liability in connection with the -Program, unless a warranty or assumption of liability accompanies a -copy of the Program in return for a fee. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Programs - - If you develop a new program, and you want it to be of the greatest -possible use to the public, the best way to achieve this is to make it -free software which everyone can redistribute and change under these terms. - - To do so, attach the following notices to the program. It is safest -to attach them to the start of each source file to most effectively -state the exclusion of warranty; and each file should have at least -the "copyright" line and a pointer to where the full notice is found. - - - Copyright (C) - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . - -Also add information on how to contact you by electronic and paper mail. - - If the program does terminal interaction, make it output a short -notice like this when it starts in an interactive mode: - - Copyright (C) - This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. - This is free software, and you are welcome to redistribute it - under certain conditions; type `show c' for details. - -The hypothetical commands `show w' and `show c' should show the appropriate -parts of the General Public License. Of course, your program's commands -might be different; for a GUI interface, you would use an "about box". - - You should also get your employer (if you work as a programmer) or school, -if any, to sign a "copyright disclaimer" for the program, if necessary. -For more information on this, and how to apply and follow the GNU GPL, see -. - - The GNU General Public License does not permit incorporating your program -into proprietary programs. If your program is a subroutine library, you -may consider it more useful to permit linking proprietary applications with -the library. If this is what you want to do, use the GNU Lesser General -Public License instead of this License. But first, please read -. diff --git a/merge_production_dotenvs_in_dotenv.py b/merge_production_dotenvs_in_dotenv.py deleted file mode 100644 index d1170ef..0000000 --- a/merge_production_dotenvs_in_dotenv.py +++ /dev/null @@ -1,67 +0,0 @@ -import os -from pathlib import Path -from typing import Sequence - -import pytest - -ROOT_DIR_PATH = Path(__file__).parent.resolve() -PRODUCTION_DOTENVS_DIR_PATH = ROOT_DIR_PATH / ".envs" / ".production" -PRODUCTION_DOTENV_FILE_PATHS = [ - PRODUCTION_DOTENVS_DIR_PATH / ".django", - PRODUCTION_DOTENVS_DIR_PATH / ".postgres", -] -DOTENV_FILE_PATH = ROOT_DIR_PATH / ".env" - - -def merge( - output_file_path: str, merged_file_paths: Sequence[str], append_linesep: bool = True -) -> None: - with open(output_file_path, "w") as output_file: - for merged_file_path in merged_file_paths: - with open(merged_file_path, "r") as merged_file: - merged_file_content = merged_file.read() - output_file.write(merged_file_content) - if append_linesep: - output_file.write(os.linesep) - - -def main(): - merge(DOTENV_FILE_PATH, PRODUCTION_DOTENV_FILE_PATHS) - - -@pytest.mark.parametrize("merged_file_count", range(3)) -@pytest.mark.parametrize("append_linesep", [True, False]) -def test_merge(tmpdir_factory, merged_file_count: int, append_linesep: bool): - tmp_dir_path = Path(str(tmpdir_factory.getbasetemp())) - - output_file_path = tmp_dir_path / ".env" - - expected_output_file_content = "" - merged_file_paths = [] - for i in range(merged_file_count): - merged_file_ord = i + 1 - - merged_filename = ".service{}".format(merged_file_ord) - merged_file_path = tmp_dir_path / merged_filename - - merged_file_content = merged_filename * merged_file_ord - - with open(merged_file_path, "w+") as file: - file.write(merged_file_content) - - expected_output_file_content += merged_file_content - if append_linesep: - expected_output_file_content += os.linesep - - merged_file_paths.append(merged_file_path) - - merge(output_file_path, merged_file_paths, append_linesep) - - with open(output_file_path, "r") as output_file: - actual_output_file_content = output_file.read() - - assert actual_output_file_content == expected_output_file_content - - -if __name__ == "__main__": - main() From 50135405d71b4632b5c961c504fc066eca844302 Mon Sep 17 00:00:00 2001 From: Rafael JP Damaceno Date: Fri, 1 May 2026 22:31:03 -0300 Subject: [PATCH 07/31] chore: remover start-dev.sh (credenciais hardcoded, usar Docker) --- start-dev.sh | 25 ------------------------- 1 file changed, 25 deletions(-) delete mode 100644 start-dev.sh diff --git a/start-dev.sh b/start-dev.sh deleted file mode 100644 index 92d064a..0000000 --- a/start-dev.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -# Change this value to the local ethernet. -ethernet=wlp0s20f3 - -# Linux IP. -export IP=$(/sbin/ip -o -4 addr list $ethernet | awk '{print $4}' | cut -d/ -f1) - -# Mac OS IP. -#export IP=$(ifconfig $ethernet | grep inet | grep -v inet6 | awk '{print $2}') - -export DATABASE_URL=postgres://GVRFlLmcCNfGLhsFvSnCioYOPJPYpyfj:BQ4hSUL4rdj5WZLdR8ilDLRQMvCtzo0caMaXDO0olGsmycQjlcZlTVK9DepZR8kk@$IP:5432/scielo_core -export CELERY_BROKER_URL=redis://$IP:6379/0 -export USE_DOCKER=no -export IPYTHONDIR=/app/.ipython -export REDIS_URL=redis://$IP:6379/0 -export CELERY_FLOWER_USER=PhFRdLexbrsBvrrbSXxjcMMOcVOavCrZ -export CELERY_FLOWER_PASSWORD=QgScyefPrYhHgO6onW61u0nazc5xdBuP4sM7jMRrBBFuA2RjsFhZLp7xbVYZbrwR -export EMAIL_HOST=$IP -export SOLR_URL=http://$IP:8983/solr/ - - -docker stop scielo_core_local_django -# workon scms -python manage.py runserver_plus 0.0.0.0:8000 From 833ecfc76b6939ccc8e54aa39dddd0871546285a Mon Sep 17 00:00:00 2001 From: Rafael JP Damaceno Date: Fri, 1 May 2026 22:31:03 -0300 Subject: [PATCH 08/31] =?UTF-8?q?refactor:=20remover=20refer=C3=AAncias=20?= =?UTF-8?q?a=20article=20e=20journal=20em=20collection=20e=20core?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 9 ++++++++- collection/tasks.py | 4 ++-- collection/wagtail_hooks.py | 10 +--------- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/.gitignore b/.gitignore index 6342047..dd2c92d 100644 --- a/.gitignore +++ b/.gitignore @@ -169,4 +169,11 @@ cython_debug/ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ +.idea/ + +# Local agent/editor state +.continue/ +temp/ +.envs/.local/.django +start-dev.sh +opencode.json diff --git a/collection/tasks.py b/collection/tasks.py index 02fd0e7..19372de 100644 --- a/collection/tasks.py +++ b/collection/tasks.py @@ -1,14 +1,14 @@ from django.contrib.auth import get_user_model from django.utils.translation import gettext as _ -from core.utils.utils import _get_user +from core.utils.request_utils import _get_user from collection.models import Collection from config import celery_app User = get_user_model() -@celery_app.task(bind=True, name=_('Load collection data')) +@celery_app.task(bind=True, name=_('[Collection] Load Collection Data')) def task_load_collections(self, user_id=None, username=None): user = _get_user(self.request, username=username, user_id=user_id) Collection.load(user) diff --git a/collection/wagtail_hooks.py b/collection/wagtail_hooks.py index e7b7e97..52b31a8 100644 --- a/collection/wagtail_hooks.py +++ b/collection/wagtail_hooks.py @@ -1,8 +1,5 @@ from django.utils.translation import gettext as _ from wagtail.snippets.views.snippets import SnippetViewSet -from wagtail.snippets.models import register_snippet - -from config.menu import get_menu_order from .models import Collection @@ -10,10 +7,8 @@ class CollectionSnippetViewSet(SnippetViewSet): model = Collection icon = "folder-open-inverse" - menu_name = 'collection' menu_label = _("Collection") - menu_order = get_menu_order("collection") - add_to_admin_menu = True + menu_order = 100 list_display = ( "main_name", @@ -57,6 +52,3 @@ class CollectionSnippetViewSet(SnippetViewSet): "updated_by", ) export_filename = "collections" - - -register_snippet(CollectionSnippetViewSet) From 4045b12350c78501a1e6ec79606a5e963b0acd68 Mon Sep 17 00:00:00 2001 From: Rafael JP Damaceno Date: Fri, 1 May 2026 22:51:02 -0300 Subject: [PATCH 09/31] fix: remover article e journal do INSTALLED_APPS --- config/settings/base.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/config/settings/base.py b/config/settings/base.py index 4e96ed4..9638274 100644 --- a/config/settings/base.py +++ b/config/settings/base.py @@ -114,10 +114,8 @@ "core.users", "core_settings", # Your stuff: custom apps go here - "article", "collection", "core", - "journal", "log_manager", "log_manager_config", "metrics", From 509408d728182854243ff8769b433fa0612a166f Mon Sep 17 00:00:00 2001 From: Rafael JP Damaceno Date: Fri, 1 May 2026 22:32:50 -0300 Subject: [PATCH 10/31] =?UTF-8?q?feat:=20adicionar=20modelo=20Document=20c?= =?UTF-8?q?om=20metadados=20unificados=20e=20migra=C3=A7=C3=A3o?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- document/migrations/0001_initial.py | 279 ++++++++++++++++++++++++++++ document/migrations/__init__.py | 1 + document/models.py | 258 +++++++++++++++++++++++++ 3 files changed, 538 insertions(+) create mode 100644 document/migrations/0001_initial.py create mode 100644 document/migrations/__init__.py create mode 100644 document/models.py diff --git a/document/migrations/0001_initial.py b/document/migrations/0001_initial.py new file mode 100644 index 0000000..bff11be --- /dev/null +++ b/document/migrations/0001_initial.py @@ -0,0 +1,279 @@ +# Generated by Django 5.0.7 on 2026-03-15 00:00 + +import django.db.models.deletion +from django.conf import settings +from django.db import migrations, models + + +class Migration(migrations.Migration): + initial = True + + dependencies = [ + ("collection", "0001_initial"), + ("source", "0001_initial"), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.CreateModel( + name="Document", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "created", + models.DateTimeField( + auto_now_add=True, + verbose_name="Creation date", + ), + ), + ( + "updated", + models.DateTimeField( + auto_now=True, + verbose_name="Last update date", + ), + ), + ( + "document_type", + models.CharField( + choices=[ + ("article", "Article"), + ("preprint", "Preprint"), + ("dataset", "Dataset"), + ("book", "Book"), + ("chapter", "Chapter"), + ("other", "Other"), + ], + db_index=True, + max_length=32, + verbose_name="Document Type", + ), + ), + ( + "document_id", + models.CharField( + db_index=True, + max_length=255, + verbose_name="Document ID", + ), + ), + ( + "scielo_issn", + models.CharField( + blank=True, + db_index=True, + max_length=9, + null=True, + verbose_name="SciELO ISSN", + ), + ), + ( + "pid_v2", + models.CharField( + blank=True, + db_index=True, + max_length=23, + null=True, + verbose_name="PID V2", + ), + ), + ( + "pid_v3", + models.CharField( + blank=True, + db_index=True, + max_length=23, + null=True, + verbose_name="PID V3", + ), + ), + ( + "pid_generic", + models.CharField( + blank=True, + db_index=True, + max_length=255, + null=True, + verbose_name="PID Generic", + ), + ), + ( + "title", + models.CharField( + blank=True, + max_length=500, + null=True, + verbose_name="Document Title", + ), + ), + ( + "identifiers", + models.JSONField( + blank=True, + default=dict, + null=True, + verbose_name="Identifiers", + ), + ), + ( + "files", + models.JSONField( + blank=True, + default=dict, + null=True, + verbose_name="Files", + ), + ), + ( + "default_lang", + models.CharField( + blank=True, + max_length=8, + null=True, + verbose_name="Default Language", + ), + ), + ( + "text_langs", + models.JSONField( + blank=True, + default=list, + null=True, + verbose_name="Text Languages", + ), + ), + ( + "default_media_format", + models.CharField( + blank=True, + max_length=32, + null=True, + verbose_name="Default Media Format", + ), + ), + ( + "processing_date", + models.CharField( + blank=True, + max_length=32, + null=True, + verbose_name="Processing Date", + ), + ), + ( + "publication_date", + models.CharField( + blank=True, + max_length=32, + null=True, + verbose_name="Publication Date", + ), + ), + ( + "publication_year", + models.CharField( + blank=True, + db_index=True, + max_length=4, + null=True, + verbose_name="Publication Year", + ), + ), + ( + "extra_data", + models.JSONField( + blank=True, + default=dict, + null=True, + verbose_name="Extra Data", + ), + ), + ( + "collection", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + to="collection.collection", + verbose_name="Collection", + ), + ), + ( + "creator", + models.ForeignKey( + editable=False, + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="%(class)s_creator", + to=settings.AUTH_USER_MODEL, + verbose_name="Creator", + ), + ), + ( + "parent_document", + models.ForeignKey( + blank=True, + db_index=True, + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="child_documents", + to="document.document", + verbose_name="Parent Document", + ), + ), + ( + "source", + models.ForeignKey( + blank=True, + db_index=True, + null=True, + on_delete=django.db.models.deletion.CASCADE, + related_name="documents", + to="source.source", + verbose_name="Source", + ), + ), + ( + "updated_by", + models.ForeignKey( + blank=True, + editable=False, + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="%(class)s_last_mod_user", + to=settings.AUTH_USER_MODEL, + verbose_name="Updater", + ), + ), + ], + options={ + "verbose_name": "Document", + "verbose_name_plural": "Documents", + "unique_together": {("collection", "document_type", "document_id")}, + "indexes": [ + models.Index( + fields=["collection", "document_type"], + name="document_collection_type_idx", + ), + models.Index( + fields=["collection", "scielo_issn"], + name="document_collection_issn_idx", + ), + models.Index( + fields=["collection", "pid_v2"], + name="document_collection_pidv2_idx", + ), + models.Index( + fields=["collection", "pid_generic"], + name="doc_coll_pidgen_idx", + ), + ], + }, + ), + ] diff --git a/document/migrations/__init__.py b/document/migrations/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/document/migrations/__init__.py @@ -0,0 +1 @@ + diff --git a/document/models.py b/document/models.py new file mode 100644 index 0000000..5197692 --- /dev/null +++ b/document/models.py @@ -0,0 +1,258 @@ +from django.db import models +from django.utils.translation import gettext_lazy as _ + +from collection.models import Collection +from core.models import CommonControlField +from source.models import Source + + +class Document(CommonControlField): + DOCUMENT_TYPE_ARTICLE = "article" + DOCUMENT_TYPE_PREPRINT = "preprint" + DOCUMENT_TYPE_DATASET = "dataset" + DOCUMENT_TYPE_BOOK = "book" + DOCUMENT_TYPE_CHAPTER = "chapter" + DOCUMENT_TYPE_OTHER = "other" + DOCUMENT_TYPE_CHOICES = ( + (DOCUMENT_TYPE_ARTICLE, _("Article")), + (DOCUMENT_TYPE_PREPRINT, _("Preprint")), + (DOCUMENT_TYPE_DATASET, _("Dataset")), + (DOCUMENT_TYPE_BOOK, _("Book")), + (DOCUMENT_TYPE_CHAPTER, _("Chapter")), + (DOCUMENT_TYPE_OTHER, _("Other")), + ) + + collection = models.ForeignKey( + Collection, + verbose_name=_("Collection"), + on_delete=models.CASCADE, + blank=False, + null=False, + db_index=True, + ) + + source = models.ForeignKey( + Source, + verbose_name=_("Source"), + on_delete=models.CASCADE, + related_name="documents", + blank=True, + null=True, + db_index=True, + ) + + parent_document = models.ForeignKey( + "self", + verbose_name=_("Parent Document"), + on_delete=models.SET_NULL, + related_name="child_documents", + blank=True, + null=True, + db_index=True, + ) + + document_type = models.CharField( + verbose_name=_("Document Type"), + max_length=32, + choices=DOCUMENT_TYPE_CHOICES, + blank=False, + null=False, + db_index=True, + ) + + document_id = models.CharField( + verbose_name=_("Document ID"), + max_length=255, + blank=False, + null=False, + db_index=True, + ) + + scielo_issn = models.CharField( + verbose_name=_("SciELO ISSN"), + max_length=9, + blank=True, + null=True, + db_index=True, + ) + + pid_v2 = models.CharField( + verbose_name=_("PID V2"), + max_length=23, + blank=True, + null=True, + db_index=True, + ) + + pid_v3 = models.CharField( + verbose_name=_("PID V3"), + max_length=23, + blank=True, + null=True, + db_index=True, + ) + + pid_generic = models.CharField( + verbose_name=_("PID Generic"), + max_length=255, + blank=True, + null=True, + db_index=True, + ) + + title = models.CharField( + verbose_name=_("Document Title"), + max_length=500, + blank=True, + null=True, + ) + + identifiers = models.JSONField( + verbose_name=_("Identifiers"), + null=True, + blank=True, + default=dict, + ) + + files = models.JSONField( + verbose_name=_("Files"), + null=True, + blank=True, + default=dict, + ) + + default_lang = models.CharField( + verbose_name=_("Default Language"), + max_length=8, + blank=True, + null=True, + ) + + text_langs = models.JSONField( + verbose_name=_("Text Languages"), + null=True, + blank=True, + default=list, + ) + + default_media_format = models.CharField( + verbose_name=_("Default Media Format"), + max_length=32, + blank=True, + null=True, + ) + + processing_date = models.CharField( + verbose_name=_("Processing Date"), + max_length=32, + blank=True, + null=True, + ) + + publication_date = models.CharField( + verbose_name=_("Publication Date"), + max_length=32, + blank=True, + null=True, + ) + + publication_year = models.CharField( + verbose_name=_("Publication Year"), + max_length=4, + blank=True, + null=True, + db_index=True, + ) + + extra_data = models.JSONField( + verbose_name=_("Extra Data"), + null=True, + blank=True, + default=dict, + ) + + def __str__(self): + return f"{self.collection.acron3} - {self.document_type} - {self.document_id}" + + @classmethod + def metadata(cls, collection=None): + queryset = cls.objects.select_related("collection", "source").only( + "collection__acron3", + "default_lang", + "default_media_format", + "document_id", + "document_type", + "extra_data", + "files", + "identifiers", + "parent_document__document_id", + "pid_generic", + "pid_v2", + "pid_v3", + "processing_date", + "publication_date", + "publication_year", + "scielo_issn", + "source__scielo_issn", + "source__source_id", + "source__source_type", + "text_langs", + "title", + ) + + if collection: + queryset = queryset.filter(collection=collection) + + for document in queryset.iterator(): + source = document.source + yield { + "collection": document.collection.acron3, + "default_lang": document.default_lang, + "default_media_format": document.default_media_format, + "document_id": document.document_id, + "document_type": document.document_type, + "extra_data": document.extra_data or {}, + "files": document.files or {}, + "identifiers": document.identifiers or {}, + "parent_document_id": ( + document.parent_document.document_id if document.parent_document else None + ), + "pid_generic": document.pid_generic, + "pid_v2": document.pid_v2, + "pid_v3": document.pid_v3, + "processing_date": document.processing_date, + "publication_date": document.publication_date, + "publication_year": document.publication_year, + "scielo_issn": document.scielo_issn or (source.scielo_issn if source else None), + "source_id": source.source_id if source else None, + "source_type": source.source_type if source else None, + "text_langs": document.text_langs or [], + "title": document.title, + } + + class Meta: + verbose_name = _("Document") + verbose_name_plural = _("Documents") + unique_together = ( + "collection", + "document_type", + "document_id", + ) + indexes = [ + models.Index( + fields=["collection", "document_type"], + name="document_collection_type_idx", + ), + models.Index( + fields=["collection", "scielo_issn"], + name="document_collection_issn_idx", + ), + models.Index( + fields=["collection", "pid_v2"], + name="document_collection_pidv2_idx", + ), + models.Index( + fields=["collection", "pid_generic"], + name="doc_coll_pidgen_idx", + ), + ] From f49779e83a1da617b6236ae69a0980305366a7f4 Mon Sep 17 00:00:00 2001 From: Rafael JP Damaceno Date: Fri, 1 May 2026 22:33:07 -0300 Subject: [PATCH 11/31] =?UTF-8?q?feat:=20adicionar=20servi=C3=A7os=20Docum?= =?UTF-8?q?ent=20para=20artigos=20(ArticleMeta=20+=20OPAC)=20e=20helpers?= =?UTF-8?q?=20comuns?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- document/services/articles.py | 166 ++++++++++++++++++++++++++++++++++ document/services/common.py | 58 ++++++++++++ 2 files changed, 224 insertions(+) create mode 100644 document/services/articles.py create mode 100644 document/services/common.py diff --git a/document/services/articles.py b/document/services/articles.py new file mode 100644 index 0000000..09244b3 --- /dev/null +++ b/document/services/articles.py @@ -0,0 +1,166 @@ +from document.models import Document + +from .common import build_document_id, compact_dict, get_existing_document, normalize_langs, normalize_year + + +def upsert_article_document_from_articlemeta( + payload, + collection, + source=None, + user=None, + force_update=True, +): + pid_v2 = payload.get("code") + document_id = build_document_id(pid_v2, payload.get("pid_v3"), payload.get("pid_generic")) + if not document_id: + return None + + document = get_existing_document( + collection, + Document.DOCUMENT_TYPE_ARTICLE, + document_id, + pid_v2, + ) + created = document is None + if created: + document = Document( + collection=collection, + document_type=Document.DOCUMENT_TYPE_ARTICLE, + document_id=document_id, + ) + if user: + document.creator = user + + if created or force_update: + document.source = source + document.parent_document = None + document.scielo_issn = source.scielo_issn if source else None + document.pid_v2 = pid_v2 or document.pid_v2 + document.pid_v3 = payload.get("pid_v3") or document.pid_v3 + document.pid_generic = payload.get("pid_generic") or document.pid_generic + document.title = payload.get("title") or document.title + document.identifiers = _merge_dicts( + document.identifiers, + _build_articlemeta_identifiers(payload, source), + ) + document.files = payload.get("pdfs") or document.files or {} + document.default_lang = payload.get("default_language") or document.default_lang + document.text_langs = normalize_langs(payload.get("text_langs")) + document.default_media_format = document.default_media_format + document.processing_date = payload.get("processing_date") or document.processing_date + document.publication_date = payload.get("publication_date") or document.publication_date + document.publication_year = normalize_year( + payload.get("publication_year"), + fallback_date=document.publication_date, + ) + document.extra_data = _merge_dicts( + document.extra_data, + compact_dict( + { + "provider": "articlemeta", + "issn_codes": payload.get("code_title"), + } + ), + ) + + if user: + document.updated_by = user + + document.save() + return document + + +def upsert_article_document_from_opac( + payload, + collection, + source=None, + user=None, + force_update=True, +): + pid_v2 = payload.get("pid_v2") + pid_v3 = payload.get("pid_v3") + document_id = build_document_id(pid_v2, pid_v3, payload.get("pid_generic")) + if not document_id: + return None + + document = get_existing_document( + collection, + Document.DOCUMENT_TYPE_ARTICLE, + document_id, + pid_v2, + pid_v3, + payload.get("pid_generic"), + ) + created = document is None + if created: + document = Document( + collection=collection, + document_type=Document.DOCUMENT_TYPE_ARTICLE, + document_id=document_id, + ) + if user: + document.creator = user + + if created or force_update: + document.source = source + document.parent_document = None + document.scielo_issn = source.scielo_issn if source else None + document.pid_v2 = pid_v2 or document.pid_v2 + document.pid_v3 = pid_v3 or document.pid_v3 + document.pid_generic = payload.get("pid_generic") or document.pid_generic + document.title = payload.get("title") or document.title + document.identifiers = _merge_dicts( + document.identifiers, + _build_opac_identifiers(payload, source), + ) + document.files = document.files or {} + document.default_lang = payload.get("default_language") or document.default_lang + document.text_langs = normalize_langs(payload.get("text_langs")) or document.text_langs or [] + document.default_media_format = document.default_media_format + document.processing_date = document.processing_date + document.publication_date = payload.get("publication_date") or document.publication_date + document.publication_year = normalize_year( + payload.get("publication_year"), + fallback_date=document.publication_date, + ) + document.extra_data = _merge_dicts( + document.extra_data, + compact_dict( + { + "provider": "opac", + "journal_acronym": payload.get("journal_acronym"), + } + ), + ) + + if user: + document.updated_by = user + + document.save() + return document + + +def _build_articlemeta_identifiers(payload, source): + return compact_dict( + { + "pid_v2": payload.get("code"), + "scielo_issn": source.scielo_issn if source else None, + } + ) + + +def _build_opac_identifiers(payload, source): + return compact_dict( + { + "pid_v2": payload.get("pid_v2"), + "pid_v3": payload.get("pid_v3"), + "scielo_issn": source.scielo_issn if source else None, + "journal_acronym": payload.get("journal_acronym"), + } + ) + + +def _merge_dicts(current, new_values): + merged = dict(current or {}) + merged.update(new_values or {}) + return merged diff --git a/document/services/common.py b/document/services/common.py new file mode 100644 index 0000000..91e103d --- /dev/null +++ b/document/services/common.py @@ -0,0 +1,58 @@ +from document.models import Document + + +def build_document_id(*values): + for value in values: + if value not in (None, ""): + return str(value) + return None + + +def get_existing_document(collection, document_type, *identifiers): + identifiers = [str(value) for value in identifiers if value not in (None, "")] + if not identifiers: + return None + + queryset = Document.objects.filter( + collection=collection, + document_type=document_type, + ) + + for field_name in ("document_id", "pid_v2", "pid_v3", "pid_generic"): + for identifier in identifiers: + document = queryset.filter(**{field_name: identifier}).first() + if document: + return document + + return None + + +def normalize_langs(value): + if not value: + return [] + + if isinstance(value, list): + return [item for item in value if item not in (None, "")] + + if isinstance(value, dict): + return [key for key, enabled in value.items() if enabled] + + return [value] + + +def normalize_year(value, fallback_date=None): + if value not in (None, ""): + return str(value)[:4] + + if fallback_date not in (None, ""): + return str(fallback_date)[:4] + + return None + + +def compact_dict(data): + return { + key: value + for key, value in data.items() + if value not in (None, "", [], {}, ()) + } From 760222c3856a410588ddca1bed17d3bdbe55990d Mon Sep 17 00:00:00 2001 From: Rafael JP Damaceno Date: Fri, 1 May 2026 22:33:07 -0300 Subject: [PATCH 12/31] =?UTF-8?q?feat:=20adicionar=20servi=C3=A7os=20Docum?= =?UTF-8?q?ent=20para=20books,=20datasets=20e=20preprints?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- document/services/__init__.py | 1 + document/services/books.py | 256 +++++++++++++++++++++++++++++++++ document/services/datasets.py | 69 +++++++++ document/services/preprints.py | 58 ++++++++ 4 files changed, 384 insertions(+) create mode 100644 document/services/__init__.py create mode 100644 document/services/books.py create mode 100644 document/services/datasets.py create mode 100644 document/services/preprints.py diff --git a/document/services/__init__.py b/document/services/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/document/services/__init__.py @@ -0,0 +1 @@ + diff --git a/document/services/books.py b/document/services/books.py new file mode 100644 index 0000000..96d92e1 --- /dev/null +++ b/document/services/books.py @@ -0,0 +1,256 @@ +from document.models import Document + + +def build_book_pid_generic(book_id): + if book_id in (None, ""): + return None + return f"book:{book_id}" + + +def build_chapter_pid_generic(book_id, chapter_id): + if book_id in (None, "") or chapter_id in (None, ""): + return None + return f"book:{book_id}/chapter:{chapter_id}" + + +def enrich_part_payload(payload, monograph_payload): + if not monograph_payload: + return payload + + enriched = dict(payload) + enriched["monograph_title"] = monograph_payload.get("title") + enriched["monograph_language"] = monograph_payload.get("language") + enriched["monograph_publication_date"] = monograph_payload.get("publication_date") + enriched["monograph_year"] = monograph_payload.get("year") + enriched["monograph_publisher"] = monograph_payload.get("publisher") + enriched["monograph_isbn"] = monograph_payload.get("isbn") + enriched["monograph_eisbn"] = monograph_payload.get("eisbn") + enriched["monograph_doi_number"] = monograph_payload.get("doi_number") + enriched["monograph_creators"] = monograph_payload.get("creators") + return enriched + + +def upsert_monograph_document( + payload, + collection, + source=None, + user=None, + force_update=True, + source_url=None, + last_seq=None, +): + if payload.get("TYPE") != "Monograph": + return None + + book_id = str(payload.get("id")) + pid_generic = build_book_pid_generic(book_id) + document, created = Document.objects.get_or_create( + collection=collection, + document_type=Document.DOCUMENT_TYPE_BOOK, + document_id=pid_generic, + ) + + if created and user: + document.creator = user + + if created or force_update: + document.source = source + document.parent_document = None + document.scielo_issn = None + document.pid_v2 = None + document.pid_v3 = None + document.pid_generic = pid_generic + document.title = payload.get("title") or book_id + document.identifiers = _build_monograph_identifiers(payload) + document.files = {} + document.default_lang = payload.get("language") or None + document.text_langs = _unique_list(payload.get("language")) + document.default_media_format = None + document.processing_date = None + document.publication_date = payload.get("publication_date") or None + document.publication_year = _normalize_year(payload.get("year")) + document.extra_data = _build_monograph_extra_data( + payload, + source_url=source_url, + last_seq=last_seq, + ) + + if user: + document.updated_by = user + + document.save() + return document + + +def upsert_part_document( + payload, + collection, + source=None, + parent_document=None, + user=None, + force_update=True, + source_url=None, + last_seq=None, +): + if payload.get("TYPE") != "Part": + return None + + book_id = payload.get("monograph") + chapter_id = payload.get("id") + pid_generic = build_chapter_pid_generic(book_id, chapter_id) + document, created = Document.objects.get_or_create( + collection=collection, + document_type=Document.DOCUMENT_TYPE_CHAPTER, + document_id=pid_generic, + ) + + if created and user: + document.creator = user + + if created or force_update: + document.source = source + document.parent_document = parent_document + document.scielo_issn = None + document.pid_v2 = None + document.pid_v3 = None + document.pid_generic = pid_generic + document.title = payload.get("title") or str(chapter_id) + document.identifiers = _build_part_identifiers(payload) + document.files = {} + document.default_lang = ( + payload.get("text_language") + or payload.get("monograph_language") + or None + ) + document.text_langs = _unique_list( + payload.get("text_language") or payload.get("monograph_language") + ) + document.default_media_format = None + document.processing_date = None + document.publication_date = payload.get("monograph_publication_date") or None + document.publication_year = _normalize_year(payload.get("monograph_year")) + document.extra_data = _build_part_extra_data( + payload, + source_url=source_url, + last_seq=last_seq, + ) + + if user: + document.updated_by = user + + document.save() + return document + + +def delete_book_document(collection, book_id): + return Document.objects.filter( + collection=collection, + document_type=Document.DOCUMENT_TYPE_BOOK, + document_id=build_book_pid_generic(book_id), + ).delete() + + +def delete_document_by_raw_id(collection, raw_id): + return Document.objects.filter( + collection=collection, + extra_data__raw_id=str(raw_id), + ).delete() + + +def has_monograph_document_for_raw_id(collection, raw_id): + return Document.objects.filter( + collection=collection, + document_type=Document.DOCUMENT_TYPE_BOOK, + extra_data__raw_id=str(raw_id), + ).exists() + + +def get_monograph_document(collection, book_id): + return Document.objects.filter( + collection=collection, + document_type=Document.DOCUMENT_TYPE_BOOK, + document_id=build_book_pid_generic(book_id), + ).first() + + +def _build_monograph_identifiers(payload): + identifiers = { + "book_id": str(payload.get("id")) if payload.get("id") is not None else None, + "isbn": payload.get("isbn"), + "eisbn": payload.get("eisbn"), + "doi": payload.get("doi_number"), + } + return _compact_dict(identifiers) + + +def _build_part_identifiers(payload): + identifiers = { + "book_id": str(payload.get("monograph")) if payload.get("monograph") is not None else None, + "chapter_id": str(payload.get("id")) if payload.get("id") is not None else None, + "isbn": payload.get("monograph_isbn"), + "eisbn": payload.get("monograph_eisbn"), + "doi": payload.get("doi_number"), + "book_doi": payload.get("monograph_doi_number"), + } + return _compact_dict(identifiers) + + +def _build_monograph_extra_data(payload, source_url=None, last_seq=None): + extra_data = { + "raw_id": str(payload.get("id")) if payload.get("id") is not None else None, + "raw_type": payload.get("TYPE"), + "source_url": source_url, + "last_seq": last_seq, + "visible": payload.get("visible"), + "city": payload.get("city"), + "country": payload.get("country"), + "pages": payload.get("pages"), + "publisher": payload.get("publisher"), + "creators": payload.get("creators"), + "translated_titles": payload.get("translated_titles"), + "translated_synopses": payload.get("translated_synopses"), + "synopsis": payload.get("synopsis"), + } + return _compact_dict(extra_data) + + +def _build_part_extra_data(payload, source_url=None, last_seq=None): + extra_data = { + "raw_id": str(payload.get("id")) if payload.get("id") is not None else None, + "raw_type": payload.get("TYPE"), + "source_url": source_url, + "last_seq": last_seq, + "visible": payload.get("visible"), + "order": payload.get("order"), + "pages": payload.get("pages"), + "creators": payload.get("creators"), + "translated_titles": payload.get("translated_titles"), + "monograph_id": str(payload.get("monograph")) if payload.get("monograph") is not None else None, + "monograph_title": payload.get("monograph_title"), + "monograph_language": payload.get("monograph_language"), + "monograph_publication_date": payload.get("monograph_publication_date"), + "monograph_year": payload.get("monograph_year"), + "monograph_publisher": payload.get("monograph_publisher"), + "monograph_creators": payload.get("monograph_creators"), + } + return _compact_dict(extra_data) + + +def _unique_list(value): + if not value: + return [] + return [value] + + +def _normalize_year(value): + if value in (None, ""): + return None + return str(value)[:4] + + +def _compact_dict(data): + return { + key: value + for key, value in data.items() + if value not in (None, "", [], {}, ()) + } diff --git a/document/services/datasets.py b/document/services/datasets.py new file mode 100644 index 0000000..2496b20 --- /dev/null +++ b/document/services/datasets.py @@ -0,0 +1,69 @@ +from document.models import Document + +from .common import compact_dict, normalize_year + + +def upsert_dataset_document( + payload, + collection, + user=None, + force_update=True, +): + dataset_doi = payload.get("dataset_doi") + if not dataset_doi: + return None + + document, created = Document.objects.get_or_create( + collection=collection, + document_type=Document.DOCUMENT_TYPE_DATASET, + document_id=dataset_doi, + ) + + if created and user: + document.creator = user + + if created or force_update: + files = dict(document.files or {}) + file_id = payload.get("file_id") + if file_id: + files[str(file_id)] = compact_dict( + { + "name": payload.get("file_name"), + "url": payload.get("file_url"), + "file_persistent_id": payload.get("file_persistent_id"), + } + ) + + document.source = None + document.parent_document = None + document.scielo_issn = None + document.pid_v2 = None + document.pid_v3 = None + document.pid_generic = dataset_doi + document.title = payload.get("title") or document.title + document.identifiers = compact_dict( + { + "dataset_doi": dataset_doi, + } + ) + document.files = files + document.default_lang = document.default_lang + document.text_langs = document.text_langs or [] + document.default_media_format = document.default_media_format + document.processing_date = document.processing_date + document.publication_date = payload.get("dataset_published") or document.publication_date + document.publication_year = normalize_year( + None, + fallback_date=document.publication_date, + ) + document.extra_data = compact_dict( + { + "provider": "dataverse", + } + ) + + if user: + document.updated_by = user + + document.save() + return document diff --git a/document/services/preprints.py b/document/services/preprints.py new file mode 100644 index 0000000..4be89f1 --- /dev/null +++ b/document/services/preprints.py @@ -0,0 +1,58 @@ +from document.models import Document + +from .common import compact_dict, normalize_langs, normalize_year + + +def upsert_preprint_document( + payload, + collection, + user=None, + force_update=True, +): + pid_generic = payload.get("pid_generic") + if not pid_generic: + return None + + document, created = Document.objects.get_or_create( + collection=collection, + document_type=Document.DOCUMENT_TYPE_PREPRINT, + document_id=pid_generic, + ) + + if created and user: + document.creator = user + + if created or force_update: + document.source = None + document.parent_document = None + document.scielo_issn = None + document.pid_v2 = None + document.pid_v3 = None + document.pid_generic = pid_generic + document.title = payload.get("title") or document.title + document.identifiers = compact_dict( + { + "pid_generic": pid_generic, + } + ) + document.files = document.files or {} + document.default_lang = payload.get("default_language") or document.default_lang + document.text_langs = normalize_langs(payload.get("text_langs")) + document.default_media_format = document.default_media_format + document.processing_date = document.processing_date + document.publication_date = payload.get("publication_date") or document.publication_date + document.publication_year = normalize_year( + payload.get("publication_year"), + fallback_date=document.publication_date, + ) + document.extra_data = compact_dict( + { + "provider": "preprints", + } + ) + + if user: + document.updated_by = user + + document.save() + return document From f1f43803953d9f014c3fc603fbf9cc077733bbf1 Mon Sep 17 00:00:00 2001 From: Rafael JP Damaceno Date: Fri, 1 May 2026 22:33:16 -0300 Subject: [PATCH 13/31] feat: adicionar modelo Source com tipos journal, book e server --- source/migrations/0001_initial.py | 210 ++++++++++++++++++ source/migrations/0002_source_access_type.py | 25 +++ source/migrations/0003_alter_source_title.py | 15 ++ source/migrations/__init__.py | 1 + source/models.py | 219 +++++++++++++++++++ 5 files changed, 470 insertions(+) create mode 100644 source/migrations/0001_initial.py create mode 100644 source/migrations/0002_source_access_type.py create mode 100644 source/migrations/0003_alter_source_title.py create mode 100644 source/migrations/__init__.py create mode 100644 source/models.py diff --git a/source/migrations/0001_initial.py b/source/migrations/0001_initial.py new file mode 100644 index 0000000..cc736e3 --- /dev/null +++ b/source/migrations/0001_initial.py @@ -0,0 +1,210 @@ +# Generated by Django 5.0.7 on 2026-03-15 00:00 + +import django.db.models.deletion +from django.conf import settings +from django.db import migrations, models + + +class Migration(migrations.Migration): + initial = True + + dependencies = [ + ("collection", "0001_initial"), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.CreateModel( + name="Source", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "created", + models.DateTimeField( + auto_now_add=True, + verbose_name="Creation date", + ), + ), + ( + "updated", + models.DateTimeField( + auto_now=True, + verbose_name="Last update date", + ), + ), + ( + "source_type", + models.CharField( + choices=[ + ("journal", "Journal"), + ("book", "Book"), + ("preprint_server", "Preprint Server"), + ("data_repository", "Data Repository"), + ("other", "Other"), + ], + db_index=True, + max_length=32, + verbose_name="Source Type", + ), + ), + ( + "source_id", + models.CharField( + db_index=True, + max_length=255, + verbose_name="Source ID", + ), + ), + ( + "scielo_issn", + models.CharField( + blank=True, + db_index=True, + max_length=9, + null=True, + verbose_name="SciELO ISSN", + ), + ), + ( + "acronym", + models.CharField( + blank=True, + default="", + max_length=64, + null=True, + verbose_name="Source Acronym", + ), + ), + ( + "title", + models.CharField( + max_length=255, + verbose_name="Source Title", + ), + ), + ( + "identifiers", + models.JSONField( + blank=True, + default=dict, + null=True, + verbose_name="Identifiers", + ), + ), + ( + "publisher_name", + models.JSONField( + blank=True, + default=list, + null=True, + verbose_name="Publisher Name", + ), + ), + ( + "subject_areas", + models.JSONField( + default=list, + verbose_name="Subject Areas (CAPES)", + ), + ), + ( + "wos_subject_areas", + models.JSONField( + default=list, + verbose_name="Subject Areas (WoS)", + ), + ), + ( + "default_lang", + models.CharField( + blank=True, + max_length=8, + null=True, + verbose_name="Default Language", + ), + ), + ( + "publication_date", + models.CharField( + blank=True, + max_length=32, + null=True, + verbose_name="Publication Date", + ), + ), + ( + "publication_year", + models.CharField( + blank=True, + db_index=True, + max_length=4, + null=True, + verbose_name="Publication Year", + ), + ), + ( + "extra_data", + models.JSONField( + blank=True, + default=dict, + null=True, + verbose_name="Extra Data", + ), + ), + ( + "collection", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + to="collection.collection", + verbose_name="Collection", + ), + ), + ( + "creator", + models.ForeignKey( + editable=False, + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="%(class)s_creator", + to=settings.AUTH_USER_MODEL, + verbose_name="Creator", + ), + ), + ( + "updated_by", + models.ForeignKey( + blank=True, + editable=False, + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="%(class)s_last_mod_user", + to=settings.AUTH_USER_MODEL, + verbose_name="Updater", + ), + ), + ], + options={ + "verbose_name": "Source", + "verbose_name_plural": "Sources", + "unique_together": {("collection", "source_type", "source_id")}, + "indexes": [ + models.Index( + fields=["collection", "source_type"], + name="source_collection_type_idx", + ), + models.Index( + fields=["collection", "scielo_issn"], + name="source_collection_issn_idx", + ), + ], + }, + ), + ] diff --git a/source/migrations/0002_source_access_type.py b/source/migrations/0002_source_access_type.py new file mode 100644 index 0000000..e148c15 --- /dev/null +++ b/source/migrations/0002_source_access_type.py @@ -0,0 +1,25 @@ +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("source", "0001_initial"), + ] + + operations = [ + migrations.AddField( + model_name="source", + name="access_type", + field=models.CharField( + blank=True, + choices=[ + ("open_access", "Open Access"), + ("commercial", "Commercial"), + ], + db_index=True, + max_length=32, + null=True, + verbose_name="Access Type", + ), + ), + ] diff --git a/source/migrations/0003_alter_source_title.py b/source/migrations/0003_alter_source_title.py new file mode 100644 index 0000000..354a82a --- /dev/null +++ b/source/migrations/0003_alter_source_title.py @@ -0,0 +1,15 @@ +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("source", "0002_source_access_type"), + ] + + operations = [ + migrations.AlterField( + model_name="source", + name="title", + field=models.CharField(max_length=500, verbose_name="Source Title"), + ), + ] diff --git a/source/migrations/__init__.py b/source/migrations/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/source/migrations/__init__.py @@ -0,0 +1 @@ + diff --git a/source/models.py b/source/models.py new file mode 100644 index 0000000..48d3e00 --- /dev/null +++ b/source/models.py @@ -0,0 +1,219 @@ +from django.db import models +from django.utils.translation import gettext_lazy as _ + +from collection.models import Collection +from core.models import CommonControlField + + +class Source(CommonControlField): + SOURCE_TYPE_JOURNAL = "journal" + SOURCE_TYPE_BOOK = "book" + SOURCE_TYPE_PREPRINT_SERVER = "preprint_server" + SOURCE_TYPE_DATA_REPOSITORY = "data_repository" + SOURCE_TYPE_OTHER = "other" + SOURCE_TYPE_CHOICES = ( + (SOURCE_TYPE_JOURNAL, _("Journal")), + (SOURCE_TYPE_BOOK, _("Book")), + (SOURCE_TYPE_PREPRINT_SERVER, _("Preprint Server")), + (SOURCE_TYPE_DATA_REPOSITORY, _("Data Repository")), + (SOURCE_TYPE_OTHER, _("Other")), + ) + + ACCESS_TYPE_OPEN_ACCESS = "open_access" + ACCESS_TYPE_COMMERCIAL = "commercial" + ACCESS_TYPE_CHOICES = ( + (ACCESS_TYPE_OPEN_ACCESS, _("Open Access")), + (ACCESS_TYPE_COMMERCIAL, _("Commercial")), + ) + + collection = models.ForeignKey( + Collection, + verbose_name=_("Collection"), + on_delete=models.CASCADE, + blank=False, + null=False, + db_index=True, + ) + + source_type = models.CharField( + verbose_name=_("Source Type"), + max_length=32, + choices=SOURCE_TYPE_CHOICES, + blank=False, + null=False, + db_index=True, + ) + + source_id = models.CharField( + verbose_name=_("Source ID"), + max_length=255, + blank=False, + null=False, + db_index=True, + ) + + scielo_issn = models.CharField( + verbose_name=_("SciELO ISSN"), + max_length=9, + blank=True, + null=True, + db_index=True, + ) + + acronym = models.CharField( + verbose_name=_("Source Acronym"), + max_length=64, + blank=True, + null=True, + default="", + ) + + title = models.CharField( + verbose_name=_("Source Title"), + max_length=500, + blank=False, + null=False, + ) + + identifiers = models.JSONField( + verbose_name=_("Identifiers"), + null=True, + blank=True, + default=dict, + ) + + publisher_name = models.JSONField( + verbose_name=_("Publisher Name"), + blank=True, + null=True, + default=list, + ) + + subject_areas = models.JSONField( + verbose_name=_("Subject Areas (CAPES)"), + null=False, + blank=False, + default=list, + ) + + wos_subject_areas = models.JSONField( + verbose_name=_("Subject Areas (WoS)"), + null=False, + blank=False, + default=list, + ) + + default_lang = models.CharField( + verbose_name=_("Default Language"), + max_length=8, + blank=True, + null=True, + ) + + publication_date = models.CharField( + verbose_name=_("Publication Date"), + max_length=32, + blank=True, + null=True, + ) + + publication_year = models.CharField( + verbose_name=_("Publication Year"), + max_length=4, + blank=True, + null=True, + db_index=True, + ) + + access_type = models.CharField( + verbose_name=_("Access Type"), + max_length=32, + choices=ACCESS_TYPE_CHOICES, + blank=True, + null=True, + db_index=True, + ) + + extra_data = models.JSONField( + verbose_name=_("Extra Data"), + null=True, + blank=True, + default=dict, + ) + + def __str__(self): + return f"{self.collection.acron3} - {self.source_type} - {self.source_id}" + + @staticmethod + def _extract_issns(identifiers): + if not isinstance(identifiers, dict): + return set() + + return { + value + for key, value in identifiers.items() + if value and "issn" in str(key).lower() + } + + @classmethod + def metadata(cls, collection=None): + queryset = cls.objects.select_related("collection").only( + "acronym", + "collection__acron3", + "default_lang", + "extra_data", + "identifiers", + "publication_date", + "publication_year", + "access_type", + "publisher_name", + "scielo_issn", + "source_id", + "source_type", + "subject_areas", + "title", + "wos_subject_areas", + ) + + if collection: + queryset = queryset.filter(collection=collection) + + for source in queryset.iterator(): + identifiers = source.identifiers or {} + yield { + "acronym": source.acronym, + "collection": source.collection.acron3, + "default_lang": source.default_lang, + "extra_data": source.extra_data or {}, + "identifiers": identifiers, + "issns": cls._extract_issns(identifiers), + "publication_date": source.publication_date, + "publication_year": source.publication_year, + "access_type": source.access_type, + "publisher_name": source.publisher_name or [], + "scielo_issn": source.scielo_issn, + "source_id": source.source_id, + "source_type": source.source_type, + "subject_areas": source.subject_areas or [], + "title": source.title, + "wos_subject_areas": source.wos_subject_areas or [], + } + + class Meta: + verbose_name = _("Source") + verbose_name_plural = _("Sources") + unique_together = ( + "collection", + "source_type", + "source_id", + ) + indexes = [ + models.Index( + fields=["collection", "source_type"], + name="source_collection_type_idx", + ), + models.Index( + fields=["collection", "scielo_issn"], + name="source_collection_issn_idx", + ), + ] From 685b8d07edc092ccbddd340f56563948a80ef1bb Mon Sep 17 00:00:00 2001 From: Rafael JP Damaceno Date: Fri, 1 May 2026 22:33:25 -0300 Subject: [PATCH 14/31] =?UTF-8?q?feat:=20adicionar=20servi=C3=A7os=20Sourc?= =?UTF-8?q?e,=20tasks,=20wagtail=20hooks=20e=20testes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- source/__init__.py | 1 + source/apps.py | 6 ++ source/services/__init__.py | 1 + source/services/books.py | 137 +++++++++++++++++++++++++++++++++ source/services/journals.py | 118 ++++++++++++++++++++++++++++ source/tasks.py | 148 ++++++++++++++++++++++++++++++++++++ source/tests.py | 133 ++++++++++++++++++++++++++++++++ source/wagtail_hooks.py | 32 ++++++++ 8 files changed, 576 insertions(+) create mode 100644 source/__init__.py create mode 100644 source/apps.py create mode 100644 source/services/__init__.py create mode 100644 source/services/books.py create mode 100644 source/services/journals.py create mode 100644 source/tasks.py create mode 100644 source/tests.py create mode 100644 source/wagtail_hooks.py diff --git a/source/__init__.py b/source/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/source/__init__.py @@ -0,0 +1 @@ + diff --git a/source/apps.py b/source/apps.py new file mode 100644 index 0000000..06d886d --- /dev/null +++ b/source/apps.py @@ -0,0 +1,6 @@ +from django.apps import AppConfig + + +class SourceConfig(AppConfig): + default_auto_field = "django.db.models.BigAutoField" + name = "source" diff --git a/source/services/__init__.py b/source/services/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/source/services/__init__.py @@ -0,0 +1 @@ + diff --git a/source/services/books.py b/source/services/books.py new file mode 100644 index 0000000..df9bd4d --- /dev/null +++ b/source/services/books.py @@ -0,0 +1,137 @@ +from collection.models import Collection +from source.models import Source + + +BOOKS_COLLECTION_ACRONYM = "books" + + +def get_books_collection(acronym=BOOKS_COLLECTION_ACRONYM): + return Collection.objects.get(acron3=acronym) + + +def upsert_monograph_source( + payload, + collection, + user=None, + force_update=True, + source_url=None, + last_seq=None, +): + if payload.get("TYPE") != "Monograph": + return None + + source, created = Source.objects.get_or_create( + collection=collection, + source_type=Source.SOURCE_TYPE_BOOK, + source_id=str(payload.get("id")), + ) + + if created and user: + source.creator = user + + if created or force_update: + source.scielo_issn = None + source.acronym = "" + source.title = payload.get("title") or str(payload.get("id")) + source.identifiers = _build_source_identifiers(payload) + source.publisher_name = _as_list(payload.get("publisher")) + source.subject_areas = [] + source.wos_subject_areas = [] + source.default_lang = payload.get("language") or None + source.publication_date = payload.get("publication_date") or None + source.publication_year = _normalize_year(payload.get("year")) + source.access_type = _normalize_access_type(payload.get("is_comercial")) + source.extra_data = _build_source_extra_data( + payload, + source_url=source_url, + last_seq=last_seq, + ) + + if user: + source.updated_by = user + + source.save() + return source + + +def delete_book_source(collection, book_id): + return Source.objects.filter( + collection=collection, + source_type=Source.SOURCE_TYPE_BOOK, + source_id=str(book_id), + ).delete() + + +def _build_source_identifiers(payload): + identifiers = { + "book_id": str(payload.get("id")) if payload.get("id") is not None else None, + "isbn": payload.get("isbn"), + "eisbn": payload.get("eisbn"), + "doi": payload.get("doi_number"), + } + return _compact_dict(identifiers) + + +def _build_source_extra_data(payload, source_url=None, last_seq=None): + extra_data = { + "raw_type": payload.get("TYPE"), + "source_url": source_url, + "last_seq": last_seq, + "visible": payload.get("visible"), + "city": payload.get("city"), + "country": payload.get("country"), + "pages": payload.get("pages"), + "collection_data": payload.get("collection"), + "creators": payload.get("creators"), + "is_comercial": payload.get("is_comercial"), + "use_licence": payload.get("use_licence"), + "price_reais": payload.get("price_reais"), + "price_dollar": payload.get("price_dollar"), + "shopping_info": payload.get("shopping_info"), + "serie": payload.get("serie"), + "format": payload.get("format"), + "translated_titles": payload.get("translated_titles"), + "translated_synopses": payload.get("translated_synopses"), + "synopsis": payload.get("synopsis"), + "primary_descriptor": payload.get("primary_descriptor"), + "translated_primary_descriptors": payload.get("translated_primary_descriptors"), + } + return _compact_dict(extra_data) + + +def _as_list(value): + if not value: + return [] + + if isinstance(value, list): + return value + + return [value] + + +def _normalize_year(value): + if value in (None, ""): + return None + return str(value)[:4] + + +def _normalize_access_type(value): + if value in (None, ""): + return None + + if isinstance(value, str): + normalized = value.strip().lower() + if normalized in {"true", "1", "yes", "y", "sim"}: + return Source.ACCESS_TYPE_COMMERCIAL + if normalized in {"false", "0", "no", "n", "nao", "não"}: + return Source.ACCESS_TYPE_OPEN_ACCESS + + return Source.ACCESS_TYPE_COMMERCIAL if bool(value) else Source.ACCESS_TYPE_OPEN_ACCESS + + +def _compact_dict(data): + return { + key: value + for key, value in data.items() + if value not in (None, "", [], {}, ()) + } diff --git a/source/services/journals.py b/source/services/journals.py new file mode 100644 index 0000000..ac133f6 --- /dev/null +++ b/source/services/journals.py @@ -0,0 +1,118 @@ +from django.db.models import Q + +from collection.models import Collection +from source.models import Source + + +def get_collection(acronym): + return Collection.objects.filter(acron3=acronym).first() + + +def upsert_journal_source( + journal, + collection, + user=None, + force_update=True, + load_mode=None, +): + scielo_issn = _value(journal, "scielo_issn") + if not scielo_issn: + return None + + source, created = Source.objects.get_or_create( + collection=collection, + source_type=Source.SOURCE_TYPE_JOURNAL, + source_id=scielo_issn, + ) + + if created and user: + source.creator = user + + if created or force_update: + source.scielo_issn = scielo_issn + source.acronym = _value(journal, "acronym") or "" + source.title = _value(journal, "title") or scielo_issn + source.identifiers = _build_source_identifiers(journal) + source.publisher_name = _as_list(_value(journal, "publisher_name")) + source.subject_areas = _as_list(_value(journal, "subject_areas")) + source.wos_subject_areas = _as_list(_value(journal, "wos_subject_areas")) + source.default_lang = None + source.publication_date = None + source.publication_year = None + source.extra_data = _compact_dict( + { + "collection_acronym": _value(journal, "collection_acronym"), + "load_mode": load_mode, + } + ) + + if user: + source.updated_by = user + + source.save() + return source + + +def find_journal_source_by_issns(collection, issns): + for issn in filter(None, issns or []): + source = ( + Source.objects.filter( + collection=collection, + source_type=Source.SOURCE_TYPE_JOURNAL, + ) + .filter( + Q(scielo_issn=issn) + | Q(source_id=issn) + | Q(identifiers__electronic_issn=issn) + | Q(identifiers__print_issn=issn) + | Q(identifiers__scielo_issn=issn) + ) + .first() + ) + if source: + return source + return None + + +def find_journal_source_by_acronym(collection, acronym): + if not acronym: + return None + + return Source.objects.filter( + collection=collection, + source_type=Source.SOURCE_TYPE_JOURNAL, + acronym=acronym, + ).first() + + +def _build_source_identifiers(journal): + identifiers = { + "electronic_issn": _value(journal, "electronic_issn"), + "print_issn": _value(journal, "print_issn"), + "scielo_issn": _value(journal, "scielo_issn"), + } + return _compact_dict(identifiers) + + +def _as_list(value): + if not value: + return [] + + if isinstance(value, list): + return value + + return [value] + + +def _value(data, key, default=None): + if isinstance(data, dict): + return data.get(key, default) + return getattr(data, key, default) + + +def _compact_dict(data): + return { + key: value + for key, value in data.items() + if value not in (None, "", [], {}, ()) + } diff --git a/source/tasks.py b/source/tasks.py new file mode 100644 index 0000000..eb1633b --- /dev/null +++ b/source/tasks.py @@ -0,0 +1,148 @@ +import logging + +from django.utils.translation import gettext as _ +from django.conf import settings + +from collection.models import Collection +from config import celery_app +from core.collectors import articlemeta as articlemeta_collector +from core.collectors import scielo_books as scielo_books_collector +from core.utils.request_utils import _get_user +from source.services import books as books_service +from source.services import journals as journal_service + + +def load_sources_from_article_meta( + collections=None, + force_update=True, + user=None, + mode="thrift", +): + collection_codes = collections or Collection.acron3_list() + + for collection_code in collection_codes: + logging.info( + "Loading sources from Article Meta. Collection: %s, Mode: %s", + collection_code, + mode, + ) + + for journal in articlemeta_collector.iter_journals( + collection=collection_code, + mode=mode, + ): + collection = journal_service.get_collection(journal.collection_acronym) + if not collection: + logging.error( + "Collection %s does not exist", + journal.collection_acronym, + ) + continue + + source = journal_service.upsert_journal_source( + journal, + collection=collection, + user=user, + force_update=force_update, + load_mode=mode, + ) + logging.info( + "Source %s upserted for collection %s", + source.source_id if source else None, + collection.acron3, + ) + + return True + + +def load_sources_from_scielo_books( + collection="books", + db_name=settings.SCIELO_BOOKS_DB_NAME, + since=0, + limit=settings.SCIELO_BOOKS_LIMIT, + force_update=True, + headers=None, + base_url=None, + user=None, +): + collection_obj = books_service.get_books_collection(collection) + + logging.info( + "Loading sources from SciELO Books. Collection: %s, DB: %s, Since: %s, Limit: %s", + collection, + db_name, + since, + limit, + ) + + for item in scielo_books_collector.iter_change_documents( + base_url=base_url, + db_name=db_name, + since=since, + limit=limit, + headers=headers, + ): + change = item["change"] + + if item["deleted"]: + books_service.delete_book_source(collection_obj, change.get("id")) + continue + + payload = item["payload"] or {} + if payload.get("TYPE") != "Monograph": + continue + + books_service.upsert_monograph_source( + payload, + collection=collection_obj, + user=user, + force_update=force_update, + source_url=item.get("source_url"), + last_seq=change.get("seq"), + ) + + return True + + +@celery_app.task(bind=True, name=_("[Metadata] Sync Sources (Article Meta)"), queue="load") +def task_load_sources_from_article_meta( + self, + collections=None, + force_update=True, + user_id=None, + username=None, + mode="thrift", +): + user = _get_user(self.request, username=username, user_id=user_id) + return load_sources_from_article_meta( + collections=collections, + force_update=force_update, + user=user, + mode=mode, + ) + + +@celery_app.task(bind=True, name=_("[Metadata] Sync Sources (SciELO Books)"), queue="load") +def task_load_sources_from_scielo_books( + self, + collection="books", + db_name=settings.SCIELO_BOOKS_DB_NAME, + since=0, + limit=settings.SCIELO_BOOKS_LIMIT, + force_update=True, + headers=None, + base_url=None, + user_id=None, + username=None, +): + user = _get_user(self.request, username=username, user_id=user_id) + return load_sources_from_scielo_books( + collection=collection, + db_name=db_name, + since=since, + limit=limit, + force_update=force_update, + headers=headers, + base_url=base_url, + user=user, + ) diff --git a/source/tests.py b/source/tests.py new file mode 100644 index 0000000..a182f4e --- /dev/null +++ b/source/tests.py @@ -0,0 +1,133 @@ +from django.test import TestCase + +from collection.models import Collection + +from .models import Source +from .services import books as books_service +from .services import journals as journal_service + + +class SourceMetadataTests(TestCase): + def test_source_type_choices_include_scielo_non_journal_sources(self): + self.assertIn( + (Source.SOURCE_TYPE_PREPRINT_SERVER, "Preprint Server"), + [(value, str(label)) for value, label in Source.SOURCE_TYPE_CHOICES], + ) + self.assertIn( + (Source.SOURCE_TYPE_DATA_REPOSITORY, "Data Repository"), + [(value, str(label)) for value, label in Source.SOURCE_TYPE_CHOICES], + ) + + def test_metadata_exposes_generic_and_journal_fields(self): + collection = Collection.objects.create(acron3="scl", acron2="sc") + Source.objects.create( + collection=collection, + source_type=Source.SOURCE_TYPE_JOURNAL, + source_id="1234-5678", + scielo_issn="1234-5678", + acronym="testjou", + title="Test Journal", + identifiers={ + "electronic_issn": "1234-5678", + "print_issn": "8765-4321", + "doi": "10.1590/example", + }, + publisher_name=["SciELO"], + subject_areas=["Health Sciences"], + wos_subject_areas=["Medicine"], + default_lang="en", + publication_date="2024-01-15", + publication_year="2024", + extra_data={"country": "BR"}, + ) + + metadata = list(Source.metadata(collection=collection)) + + self.assertEqual(len(metadata), 1) + self.assertEqual(metadata[0]["source_type"], Source.SOURCE_TYPE_JOURNAL) + self.assertEqual(metadata[0]["source_id"], "1234-5678") + self.assertEqual(metadata[0]["scielo_issn"], "1234-5678") + self.assertEqual(metadata[0]["issns"], {"1234-5678", "8765-4321"}) + self.assertEqual(metadata[0]["title"], "Test Journal") + + def test_upsert_monograph_source_maps_scielo_books_payload(self): + collection = Collection.objects.create(acron3="books", acron2="bk") + + source = books_service.upsert_monograph_source( + { + "TYPE": "Monograph", + "id": "abcd1", + "title": "Sample Book", + "isbn": "9788578791889", + "eisbn": "9788578791880", + "doi_number": "10.1234/book", + "language": "pt", + "publication_date": "2024-05-20", + "year": "2024", + "publisher": "SciELO Books", + "is_comercial": False, + "visible": True, + }, + collection=collection, + ) + + self.assertEqual(source.source_type, Source.SOURCE_TYPE_BOOK) + self.assertEqual(source.source_id, "abcd1") + self.assertEqual(source.identifiers["isbn"], "9788578791889") + self.assertEqual(source.default_lang, "pt") + self.assertEqual(source.publication_year, "2024") + self.assertEqual(source.access_type, Source.ACCESS_TYPE_OPEN_ACCESS) + + def test_upsert_monograph_source_accepts_long_real_world_title(self): + collection = Collection.objects.create(acron3="books", acron2="bk") + title = ( + "O Estado da Arte sobre Refugiados, Deslocados Internos, " + "Deslocados Ambientais e Apatridas no Brasil: atualizacao do " + "Diretorio Nacional do ACNUR de teses, dissertacoes, trabalhos " + "de conclusao de curso de graduacao em Joao Pessoa (Paraiba) e " + "artigos (2007 a 2017)" + ) + + source = books_service.upsert_monograph_source( + { + "TYPE": "Monograph", + "id": "9zzts", + "title": title, + }, + collection=collection, + ) + + self.assertEqual(source.title, title) + + def test_upsert_journal_source_maps_articlemeta_payload(self): + collection = Collection.objects.create(acron3="scl", acron2="sc") + + source = journal_service.upsert_journal_source( + { + "collection_acronym": "scl", + "scielo_issn": "1234-5678", + "electronic_issn": "1234-5678", + "print_issn": "8765-4321", + "acronym": "testjou", + "title": "Test Journal", + "publisher_name": "SciELO", + "subject_areas": ["Health Sciences"], + "wos_subject_areas": ["Medicine"], + }, + collection=collection, + load_mode="thrift", + ) + + self.assertEqual(source.source_type, Source.SOURCE_TYPE_JOURNAL) + self.assertEqual(source.source_id, "1234-5678") + self.assertEqual(source.identifiers["electronic_issn"], "1234-5678") + self.assertEqual(source.publisher_name, ["SciELO"]) + self.assertEqual(source.extra_data["load_mode"], "thrift") + self.assertEqual( + journal_service.find_journal_source_by_issns(collection, ["8765-4321"]).pk, + source.pk, + ) + self.assertEqual( + journal_service.find_journal_source_by_acronym(collection, "testjou").pk, + source.pk, + ) diff --git a/source/wagtail_hooks.py b/source/wagtail_hooks.py new file mode 100644 index 0000000..5ffad62 --- /dev/null +++ b/source/wagtail_hooks.py @@ -0,0 +1,32 @@ +from django.utils.translation import gettext_lazy as _ +from wagtail.snippets.views.snippets import SnippetViewSet + +from .models import Source + + +class SourceSnippetViewSet(SnippetViewSet): + model = Source + icon = "folder-open-inverse" + menu_label = _("Source") + menu_order = 200 + + list_display = ( + "collection", + "source_type", + "source_id", + "scielo_issn", + "acronym", + "title", + "publication_year", + ) + list_filter = ( + "collection", + "source_type", + "publication_year", + ) + search_fields = ( + "source_id", + "scielo_issn", + "acronym", + "title", + ) From c547c3f02b6a50ea99372b4e4a7fa5076cfeedbd Mon Sep 17 00:00:00 2001 From: Rafael JP Damaceno Date: Fri, 1 May 2026 22:33:34 -0300 Subject: [PATCH 15/31] =?UTF-8?q?feat:=20adicionar=20modelos=20Reports=20(?= =?UTF-8?q?semanal,=20mensal,=20anual)=20e=20migra=C3=A7=C3=B5es?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- reports/__init__.py | 0 reports/apps.py | 8 + reports/migrations/0001_initial.py | 140 ++++++++++++++++++ ...alter_monthlylogreport_options_and_more.py | 36 +++++ reports/migrations/__init__.py | 0 reports/models.py | 100 +++++++++++++ 6 files changed, 284 insertions(+) create mode 100644 reports/__init__.py create mode 100644 reports/apps.py create mode 100644 reports/migrations/0001_initial.py create mode 100644 reports/migrations/0002_alter_monthlylogreport_options_and_more.py create mode 100644 reports/migrations/__init__.py create mode 100644 reports/models.py diff --git a/reports/__init__.py b/reports/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/reports/apps.py b/reports/apps.py new file mode 100644 index 0000000..119ca26 --- /dev/null +++ b/reports/apps.py @@ -0,0 +1,8 @@ +from django.apps import AppConfig +from django.utils.translation import gettext_lazy as _ + + +class ReportsConfig(AppConfig): + default_auto_field = "django.db.models.BigAutoField" + name = "reports" + verbose_name = _("Reports") diff --git a/reports/migrations/0001_initial.py b/reports/migrations/0001_initial.py new file mode 100644 index 0000000..2a72923 --- /dev/null +++ b/reports/migrations/0001_initial.py @@ -0,0 +1,140 @@ +# Generated by Django 5.2.12 on 2026-05-01 15:50 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + initial = True + + dependencies = [ + ("collection", "0001_initial"), + ] + + operations = [ + migrations.CreateModel( + name="MonthlyLogReport", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("total_files", models.IntegerField(default=0)), + ("created_files", models.IntegerField(default=0)), + ("validated_files", models.IntegerField(default=0)), + ("invalidated_files", models.IntegerField(default=0)), + ("errored_files", models.IntegerField(default=0)), + ("lines_parsed", models.IntegerField(default=0)), + ("valid_lines", models.IntegerField(default=0)), + ("discarded_lines", models.IntegerField(default=0)), + ("ip_local_count", models.IntegerField(default=0)), + ("ip_remote_count", models.IntegerField(default=0)), + ("ip_unknown_count", models.IntegerField(default=0)), + ("generated_at", models.DateTimeField(auto_now=True)), + ("year", models.IntegerField(verbose_name="Year")), + ("month", models.IntegerField(verbose_name="Month")), + ( + "collection", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + to="collection.collection", + verbose_name="Collection", + ), + ), + ], + options={ + "verbose_name": "Monthly Log Report", + "verbose_name_plural": "Monthly Log Reports", + "ordering": ["-year", "-month", "collection__acron3"], + "unique_together": {("collection", "year", "month")}, + }, + ), + migrations.CreateModel( + name="WeeklyLogReport", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("total_files", models.IntegerField(default=0)), + ("created_files", models.IntegerField(default=0)), + ("validated_files", models.IntegerField(default=0)), + ("invalidated_files", models.IntegerField(default=0)), + ("errored_files", models.IntegerField(default=0)), + ("lines_parsed", models.IntegerField(default=0)), + ("valid_lines", models.IntegerField(default=0)), + ("discarded_lines", models.IntegerField(default=0)), + ("ip_local_count", models.IntegerField(default=0)), + ("ip_remote_count", models.IntegerField(default=0)), + ("ip_unknown_count", models.IntegerField(default=0)), + ("generated_at", models.DateTimeField(auto_now=True)), + ("year", models.IntegerField(verbose_name="Year")), + ("week", models.IntegerField(verbose_name="ISO Week")), + ( + "collection", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + to="collection.collection", + verbose_name="Collection", + ), + ), + ], + options={ + "verbose_name": "Weekly Log Report", + "verbose_name_plural": "Weekly Log Reports", + "ordering": ["-year", "-week", "collection__acron3"], + "unique_together": {("collection", "year", "week")}, + }, + ), + migrations.CreateModel( + name="YearlyLogReport", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("total_files", models.IntegerField(default=0)), + ("created_files", models.IntegerField(default=0)), + ("validated_files", models.IntegerField(default=0)), + ("invalidated_files", models.IntegerField(default=0)), + ("errored_files", models.IntegerField(default=0)), + ("lines_parsed", models.IntegerField(default=0)), + ("valid_lines", models.IntegerField(default=0)), + ("discarded_lines", models.IntegerField(default=0)), + ("ip_local_count", models.IntegerField(default=0)), + ("ip_remote_count", models.IntegerField(default=0)), + ("ip_unknown_count", models.IntegerField(default=0)), + ("generated_at", models.DateTimeField(auto_now=True)), + ("year", models.IntegerField(verbose_name="Year")), + ( + "collection", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + to="collection.collection", + verbose_name="Collection", + ), + ), + ], + options={ + "verbose_name": "Yearly Log Report", + "verbose_name_plural": "Yearly Log Reports", + "ordering": ["-year", "collection__acron3"], + "unique_together": {("collection", "year")}, + }, + ), + ] diff --git a/reports/migrations/0002_alter_monthlylogreport_options_and_more.py b/reports/migrations/0002_alter_monthlylogreport_options_and_more.py new file mode 100644 index 0000000..659215c --- /dev/null +++ b/reports/migrations/0002_alter_monthlylogreport_options_and_more.py @@ -0,0 +1,36 @@ +# Generated by Django 5.2.12 on 2026-05-01 22:23 + +from django.db import migrations + + +class Migration(migrations.Migration): + dependencies = [ + ("reports", "0001_initial"), + ] + + operations = [ + migrations.AlterModelOptions( + name="monthlylogreport", + options={ + "ordering": ["collection__acron3", "year", "month"], + "verbose_name": "Monthly Log Report", + "verbose_name_plural": "Monthly Log Reports", + }, + ), + migrations.AlterModelOptions( + name="weeklylogreport", + options={ + "ordering": ["collection__acron3", "year", "week"], + "verbose_name": "Weekly Log Report", + "verbose_name_plural": "Weekly Log Reports", + }, + ), + migrations.AlterModelOptions( + name="yearlylogreport", + options={ + "ordering": ["collection__acron3", "year"], + "verbose_name": "Yearly Log Report", + "verbose_name_plural": "Yearly Log Reports", + }, + ), + ] diff --git a/reports/migrations/__init__.py b/reports/migrations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/reports/models.py b/reports/models.py new file mode 100644 index 0000000..3af1ec8 --- /dev/null +++ b/reports/models.py @@ -0,0 +1,100 @@ +from django.db import models +from django.utils.translation import gettext_lazy as _ + +from collection.models import Collection + + +class AbstractLogReport(models.Model): + collection = models.ForeignKey( + Collection, + on_delete=models.CASCADE, + verbose_name=_("Collection"), + ) + total_files = models.IntegerField(default=0) + created_files = models.IntegerField(default=0) + validated_files = models.IntegerField(default=0) + invalidated_files = models.IntegerField(default=0) + errored_files = models.IntegerField(default=0) + lines_parsed = models.IntegerField(default=0) + valid_lines = models.IntegerField(default=0) + discarded_lines = models.IntegerField(default=0) + ip_local_count = models.IntegerField(default=0) + ip_remote_count = models.IntegerField(default=0) + ip_unknown_count = models.IntegerField(default=0) + generated_at = models.DateTimeField(auto_now=True) + + class Meta: + abstract = True + + @property + def pct_validated(self): + if not self.total_files: + return 0 + return round(self.validated_files / self.total_files * 100, 1) + pct_validated.fget.short_description = _("% Valid Files") + + @property + def pct_valid_lines(self): + if not self.lines_parsed: + return 0 + return round(self.valid_lines / self.lines_parsed * 100, 1) + pct_valid_lines.fget.short_description = _("% Valid Lines") + + @property + def pct_remote_ip(self): + total = self.ip_remote_count + self.ip_local_count + if not total: + return 0 + return round(self.ip_remote_count / total * 100, 1) + pct_remote_ip.fget.short_description = _("% Remote IP") + + def __str__(self): + return f"{self.collection.acron3} {self.period_label}" + + @property + def period_label(self): + raise NotImplementedError + + +class WeeklyLogReport(AbstractLogReport): + year = models.IntegerField(verbose_name=_("Year")) + week = models.IntegerField(verbose_name=_("ISO Week")) + + class Meta: + unique_together = [("collection", "year", "week")] + ordering = ["collection__acron3", "year", "week"] + verbose_name = _("Weekly Log Report") + verbose_name_plural = _("Weekly Log Reports") + + @property + def period_label(self): + return f"{self.year}-W{self.week:02d}" + + +class MonthlyLogReport(AbstractLogReport): + year = models.IntegerField(verbose_name=_("Year")) + month = models.IntegerField(verbose_name=_("Month")) + + class Meta: + unique_together = [("collection", "year", "month")] + ordering = ["collection__acron3", "year", "month"] + verbose_name = _("Monthly Log Report") + verbose_name_plural = _("Monthly Log Reports") + + @property + def period_label(self): + return f"{self.year}-{self.month:02d}" + + +class YearlyLogReport(AbstractLogReport): + year = models.IntegerField(verbose_name=_("Year")) + + class Meta: + unique_together = [("collection", "year")] + ordering = ["collection__acron3", "year"] + verbose_name = _("Yearly Log Report") + verbose_name_plural = _("Yearly Log Reports") + + @property + def period_label(self): + return str(self.year) From d4a32f2ad1f6c7cf91db9973525ca107c6d88db9 Mon Sep 17 00:00:00 2001 From: Rafael JP Damaceno Date: Fri, 1 May 2026 22:33:46 -0300 Subject: [PATCH 16/31] feat: adicionar tasks e wagtail hooks do Reports --- reports/tasks.py | 238 +++++++++++++++++++++++++++++++++++++++ reports/wagtail_hooks.py | 75 ++++++++++++ 2 files changed, 313 insertions(+) create mode 100644 reports/tasks.py create mode 100644 reports/wagtail_hooks.py diff --git a/reports/tasks.py b/reports/tasks.py new file mode 100644 index 0000000..69a53a1 --- /dev/null +++ b/reports/tasks.py @@ -0,0 +1,238 @@ +import logging +from collections import defaultdict + +from django.core.mail import send_mail +from django.conf import settings +from django.utils.translation import gettext as _ + +from config import celery_app +from core.utils import date_utils +from collection.models import Collection +from log_manager import choices +from log_manager.models import LogFile +from log_manager_config import models as lmc_models + +from reports.models import WeeklyLogReport, MonthlyLogReport, YearlyLogReport + + +def _extract_date_from_log_file(lf): + if lf.date: + return lf.date + + probably_date = (lf.validation or {}).get("probably_date") + if isinstance(probably_date, str) and probably_date: + return date_utils.get_date_obj(probably_date) + + try: + import re + match = re.search(r"(\d{4}-\d{2}-\d{2})", lf.path) + if match: + return date_utils.get_date_obj(match.group(1)) + except Exception: + pass + + return None + + +@celery_app.task(bind=True, name=_("[Reports] Populate All Reports")) +def task_populate_all_reports(self, year=None, collection_acron=None): + qs = LogFile.objects.select_related("collection") + if collection_acron: + qs = qs.filter(collection__acron3=collection_acron) + qs = qs.only( + "id", "collection_id", "date", "path", "status", "summary", "validation" + ) + + weekly = defaultdict(lambda: defaultdict(int)) + monthly = defaultdict(lambda: defaultdict(int)) + yearly = defaultdict(lambda: defaultdict(int)) + + for lf in qs.iterator(chunk_size=2000): + extracted_date = _extract_date_from_log_file(lf) + if not extracted_date: + continue + if year and extracted_date.year != int(year): + continue + + iso_year, iso_week, _ = extracted_date.isocalendar() + yr = extracted_date.year + mo = extracted_date.month + + for agg, key in [ + (weekly, (lf.collection_id, iso_year, iso_week)), + (monthly, (lf.collection_id, yr, mo)), + (yearly, (lf.collection_id, yr)), + ]: + r = agg[key] + r["total_files"] += 1 + st = lf.status + if st == "CRE": + r["created_files"] += 1 + elif st in ("QUE", "PAR", "PRO"): + r["validated_files"] += 1 + elif st == "INV": + r["invalidated_files"] += 1 + elif st == "ERR": + r["errored_files"] += 1 + + s = lf.summary or {} + lp = s.get("lines_parsed", 0) or 0 + vl = s.get("valid_lines", 0) or 0 + r["lines_parsed"] += lp + r["valid_lines"] += vl + r["discarded_lines"] += max(lp - vl, 0) + + ips = ( + (lf.validation or {}) + .get("content", {}) + .get("summary", {}) + .get("ips", {}) + ) + r["ip_local_count"] += ips.get("local", 0) or 0 + r["ip_remote_count"] += ips.get("remote", 0) or 0 + r["ip_unknown_count"] += ips.get("unknown", 0) or 0 + + w_count = _upsert_reports(WeeklyLogReport, weekly) + m_count = _upsert_reports(MonthlyLogReport, monthly) + y_count = _upsert_reports(YearlyLogReport, yearly) + + logging.info( + "Reports populated: %s weekly, %s monthly, %s yearly.", + w_count, m_count, y_count, + ) + return f"Weekly: {w_count}, Monthly: {m_count}, Yearly: {y_count}" + + +def _upsert_reports(model_class, data): + count = 0 + unique_fields = list(model_class._meta.unique_together[0]) + period_fields = unique_fields[1:] + for key, fields in data.items(): + coll_id = key[0] + period_values = key[1:] + lookup = {"collection_id": coll_id} + for idx, field_name in enumerate(period_fields): + lookup[field_name] = period_values[idx] + model_class.objects.update_or_create(defaults=fields, **lookup) + count += 1 + return count + + +@celery_app.task( + bind=True, + name=_("[Reports] Generate Log Report Summary (Manual)"), + queue="load", +) +def task_log_files_count_status_report( + self, + collections=None, + from_date=None, + until_date=None, + days_to_go_back=None, + user_id=None, + username=None, +): + from_date_str, until_date_str = date_utils.get_date_range_str( + from_date, until_date, days_to_go_back + ) + subject = _( + "Usage Log Report Summary " + f"({from_date_str} to {until_date_str})" + ) + + for collection_acron in (collections or Collection.acron3_list()): + try: + collection = Collection.objects.get(acron3=collection_acron) + except Collection.DoesNotExist: + logging.warning("Collection not found: %s", collection_acron) + continue + + message = _build_report_message( + collection, + from_date_str, + until_date_str, + ) + + if not message: + continue + + logging.info( + "Sending email to collection %s. Subject: %s.", + collection.main_name, subject, + ) + + _send_collection_email(subject, message, collection_acron) + + +def _build_report_message(collection, from_date_str, until_date_str): + monthly = MonthlyLogReport.objects.filter( + collection=collection, + ).order_by("-year", "-month") + + if not monthly.exists(): + return "" + + latest = monthly.first() + message = _( + f"Usage Log Report for {collection.acron3}\n" + f"Period: {from_date_str} to {until_date_str}\n\n" + ) + message += _("Latest month ({latest}):\n").format(latest=latest.period_label) + message += ( + f" Total files: {latest.total_files}\n" + f" Validated files: {latest.validated_files} ({latest.pct_validated}%)\n" + f" Invalidated files: {latest.invalidated_files}\n" + f" Errored files: {latest.errored_files}\n" + f" Lines parsed: {latest.lines_parsed}\n" + f" Valid lines: {latest.valid_lines} ({latest.pct_valid_lines}%)\n" + f" Discarded lines: {latest.discarded_lines}\n" + f" Remote IPs: {latest.ip_remote_count} ({latest.pct_remote_ip}%)\n" + f" Local IPs: {latest.ip_local_count}\n" + ) + + prev_month = latest + if len(monthly) > 1: + prev_month = monthly[1] + message += _("\nPrevious month ({prev}):\n").format(prev=prev_month.period_label) + message += ( + f" Total files: {prev_month.total_files}\n" + f" Validated files: {prev_month.validated_files} ({prev_month.pct_validated}%)\n" + f" Valid lines: {prev_month.valid_lines} ({prev_month.pct_valid_lines}%)\n" + f" Remote IPs: {prev_month.ip_remote_count} ({prev_month.pct_remote_ip}%)\n" + ) + + if prev_month.total_files: + file_diff = latest.total_files - prev_month.total_files + line_diff = latest.lines_parsed - prev_month.lines_parsed + message += _("\nMonth-over-month change:\n") + message += f" Files: {file_diff:+d}\n" + message += f" Lines: {line_diff:+d}\n" + + message += ( + f"\n---\n" + f"This report is automatically generated by SciELO Usage.\n" + ) + return message + + +def _send_collection_email(subject, message, collection): + emails = lmc_models.CollectionEmail.objects.filter( + config__collection__acron3=collection, active=True + ).values_list("email", flat=True) + + if not emails: + logging.error( + "Error. Please, add an E-mail Configuration for the collection %s.", + collection, + ) + return + + try: + send_mail( + subject=subject, + message=message, + from_email=settings.DEFAULT_FROM_EMAIL, + recipient_list=list(emails), + ) + except Exception as e: + logging.error("Error sending log files report for %s: %s", collection, e) diff --git a/reports/wagtail_hooks.py b/reports/wagtail_hooks.py new file mode 100644 index 0000000..b2aeac7 --- /dev/null +++ b/reports/wagtail_hooks.py @@ -0,0 +1,75 @@ +from django.contrib.auth import get_user_model +from django.utils.translation import gettext_lazy as _ +from wagtail.snippets.views.snippets import SnippetViewSet, SnippetViewSetGroup +from wagtail.snippets.models import register_snippet +from wagtail.permission_policies.base import BasePermissionPolicy + +from reports.models import WeeklyLogReport, MonthlyLogReport, YearlyLogReport + + +class ReadOnlyPermissionPolicy(BasePermissionPolicy): + def user_has_permission(self, user, action): + if action in ("add", "change", "delete"): + return False + return True + + def users_with_any_permission(self, actions): + return get_user_model().objects.filter(is_active=True) + + +COMMON_LIST_DISPLAY = ( + "total_files", + "pct_validated", + "lines_parsed", + "pct_valid_lines", + "pct_remote_ip", + "generated_at", +) + + +class WeeklyLogReportSnippetViewSet(SnippetViewSet): + model = WeeklyLogReport + menu_label = _("Weekly") + icon = "info-circle" + menu_order = 100 + list_display = ("collection", "year", "week") + COMMON_LIST_DISPLAY + list_filter = ("collection", "year", "week") + search_fields = ("collection__acron3",) + permission_policy = ReadOnlyPermissionPolicy(WeeklyLogReport) + + +class MonthlyLogReportSnippetViewSet(SnippetViewSet): + model = MonthlyLogReport + menu_label = _("Monthly") + icon = "info-circle" + menu_order = 200 + list_display = ("collection", "year", "month") + COMMON_LIST_DISPLAY + list_filter = ("collection", "year", "month") + search_fields = ("collection__acron3",) + permission_policy = ReadOnlyPermissionPolicy(MonthlyLogReport) + + +class YearlyLogReportSnippetViewSet(SnippetViewSet): + model = YearlyLogReport + menu_label = _("Yearly") + icon = "info-circle" + menu_order = 300 + list_display = ("collection", "year") + COMMON_LIST_DISPLAY + list_filter = ("collection", "year") + search_fields = ("collection__acron3",) + permission_policy = ReadOnlyPermissionPolicy(YearlyLogReport) + + +class ReportsSnippetViewSetGroup(SnippetViewSetGroup): + menu_name = "usage_reports" + menu_label = _("Reports") + menu_icon = "info-circle" + menu_order = 350 + items = ( + WeeklyLogReportSnippetViewSet, + MonthlyLogReportSnippetViewSet, + YearlyLogReportSnippetViewSet, + ) + + +register_snippet(ReportsSnippetViewSetGroup) From 3fffbe7d350cef86d32289956dff32de592a70a4 Mon Sep 17 00:00:00 2001 From: Rafael JP Damaceno Date: Fri, 1 May 2026 22:33:46 -0300 Subject: [PATCH 17/31] feat: adicionar core collectors para ArticleMeta e OPAC --- core/collectors/articlemeta.py | 60 ++++++++++++++++++++++++++++++++++ core/collectors/opac.py | 33 +++++++++++++++++++ 2 files changed, 93 insertions(+) create mode 100644 core/collectors/articlemeta.py create mode 100644 core/collectors/opac.py diff --git a/core/collectors/articlemeta.py b/core/collectors/articlemeta.py new file mode 100644 index 0000000..7f6ace0 --- /dev/null +++ b/core/collectors/articlemeta.py @@ -0,0 +1,60 @@ +import logging + +import requests +from django.conf import settings +from articlemeta.client import RestfulClient, ThriftClient +from time import sleep + + +def fetch_article_counter_dict( + from_date, + until_date, + offset=0, + limit=1000, + collection=None, + issn=None, +): + for attempt in range(1, settings.ARTICLEMETA_MAX_RETRIES + 1): + params = { + "from": from_date, + "until": until_date, + "offset": offset, + "limit": limit, + } + + if collection: + params["collection"] = collection + + if issn: + params["issn"] = issn + + response = requests.get(settings.ARTICLEMETA_COLLECT_URL, params=params) + + try: + response.raise_for_status() + logging.info(response.url) + except requests.exceptions.HTTPError: + logging.warning( + "Failed to collect data from %s. Waiting %d seconds before retry %d of %d", + response.url, + settings.ARTICLEMETA_SLEEP_TIME, + attempt, + settings.ARTICLEMETA_MAX_RETRIES, + ) + sleep(settings.ARTICLEMETA_SLEEP_TIME) + else: + return response.json() + + return {} + + +def iter_journals(collection="scl", mode="rest"): + if mode == "rest": + client = RestfulClient() + elif mode == "thrift": + client = ThriftClient() + else: + raise ValueError(f"Unsupported ArticleMeta mode: {mode}") + + for journal in client.journals(collection=collection): + yield journal diff --git a/core/collectors/opac.py b/core/collectors/opac.py new file mode 100644 index 0000000..94122b7 --- /dev/null +++ b/core/collectors/opac.py @@ -0,0 +1,33 @@ +import logging + +import requests +from django.conf import settings +from time import sleep + + +def fetch_counter_dict(from_date, until_date, page=1): + for attempt in range(1, settings.OPAC_MAX_RETRIES + 1): + params = { + "begin_date": from_date, + "end_date": until_date, + "page": page, + } + + response = requests.get(url=settings.OPAC_ENDPOINT, params=params, verify=False) + + try: + response.raise_for_status() + logging.info(response.url) + except requests.exceptions.HTTPError: + logging.warning( + "Could not collect data from %s. Waiting %d seconds for attempt %d of %d", + response.url, + settings.OPAC_SLEEP_TIME, + attempt, + settings.OPAC_MAX_RETRIES, + ) + sleep(settings.OPAC_SLEEP_TIME) + else: + return response.json() + + return {} From 224c01d8cbb8457f788ad00abb6b1de686c8b4b2 Mon Sep 17 00:00:00 2001 From: Rafael JP Damaceno Date: Fri, 1 May 2026 22:33:46 -0300 Subject: [PATCH 18/31] feat: adicionar core collectors para Preprints, Dataverse e SciELO Books --- core/collectors/__init__.py | 1 + core/collectors/dataverse.py | 75 +++++++++++++ core/collectors/preprints.py | 55 ++++++++++ core/collectors/scielo_books.py | 182 ++++++++++++++++++++++++++++++++ 4 files changed, 313 insertions(+) create mode 100644 core/collectors/__init__.py create mode 100644 core/collectors/dataverse.py create mode 100644 core/collectors/preprints.py create mode 100644 core/collectors/scielo_books.py diff --git a/core/collectors/__init__.py b/core/collectors/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/core/collectors/__init__.py @@ -0,0 +1 @@ + diff --git a/core/collectors/dataverse.py b/core/collectors/dataverse.py new file mode 100644 index 0000000..ca51fd7 --- /dev/null +++ b/core/collectors/dataverse.py @@ -0,0 +1,75 @@ +import logging + +import requests +from django.conf import settings + +from core.utils import standardizer + + +def _request_json(url): + try: + response = requests.get(url, timeout=settings.DATAVERSE_SLEEP_TIME) + response.raise_for_status() + return response.json() + except requests.exceptions.RequestException as exc: + logging.error("Error fetching %s: %s", url, exc) + return {} + + +def _get_subdataverses(): + url = f"{settings.DATAVERSE_ENDPOINT}/dataverses/{settings.DATAVERSE_ROOT_COLLECTION}/contents" + return _request_json(url).get("data", []) + + +def _get_datasets(subdataverse_id): + url = f"{settings.DATAVERSE_ENDPOINT}/dataverses/{subdataverse_id}/contents" + return _request_json(url).get("data", []) + + +def _get_files(dataset_id): + url = f"{settings.DATAVERSE_ENDPOINT}/datasets/{dataset_id}/versions/:latest/files" + return _request_json(url).get("data", []) + + +def iter_dataset_metadata(from_date=None, until_date=None): + for subdataverse in _get_subdataverses(): + if subdataverse.get("type") != "dataverse": + continue + + subdataverse_id = subdataverse["id"] + subdataverse_title = subdataverse["title"] + + for dataset in _get_datasets(subdataverse_id): + if dataset.get("type") != "dataset": + continue + + dataset_id = dataset["id"] + doi = standardizer.standardize_doi(dataset.get("persistentUrl")) + if not doi: + logging.warning("Dataset %s does not have a DOI.", dataset_id) + continue + + publication_date = dataset.get("publicationDate") + if publication_date: + if (from_date and publication_date < from_date) or ( + until_date and publication_date > until_date + ): + continue + + for file_data in _get_files(dataset_id): + file_persistent_id = file_data["dataFile"].get("persistentId") + standardized_persistent_id = ( + standardizer.standardize_pid_generic(file_persistent_id) + if file_persistent_id + else None + ) + + yield { + "title": subdataverse_title, + "dataset_doi": doi, + "dataset_published": publication_date, + "file_id": file_data["dataFile"]["id"], + "file_name": file_data["label"], + "file_url": f"{settings.DATAVERSE_ENDPOINT}/access/datafile/{file_data['dataFile']['id']}", + "file_persistent_id": standardized_persistent_id, + } diff --git a/core/collectors/preprints.py b/core/collectors/preprints.py new file mode 100644 index 0000000..bead72c --- /dev/null +++ b/core/collectors/preprints.py @@ -0,0 +1,55 @@ +from django.conf import settings +from sickle import Sickle + +from core.utils import standardizer + + +def iter_records(from_date, until_date): + oai_client = Sickle( + endpoint=settings.OAI_PMH_PREPRINT_ENDPOINT, + max_retries=settings.OAI_PMH_MAX_RETRIES, + verify=False, + ) + records = oai_client.ListRecords( + **{ + "metadataPrefix": settings.OAI_METADATA_PREFIX, + "from": from_date, + "until": until_date, + } + ) + + for record in records: + yield record + + +def extract_record_data(record): + pid_generic = _extract_compatible_identifier(record.header.identifier) + text_langs = [ + standardizer.standardize_language_code(language) + for language in record.metadata.get("language", []) + ] + publication_date = record.metadata.get("date", [""])[0] + default_language = text_langs[0] if text_langs else "" + publication_year = _extract_publication_year_from_date(publication_date) + + return { + "pid_generic": pid_generic, + "text_langs": text_langs, + "publication_date": publication_date, + "default_language": default_language, + "publication_year": publication_year, + } + + +def _extract_compatible_identifier(identifier): + try: + return identifier.split(":")[-1].split("/")[1] + except IndexError: + return "" + + +def _extract_publication_year_from_date(date_str): + try: + return date_str[:4] + except IndexError: + return "" diff --git a/core/collectors/scielo_books.py b/core/collectors/scielo_books.py new file mode 100644 index 0000000..b1f2dd8 --- /dev/null +++ b/core/collectors/scielo_books.py @@ -0,0 +1,182 @@ +import logging + +import requests +from django.conf import settings +from urllib.parse import urlencode + + + + +def build_url(base_url, params=None): + if not params: + return base_url + return f"{base_url}?{urlencode(params, doseq=True)}" + + +def sanitize_raw_data(payload): + if not isinstance(payload, dict): + return payload + + if "_id" not in payload: + return payload + + sanitized = dict(payload) + sanitized["id"] = sanitized.pop("_id") + return sanitized + + +def fetch_document(doc_id, base_url=None, db_name=None, headers=None): + db_name = db_name or settings.SCIELO_BOOKS_DB_NAME + resolved_base_url = base_url or settings.SCIELO_BOOKS_BASE_URL + if not resolved_base_url: + logging.error("Sem base url definida para coleta de books") + raise ValueError("SCIELO_BOOKS_BASE_URL is not configured") + + url = f"{resolved_base_url}/{db_name}/{doc_id}" + response = requests.get(url, headers=headers, timeout=settings.SCIELO_BOOKS_TIMEOUT, verify=False) + response.raise_for_status() + payload = response.json() + return sanitize_raw_data(payload), url + + +def fetch_changes_page( + base_url=None, + db_name=None, + since=0, + limit=None, + include_docs=False, + headers=None, +): + db_name = db_name or settings.SCIELO_BOOKS_DB_NAME + limit = limit or settings.SCIELO_BOOKS_LIMIT + resolved_base_url = base_url or settings.SCIELO_BOOKS_BASE_URL + if not resolved_base_url: + logging.error("Sem base url definida para coleta de books") + raise ValueError("SCIELO_BOOKS_BASE_URL is not configured") + + params = { + "since": since, + "limit": limit, + } + if include_docs: + params["include_docs"] = "true" + + url = build_url(f"{resolved_base_url}/{db_name}/_changes", params) + response = requests.get(url, headers=headers, timeout=settings.SCIELO_BOOKS_TIMEOUT, verify=False) + response.raise_for_status() + payload = response.json() + return payload if isinstance(payload, dict) else {} + + +def extract_changes(payload): + if isinstance(payload, dict) and isinstance(payload.get("results"), list): + return payload.get("results") + return [] + + +def extract_last_seq(payload): + if isinstance(payload, dict): + return payload.get("last_seq") or payload.get("seq") + return None + + +def iter_changes( + base_url=None, + db_name=None, + since=0, + limit=None, + headers=None, +): + db_name = db_name or settings.SCIELO_BOOKS_DB_NAME + limit = limit or settings.SCIELO_BOOKS_LIMIT + current_since = since or 0 + + while True: + payload = fetch_changes_page( + base_url=base_url, + db_name=db_name, + since=current_since, + limit=limit, + include_docs=False, + headers=headers, + ) + changes = extract_changes(payload) + if not changes: + break + + for change in changes: + yield change + + last_seq = extract_last_seq(payload) + if last_seq is None or last_seq == current_since: + break + current_since = last_seq + + +def iter_change_documents( + base_url=None, + db_name=None, + since=0, + limit=None, + headers=None, +): + db_name = db_name or settings.SCIELO_BOOKS_DB_NAME + limit = limit or settings.SCIELO_BOOKS_LIMIT + current_since = since or 0 + + while True: + payload = fetch_changes_page( + base_url=base_url, + db_name=db_name, + since=current_since, + limit=limit, + include_docs=True, + headers=headers, + ) + changes = extract_changes(payload) + if not changes: + break + + for change in changes: + doc_id = change.get("id") + if not doc_id: + continue + + deleted = bool(change.get("deleted")) + raw_doc = change.get("doc") or {} + if deleted: + yield { + "change": change, + "deleted": True, + "payload": None, + "source_url": None, + } + continue + + if raw_doc: + sanitized = sanitize_raw_data(raw_doc) + yield { + "change": change, + "deleted": False, + "payload": sanitized, + "source_url": f"{(base_url or settings.SCIELO_BOOKS_BASE_URL)}/{db_name}/{doc_id}", + } + continue + + document_payload, source_url = fetch_document( + doc_id=doc_id, + base_url=base_url, + db_name=db_name, + headers=headers, + ) + yield { + "change": change, + "deleted": False, + "payload": document_payload, + "source_url": source_url, + } + + last_seq = extract_last_seq(payload) + if last_seq is None or last_seq == current_since: + break + current_since = last_seq From 1124f4e99f232f97c2077126ca8400c3f7d5e174 Mon Sep 17 00:00:00 2001 From: Rafael JP Damaceno Date: Fri, 1 May 2026 22:33:57 -0300 Subject: [PATCH 19/31] feat: adicionar document tasks para articlemeta e opac --- document/tasks/articlemeta.py | 120 ++++++++++++++++++++++++++++++++++ document/tasks/common.py | 43 ++++++++++++ document/tasks/opac.py | 107 ++++++++++++++++++++++++++++++ 3 files changed, 270 insertions(+) create mode 100644 document/tasks/articlemeta.py create mode 100644 document/tasks/common.py create mode 100644 document/tasks/opac.py diff --git a/document/tasks/articlemeta.py b/document/tasks/articlemeta.py new file mode 100644 index 0000000..75b2689 --- /dev/null +++ b/document/tasks/articlemeta.py @@ -0,0 +1,120 @@ +import logging + +from django.db import DataError +from django.utils.translation import gettext as _ + +from core.collectors import articlemeta as articlemeta_collector +from core.utils import date_utils +from core.utils.request_utils import _get_user +from document.services import articles as article_service +from source.services import journals as journal_service + +from config import celery_app + +from .common import _get_collection + + +def load_documents_from_article_meta( + from_date=None, + until_date=None, + days_to_go_back=None, + collection=None, + issn=None, + force_update=True, + user=None, +): + from_date, until_date = date_utils.get_date_range_str( + from_date, + until_date, + days_to_go_back, + ) + logging.info( + "Loading documents from Article Meta. From: %s, Until: %s, Collection: %s, ISSN: %s", + from_date, + until_date, + collection, + issn, + ) + + offset = 0 + limit = 1000 + while True: + response = articlemeta_collector.fetch_article_counter_dict( + from_date, + until_date, + offset=offset, + limit=limit, + collection=collection, + issn=issn, + ) + objects = response.get("objects") or [] + if not objects: + break + + for payload in objects: + collection_obj = _get_collection(payload.get("collection") or collection) + if not collection_obj: + logging.info( + "Collection not found for payload %s", + payload.get("code"), + ) + continue + + source = journal_service.find_journal_source_by_issns( + collection_obj, + payload.get("code_title"), + ) + if not source: + logging.info( + "Source not found for collection %s and ISSNs %s", + collection_obj.acron3, + payload.get("code_title"), + ) + continue + + try: + article_service.upsert_article_document_from_articlemeta( + payload, + collection=collection_obj, + source=source, + user=user, + force_update=force_update, + ) + except DataError as exc: + logging.error( + "Error saving Document from Article Meta. " + "Collection: %s, Source: %s, PIDv2: %s. Error: %s", + collection_obj, + source.source_id, + payload.get('code'), + exc + ) + continue + + offset += limit + + return True + + +@celery_app.task(bind=True, name=_("[Metadata] Sync Documents (Article Meta)"), timelimit=-1, queue="load") +def task_load_documents_from_article_meta( + self, + from_date=None, + until_date=None, + days_to_go_back=None, + collection=None, + issn=None, + force_update=True, + user_id=None, + username=None, +): + user = _get_user(self.request, username=username, user_id=user_id) + return load_documents_from_article_meta( + from_date=from_date, + until_date=until_date, + days_to_go_back=days_to_go_back, + collection=collection, + issn=issn, + force_update=force_update, + user=user, + ) diff --git a/document/tasks/common.py b/document/tasks/common.py new file mode 100644 index 0000000..1645918 --- /dev/null +++ b/document/tasks/common.py @@ -0,0 +1,43 @@ +import logging + +from collection.models import Collection +from document.models import Document +from source.models import Source + + +def _get_collection(acronym): + if not acronym: + return None + return Collection.objects.filter(acron3=acronym).first() + + +def get_latest_scielo_books_last_seq(collection="books"): + document_last_seq = _get_latest_last_seq_from_queryset( + Document.objects.filter(collection__acron3=collection).only("extra_data") + ) + source_last_seq = _get_latest_last_seq_from_queryset( + Source.objects.filter( + collection__acron3=collection, + source_type=Source.SOURCE_TYPE_BOOK, + ).only("extra_data") + ) + return max(document_last_seq, source_last_seq) + + +def _get_latest_last_seq_from_queryset(queryset): + latest = 0 + for item in queryset.iterator(): + value = _coerce_last_seq((item.extra_data or {}).get("last_seq")) + if value is not None and value > latest: + latest = value + return latest + + +def _coerce_last_seq(value): + if value in (None, ""): + return None + try: + return int(value) + except (TypeError, ValueError): + logging.warning("Ignoring invalid SciELO Books last_seq value: %r", value) + return None diff --git a/document/tasks/opac.py b/document/tasks/opac.py new file mode 100644 index 0000000..5e1c81e --- /dev/null +++ b/document/tasks/opac.py @@ -0,0 +1,107 @@ +import logging + +from django.db import DataError +from django.utils.translation import gettext as _ + +from core.collectors import opac as opac_collector +from core.utils import date_utils +from core.utils.request_utils import _get_user +from document.services import articles as article_service +from source.services import journals as journal_service + +from config import celery_app + +from .common import _get_collection + + +def load_documents_from_opac( + collection="scl", + from_date=None, + until_date=None, + days_to_go_back=None, + page=1, + force_update=True, + user=None, +): + from_date, until_date = date_utils.get_date_range_str( + from_date, + until_date, + days_to_go_back, + ) + logging.info( + "Loading documents from OPAC. From: %s, Until: %s, Collection: %s", + from_date, + until_date, + collection, + ) + + collection_obj = _get_collection(collection) + if not collection_obj: + logging.error("Collection not found: %s", collection) + return False + + while True: + response = opac_collector.fetch_counter_dict(from_date, until_date, page=page) + documents = response.get("documents") or {} + + for payload in documents.values(): + source = journal_service.find_journal_source_by_acronym( + collection_obj, + payload.get("journal_acronym"), + ) + if not source: + logging.info( + "Source not found for collection %s and acronym %s", + collection_obj.acron3, + payload.get("journal_acronym"), + ) + continue + + try: + article_service.upsert_article_document_from_opac( + payload, + collection=collection_obj, + source=source, + user=user, + force_update=force_update, + ) + except DataError as exc: + logging.error( + "Error saving Document from OPAC. " + "Collection: %s, Source: %s, PIDv2: %s. Error: %s", + collection_obj, + source.source_id, + payload.get('pid_v2'), + exc + ) + continue + + page += 1 + if page > int(response.get("pages", 0)): + break + + return True + + +@celery_app.task(bind=True, name=_("[Metadata] Sync Documents (OPAC)"), timelimit=-1, queue="load") +def task_load_documents_from_opac( + self, + collection="scl", + from_date=None, + until_date=None, + days_to_go_back=None, + page=1, + force_update=True, + user_id=None, + username=None, +): + user = _get_user(self.request, username=username, user_id=user_id) + return load_documents_from_opac( + collection=collection, + from_date=from_date, + until_date=until_date, + days_to_go_back=days_to_go_back, + page=page, + force_update=force_update, + user=user, + ) From 1e53b38d7a79a201ee9ebef132404e58e3689233 Mon Sep 17 00:00:00 2001 From: Rafael JP Damaceno Date: Fri, 1 May 2026 22:33:57 -0300 Subject: [PATCH 20/31] feat: adicionar document tasks para preprints, dataverse e scielo_books --- document/tasks/dataverse.py | 80 +++++++++++ document/tasks/preprints.py | 82 +++++++++++ document/tasks/scielo_books.py | 247 +++++++++++++++++++++++++++++++++ 3 files changed, 409 insertions(+) create mode 100644 document/tasks/dataverse.py create mode 100644 document/tasks/preprints.py create mode 100644 document/tasks/scielo_books.py diff --git a/document/tasks/dataverse.py b/document/tasks/dataverse.py new file mode 100644 index 0000000..15618a5 --- /dev/null +++ b/document/tasks/dataverse.py @@ -0,0 +1,80 @@ +import logging + +from django.db import DataError +from django.utils.translation import gettext as _ + +from core.collectors import dataverse as dataverse_collector +from core.utils import date_utils +from core.utils.request_utils import _get_user +from document.services import datasets as dataset_service + +from config import celery_app + +from .common import _get_collection + + +def load_dataset_metadata_from_dataverse( + from_date=None, + until_date=None, + days_to_go_back=None, + force_update=True, + user=None, +): + from_date, until_date = date_utils.get_date_range_str( + from_date, + until_date, + days_to_go_back, + ) + logging.info( + "Loading dataset metadata into documents. From: %s, Until: %s", + from_date, + until_date, + ) + + collection_obj = _get_collection("data") + if not collection_obj: + logging.error("Collection not found: data") + return False + + for payload in dataverse_collector.iter_dataset_metadata(from_date, until_date): + if not payload.get("dataset_doi"): + logging.error("Dataset DOI not found in record: %s", payload) + continue + + try: + dataset_service.upsert_dataset_document( + payload, + collection=collection_obj, + user=user, + force_update=force_update, + ) + except DataError as exc: + logging.error( + "Error saving Dataset Document. Collection: %s, PID: %s. Error: %s", + collection_obj, + payload.get('dataset_doi'), + exc + ) + continue + + return True + + +@celery_app.task(bind=True, name=_("[Metadata] Sync Documents (Dataverse)"), timelimit=-1, queue="load") +def task_load_dataset_metadata_into_documents( + self, + from_date=None, + until_date=None, + days_to_go_back=None, + force_update=True, + user_id=None, + username=None, +): + user = _get_user(self.request, username=username, user_id=user_id) + return load_dataset_metadata_from_dataverse( + from_date=from_date, + until_date=until_date, + days_to_go_back=days_to_go_back, + force_update=force_update, + user=user, + ) diff --git a/document/tasks/preprints.py b/document/tasks/preprints.py new file mode 100644 index 0000000..ee63211 --- /dev/null +++ b/document/tasks/preprints.py @@ -0,0 +1,82 @@ +import logging + +from django.db import DataError +from django.utils.translation import gettext as _ + +from core.collectors import preprints as preprints_collector +from core.utils import date_utils +from core.utils.request_utils import _get_user +from document.services import preprints as preprint_service + +from config import celery_app + +from .common import _get_collection + + +def load_preprints_from_preprints_api( + from_date=None, + until_date=None, + days_to_go_back=None, + force_update=True, + user=None, +): + from_date, until_date = date_utils.get_date_range_str( + from_date, + until_date, + days_to_go_back, + ) + logging.info( + "Loading preprints into documents. From: %s, Until: %s", + from_date, + until_date, + ) + + collection_obj = _get_collection("preprints") + if not collection_obj: + logging.error("Collection not found: preprints") + return False + + for record in preprints_collector.iter_records(from_date, until_date): + payload = preprints_collector.extract_record_data(record) + + if not payload.get("pid_generic"): + logging.error("Preprint ID not found in record: %s", record) + continue + + try: + preprint_service.upsert_preprint_document( + payload, + collection=collection_obj, + user=user, + force_update=force_update, + ) + except DataError as exc: + logging.error( + "Error saving Preprint Document. Collection: %s, PID: %s. Error: %s", + collection_obj, + payload.get('pid_generic'), + exc + ) + continue + + return True + + +@celery_app.task(bind=True, name=_("[Metadata] Sync Documents (Preprints)"), timelimit=-1, queue="load") +def task_load_preprints_into_documents( + self, + from_date=None, + until_date=None, + days_to_go_back=None, + force_update=True, + user_id=None, + username=None, +): + user = _get_user(self.request, username=username, user_id=user_id) + return load_preprints_from_preprints_api( + from_date=from_date, + until_date=until_date, + days_to_go_back=days_to_go_back, + force_update=force_update, + user=user, + ) diff --git a/document/tasks/scielo_books.py b/document/tasks/scielo_books.py new file mode 100644 index 0000000..ddbd462 --- /dev/null +++ b/document/tasks/scielo_books.py @@ -0,0 +1,247 @@ +import logging + +from django.conf import settings +from django.utils.translation import gettext as _ + +from core.collectors import scielo_books as scielo_books_collector +from core.utils.request_utils import _get_user +from document.services import books as document_books_service +from source.services import books as source_books_service + +from config import celery_app + +from .common import get_latest_scielo_books_last_seq + + +def load_documents_from_scielo_books( + collection="books", + db_name=None, + since=0, + limit=None, + force_update=True, + headers=None, + base_url=None, + user=None, +): + db_name = db_name or settings.SCIELO_BOOKS_DB_NAME + limit = limit or settings.SCIELO_BOOKS_LIMIT + collection_obj = source_books_service.get_books_collection(collection) + monograph_cache = {} + + logging.info( + "Loading documents from SciELO Books. Collection: %s, DB: %s, Since: %s, Limit: %s", + collection, + db_name, + since, + limit, + ) + + for item in scielo_books_collector.iter_change_documents( + base_url=base_url, + db_name=db_name, + since=since, + limit=limit, + headers=headers, + ): + change = item["change"] + raw_id = change.get("id") + + if item["deleted"]: + delete_source = document_books_service.has_monograph_document_for_raw_id( + collection_obj, + raw_id, + ) + document_books_service.delete_document_by_raw_id(collection_obj, raw_id) + if delete_source: + source_books_service.delete_book_source(collection_obj, raw_id) + continue + + payload = item["payload"] or {} + source_url = item.get("source_url") + last_seq = change.get("seq") + + if payload.get("TYPE") == "Monograph": + source = source_books_service.upsert_monograph_source( + payload, + collection=collection_obj, + user=user, + force_update=force_update, + source_url=source_url, + last_seq=last_seq, + ) + document_books_service.upsert_monograph_document( + payload, + collection=collection_obj, + source=source, + user=user, + force_update=force_update, + source_url=source_url, + last_seq=last_seq, + ) + monograph_cache[str(payload.get("id"))] = payload + continue + + if payload.get("TYPE") != "Part": + continue + + monograph_payload = _get_monograph_payload( + payload, + monograph_cache=monograph_cache, + base_url=base_url, + db_name=db_name, + headers=headers, + ) + if not monograph_payload: + logging.warning( + "Skipping part %s because monograph %s could not be loaded.", + payload.get("id"), + payload.get("monograph"), + ) + continue + + source = source_books_service.upsert_monograph_source( + monograph_payload, + collection=collection_obj, + user=user, + force_update=force_update, + source_url=None, + last_seq=last_seq, + ) + parent_document = document_books_service.upsert_monograph_document( + monograph_payload, + collection=collection_obj, + source=source, + user=user, + force_update=force_update, + source_url=None, + last_seq=last_seq, + ) + enriched_payload = document_books_service.enrich_part_payload( + payload, + monograph_payload, + ) + document_books_service.upsert_part_document( + enriched_payload, + collection=collection_obj, + source=source, + parent_document=parent_document, + user=user, + force_update=force_update, + source_url=source_url, + last_seq=last_seq, + ) + + return True + + +def sync_documents_from_scielo_books( + collection="books", + db_name=None, + limit=None, + force_update=True, + headers=None, + base_url=None, + user=None, +): + db_name = db_name or settings.SCIELO_BOOKS_DB_NAME + limit = limit or settings.SCIELO_BOOKS_LIMIT + since = get_latest_scielo_books_last_seq(collection=collection) + logging.info( + "Syncing documents from SciELO Books incrementally. Collection: %s, Since: %s, Limit: %s", + collection, + since, + limit, + ) + return load_documents_from_scielo_books( + collection=collection, + db_name=db_name, + since=since, + limit=limit, + force_update=force_update, + headers=headers, + base_url=base_url, + user=user, + ) + + +@celery_app.task(bind=True, name=_("[Metadata] Sync Documents (SciELO Books - Manual)"), queue="load") +def task_load_documents_from_scielo_books( + self, + collection="books", + db_name=None, + since=0, + limit=None, + force_update=True, + headers=None, + base_url=None, + user_id=None, + username=None, +): + db_name = db_name or settings.SCIELO_BOOKS_DB_NAME + limit = limit or settings.SCIELO_BOOKS_LIMIT + user = _get_user(self.request, username=username, user_id=user_id) + return load_documents_from_scielo_books( + collection=collection, + db_name=db_name, + since=since, + limit=limit, + force_update=force_update, + headers=headers, + base_url=base_url, + user=user, + ) + + +@celery_app.task(bind=True, name=_("[Metadata] Sync Documents (SciELO Books - Incremental)"), queue="load") +def task_sync_documents_from_scielo_books( + self, + collection="books", + db_name=None, + limit=None, + force_update=True, + headers=None, + base_url=None, + user_id=None, + username=None, +): + db_name = db_name or settings.SCIELO_BOOKS_DB_NAME + limit = limit or settings.SCIELO_BOOKS_LIMIT + user = _get_user(self.request, username=username, user_id=user_id) + return sync_documents_from_scielo_books( + collection=collection, + db_name=db_name, + limit=limit, + force_update=force_update, + headers=headers, + base_url=base_url, + user=user, + ) + + +def _get_monograph_payload(payload, monograph_cache, base_url=None, db_name=None, headers=None): + monograph_id = payload.get("monograph") + if not monograph_id: + return None + + monograph_key = str(monograph_id) + if monograph_key in monograph_cache: + return monograph_cache[monograph_key] + + try: + monograph_payload, _ = scielo_books_collector.fetch_document( + doc_id=monograph_id, + base_url=base_url, + db_name=db_name or settings.SCIELO_BOOKS_DB_NAME, + headers=headers, + ) + except Exception as exc: + logging.warning( + "Failed to fetch monograph %s for part %s: %s", + monograph_id, + payload.get("id"), + exc, + ) + return None + + monograph_cache[monograph_key] = monograph_payload + return monograph_payload From 0ccb0e9874944bd2680d4ceba0802e8380a97772 Mon Sep 17 00:00:00 2001 From: Rafael JP Damaceno Date: Fri, 1 May 2026 22:33:57 -0300 Subject: [PATCH 21/31] feat: adicionar document task pipeline, wagtail hooks e management command --- document/__init__.py | 1 + document/apps.py | 6 + document/management/__init__.py | 1 + document/management/commands/__init__.py | 1 + .../commands/load_articles_by_year.py | 80 ++++++ document/tasks/__init__.py | 28 ++ document/tasks/pipeline.py | 24 ++ document/tests.py | 255 ++++++++++++++++++ document/wagtail_hooks.py | 35 +++ 9 files changed, 431 insertions(+) create mode 100644 document/__init__.py create mode 100644 document/apps.py create mode 100644 document/management/__init__.py create mode 100644 document/management/commands/__init__.py create mode 100644 document/management/commands/load_articles_by_year.py create mode 100644 document/tasks/__init__.py create mode 100644 document/tasks/pipeline.py create mode 100644 document/tests.py create mode 100644 document/wagtail_hooks.py diff --git a/document/__init__.py b/document/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/document/__init__.py @@ -0,0 +1 @@ + diff --git a/document/apps.py b/document/apps.py new file mode 100644 index 0000000..eb482d2 --- /dev/null +++ b/document/apps.py @@ -0,0 +1,6 @@ +from django.apps import AppConfig + + +class DocumentConfig(AppConfig): + default_auto_field = "django.db.models.BigAutoField" + name = "document" diff --git a/document/management/__init__.py b/document/management/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/document/management/__init__.py @@ -0,0 +1 @@ + diff --git a/document/management/commands/__init__.py b/document/management/commands/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/document/management/commands/__init__.py @@ -0,0 +1 @@ + diff --git a/document/management/commands/load_articles_by_year.py b/document/management/commands/load_articles_by_year.py new file mode 100644 index 0000000..a922456 --- /dev/null +++ b/document/management/commands/load_articles_by_year.py @@ -0,0 +1,80 @@ +from django.core.management.base import BaseCommand + +from document.tasks import task_load_documents_from_article_meta +from document.tasks import task_load_documents_from_opac + + +class Command(BaseCommand): + help = "Generate task requests for loading document data by year" + + def add_arguments(self, parser): + parser.add_argument( + "--start-year", + type=int, + default=1990, + help="Start year (default: 1990)", + ) + parser.add_argument( + "--end-year", + type=int, + default=2025, + help="End year (default: 2025)", + ) + parser.add_argument( + "--collection", + type=str, + default="scl", + help="Collection code (default: scl)", + ) + parser.add_argument( + "--task", + choices=["load_documents_from_opac", "load_documents_from_article_meta"], + default="load_documents_from_opac", + help="Task to execute (default: load_documents_from_opac)", + ) + + def handle(self, *args, **options): + start_year = options["start_year"] + end_year = options["end_year"] + collection = options["collection"] + + self.stdout.write( + self.style.SUCCESS( + f"Generating task requests from {start_year} to {end_year} for collection: {collection}" + ) + ) + + total_tasks = 0 + + for year in range(start_year, end_year + 1): + from_date = f"{year}-01-01" + until_date = f"{year}-12-31" + + self.stdout.write(f"Queuing task for year {year}...") + + if options["task"] == "load_documents_from_article_meta": + task_result = task_load_documents_from_article_meta.delay( + from_date=from_date, + until_date=until_date, + collection=collection, + ) + else: + task_result = task_load_documents_from_opac.delay( + from_date=from_date, + until_date=until_date, + collection=collection, + ) + + total_tasks += 1 + + self.stdout.write( + self.style.SUCCESS( + f"✓ Task queued for year {year}: {from_date} to {until_date} (Task ID: {task_result.id})" + ) + ) + + self.stdout.write( + self.style.SUCCESS( + f"\nCompleted! {total_tasks} tasks have been queued successfully." + ) + ) diff --git a/document/tasks/__init__.py b/document/tasks/__init__.py new file mode 100644 index 0000000..95a0ba5 --- /dev/null +++ b/document/tasks/__init__.py @@ -0,0 +1,28 @@ +from .articlemeta import ( + load_documents_from_article_meta, + task_load_documents_from_article_meta, +) +from .common import ( + get_latest_scielo_books_last_seq, +) +from .dataverse import ( + load_dataset_metadata_from_dataverse, + task_load_dataset_metadata_into_documents, +) +from .opac import ( + load_documents_from_opac, + task_load_documents_from_opac, +) +from .pipeline import ( + task_daily_metadata_sync_pipeline, +) +from .preprints import ( + load_preprints_from_preprints_api, + task_load_preprints_into_documents, +) +from .scielo_books import ( + load_documents_from_scielo_books, + sync_documents_from_scielo_books, + task_load_documents_from_scielo_books, + task_sync_documents_from_scielo_books, +) diff --git a/document/tasks/pipeline.py b/document/tasks/pipeline.py new file mode 100644 index 0000000..97bef7c --- /dev/null +++ b/document/tasks/pipeline.py @@ -0,0 +1,24 @@ +import logging + +from celery import group +from django.utils.translation import gettext as _ + +from config import celery_app + +from .articlemeta import task_load_documents_from_article_meta +from .dataverse import task_load_dataset_metadata_into_documents +from .opac import task_load_documents_from_opac +from .preprints import task_load_preprints_into_documents +from .scielo_books import task_sync_documents_from_scielo_books + + +@celery_app.task(bind=True, name=_("[Metadata] Daily Sync Routine (Auto)"), queue="load") +def task_daily_metadata_sync_pipeline(self): + logging.info("Starting Daily Metadata Sync Pipeline") + group([ + task_load_documents_from_article_meta.s(), + task_load_documents_from_opac.s(), + task_load_preprints_into_documents.s(), + task_load_dataset_metadata_into_documents.s(), + task_sync_documents_from_scielo_books.s(), + ]).apply_async() diff --git a/document/tests.py b/document/tests.py new file mode 100644 index 0000000..14d9bcd --- /dev/null +++ b/document/tests.py @@ -0,0 +1,255 @@ +from django.test import TestCase +from unittest.mock import patch + +from collection.models import Collection +from document import tasks as document_tasks +from source.services import books as source_books_service +from source.models import Source + +from .models import Document +from .services import articles as article_service +from .services import books as books_service +from .services import datasets as dataset_service +from .services import preprints as preprint_service + + +class DocumentMetadataTests(TestCase): + def test_metadata_includes_source_context_and_legacy_identifiers(self): + collection = Collection.objects.create(acron3="scl", acron2="sc") + source = Source.objects.create( + collection=collection, + source_type=Source.SOURCE_TYPE_JOURNAL, + source_id="1234-5678", + scielo_issn="1234-5678", + title="Test Journal", + identifiers={"scielo_issn": "1234-5678"}, + ) + Document.objects.create( + collection=collection, + source=source, + document_type=Document.DOCUMENT_TYPE_ARTICLE, + document_id="S123456782024000100001", + scielo_issn="1234-5678", + pid_v2="S123456782024000100001", + pid_v3="abc123", + title="Test Article", + identifiers={"doi": "10.1590/example"}, + files={"pt": {"path": "/pdf/test.pdf"}}, + default_lang="en", + text_langs=["en", "pt"], + publication_date="2024-01-15", + publication_year="2024", + ) + + metadata = list(Document.metadata(collection=collection)) + + self.assertEqual(len(metadata), 1) + self.assertEqual(metadata[0]["document_type"], Document.DOCUMENT_TYPE_ARTICLE) + self.assertEqual(metadata[0]["document_id"], "S123456782024000100001") + self.assertEqual(metadata[0]["source_type"], Source.SOURCE_TYPE_JOURNAL) + self.assertEqual(metadata[0]["source_id"], "1234-5678") + self.assertEqual(metadata[0]["scielo_issn"], "1234-5678") + + def test_upsert_monograph_and_part_documents_from_books_payload(self): + collection = Collection.objects.create(acron3="books", acron2="bk") + monograph_payload = { + "TYPE": "Monograph", + "id": "abcd1", + "title": "Sample Book", + "isbn": "9788578791889", + "eisbn": "9788578791880", + "doi_number": "10.1234/book", + "language": "pt", + "publication_date": "2024-05-20", + "year": "2024", + "publisher": "SciELO Books", + } + part_payload = { + "TYPE": "Part", + "id": "18", + "monograph": "abcd1", + "title": "Chapter 18", + "text_language": "es", + "order": "18", + } + + source = source_books_service.upsert_monograph_source( + monograph_payload, + collection=collection, + ) + parent_document = books_service.upsert_monograph_document( + monograph_payload, + collection=collection, + source=source, + ) + chapter = books_service.upsert_part_document( + books_service.enrich_part_payload(part_payload, monograph_payload), + collection=collection, + source=source, + parent_document=parent_document, + ) + + self.assertEqual(parent_document.document_type, Document.DOCUMENT_TYPE_BOOK) + self.assertEqual(parent_document.document_id, "book:abcd1") + self.assertEqual(parent_document.pid_generic, "book:abcd1") + self.assertEqual(chapter.document_type, Document.DOCUMENT_TYPE_CHAPTER) + self.assertEqual(chapter.document_id, "book:abcd1/chapter:18") + self.assertEqual(chapter.parent_document, parent_document) + self.assertEqual(chapter.identifiers["book_id"], "abcd1") + self.assertEqual(chapter.default_lang, "es") + + def test_articlemeta_and_opac_upsert_same_document(self): + collection = Collection.objects.create(acron3="scl", acron2="sc") + source = Source.objects.create( + collection=collection, + source_type=Source.SOURCE_TYPE_JOURNAL, + source_id="1234-5678", + scielo_issn="1234-5678", + acronym="testjou", + title="Test Journal", + identifiers={"scielo_issn": "1234-5678"}, + ) + + first = article_service.upsert_article_document_from_articlemeta( + { + "code": "S123456782024000100001", + "title": "Article Title", + "pdfs": {"en": {"url": "/pdf/en.pdf"}}, + "processing_date": "2024-02-10", + "publication_date": "2024-01-15", + "publication_year": "2024", + "default_language": "en", + "text_langs": ["en", "pt"], + "code_title": ["1234-5678"], + }, + collection=collection, + source=source, + ) + second = article_service.upsert_article_document_from_opac( + { + "pid_v2": "S123456782024000100001", + "pid_v3": "S1234-56782024000100001", + "title": "Article Title", + "journal_acronym": "testjou", + "publication_date": "2024-01-15", + "default_language": "en", + "text_langs": ["en", "pt"], + }, + collection=collection, + source=source, + ) + + self.assertEqual(first.pk, second.pk) + self.assertEqual(Document.objects.count(), 1) + second.refresh_from_db() + self.assertEqual(second.pid_v3, "S1234-56782024000100001") + self.assertEqual(second.identifiers["journal_acronym"], "testjou") + + def test_upsert_preprint_document_maps_metadata(self): + collection = Collection.objects.create(acron3="preprints", acron2="pp") + + document = preprint_service.upsert_preprint_document( + { + "pid_generic": "preprint/123", + "title": "Preprint Title", + "text_langs": ["en", "pt"], + "default_language": "en", + "publication_date": "2024-01-20", + "publication_year": "2024", + }, + collection=collection, + ) + + self.assertEqual(document.document_type, Document.DOCUMENT_TYPE_PREPRINT) + self.assertEqual(document.document_id, "preprint/123") + self.assertEqual(document.pid_generic, "preprint/123") + self.assertEqual(document.default_lang, "en") + + def test_upsert_dataset_document_accumulates_files(self): + collection = Collection.objects.create(acron3="data", acron2="dt") + + dataset_service.upsert_dataset_document( + { + "title": "Dataset Title", + "dataset_doi": "10.1234/dataset", + "dataset_published": "2024-03-15", + "file_id": "1", + "file_name": "first.csv", + "file_url": "https://example.org/first.csv", + "file_persistent_id": "pid:first", + }, + collection=collection, + ) + document = dataset_service.upsert_dataset_document( + { + "title": "Dataset Title", + "dataset_doi": "10.1234/dataset", + "dataset_published": "2024-03-15", + "file_id": "2", + "file_name": "second.csv", + "file_url": "https://example.org/second.csv", + "file_persistent_id": "pid:second", + }, + collection=collection, + ) + + self.assertEqual(document.document_type, Document.DOCUMENT_TYPE_DATASET) + self.assertEqual(document.document_id, "10.1234/dataset") + self.assertEqual(set(document.files.keys()), {"1", "2"}) + + +class DocumentBooksSyncTests(TestCase): + def test_get_latest_scielo_books_last_seq_uses_documents_and_sources(self): + collection = Collection.objects.create(acron3="books", acron2="bk") + source = Source.objects.create( + collection=collection, + source_type=Source.SOURCE_TYPE_BOOK, + source_id="book-1", + title="Book 1", + extra_data={"last_seq": 120}, + ) + Document.objects.create( + collection=collection, + source=source, + document_type=Document.DOCUMENT_TYPE_BOOK, + document_id="book:book-1", + extra_data={"last_seq": "135"}, + ) + + self.assertEqual(document_tasks.get_latest_scielo_books_last_seq("books"), 135) + + def test_sync_documents_from_scielo_books_uses_computed_since(self): + collection = Collection.objects.create(acron3="books", acron2="bk") + source = Source.objects.create( + collection=collection, + source_type=Source.SOURCE_TYPE_BOOK, + source_id="book-1", + title="Book 1", + extra_data={"last_seq": 120}, + ) + Document.objects.create( + collection=collection, + source=source, + document_type=Document.DOCUMENT_TYPE_BOOK, + document_id="book:book-1", + extra_data={"last_seq": 135}, + ) + + with patch("document.tasks.scielo_books.load_documents_from_scielo_books", return_value=True) as mocked: + result = document_tasks.sync_documents_from_scielo_books( + collection="books", + db_name="scielobooks_1a", + limit=500, + ) + + self.assertTrue(result) + mocked.assert_called_once_with( + collection="books", + db_name="scielobooks_1a", + since=135, + limit=500, + force_update=True, + headers=None, + base_url=None, + user=None, + ) diff --git a/document/wagtail_hooks.py b/document/wagtail_hooks.py new file mode 100644 index 0000000..de291c9 --- /dev/null +++ b/document/wagtail_hooks.py @@ -0,0 +1,35 @@ +from django.utils.translation import gettext_lazy as _ +from wagtail.snippets.views.snippets import SnippetViewSet + +from .models import Document + + +class DocumentSnippetViewSet(SnippetViewSet): + model = Document + icon = "folder-open-inverse" + menu_label = _("Document") + menu_order = 300 + + list_display = ( + "collection", + "document_type", + "document_id", + "source", + "title", + "pid_v2", + "pid_v3", + "pid_generic", + "publication_year", + ) + list_filter = ( + "collection", + "document_type", + "publication_year", + ) + search_fields = ( + "document_id", + "title", + "pid_v2", + "pid_v3", + "pid_generic", + ) From 9c1e3820acfb8271e71be0b13750a079894ed997 Mon Sep 17 00:00:00 2001 From: Rafael JP Damaceno Date: Fri, 1 May 2026 22:34:06 -0300 Subject: [PATCH 22/31] feat: registrar novos apps em INSTALLED_APPS, menu e wagtail hooks --- config/menu.py | 15 +++----- config/settings/base.py | 82 +++++++++++++++++++++++++--------------- metrics/wagtail_hooks.py | 22 +++++++++++ 3 files changed, 80 insertions(+), 39 deletions(-) create mode 100644 metrics/wagtail_hooks.py diff --git a/config/menu.py b/config/menu.py index 13371c6..844ce0c 100644 --- a/config/menu.py +++ b/config/menu.py @@ -1,13 +1,10 @@ WAGTAIL_MENU_APPS_ORDER = { - "collection": 100, - "article": 200, - "journal": 300, - "resources": 400, - "log_manager": 500, - "log_manager_config": 600, - "metrics": 700, - "tasks": 800, - "unexpected-error": 900, + "metadata": 100, + "resources": 200, + "log_manager": 300, + "tracker": 400, + "metrics": 500, + "tasks": 600, } def get_menu_order(app_name): diff --git a/config/settings/base.py b/config/settings/base.py index 9638274..e4a99fa 100644 --- a/config/settings/base.py +++ b/config/settings/base.py @@ -5,7 +5,8 @@ from pathlib import Path import environ -from django.utils.translation import gettext_lazy as _ + +from config.collections import COLLECTION_ACRON3_SIZE_MAP # noqa: F401 ROOT_DIR = Path(__file__).resolve(strict=True).parent.parent.parent # core/ @@ -116,10 +117,13 @@ # Your stuff: custom apps go here "collection", "core", + "document", "log_manager", "log_manager_config", "metrics", + "reports", "resources", + "source", "tracker", ] @@ -402,36 +406,54 @@ SEARCH_PAGINATION_ITEMS_PER_PAGE = 10 -# Elasticsearch +# OpenSearch # ------------------------------------------------------------------------------ -ES_URL = env("ES_URL", default="http://192.168.0.33:9200/") -ES_INDEX_NAME = env("ES_INDEX_NAME", default="usage") -ES_API_KEY = env("ES_API_KEY", default="") -ES_BASIC_AUTH = env("ES_BASIC_AUTH", default=("elastic", "iHktg66E")) -ES_VERIFY_CERTS = env.bool("ES_VERIFY_CERTS", default=False) +OPENSEARCH_URL = env("OPENSEARCH_URL", default="http://localhost:9200/") +OPENSEARCH_INDEX_NAME = env("OPENSEARCH_INDEX_NAME", default="usage") +OPENSEARCH_API_KEY = env("OPENSEARCH_API_KEY", default="") +OPENSEARCH_BASIC_AUTH = env( + "OPENSEARCH_BASIC_AUTH", + default=("admin", "admin"), +) +OPENSEARCH_VERIFY_CERTS = env.bool( + "OPENSEARCH_VERIFY_CERTS", + default=False, +) + +# Collectors configuration +# ------------------------------------------------------------------------------ +# ArticleMeta +ARTICLEMETA_COLLECT_URL = env( + "ARTICLEMETA_COLLECT_URL", + default="http://articlemeta.scielo.org/api/v1/article/counter_dict", +) +ARTICLEMETA_MAX_RETRIES = env.int("ARTICLEMETA_MAX_RETRIES", default=5) +ARTICLEMETA_SLEEP_TIME = env.int("ARTICLEMETA_SLEEP_TIME", default=30) + +# Dataverse +DATAVERSE_ENDPOINT = env("DATAVERSE_ENDPOINT", default="https://data.scielo.org/api") +DATAVERSE_ROOT_COLLECTION = env("DATAVERSE_ROOT_COLLECTION", default="scielodata") +DATAVERSE_SLEEP_TIME = env.int("DATAVERSE_SLEEP_TIME", default=30) + +# OPAC +OPAC_ENDPOINT = env("OPAC_ENDPOINT", default="https://www.scielo.br/api/v1/counter_dict") +OPAC_MAX_RETRIES = env.int("OPAC_MAX_RETRIES", default=5) +OPAC_SLEEP_TIME = env.int("OPAC_SLEEP_TIME", default=30) + +# Preprints +OAI_PMH_PREPRINT_ENDPOINT = env( + "OAI_PMH_PREPRINT_ENDPOINT", + default="https://preprints.scielo.org/index.php/scielo/oai", +) +OAI_METADATA_PREFIX = env("OAI_METADATA_PREFIX", default="oai_dc") +OAI_PMH_MAX_RETRIES = env.int("OAI_PMH_MAX_RETRIES", default=5) + +# SciELO Books +SCIELO_BOOKS_BASE_URL = env("SCIELO_BOOKS_BASE_URL", default="http://localhost:5984") +SCIELO_BOOKS_TIMEOUT = env.int("SCIELO_BOOKS_TIMEOUT", default=60) +SCIELO_BOOKS_DB_NAME = env("SCIELO_BOOKS_DB_NAME", default="scielobooks_1a") +SCIELO_BOOKS_LIMIT = env.int("SCIELO_BOOKS_LIMIT", default=1000) # Collection size categories # ------------------------------------------------------------------------------ -EXTRA_LARGE_COLLECTIONS = env.list("EXTRA_LARGE_COLLECTIONS", default=["scl"]) -LARGE_COLLECTIONS = env.list("LARGE_COLLECTIONS", default=["chl", "col", "mex"]) -MEDIUM_COLLECTIONS = env.list("MEDIUM_COLLECTIONS", default=["cri", "esp", "psi", "prt", "ven"]) -SMALL_COLLECTIONS = env.list("SMALL_COLLECTIONS", default=["arg", "bol", "cub", "data", "ecu", "per", "preprints", "pry", "rve", "spa", "sss", "sza", "ury", "wid"]) - -# Collection size mapping -def _build_collection_size_map(): - """Build mapping of collection acronyms to their size categories.""" - size_map = {} - size_categories = { - "xlarge": EXTRA_LARGE_COLLECTIONS, - "large": LARGE_COLLECTIONS, - "medium": MEDIUM_COLLECTIONS, - "small": SMALL_COLLECTIONS, - } - - for size, collections in size_categories.items(): - for acron3 in collections: - size_map[acron3] = size - - return size_map - -COLLECTION_ACRON3_SIZE_MAP = _build_collection_size_map() +SUPPORTED_LOGFILE_EXTENSIONS = env.list("SUPPORTED_LOGFILE_EXTENSIONS", default=[".log", ".gz", ".zip"]) diff --git a/metrics/wagtail_hooks.py b/metrics/wagtail_hooks.py new file mode 100644 index 0000000..94c2ffb --- /dev/null +++ b/metrics/wagtail_hooks.py @@ -0,0 +1,22 @@ +from django.utils.translation import gettext_lazy as _ +from wagtail.snippets.views.snippets import SnippetViewSet + +from metrics.models import DailyMetricJob + +class DailyMetricJobSnippetViewSet(SnippetViewSet): + model = DailyMetricJob + menu_label = _("Daily Metric Jobs") + icon = "history" + menu_order = 600 + list_display = ( + "collection", + "access_date", + "status", + "input_log_count", + "attempts", + "export_started_at", + "exported_at", + "updated", + ) + list_filter = ("status", "collection", "access_date") + search_fields = ("collection__acron3", "error_message") From 95b5ab5d520ec97a2b445cf7cf21279f53b46585 Mon Sep 17 00:00:00 2001 From: Rafael JP Damaceno Date: Fri, 1 May 2026 22:40:37 -0300 Subject: [PATCH 23/31] refactor: simplificar modelo LogFile, adicionar parse_heartbeat_at --- log_manager/choices.py | 10 -- log_manager/models.py | 240 ++++++----------------------------------- 2 files changed, 30 insertions(+), 220 deletions(-) diff --git a/log_manager/choices.py b/log_manager/choices.py index e98c8f2..c6e461a 100644 --- a/log_manager/choices.py +++ b/log_manager/choices.py @@ -19,13 +19,3 @@ (LOG_FILE_STATUS_IGNORED, _("Ignored")), ] - -COLLECTION_LOG_FILE_DATE_COUNT_OK = 'OK' -COLLECTION_LOG_FILE_DATE_COUNT_MISSING_FILES = 'MIS' -COLLECTION_LOG_FILE_DATE_COUNT_EXTRA_FILES = 'EXT' - -COLLECTION_LOG_FILE_DATE_COUNT = [ - (COLLECTION_LOG_FILE_DATE_COUNT_OK, _("OK")), - (COLLECTION_LOG_FILE_DATE_COUNT_MISSING_FILES, _("Missing Files")), - (COLLECTION_LOG_FILE_DATE_COUNT_EXTRA_FILES, _("Extra files")), -] diff --git a/log_manager/models.py b/log_manager/models.py index fc3a8b6..6bf04d8 100644 --- a/log_manager/models.py +++ b/log_manager/models.py @@ -1,209 +1,20 @@ import logging -from django.db import models -from django.db.models import Q +from django.db import IntegrityError, models from django.utils import timezone from django.utils.translation import gettext_lazy as _ from wagtail.admin.panels import FieldPanel from wagtailautocomplete.edit_handlers import AutocompletePanel from collection.models import Collection -from core.forms import CoreAdminModelForm -from core.models import CommonControlField from . import choices -class LogFileDate(CommonControlField): - date = models.DateField( - verbose_name=_("Date"), - null=False, - blank=False, - db_index=True, - ) - - log_file = models.ForeignKey( - 'LogFile', - verbose_name=_('Log File'), - blank=True, - on_delete=models.DO_NOTHING, - db_index=True, - ) - - base_form_class = CoreAdminModelForm - - panel = [ - FieldPanel('date'), - AutocompletePanel('log_file') - ] - - class Meta: - ordering = ['-date'] - verbose_name = _("Log File Date") - verbose_name_plural = _("Log File Dates") - unique_together = ( - 'date', - 'log_file', - ) - indexes = [ - models.Index(fields=['date', 'log_file']), - ] - - @classmethod - def create_or_update(cls, user, log_file, date): - obj, created = cls.objects.get_or_create( - log_file=log_file, - date=date, - ) - - if not created: - obj.updated_by = user - obj.updated = timezone.now() - else: - obj.creator = user - obj.created = timezone.now() - - return obj - - @classmethod - def filter_by_collection_and_date(cls, collection, date): - return cls.objects.filter( - ~Q(log_file__status__in=[ - choices.LOG_FILE_STATUS_CREATED, - choices.LOG_FILE_STATUS_INVALIDATED - ]), - log_file__collection__acron3=collection, - date=date, - ) - - @classmethod - def get_number_of_found_files_for_date(cls, collection, date): - return cls.objects.filter( - ~Q(log_file__status__in=[ - choices.LOG_FILE_STATUS_CREATED, - choices.LOG_FILE_STATUS_INVALIDATED - ]), - log_file__collection__acron3=collection, - date=date, - ).count() - - def __str__(self): - return f'{self.log_file.path}-{self.date}' - - -class CollectionLogFileDateCount(CommonControlField): - collection = models.ForeignKey( - Collection, - verbose_name=_('Collection'), - on_delete=models.DO_NOTHING, - null=False, - blank=False, - ) - - date = models.DateField( - _('Date'), - null=False, - blank=False, - ) - - year = models.IntegerField( - _('Year'), - null=False, - blank=False, - ) - - month = models.IntegerField( - _('Month'), - null=False, - blank=False, - ) - - found_log_files = models.IntegerField( - verbose_name=_('Number of Found Valid Log Files'), - default=0, - ) - - expected_log_files = models.IntegerField( - verbose_name=_('Number of Expected Valid Log Files'), - blank=True, - null=True, - ) - - is_usage_metric_computed = models.BooleanField( - verbose_name=_('Is Usage Metric Computed'), - default=False, - ) - - exported_files_count = models.SmallIntegerField( - verbose_name=_('Exported Files Count'), - default=0, - ) - - status = models.CharField( - verbose_name=_('Status'), - choices=choices.COLLECTION_LOG_FILE_DATE_COUNT, - max_length=3, - ) - - def set_status(self): - if self.found_log_files < self.expected_log_files: - self.status = choices.COLLECTION_LOG_FILE_DATE_COUNT_MISSING_FILES - elif self.found_log_files > self.expected_log_files: - self.status = choices.COLLECTION_LOG_FILE_DATE_COUNT_EXTRA_FILES - else: - self.status = choices.COLLECTION_LOG_FILE_DATE_COUNT_OK - - def set_is_usage_metric_computed(self): - if self.exported_files_count == self.found_log_files: - self.is_usage_metric_computed = True - - @classmethod - def create_or_update(cls, user, collection, date, expected_log_files, found_log_files): - obj, created = cls.objects.get_or_create( - collection=collection, - date=date, - month=date.month, - year=date.year, - ) - - if not created: - obj.updated_by = user - obj.updated = timezone.now() - else: - obj.creator = user - obj.created = timezone.now() - - obj.expected_log_files = expected_log_files - obj.found_log_files = found_log_files - obj.set_status() - - obj.save() - return obj - - class Meta: - ordering = ['-date'] - verbose_name = _("Collection Log File Date Count") - unique_together = ( - 'collection', - 'date', - ) - - panels = [ - AutocompletePanel('collection'), - FieldPanel('date'), - FieldPanel('year'), - FieldPanel('month'), - FieldPanel('found_log_files'), - FieldPanel('expected_log_files'), - FieldPanel('status'), - FieldPanel('is_usage_metric_computed'), - ] - - def __str__(self): - return f'{self.collection.acron3}-{self.date}' - - -class LogFile(CommonControlField): +class LogFile(models.Model): + created = models.DateTimeField(verbose_name=_("Creation date"), auto_now_add=True) + updated = models.DateTimeField(verbose_name=_("Last update date"), auto_now=True) + date = models.DateField(verbose_name=_("Date"), null=True, blank=True, db_index=True) hash = models.CharField(_("Hash MD5"), max_length=32, null=True, blank=True, unique=True) path = models.CharField(_("Name"), max_length=255, null=False, blank=False) @@ -246,19 +57,25 @@ class LogFile(CommonControlField): default=0, ) + parse_heartbeat_at = models.DateTimeField( + _("Parse Heartbeat At"), + null=True, + blank=True, + ) + panels = [ FieldPanel('hash'), + FieldPanel('date'), FieldPanel('path'), FieldPanel('stat_result'), FieldPanel('status'), FieldPanel('validation'), FieldPanel('summary'), FieldPanel('last_processed_line'), + FieldPanel('parse_heartbeat_at'), AutocompletePanel('collection'), ] - base_form_class = CoreAdminModelForm - class Meta: verbose_name = _("Log File") verbose_name_plural = _("Log Files") @@ -268,25 +85,28 @@ def get(cls, hash): return cls.objects.get(hash=hash) @classmethod - def create_or_update(cls, user, collection, path, stat_result, hash, status=None): + def create_or_update(cls, collection, path, stat_result, hash, status=None): try: + obj, created = cls.objects.get_or_create( + hash=hash, + defaults={ + "collection": collection, + "path": path, + "stat_result": stat_result, + "status": status or choices.LOG_FILE_STATUS_CREATED, + }, + ) + except IntegrityError: obj = cls.get(hash=hash) - obj.updated_by = user + created = False + + if created: + logging.info(f'File {path} added to the database.') + else: obj.updated = timezone.now() + obj.save(update_fields=["updated"]) logging.info(f'File {path} already exists in the database.') - except cls.DoesNotExist: - obj = cls() - obj.creator = user - obj.created = timezone.now() - obj.collection = collection - obj.path = path - obj.stat_result = stat_result - obj.hash = hash - obj.status = status or choices.LOG_FILE_STATUS_CREATED - logging.info(f'File {path} added to the database.') - - obj.save() return obj def __str__(self): From dfeaf9711715cd20db2857c491829a7072b8852d Mon Sep 17 00:00:00 2001 From: Rafael JP Damaceno Date: Fri, 1 May 2026 22:40:37 -0300 Subject: [PATCH 24/31] =?UTF-8?q?refactor:=20adicionar=20migra=C3=A7=C3=A3?= =?UTF-8?q?o=20log=5Fmanager=200010=20(remover=20modelos=20legados)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...er_logfiledate_unique_together_and_more.py | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 log_manager/migrations/0010_alter_logfiledate_unique_together_and_more.py diff --git a/log_manager/migrations/0010_alter_logfiledate_unique_together_and_more.py b/log_manager/migrations/0010_alter_logfiledate_unique_together_and_more.py new file mode 100644 index 0000000..d30cdf4 --- /dev/null +++ b/log_manager/migrations/0010_alter_logfiledate_unique_together_and_more.py @@ -0,0 +1,52 @@ +# Generated by Django 5.2.12 on 2026-05-01 22:23 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("log_manager", "0009_collectionlogfiledatecount_exported_files_count"), + ] + + operations = [ + migrations.RemoveField( + model_name="logfiledate", + name="creator", + ), + migrations.RemoveField( + model_name="logfiledate", + name="log_file", + ), + migrations.RemoveField( + model_name="logfiledate", + name="updated_by", + ), + migrations.RemoveField( + model_name="logfile", + name="creator", + ), + migrations.RemoveField( + model_name="logfile", + name="updated_by", + ), + migrations.AddField( + model_name="logfile", + name="date", + field=models.DateField( + blank=True, db_index=True, null=True, verbose_name="Date" + ), + ), + migrations.AddField( + model_name="logfile", + name="parse_heartbeat_at", + field=models.DateTimeField( + blank=True, null=True, verbose_name="Parse Heartbeat At" + ), + ), + migrations.DeleteModel( + name="CollectionLogFileDateCount", + ), + migrations.DeleteModel( + name="LogFileDate", + ), + ] From 495500549cbab1d0ac698cf8629523d881076ccd Mon Sep 17 00:00:00 2001 From: Rafael JP Damaceno Date: Fri, 1 May 2026 22:40:37 -0300 Subject: [PATCH 25/31] refactor: refatorar modelos log_manager_config com ParentalKey --- log_manager_config/exceptions.py | 6 - log_manager_config/models.py | 408 ++++++++----------------------- 2 files changed, 108 insertions(+), 306 deletions(-) diff --git a/log_manager_config/exceptions.py b/log_manager_config/exceptions.py index ad7581a..0a6a6a9 100644 --- a/log_manager_config/exceptions.py +++ b/log_manager_config/exceptions.py @@ -4,11 +4,5 @@ class UndefinedCollectionLogDirectoryError(Exception): class UndefinedCollectionEmailError(Exception): ... -class UndefinedCollectionFilesPerDayError(Exception): - ... - class UndefinedSupportedLogFile(Exception): ... - -class MultipleFilesPerDayForTheSameDateError(Exception): - ... diff --git a/log_manager_config/models.py b/log_manager_config/models.py index 384368e..8cf3e34 100644 --- a/log_manager_config/models.py +++ b/log_manager_config/models.py @@ -4,38 +4,57 @@ from django.utils import timezone from django.utils.translation import gettext_lazy as _ +from modelcluster.models import ClusterableModel +from modelcluster.fields import ParentalKey +from wagtail.models import Orderable +from wagtail.admin.panels import FieldPanel, InlinePanel +from wagtailautocomplete.edit_handlers import AutocompletePanel + from collection.models import Collection from core.models import CommonControlField -from .exceptions import MultipleFilesPerDayForTheSameDateError, UndefinedCollectionFilesPerDayError -class CollectionLogDirectory(CommonControlField): - collection = models.ForeignKey( +class LogManagerCollectionConfig(ClusterableModel, CommonControlField): + collection = models.OneToOneField( Collection, verbose_name=_('Collection'), - on_delete=models.DO_NOTHING, + on_delete=models.CASCADE, + related_name="log_manager_config" ) - path = models.CharField( - verbose_name=_('Path'), - max_length=255, - blank=False, + sample_size = models.FloatField( + verbose_name=_('Sample Size'), + blank=False, null=False, + default=0.1, ) - directory_name = models.CharField( - verbose_name=_('Directory Name'), - max_length=255, - blank=True, - null=True, + buffer_size = models.IntegerField( + verbose_name=_('Buffer Size'), + blank=False, + null=False, + default=2048, ) - active = models.BooleanField( - verbose_name=_('Active'), - default=True, + expected_logs_per_day = models.IntegerField( + verbose_name=_('Expected Logs Per Day'), + default=1, ) + panels = [ + AutocompletePanel("collection"), + FieldPanel("sample_size"), + FieldPanel("buffer_size"), + FieldPanel("expected_logs_per_day"), + InlinePanel("directories", label=_("Directories")), + InlinePanel("emails", label=_("Emails")), + ] + def __str__(self): - return f'{self.collection} - {self.path} - {self.directory_name}' - + return f'{self.collection.acron3} Config' + + class Meta: + verbose_name = _('Log Manager Collection Config') + verbose_name_plural = _('Log Manager Collection Configs') + @classmethod def load(cls, data, user): for item in data: @@ -45,13 +64,12 @@ def load(cls, data, user): logging.warning(f'Collection {item.get("acronym")} not found.') continue - logging.info(item) cls.create_or_update( user=user, collection=collection, - directory_name=item.get('directory_name'), - path=item.get('path'), - active=item.get('active', True), + sample_size=item.get('sample_size', 0.1), + buffer_size=item.get('buffer_size', 2048), + expected_logs_per_day=item.get('quantity', 1), ) @classmethod @@ -59,81 +77,66 @@ def create_or_update( cls, user, collection, - directory_name, - path, - active, + sample_size, + buffer_size, + expected_logs_per_day, ): - try: - obj = cls.objects.get(collection=collection, path=path) - except cls.DoesNotExist: - obj = cls() + obj, created = cls.objects.get_or_create(collection=collection) + if created: obj.creator = user obj.created = timezone.now() - obj.collection = collection obj.updated_by = user obj.updated = timezone.now() - obj.directory_name = directory_name - obj.path = path - obj.active = active - + obj.sample_size = sample_size + obj.buffer_size = buffer_size + obj.expected_logs_per_day = expected_logs_per_day obj.save() - logging.info(f'{collection.acron3} - {directory_name} - {path}') + logging.info(f'Config for {collection.acron3} updated.') return obj - class Meta: - verbose_name = _('Collection Log Directory') - verbose_name_plural = _('Collection Log Directories') - constraints = [ - models.UniqueConstraint(fields=['collection', 'path'], name='unique_collection_path') - ] -class CollectionLogFilesPerDay(CommonControlField): - collection = models.ForeignKey( - Collection, - verbose_name=_('Collection'), - on_delete=models.DO_NOTHING, +class CollectionLogDirectory(Orderable, CommonControlField): + config = ParentalKey( + 'LogManagerCollectionConfig', + related_name='directories', + on_delete=models.CASCADE, + null=True, + blank=True, ) - start_date = models.DateField( - verbose_name=_('Start Date'), - blank=False, + path = models.CharField( + verbose_name=_('Path'), + max_length=255, + blank=False, null=False, ) - end_date = models.DateField( - verbose_name=_('End Date'), + directory_name = models.CharField( + verbose_name=_('Directory Name'), + max_length=255, blank=True, null=True, ) - quantity = models.IntegerField( - verbose_name=_('Quantity'), - default=1, + active = models.BooleanField( + verbose_name=_('Active'), + default=True, + ) + translator_class = models.CharField( + verbose_name=_('URL Translator Class'), + blank=False, + null=False, + default='URLTranslatorClassicSite', ) def __str__(self): - return f'{self.start_date} - {self.quantity}' + return f'{self.config.collection} - {self.path} - {self.directory_name}' - @classmethod - def get_number_of_expected_files_by_day(cls, collection, date): - files_by_day = cls.objects.filter( - models.Q(collection__acron3=collection) & - models.Q(start_date__lte=date) & - (models.Q(end_date__gte=date) | models.Q(end_date__isnull=True)) - ) - - if files_by_day.count() > 1: - raise MultipleFilesPerDayForTheSameDateError(_("ERROR. Please, set the field end_date for the collection {collection}.")) - - if files_by_day.count() == 0: - raise UndefinedCollectionFilesPerDayError(_("ERROR. Please, set the number of files per day for the collection {collection}.")) - - return int(files_by_day.get().quantity) - @classmethod def load(cls, data, user): for item in data: try: collection = Collection.objects.get(acron3=item.get('acronym')) + config, _ = LogManagerCollectionConfig.objects.get_or_create(collection=collection) except Collection.DoesNotExist: logging.warning(f'Collection {item.get("acronym")} not found.') continue @@ -141,52 +144,55 @@ def load(cls, data, user): logging.info(item) cls.create_or_update( user=user, - collection=collection, - start_date=item.get('start_date'), - quantity=item.get('quantity'), - end_date=item.get('end_date'), + config=config, + directory_name=item.get('directory_name'), + path=item.get('path'), + active=item.get('active', True), ) @classmethod def create_or_update( cls, user, - collection, - start_date, - quantity, - end_date, + config, + directory_name, + path, + active, ): try: - obj = cls.objects.get(collection=collection, start_date=start_date) + obj = cls.objects.get(config=config, path=path) except cls.DoesNotExist: obj = cls() obj.creator = user obj.created = timezone.now() - obj.collection = collection - + obj.config = config + obj.updated_by = user obj.updated = timezone.now() - obj.start_date = start_date - obj.quantity = quantity - obj.end_date = end_date - + obj.directory_name = directory_name + obj.path = path + obj.active = active + obj.save() - logging.info(f'{collection.acron3} - {start_date} - {quantity}') + logging.info(f'{config.collection.acron3} - {directory_name} - {path}') return obj class Meta: - verbose_name = _('Collection Log Files Per Day') - verbose_name_plural = _('Collection Log Files Per Day') + verbose_name = _('Collection Log Directory') + verbose_name_plural = _('Collection Log Directories') constraints = [ - models.UniqueConstraint(fields=['collection', 'start_date'], name='unique_collection_start_date') + models.UniqueConstraint(fields=['config', 'path'], name='unique_config_path') ] -class CollectionEmail(CommonControlField): - collection = models.ForeignKey( - Collection, - verbose_name=_('Collection'), - on_delete=models.DO_NOTHING, + +class CollectionEmail(Orderable, CommonControlField): + config = ParentalKey( + 'LogManagerCollectionConfig', + related_name='emails', + on_delete=models.CASCADE, + null=True, + blank=True, ) name = models.CharField( verbose_name=_('Name'), @@ -218,6 +224,7 @@ def load(cls, data, user): for item in data: try: collection = Collection.objects.get(acron3=item.get('acronym')) + config, _ = LogManagerCollectionConfig.objects.get_or_create(collection=collection) except Collection.DoesNotExist: logging.warning(f'Collection {item.get("acronym")} not found.') continue @@ -225,7 +232,7 @@ def load(cls, data, user): logging.info(item) cls.create_or_update( user=user, - collection=collection, + config=config, email=item.get('e-mail'), name=item.get('name'), position=item.get('position'), @@ -236,19 +243,19 @@ def load(cls, data, user): def create_or_update( cls, user, - collection, + config, email, name, position, active, ): try: - obj = cls.objects.get(collection=collection, email=email) + obj = cls.objects.get(config=config, email=email) except cls.DoesNotExist: obj = cls() obj.creator = user obj.created = timezone.now() - obj.collection = collection + obj.config = config obj.email = email obj.updated_by = user @@ -258,213 +265,14 @@ def create_or_update( obj.active = active obj.save() - logging.info(f'{collection.acron3} - {name} - {position} - {email}') + logging.info(f'{config.collection.acron3} - {name} - {position} - {email}') return obj class Meta: verbose_name = _('Collection Email') verbose_name_plural = _('Collection Emails') constraints = [ - models.UniqueConstraint(fields=['collection', 'email'], name='unique_collection_email') - ] - - -class CollectionValidationParameters(CommonControlField): - collection = models.ForeignKey( - Collection, - verbose_name=_('Collection'), - on_delete=models.DO_NOTHING, - primary_key=True, - ) - sample_size = models.FloatField( - verbose_name=_('Sample Size'), - blank=False, - null=False, - default=0.1, - ) - buffer_size = models.IntegerField( - verbose_name=_('Buffer Size'), - blank=False, - null=False, - default=2048, - ) - - def __str__(self): - return f'{self.collection.acron3} - {self.sample_size} - {self.buffer_size}' - - @classmethod - def load(cls, data, user): - for item in data: - try: - collection = Collection.objects.get(acron3=item.get('acronym')) - except Collection.DoesNotExist: - logging.warning(f'Collection {item.get("acronym")} not found.') - continue - - logging.info(item) - cls.create_or_update( - user=user, - collection=collection, - sample_size=item.get('sample_size'), - buffer_size=item.get('buffer_size'), - ) - - @classmethod - def create_or_update( - cls, - user, - collection, - sample_size, - buffer_size, - ): - try: - obj = cls.objects.get(collection=collection) - except cls.DoesNotExist: - obj = cls() - obj.creator = user - obj.created = timezone.now() - obj.collection = collection - - obj.updated_by = user - obj.updated = timezone.now() - obj.sample_size = sample_size - obj.buffer_size = buffer_size - - obj.save() - logging.info(f'{collection.acron3} - {sample_size} - {buffer_size}') - return obj - - class Meta: - verbose_name = _('Collection Validation Parameters') - verbose_name_plural = _('Collection Validation Parameters') - - -class CollectionURLTranslatorClass(CommonControlField): - collection = models.ForeignKey( - Collection, - verbose_name=_('Collection'), - on_delete=models.DO_NOTHING, - ) - directory = models.ForeignKey( - CollectionLogDirectory, - verbose_name=_('Directory'), - on_delete=models.DO_NOTHING, - ) - translator_class = models.CharField( - verbose_name=_('URL Translator Class'), - blank=False, - null=False, - default='URLTranslatorClassicSite', - ) - - def __str__(self): - return f'{self.collection.acron3} - {self.directory} - {self.translator_class}' - - class Meta: - verbose_name = _('Collection URL Translator Class') - verbose_name_plural = _('Collection URL Translator Classes') - constraints = [ - models.UniqueConstraint(fields=['collection', 'directory'], name='unique_collection_directory') + models.UniqueConstraint(fields=['config', 'email'], name='unique_config_email') ] - @classmethod - def load(cls, data, user): - for item in data: - try: - collection = Collection.objects.get(acron3=item.get('acronym')) - except Collection.DoesNotExist: - logging.warning(f'Collection {item.get("acronym")} not found.') - continue - - try: - directory = CollectionLogDirectory.objects.get(collection=collection, path=item.get('path')) - logging.info(item) - cls.create_or_update( - user=user, - collection=collection, - directory=directory, - translator_class=item.get('translator_class'), - ) - except CollectionLogDirectory.DoesNotExist: - logging.warning(f'Directory {item.get("path")} not found.') - continue - @classmethod - def create_or_update( - cls, - user, - collection, - directory, - translator_class, - ): - try: - obj = cls.objects.get(collection=collection) - except cls.DoesNotExist: - obj = cls() - obj.creator = user - obj.created = timezone.now() - obj.collection = collection - obj.directory = directory - - obj.updated_by = user - obj.updated = timezone.now() - obj.translator_class = translator_class - - obj.save() - logging.info(f'{collection.acron3} - {directory.path} - {translator_class}') - return obj - - -class SupportedLogFile(CommonControlField): - file_extension = models.CharField( - verbose_name=_('File Extension'), - max_length=255, - unique=True, - blank=False, - null=False, - ) - description = models.TextField( - verbose_name=_('Description'), - blank=True, - null=True, - ) - - def __str__(self): - return f'{self.file_extension}' - - @classmethod - def load(cls, data, user): - for item in data: - logging.info(item) - cls.create_or_update( - user=user, - file_extension=item.get('file_extension'), - description=item.get('description'), - ) - - @classmethod - def create_or_update( - cls, - user, - file_extension, - description, - ): - try: - obj = cls.objects.get(file_extension=file_extension) - except cls.DoesNotExist: - obj = cls() - obj.creator = user - obj.created = timezone.now() - - obj.updated_by = user - obj.updated = timezone.now() - obj.file_extension = file_extension - obj.description = description - - obj.save() - logging.info(f'{file_extension}') - return obj - - class Meta: - verbose_name = _('Supported Log File') - verbose_name_plural = _('Supported Log Files') From 2b69fc13e40f5f1ed428eb1b5c93c73b1aaba5e3 Mon Sep 17 00:00:00 2001 From: Rafael JP Damaceno Date: Fri, 1 May 2026 22:40:37 -0300 Subject: [PATCH 26/31] =?UTF-8?q?refactor:=20adicionar=20migra=C3=A7=C3=A3?= =?UTF-8?q?o=20log=5Fmanager=5Fconfig=200004?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...004_logmanagercollectionconfig_and_more.py | 223 ++++++++++++++++++ 1 file changed, 223 insertions(+) create mode 100644 log_manager_config/migrations/0004_logmanagercollectionconfig_and_more.py diff --git a/log_manager_config/migrations/0004_logmanagercollectionconfig_and_more.py b/log_manager_config/migrations/0004_logmanagercollectionconfig_and_more.py new file mode 100644 index 0000000..5b6351c --- /dev/null +++ b/log_manager_config/migrations/0004_logmanagercollectionconfig_and_more.py @@ -0,0 +1,223 @@ +# Generated by Django 5.2.12 on 2026-05-01 22:27 + +import django.db.models.deletion +import modelcluster.fields +from django.conf import settings +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("collection", "0001_initial"), + ("log_manager_config", "0003_alter_collectionemail_options_and_more"), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.CreateModel( + name="LogManagerCollectionConfig", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "created", + models.DateTimeField( + auto_now_add=True, verbose_name="Creation date" + ), + ), + ( + "updated", + models.DateTimeField( + auto_now=True, verbose_name="Last update date" + ), + ), + ( + "sample_size", + models.FloatField(default=0.1, verbose_name="Sample Size"), + ), + ( + "buffer_size", + models.IntegerField(default=2048, verbose_name="Buffer Size"), + ), + ( + "expected_logs_per_day", + models.IntegerField( + default=1, verbose_name="Expected Logs Per Day" + ), + ), + ], + options={ + "verbose_name": "Log Manager Collection Config", + "verbose_name_plural": "Log Manager Collection Configs", + }, + ), + migrations.RemoveField( + model_name="collectionlogfilesperday", + name="collection", + ), + migrations.RemoveField( + model_name="collectionlogfilesperday", + name="creator", + ), + migrations.RemoveField( + model_name="collectionlogfilesperday", + name="updated_by", + ), + migrations.RemoveField( + model_name="collectionurltranslatorclass", + name="collection", + ), + migrations.RemoveField( + model_name="collectionurltranslatorclass", + name="creator", + ), + migrations.RemoveField( + model_name="collectionurltranslatorclass", + name="directory", + ), + migrations.RemoveField( + model_name="collectionurltranslatorclass", + name="updated_by", + ), + migrations.RemoveField( + model_name="collectionvalidationparameters", + name="collection", + ), + migrations.RemoveField( + model_name="collectionvalidationparameters", + name="creator", + ), + migrations.RemoveField( + model_name="collectionvalidationparameters", + name="updated_by", + ), + migrations.RemoveField( + model_name="supportedlogfile", + name="creator", + ), + migrations.RemoveField( + model_name="supportedlogfile", + name="updated_by", + ), + migrations.RemoveConstraint( + model_name="collectionemail", + name="unique_collection_email", + ), + migrations.RemoveConstraint( + model_name="collectionlogdirectory", + name="unique_collection_path", + ), + migrations.RemoveField( + model_name="collectionemail", + name="collection", + ), + migrations.RemoveField( + model_name="collectionlogdirectory", + name="collection", + ), + migrations.AddField( + model_name="collectionemail", + name="sort_order", + field=models.IntegerField(blank=True, editable=False, null=True), + ), + migrations.AddField( + model_name="collectionlogdirectory", + name="sort_order", + field=models.IntegerField(blank=True, editable=False, null=True), + ), + migrations.AddField( + model_name="collectionlogdirectory", + name="translator_class", + field=models.CharField( + default="URLTranslatorClassicSite", verbose_name="URL Translator Class" + ), + ), + migrations.AddField( + model_name="logmanagercollectionconfig", + name="collection", + field=models.OneToOneField( + on_delete=django.db.models.deletion.CASCADE, + related_name="log_manager_config", + to="collection.collection", + verbose_name="Collection", + ), + ), + migrations.AddField( + model_name="logmanagercollectionconfig", + name="creator", + field=models.ForeignKey( + editable=False, + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="%(class)s_creator", + to=settings.AUTH_USER_MODEL, + verbose_name="Creator", + ), + ), + migrations.AddField( + model_name="logmanagercollectionconfig", + name="updated_by", + field=models.ForeignKey( + blank=True, + editable=False, + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="%(class)s_last_mod_user", + to=settings.AUTH_USER_MODEL, + verbose_name="Updater", + ), + ), + migrations.AddField( + model_name="collectionemail", + name="config", + field=modelcluster.fields.ParentalKey( + blank=True, + null=True, + on_delete=django.db.models.deletion.CASCADE, + related_name="emails", + to="log_manager_config.logmanagercollectionconfig", + ), + ), + migrations.AddField( + model_name="collectionlogdirectory", + name="config", + field=modelcluster.fields.ParentalKey( + blank=True, + null=True, + on_delete=django.db.models.deletion.CASCADE, + related_name="directories", + to="log_manager_config.logmanagercollectionconfig", + ), + ), + migrations.AddConstraint( + model_name="collectionemail", + constraint=models.UniqueConstraint( + fields=("config", "email"), name="unique_config_email" + ), + ), + migrations.AddConstraint( + model_name="collectionlogdirectory", + constraint=models.UniqueConstraint( + fields=("config", "path"), name="unique_config_path" + ), + ), + migrations.DeleteModel( + name="CollectionLogFilesPerDay", + ), + migrations.DeleteModel( + name="CollectionURLTranslatorClass", + ), + migrations.DeleteModel( + name="CollectionValidationParameters", + ), + migrations.DeleteModel( + name="SupportedLogFile", + ), + ] From efdc7e06ef93e96425bb89a4ae4f0cfd9f43d708 Mon Sep 17 00:00:00 2001 From: Rafael JP Damaceno Date: Fri, 1 May 2026 22:40:49 -0300 Subject: [PATCH 27/31] refactor: simplificar modelos tracker, remover ArticleEvent e UnexpectedEvent --- tracker/choices.py | 46 +--- tracker/exceptions.py | 24 -- .../0002_top100articlesfileevent.py | 93 ------- ...rdedline_delete_top100articlesfileevent.py | 98 -------- ...4_alter_logfilediscardedline_error_type.py | 28 --- tracker/migrations/0005_articleevent.py | 86 ------- ...6_alter_logfilediscardedline_error_type.py | 29 --- ...7_alter_logfilediscardedline_error_type.py | 29 --- tracker/models.py | 228 +----------------- tracker/tasks.py | 34 --- 10 files changed, 7 insertions(+), 688 deletions(-) delete mode 100644 tracker/migrations/0002_top100articlesfileevent.py delete mode 100644 tracker/migrations/0003_logfilediscardedline_delete_top100articlesfileevent.py delete mode 100644 tracker/migrations/0004_alter_logfilediscardedline_error_type.py delete mode 100644 tracker/migrations/0005_articleevent.py delete mode 100644 tracker/migrations/0006_alter_logfilediscardedline_error_type.py delete mode 100644 tracker/migrations/0007_alter_logfilediscardedline_error_type.py delete mode 100644 tracker/tasks.py diff --git a/tracker/choices.py b/tracker/choices.py index e2c80e2..dfc562c 100644 --- a/tracker/choices.py +++ b/tracker/choices.py @@ -1,54 +1,16 @@ from django.utils.translation import gettext_lazy as _ -ERROR = "ERROR" -EXCEPTION = "EXCEPTION" -INFO = "INFO" -WARNING = "WARNING" - -EVENT_MSG_TYPE = [ - (ERROR, _("error")), - (WARNING, _("warning")), - (INFO, _("info")), - (EXCEPTION, _("exception")), -] - - -PROGRESS_STATUS_IGNORED = "IGNORED" -PROGRESS_STATUS_REPROC = "REPROC" -PROGRESS_STATUS_TODO = "TODO" -PROGRESS_STATUS_DOING = "DOING" -PROGRESS_STATUS_DONE = "DONE" -PROGRESS_STATUS_PENDING = "PENDING" - -PROGRESS_STATUS = ( - (PROGRESS_STATUS_REPROC, _("To reprocess")), - (PROGRESS_STATUS_TODO, _("To do")), - (PROGRESS_STATUS_DONE, _("Done")), - (PROGRESS_STATUS_DOING, _("Doing")), - (PROGRESS_STATUS_PENDING, _("Pending")), - (PROGRESS_STATUS_IGNORED, _("ignored")), -) - LOG_FILE_DISCARDED_LINE_REASON_MISSING_METADATA = 'MET' -LOG_FILE_DISCARDED_LINE_REASON_MISSING_ARTICLE = 'ART' -LOG_FILE_DISCARDED_LINE_REASON_MISSING_JOURNAL = 'JOU' +LOG_FILE_DISCARDED_LINE_REASON_MISSING_DOCUMENT = 'DOC' +LOG_FILE_DISCARDED_LINE_REASON_MISSING_SOURCE = 'SRC' LOG_FILE_DISCARDED_LINE_REASON_URL_TRANSLATION = 'URL' LOG_FILE_DISCARDED_LINE_REASON_DATABASE_ERROR = 'DBE' LOG_FILE_DISCARDED_LINE_REASON = [ (LOG_FILE_DISCARDED_LINE_REASON_MISSING_METADATA, _("Missing Metadata")), - (LOG_FILE_DISCARDED_LINE_REASON_MISSING_ARTICLE, _("Missing PIDv2 or PIDv3 or PID Generic")), - (LOG_FILE_DISCARDED_LINE_REASON_MISSING_JOURNAL, _("Missing ISSN")), + (LOG_FILE_DISCARDED_LINE_REASON_MISSING_DOCUMENT, _("Missing Document")), + (LOG_FILE_DISCARDED_LINE_REASON_MISSING_SOURCE, _("Missing Source")), (LOG_FILE_DISCARDED_LINE_REASON_URL_TRANSLATION, _("URL Translation")), (LOG_FILE_DISCARDED_LINE_REASON_DATABASE_ERROR, _("Database Error")), ] - - -ARTICLE_EVENT_TYPE_MULTIPLE_OBJS_RETURNED = 'MUL' -ARTICLE_EVENT_TYPE_DATA_ERROR = 'ERR' - -ARTICLE_EVENT_TYPE = [ - (ARTICLE_EVENT_TYPE_MULTIPLE_OBJS_RETURNED, _("Multiple Articles Returned")), - (ARTICLE_EVENT_TYPE_DATA_ERROR, _("Data Error")), -] diff --git a/tracker/exceptions.py b/tracker/exceptions.py index 31ed8c8..9ef3267 100644 --- a/tracker/exceptions.py +++ b/tracker/exceptions.py @@ -1,26 +1,2 @@ -class ProcEventCreateError(Exception): - ... - -class UnexpectedEventCreateError(Exception): - ... - -class EventCreateError(Exception): - ... - -class EventReportCreateError(Exception): - ... - -class EventReportSaveFileError(Exception): - ... - -class EventReportCreateError(Exception): - ... - -class EventReportDeleteEventsError(Exception): - ... - class LogFileDiscardedLineCreateError(Exception): ... - -class ArticleEventError(Exception): - ... diff --git a/tracker/migrations/0002_top100articlesfileevent.py b/tracker/migrations/0002_top100articlesfileevent.py deleted file mode 100644 index 230fb8a..0000000 --- a/tracker/migrations/0002_top100articlesfileevent.py +++ /dev/null @@ -1,93 +0,0 @@ -# Generated by Django 5.0.7 on 2024-08-30 21:52 - -import django.db.models.deletion -from django.conf import settings -from django.db import migrations, models - - -class Migration(migrations.Migration): - dependencies = [ - ("metrics", "0002_alter_top100articlesfile_status"), - ("tracker", "0001_initial"), - migrations.swappable_dependency(settings.AUTH_USER_MODEL), - ] - - operations = [ - migrations.CreateModel( - name="Top100ArticlesFileEvent", - fields=[ - ( - "id", - models.BigAutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ( - "created", - models.DateTimeField( - auto_now_add=True, verbose_name="Creation date" - ), - ), - ( - "updated", - models.DateTimeField( - auto_now=True, verbose_name="Last update date" - ), - ), - ( - "status", - models.CharField( - blank=True, max_length=64, null=True, verbose_name="Status" - ), - ), - ( - "lines", - models.IntegerField( - blank=True, default=0, null=True, verbose_name="Lines" - ), - ), - ( - "message", - models.TextField(blank=True, null=True, verbose_name="Message"), - ), - ( - "creator", - models.ForeignKey( - editable=False, - null=True, - on_delete=django.db.models.deletion.SET_NULL, - related_name="%(class)s_creator", - to=settings.AUTH_USER_MODEL, - verbose_name="Creator", - ), - ), - ( - "file", - models.ForeignKey( - blank=True, - null=True, - on_delete=django.db.models.deletion.SET_NULL, - to="metrics.top100articlesfile", - ), - ), - ( - "updated_by", - models.ForeignKey( - blank=True, - editable=False, - null=True, - on_delete=django.db.models.deletion.SET_NULL, - related_name="%(class)s_last_mod_user", - to=settings.AUTH_USER_MODEL, - verbose_name="Updater", - ), - ), - ], - options={ - "verbose_name_plural": "Top 100 Article File Events", - }, - ), - ] diff --git a/tracker/migrations/0003_logfilediscardedline_delete_top100articlesfileevent.py b/tracker/migrations/0003_logfilediscardedline_delete_top100articlesfileevent.py deleted file mode 100644 index 6e37a9f..0000000 --- a/tracker/migrations/0003_logfilediscardedline_delete_top100articlesfileevent.py +++ /dev/null @@ -1,98 +0,0 @@ -# Generated by Django 5.0.7 on 2025-03-07 16:55 - -import django.db.models.deletion -from django.conf import settings -from django.db import migrations, models - - -class Migration(migrations.Migration): - dependencies = [ - ("log_manager", "0002_alter_collectionconfig_unique_together_and_more"), - ("tracker", "0002_top100articlesfileevent"), - migrations.swappable_dependency(settings.AUTH_USER_MODEL), - ] - - operations = [ - migrations.CreateModel( - name="LogFileDiscardedLine", - fields=[ - ( - "id", - models.BigAutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ( - "created", - models.DateTimeField( - auto_now_add=True, verbose_name="Creation date" - ), - ), - ( - "updated", - models.DateTimeField( - auto_now=True, verbose_name="Last update date" - ), - ), - ( - "error_type", - models.CharField( - blank=True, - choices=[ - ("MET", "Missing Metadata"), - ("ART", "Missing Article"), - ("JOU", "Missing Journal"), - ], - max_length=3, - null=True, - verbose_name="Error Type", - ), - ), - ("data", models.JSONField(default=dict, verbose_name="Data")), - ( - "message", - models.TextField(blank=True, null=True, verbose_name="Message"), - ), - ("handled", models.BooleanField(default=False, verbose_name="Handled")), - ( - "creator", - models.ForeignKey( - editable=False, - null=True, - on_delete=django.db.models.deletion.SET_NULL, - related_name="%(class)s_creator", - to=settings.AUTH_USER_MODEL, - verbose_name="Creator", - ), - ), - ( - "log_file", - models.ForeignKey( - on_delete=django.db.models.deletion.CASCADE, - to="log_manager.logfile", - ), - ), - ( - "updated_by", - models.ForeignKey( - blank=True, - editable=False, - null=True, - on_delete=django.db.models.deletion.SET_NULL, - related_name="%(class)s_last_mod_user", - to=settings.AUTH_USER_MODEL, - verbose_name="Updater", - ), - ), - ], - options={ - "abstract": False, - }, - ), - migrations.DeleteModel( - name="Top100ArticlesFileEvent", - ), - ] diff --git a/tracker/migrations/0004_alter_logfilediscardedline_error_type.py b/tracker/migrations/0004_alter_logfilediscardedline_error_type.py deleted file mode 100644 index 1061793..0000000 --- a/tracker/migrations/0004_alter_logfilediscardedline_error_type.py +++ /dev/null @@ -1,28 +0,0 @@ -# Generated by Django 5.0.7 on 2025-03-27 20:40 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - dependencies = [ - ("tracker", "0003_logfilediscardedline_delete_top100articlesfileevent"), - ] - - operations = [ - migrations.AlterField( - model_name="logfilediscardedline", - name="error_type", - field=models.CharField( - blank=True, - choices=[ - ("MET", "Missing Metadata"), - ("ART", "Missing Article"), - ("JOU", "Missing Journal"), - ("URL", "URL Translation"), - ], - max_length=3, - null=True, - verbose_name="Error Type", - ), - ), - ] diff --git a/tracker/migrations/0005_articleevent.py b/tracker/migrations/0005_articleevent.py deleted file mode 100644 index 859910e..0000000 --- a/tracker/migrations/0005_articleevent.py +++ /dev/null @@ -1,86 +0,0 @@ -# Generated by Django 5.0.7 on 2025-05-23 17:27 - -import django.db.models.deletion -from django.conf import settings -from django.db import migrations, models - - -class Migration(migrations.Migration): - dependencies = [ - ("tracker", "0004_alter_logfilediscardedline_error_type"), - migrations.swappable_dependency(settings.AUTH_USER_MODEL), - ] - - operations = [ - migrations.CreateModel( - name="ArticleEvent", - fields=[ - ( - "id", - models.BigAutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ( - "created", - models.DateTimeField( - auto_now_add=True, verbose_name="Creation date" - ), - ), - ( - "updated", - models.DateTimeField( - auto_now=True, verbose_name="Last update date" - ), - ), - ( - "event_type", - models.CharField( - blank=True, - choices=[ - ("MUL", "Multiple Articles Returned"), - ("ERR", "Data Error"), - ], - max_length=3, - null=True, - verbose_name="Event Type", - ), - ), - ( - "message", - models.TextField(blank=True, null=True, verbose_name="Message"), - ), - ("data", models.JSONField(default=dict, verbose_name="Data")), - ("handled", models.BooleanField(default=False, verbose_name="Handled")), - ( - "creator", - models.ForeignKey( - editable=False, - null=True, - on_delete=django.db.models.deletion.SET_NULL, - related_name="%(class)s_creator", - to=settings.AUTH_USER_MODEL, - verbose_name="Creator", - ), - ), - ( - "updated_by", - models.ForeignKey( - blank=True, - editable=False, - null=True, - on_delete=django.db.models.deletion.SET_NULL, - related_name="%(class)s_last_mod_user", - to=settings.AUTH_USER_MODEL, - verbose_name="Updater", - ), - ), - ], - options={ - "abstract": False, - }, - ), - ] diff --git a/tracker/migrations/0006_alter_logfilediscardedline_error_type.py b/tracker/migrations/0006_alter_logfilediscardedline_error_type.py deleted file mode 100644 index fb7f74a..0000000 --- a/tracker/migrations/0006_alter_logfilediscardedline_error_type.py +++ /dev/null @@ -1,29 +0,0 @@ -# Generated by Django 5.0.7 on 2025-06-14 10:46 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - dependencies = [ - ("tracker", "0005_articleevent"), - ] - - operations = [ - migrations.AlterField( - model_name="logfilediscardedline", - name="error_type", - field=models.CharField( - blank=True, - choices=[ - ("MET", "Missing Metadata"), - ("ART", "Missing Article"), - ("JOU", "Missing Journal"), - ("URL", "URL Translation"), - ("DBE", "Database Error"), - ], - max_length=3, - null=True, - verbose_name="Error Type", - ), - ), - ] diff --git a/tracker/migrations/0007_alter_logfilediscardedline_error_type.py b/tracker/migrations/0007_alter_logfilediscardedline_error_type.py deleted file mode 100644 index f9ffebe..0000000 --- a/tracker/migrations/0007_alter_logfilediscardedline_error_type.py +++ /dev/null @@ -1,29 +0,0 @@ -# Generated by Django 5.0.7 on 2025-08-09 21:04 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - dependencies = [ - ("tracker", "0006_alter_logfilediscardedline_error_type"), - ] - - operations = [ - migrations.AlterField( - model_name="logfilediscardedline", - name="error_type", - field=models.CharField( - blank=True, - choices=[ - ("MET", "Missing Metadata"), - ("ART", "Missing PIDv2 or PIDv3 or PID Generic"), - ("JOU", "Missing ISSN"), - ("URL", "URL Translation"), - ("DBE", "Database Error"), - ], - max_length=3, - null=True, - verbose_name="Error Type", - ), - ), - ] diff --git a/tracker/models.py b/tracker/models.py index 77086ee..a394ed6 100644 --- a/tracker/models.py +++ b/tracker/models.py @@ -1,65 +1,13 @@ -import json -import logging -import traceback -import uuid - -from datetime import datetime - -from django.core.files.base import ContentFile from django.db import models from django.utils.translation import gettext_lazy as _ -from core.models import CommonControlField from log_manager.models import LogFile from tracker import choices - -from .exceptions import * +from .exceptions import LogFileDiscardedLineCreateError -class ArticleEvent(CommonControlField): - event_type = models.CharField( - _("Event Type"), - choices=choices.ARTICLE_EVENT_TYPE, - max_length=3, - null=True, - blank=True, - ) - - message = models.TextField( - _("Message"), - null=True, - blank=True, - ) - - data = models.JSONField( - _("Data"), - default=dict, - ) - - handled = models.BooleanField( - _("Handled"), - default=False - ) - - @classmethod - def create(cls, event_type, message, data): - try: - obj = cls() - obj.event_type = event_type - obj.message = message - obj.data = data - obj.save() - except Exception as exc: - raise ArticleEventError( - f"Unable to create ArticleEvent ({data} - {event_type} - {message}). EXCEPTION {exc}" - ) - return obj - - def __str__(self): - return f"{self.event_type} - {self.message}" - - -class LogFileDiscardedLine(CommonControlField): +class LogFileDiscardedLine(models.Model): + created = models.DateTimeField(verbose_name=_("Creation date"), auto_now_add=True) log_file = models.ForeignKey( LogFile, on_delete=models.CASCADE, @@ -108,174 +56,4 @@ def __str__(self): return f"{self.data} - {self.message}" -class UnexpectedEvent(models.Model): - id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) - created = models.DateTimeField(verbose_name=_("Creation date"), auto_now_add=True) - exception_type = models.TextField(_("Exception Type"), null=True, blank=True) - exception_msg = models.TextField(_("Exception Msg"), null=True, blank=True) - traceback = models.JSONField(null=True, blank=True) - detail = models.JSONField(null=True, blank=True) - - class Meta: - indexes = [ - models.Index(fields=["exception_type"]), - ] - - def __str__(self): - return f"{self.exception_msg}" - - @property - def data(self): - return dict( - created=self.created.isoformat(), - exception_type=self.exception_type, - exception_msg=self.exception_msg, - traceback=json.dumps(self.traceback), - detail=json.dumps(self.detail), - ) - - @classmethod - def create( - cls, - exception=None, - exc_traceback=None, - detail=None, - ): - try: - if exception: - logging.exception(exception) - - obj = cls() - obj.exception_msg = str(exception) - obj.exception_type = str(type(exception)) - try: - json.dumps(detail) - obj.detail = detail - except Exception as e: - obj.detail = str(detail) - if exc_traceback: - obj.traceback = traceback.format_tb(exc_traceback) - obj.save() - return obj - except Exception as exc: - raise UnexpectedEventCreateError( - f"Unable to create unexpected event ({exception} {exc_traceback}). EXCEPTION {exc}" - ) - - -class Event(CommonControlField): - id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) - message = models.TextField(_("Message"), null=True, blank=True) - message_type = models.CharField( - _("Message type"), - choices=choices.EVENT_MSG_TYPE, - max_length=16, - null=True, - blank=True, - ) - detail = models.JSONField(null=True, blank=True) - unexpected_event = models.ForeignKey( - 'UnexpectedEvent', on_delete=models.SET_NULL, null=True, blank=True - ) - - class Meta: - abstract = True - indexes = [ - models.Index(fields=["message_type"]), - ] - - @property - def data(self): - d = {} - d["created"] = self.created.isoformat() - d["user"] = self.user.username - d.update( - dict( - message=self.message, message_type=self.message_type, detail=self.detail - ) - ) - if self.unexpected_event: - d.update(self.unexpected_event.data) - return d - - @classmethod - def create( - cls, - user=None, - message_type=None, - message=None, - e=None, - exc_traceback=None, - detail=None, - ): - try: - obj = cls() - obj.creator = user - obj.message = message - obj.message_type = message_type - obj.detail = detail - obj.save() - - if e: - logging.exception(f"{message}: {e}") - obj.unexpected_event = UnexpectedEvent.create( - exception=e, - exc_traceback=exc_traceback, - ) - obj.save() - except Exception as exc: - raise EventCreateError( - f"Unable to create Event ({message} {e}). EXCEPTION: {exc}" - ) - return obj - - -def tracker_file_directory_path(instance, filename): - d = datetime.now(datetime.timezone.utc) - return f"tracker/{d.year}/{d.month}/{d.day}/{filename}" - - -class EventReport(CommonControlField): - file = models.FileField( - upload_to=tracker_file_directory_path, null=True, blank=True - ) - - class Meta: - abstract = True - - def save_file(self, events, ext=None): - if not events: - return - try: - ext = ".json" - content = json.dumps(list([item.data for item in events])) - name = datetime.now(datetime.timezone.utc).isoformat() + ext - self.file.save(name, ContentFile(content)) - self.delete_events(events) - except Exception as e: - raise EventReportSaveFileError( - f"Unable to save EventReport.file ({name}). Exception: {e}" - ) - - def delete_events(self, events): - for item in events: - try: - item.unexpected_event.delete() - except: - pass - try: - item.delete() - except: - pass - - @classmethod - def create(cls, user): - try: - obj = cls() - obj.creator = user - obj.save() - except Exception as e: - raise EventReportCreateError( - f"Unable to create EventReport. Exception: {e}" - ) diff --git a/tracker/tasks.py b/tracker/tasks.py deleted file mode 100644 index ace8145..0000000 --- a/tracker/tasks.py +++ /dev/null @@ -1,34 +0,0 @@ -# tasks.py -from datetime import datetime - -from django.contrib.auth import get_user_model - -from config import celery_app -from core.utils.utils import _get_user - -from .models import UnexpectedEvent - - -User = get_user_model() - - -@celery_app.task(bind=True, name="Cleanup unexpected events") -def delete_unexpected_events(self, exception_type, start_date=None, end_date=None, user_id=None, username=None): - """ - Delete UnexpectedEvent records based on exception type and optional date range. - """ - user = _get_user(self.request, username=username, user_id=user_id) - - if exception_type == '__all__': - UnexpectedEvent.objects.all().delete() - return - - filters = {'exception_type__icontains': exception_type} - if start_date: - start_date = datetime.fromisoformat(start_date) - filters['created__gte'] = start_date - if end_date: - end_date = datetime.fromisoformat(end_date) - filters['created__lte'] = end_date - - UnexpectedEvent.objects.filter(**filters).delete() From 0873ba7edd5096d05c2173517002e09028fb8565 Mon Sep 17 00:00:00 2001 From: Rafael JP Damaceno Date: Fri, 1 May 2026 22:40:49 -0300 Subject: [PATCH 28/31] =?UTF-8?q?refactor:=20adicionar=20migra=C3=A7=C3=A3?= =?UTF-8?q?o=20tracker=200002=20e=20atualizar=200001=5Finitial?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tracker/migrations/0001_initial.py | 162 ++++++++++++++++-- ...02_remove_articleevent_creator_and_more.py | 38 ++++ 2 files changed, 188 insertions(+), 12 deletions(-) create mode 100644 tracker/migrations/0002_remove_articleevent_creator_and_more.py diff --git a/tracker/migrations/0001_initial.py b/tracker/migrations/0001_initial.py index f207722..04fdc35 100644 --- a/tracker/migrations/0001_initial.py +++ b/tracker/migrations/0001_initial.py @@ -1,13 +1,18 @@ -# Generated by Django 5.0.7 on 2024-08-30 00:52 +# Generated by Codex on 2026-04-27 +import django.db.models.deletion import uuid +from django.conf import settings from django.db import migrations, models class Migration(migrations.Migration): initial = True - dependencies = [] + dependencies = [ + ("log_manager", "0001_initial"), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] operations = [ migrations.CreateModel( @@ -24,21 +29,15 @@ class Migration(migrations.Migration): ), ( "created", - models.DateTimeField( - auto_now_add=True, verbose_name="Creation date" - ), + models.DateTimeField(auto_now_add=True, verbose_name="Creation date"), ), ( "exception_type", - models.TextField( - blank=True, null=True, verbose_name="Exception Type" - ), + models.TextField(blank=True, null=True, verbose_name="Exception Type"), ), ( "exception_msg", - models.TextField( - blank=True, null=True, verbose_name="Exception Msg" - ), + models.TextField(blank=True, null=True, verbose_name="Exception Msg"), ), ("traceback", models.JSONField(blank=True, null=True)), ("detail", models.JSONField(blank=True, null=True)), @@ -46,9 +45,148 @@ class Migration(migrations.Migration): options={ "indexes": [ models.Index( - fields=["exception_type"], name="tracker_une_excepti_47ede4_idx" + fields=["exception_type"], + name="tracker_une_excepti_47ede4_idx", ) ], }, ), + migrations.CreateModel( + name="ArticleEvent", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "created", + models.DateTimeField(auto_now_add=True, verbose_name="Creation date"), + ), + ( + "updated", + models.DateTimeField(auto_now=True, verbose_name="Last update date"), + ), + ( + "event_type", + models.CharField( + blank=True, + choices=[ + ("MUL", "Multiple Articles Returned"), + ("ERR", "Data Error"), + ], + max_length=3, + null=True, + verbose_name="Event Type", + ), + ), + ( + "message", + models.TextField(blank=True, null=True, verbose_name="Message"), + ), + ("data", models.JSONField(default=dict, verbose_name="Data")), + ("handled", models.BooleanField(default=False, verbose_name="Handled")), + ( + "creator", + models.ForeignKey( + editable=False, + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="%(class)s_creator", + to=settings.AUTH_USER_MODEL, + verbose_name="Creator", + ), + ), + ( + "updated_by", + models.ForeignKey( + blank=True, + editable=False, + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="%(class)s_last_mod_user", + to=settings.AUTH_USER_MODEL, + verbose_name="Updater", + ), + ), + ], + ), + migrations.CreateModel( + name="LogFileDiscardedLine", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "created", + models.DateTimeField(auto_now_add=True, verbose_name="Creation date"), + ), + ( + "updated", + models.DateTimeField(auto_now=True, verbose_name="Last update date"), + ), + ( + "error_type", + models.CharField( + blank=True, + choices=[ + ("MET", "Missing Metadata"), + ("DOC", "Missing Document"), + ("SRC", "Missing Source"), + ("URL", "URL Translation"), + ("DBE", "Database Error"), + ], + max_length=3, + null=True, + verbose_name="Error Type", + ), + ), + ("data", models.JSONField(default=dict, verbose_name="Data")), + ( + "message", + models.TextField(blank=True, null=True, verbose_name="Message"), + ), + ("handled", models.BooleanField(default=False, verbose_name="Handled")), + ( + "creator", + models.ForeignKey( + editable=False, + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="%(class)s_creator", + to=settings.AUTH_USER_MODEL, + verbose_name="Creator", + ), + ), + ( + "log_file", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + to="log_manager.logfile", + ), + ), + ( + "updated_by", + models.ForeignKey( + blank=True, + editable=False, + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="%(class)s_last_mod_user", + to=settings.AUTH_USER_MODEL, + verbose_name="Updater", + ), + ), + ], + ), ] diff --git a/tracker/migrations/0002_remove_articleevent_creator_and_more.py b/tracker/migrations/0002_remove_articleevent_creator_and_more.py new file mode 100644 index 0000000..ee23c85 --- /dev/null +++ b/tracker/migrations/0002_remove_articleevent_creator_and_more.py @@ -0,0 +1,38 @@ +# Generated by Django 5.2.12 on 2026-05-01 22:23 + +from django.db import migrations + + +class Migration(migrations.Migration): + dependencies = [ + ("tracker", "0001_initial"), + ] + + operations = [ + migrations.RemoveField( + model_name="articleevent", + name="creator", + ), + migrations.RemoveField( + model_name="articleevent", + name="updated_by", + ), + migrations.DeleteModel( + name="UnexpectedEvent", + ), + migrations.RemoveField( + model_name="logfilediscardedline", + name="creator", + ), + migrations.RemoveField( + model_name="logfilediscardedline", + name="updated", + ), + migrations.RemoveField( + model_name="logfilediscardedline", + name="updated_by", + ), + migrations.DeleteModel( + name="ArticleEvent", + ), + ] From 38944b7ae2b8a1a6b0373cdc6424131ef6e8706e Mon Sep 17 00:00:00 2001 From: Rafael JP Damaceno Date: Fri, 1 May 2026 22:40:59 -0300 Subject: [PATCH 29/31] refactor: simplificar modelos resources (remover CommonControlField) --- resources/constants.py | 2 +- resources/models.py | 83 +++++++++++++++++++++++++++++++++++--- resources/tasks.py | 56 ++++++++++++++----------- resources/tests.py | 3 -- resources/wagtail_hooks.py | 12 ++++++ 5 files changed, 122 insertions(+), 34 deletions(-) delete mode 100644 resources/tests.py diff --git a/resources/constants.py b/resources/constants.py index feba18d..2ce64da 100644 --- a/resources/constants.py +++ b/resources/constants.py @@ -1,2 +1,2 @@ DEFAULT_COUNTER_ROBOTS_URL = 'https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json' -DEFAULT_MMDB_URL = 'https://download.db-ip.com/free/dbip-country-lite-2025-02.mmdb.gz' +DEFAULT_MMDB_URL = 'https://download.db-ip.com/free/dbip-country-lite-2026-03.mmdb.gz' diff --git a/resources/models.py b/resources/models.py index a30b8d3..22663e2 100644 --- a/resources/models.py +++ b/resources/models.py @@ -2,11 +2,26 @@ from django.db import models from django.utils.translation import gettext_lazy as _ +from wagtail.admin.panels import FieldPanel -from core.models import CommonControlField +class RobotUserAgent(models.Model): + SOURCE_ALL = "all" + SOURCE_COUNTER = "counter" + SOURCE_SCIELO = "scielo" + SOURCE_CHOICES = [SOURCE_ALL, SOURCE_COUNTER, SOURCE_SCIELO] + panels = [ + FieldPanel("pattern"), + FieldPanel("source_counter"), + FieldPanel("source_scielo"), + FieldPanel("is_active"), + FieldPanel("source_url"), + FieldPanel("last_changed"), + ] + + created = models.DateTimeField(verbose_name=_("Creation date"), auto_now_add=True) + updated = models.DateTimeField(verbose_name=_("Last update date"), auto_now=True) -class RobotUserAgent(CommonControlField): pattern = models.CharField( verbose_name=_('Pattern'), max_length=255, @@ -14,21 +29,77 @@ class RobotUserAgent(CommonControlField): blank=False, primary_key=True, ) + source_counter = models.BooleanField( + verbose_name=_("From Atmire/COUNTER"), + default=False, + db_index=True, + ) + source_scielo = models.BooleanField( + verbose_name=_("From SciELO"), + default=False, + db_index=True, + ) + is_active = models.BooleanField( + verbose_name=_("Active"), + default=True, + db_index=True, + ) + source_url = models.URLField( + verbose_name=_("Source URL"), + max_length=255, + null=True, + blank=True, + ) last_changed = models.DateField( verbose_name=_('Last Changed'), - null=False, - blank=False, + null=True, + blank=True, ) @classmethod def get_all_patterns(cls): - return cls.objects.values_list('pattern', flat=True) + return cls.get_patterns(source=cls.SOURCE_ALL) + + @classmethod + def normalize_source(cls, source=None): + normalized = (source or cls.SOURCE_ALL).lower() + if normalized not in cls.SOURCE_CHOICES: + raise ValueError(f"Unsupported robots source: {source}") + return normalized + + @classmethod + def get_patterns(cls, source=None): + source = cls.normalize_source(source) + queryset = cls.objects.filter(is_active=True) + + if source == cls.SOURCE_COUNTER: + queryset = queryset.filter(source_counter=True) + elif source == cls.SOURCE_SCIELO: + queryset = queryset.filter(source_scielo=True) + + return queryset.values_list("pattern", flat=True) + + @property + def source_labels(self): + labels = [] + if self.source_counter: + labels.append("Atmire/COUNTER") + if self.source_scielo: + labels.append("SciELO") + return ", ".join(labels) or "-" + + def save(self, *args, **kwargs): + if not self.source_counter and not self.source_scielo: + self.source_scielo = True + super().save(*args, **kwargs) def __str__(self): return self.pattern -class MMDB(CommonControlField): +class MMDB(models.Model): + created = models.DateTimeField(verbose_name=_("Creation date"), auto_now_add=True) + updated = models.DateTimeField(verbose_name=_("Last update date"), auto_now=True) id = models.CharField( verbose_name=_('ID (HASH)'), max_length=64, diff --git a/resources/tasks.py b/resources/tasks.py index e67cea1..4df60a9 100644 --- a/resources/tasks.py +++ b/resources/tasks.py @@ -1,19 +1,13 @@ import logging -from django.contrib.auth import get_user_model -from django.utils import timezone from django.utils.translation import gettext as _ from config import celery_app -from core.utils.utils import _get_user from . import constants, models, utils - -User = get_user_model() - -@celery_app.task(bind=True, name=_('Load robots data')) -def task_load_robots(self, url_robots=None, user_id=None, username=None): +@celery_app.task(bind=True, name=_('[Resources] Load Robots Data')) +def task_load_robots(self, url_robots=None): """ Load robots from a given URL and save them to the database. This function fetches robot data from a specified URL (or a default URL if none is provided), @@ -32,8 +26,6 @@ def task_load_robots(self, url_robots=None, user_id=None, username=None): - Error if there is an issue downloading or saving the robots. - Debug information for each robot saved. """ - user = _get_user(self.request, username=username, user_id=user_id) - if not url_robots: url_robots = constants.DEFAULT_COUNTER_ROBOTS_URL logging.warning(f'No robots URL provided. Using default: {url_robots}') @@ -45,43 +37,63 @@ def task_load_robots(self, url_robots=None, user_id=None, username=None): return False cleaned_robots_data = utils.clean_robots_list(robots_data) + fetched_patterns = set() try: for r_str in cleaned_robots_data: pattern = r_str.get('pattern') last_changed = r_str.get('last_changed') + fetched_patterns.add(pattern) - r_obj, created = models.RobotUserAgent.objects.get_or_create(pattern=pattern, last_changed=last_changed) + r_obj = models.RobotUserAgent.objects.filter(pattern=pattern).first() + created = r_obj is None if created: - r_obj.creator = user - - r_obj.updated = timezone.now() - r_obj.updated_by = user + r_obj = models.RobotUserAgent( + pattern=pattern, + source_counter=True, + source_scielo=False, + ) + r_obj.source_counter = True + r_obj.is_active = True + r_obj.source_url = url_robots + r_obj.last_changed = last_changed r_obj.save() logging.debug(f'Robot saved: {r_obj}') + + stale_counter_patterns = models.RobotUserAgent.objects.filter( + source_counter=True + ).exclude(pattern__in=fetched_patterns) + + for r_obj in stale_counter_patterns: + r_obj.source_counter = False + r_obj.source_url = None + r_obj.last_changed = None + if not r_obj.source_scielo: + r_obj.is_active = False + r_obj.save() + logging.debug(f'Robot deactivated or detached from COUNTER source: {r_obj}') + return True except Exception as e: logging.error(f'Error saving robots: {e}') + return False -@celery_app.task(bind=True, name=_('Load geolocation and country data')) -def task_load_geoip(self, url_geoip=None, user_id=None, username=None, validate=True): +@celery_app.task(bind=True, name=_('[Resources] Load Geolocation Data')) +def task_load_geoip(self, url_geoip=None, validate=True): """ Load GeoIP data from a specified URL, validate it, and save it to the database. Args: url_geoip (str, optional): The URL to download the GeoIP data from. Defaults to None. - user_id (int, optional): The ID of the user performing the task. Defaults to None. - username (str, optional): The username of the user performing the task. Defaults to None. validate (bool, optional): Whether to validate the GeoIP data. Defaults to True. Returns: bool: True if the GeoIP data was successfully loaded and saved, False otherwise. Raises: Exception: If there is an error downloading, decompressing, or validating the GeoIP data. """ - user = _get_user(self.request, username=username, user_id=user_id) if not url_geoip: url_geoip = constants.DEFAULT_MMDB_URL @@ -115,10 +127,6 @@ def task_load_geoip(self, url_geoip=None, user_id=None, username=None, validate= except models.MMDB.DoesNotExist: mmdb_obj = models.MMDB.objects.create(id=mmdb_hash, data=mmdb_data) mmdb_obj.url = url_geoip or constants.DEFAULT_MMDB_URL - mmdb_obj.creator = user - - mmdb_obj.updated = timezone.now() - mmdb_obj.updated_by = user mmdb_obj.save() logging.debug(f'GeoIP data has been saved: {mmdb_obj}') diff --git a/resources/tests.py b/resources/tests.py deleted file mode 100644 index 7ce503c..0000000 --- a/resources/tests.py +++ /dev/null @@ -1,3 +0,0 @@ -from django.test import TestCase - -# Create your tests here. diff --git a/resources/wagtail_hooks.py b/resources/wagtail_hooks.py index 758bb53..c347b22 100644 --- a/resources/wagtail_hooks.py +++ b/resources/wagtail_hooks.py @@ -15,13 +15,25 @@ class RobotUserAgentSnippetViewSet(SnippetViewSet): list_display = ( "pattern", + "source_labels", + "is_active", "last_changed", ) search_fields = ( "pattern", + "source_url", + ) + list_filter = ( + "source_counter", + "source_scielo", + "is_active", ) list_export = ( "pattern", + "source_counter", + "source_scielo", + "is_active", + "source_url", "last_changed", ) export_filename = "robots" From 970c8310d38f0b6c9bb213f53a0b693d7afae6cb Mon Sep 17 00:00:00 2001 From: Rafael JP Damaceno Date: Fri, 1 May 2026 22:40:59 -0300 Subject: [PATCH 30/31] =?UTF-8?q?refactor:=20adicionar=20migra=C3=A7=C3=A3?= =?UTF-8?q?o=20resources=200002?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...creator_remove_mmdb_updated_by_and_more.py | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 resources/migrations/0002_remove_mmdb_creator_remove_mmdb_updated_by_and_more.py diff --git a/resources/migrations/0002_remove_mmdb_creator_remove_mmdb_updated_by_and_more.py b/resources/migrations/0002_remove_mmdb_creator_remove_mmdb_updated_by_and_more.py new file mode 100644 index 0000000..80bb0cc --- /dev/null +++ b/resources/migrations/0002_remove_mmdb_creator_remove_mmdb_updated_by_and_more.py @@ -0,0 +1,61 @@ +# Generated by Django 5.2.12 on 2026-05-01 22:23 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("resources", "0001_initial"), + ] + + operations = [ + migrations.RemoveField( + model_name="mmdb", + name="creator", + ), + migrations.RemoveField( + model_name="mmdb", + name="updated_by", + ), + migrations.RemoveField( + model_name="robotuseragent", + name="creator", + ), + migrations.RemoveField( + model_name="robotuseragent", + name="updated_by", + ), + migrations.AddField( + model_name="robotuseragent", + name="is_active", + field=models.BooleanField( + db_index=True, default=True, verbose_name="Active" + ), + ), + migrations.AddField( + model_name="robotuseragent", + name="source_counter", + field=models.BooleanField( + db_index=True, default=False, verbose_name="From Atmire/COUNTER" + ), + ), + migrations.AddField( + model_name="robotuseragent", + name="source_scielo", + field=models.BooleanField( + db_index=True, default=False, verbose_name="From SciELO" + ), + ), + migrations.AddField( + model_name="robotuseragent", + name="source_url", + field=models.URLField( + blank=True, max_length=255, null=True, verbose_name="Source URL" + ), + ), + migrations.AlterField( + model_name="robotuseragent", + name="last_changed", + field=models.DateField(blank=True, null=True, verbose_name="Last Changed"), + ), + ] From e563354ad3ac41affaf14890f0273c79e5d37ae3 Mon Sep 17 00:00:00 2001 From: Rafael JP Damaceno Date: Fri, 1 May 2026 22:41:11 -0300 Subject: [PATCH 31/31] =?UTF-8?q?refactor:=20novo=20modelo=20DailyMetricJo?= =?UTF-8?q?b,=20remover=20ES=20e=20migra=C3=A7=C3=B5es=20antigas?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- metrics/es.py | 385 ------------------ metrics/fixtures/top100articles.csv | 97 ----- metrics/fixtures/top100articles.tar.gz | Bin 1675 -> 0 bytes metrics/migrations/0001_initial.py | 171 +++----- .../0002_alter_top100articlesfile_status.py | 27 -- ..._top100articlesfile_attachment_and_more.py | 187 --------- ...0004_delete_top100articlesfile_and_more.py | 49 --- ...ter_itemaccess_unique_together_and_more.py | 49 --- .../0006_alter_itemaccess_content_type.py | 17 - ...007_alter_usersession_datetime_and_more.py | 23 -- .../migrations/0008_remove_a_few_models.py | 48 --- metrics/models.py | 108 +++++ .../indexes/metrics/top100articles_text.txt | 10 - 13 files changed, 172 insertions(+), 999 deletions(-) delete mode 100644 metrics/es.py delete mode 100755 metrics/fixtures/top100articles.csv delete mode 100644 metrics/fixtures/top100articles.tar.gz delete mode 100644 metrics/migrations/0002_alter_top100articlesfile_status.py delete mode 100644 metrics/migrations/0003_remove_top100articlesfile_attachment_and_more.py delete mode 100644 metrics/migrations/0004_delete_top100articlesfile_and_more.py delete mode 100644 metrics/migrations/0005_alter_itemaccess_unique_together_and_more.py delete mode 100644 metrics/migrations/0006_alter_itemaccess_content_type.py delete mode 100644 metrics/migrations/0007_alter_usersession_datetime_and_more.py delete mode 100644 metrics/migrations/0008_remove_a_few_models.py delete mode 100644 metrics/templates/search/indexes/metrics/top100articles_text.txt diff --git a/metrics/es.py b/metrics/es.py deleted file mode 100644 index 25ad701..0000000 --- a/metrics/es.py +++ /dev/null @@ -1,385 +0,0 @@ -import logging - -from elasticsearch import Elasticsearch, helpers, NotFoundError -from django.conf import settings - -from .utils import index_utils - - -DEFAULT_ES_INDEX_USAGE_MAPPINGS = { - "properties": { - "collection": { - "type": "keyword" - }, - "journal": { - "properties": { - "scielo_issn": { - "type": "keyword" - }, - "main_title": { - "type": "keyword" - }, - "subject_area_capes": { - "type": "keyword" - }, - "subject_area_wos": { - "type": "keyword" - }, - "acronym": { - "type": "keyword" - }, - "publisher": { - "type": "keyword" - } - } - }, - "pid": { - "type": "keyword" - }, - "pid_v2": { - "type": "keyword" - }, - "pid_v3": { - "type": "keyword" - }, - "pid_generic": { - "type": "keyword" - }, - "year_of_publication": { - "type": "integer" - }, - "media_language": { - "type": "keyword" - }, - "country_code": { - "type": "keyword" - }, - "date": { - "type": "date", - "format": "yyyy-MM-dd" - }, - "total_requests": { - "type": "integer" - }, - "total_investigations": { - "type": "integer" - }, - "unique_requests": { - "type": "integer" - }, - "unique_investigations": { - "type": "integer" - } - } -} - - -class ElasticSearchUsageWrapper: - """ - Wrapper for Elasticsearch usage metrics operations. - This class provides methods to interact with Elasticsearch for indexing, - deleting, and managing usage metrics data. - """ - - def __init__(self, url=None, basic_auth=None, api_key=None, verify_certs=False): - self.client = self.get_elasticsearch_client(url, basic_auth, api_key, verify_certs) - - - def get_elasticsearch_client(self, url=None, basic_auth=None, api_key=None, verify_certs=False): - """ - Create an Elasticsearch client instance using Django settings. - - :param url: Elasticsearch URL. If None, it will be taken from Django settings. - :param basic_auth: Basic authentication credentials. If None, it will be taken from Django settings. - :param api_key: API key. If None, it will be taken from Django settings. - :param verify_certs: Whether to verify SSL certificates. If None, it will be taken from Django settings. - """ - if not url: - url = getattr(settings, "ES_URL", None) - - if not basic_auth: - basic_auth = getattr(settings, "ES_BASIC_AUTH", None) - - if not api_key: - api_key = getattr(settings, "ES_API_KEY", None) - - if not verify_certs: - verify_certs = getattr(settings, "ES_VERIFY_CERTS", False) - - if basic_auth: - client = Elasticsearch(url, basic_auth=basic_auth, verify_certs=verify_certs) - elif api_key: - client = Elasticsearch(url, api_key=api_key, verify_certs=verify_certs) - else: - client = Elasticsearch(url, verify_certs=verify_certs) - - return client - - - def ping(self): - """ - Check if the Elasticsearch client is available. - Returns True if the client is available, False otherwise. - """ - try: - return self.client.ping() - except Exception as e: - logging.error(f"Error pinging Elasticsearch client: {e}") - return False - - - def create_index(self, index_name, mappings=None, ping_client=False): - """ - Create an Elasticsearch index. - - :param index_name: Name of the index to create. - :param mappings: Mappings for the index. If None, default mappings will be used. - :param ping_client: If True, checks if the Elasticsearch client is available before creating the index. - """ - if ping_client and not self.ping(): - return - - if not mappings: - mappings = DEFAULT_ES_INDEX_USAGE_MAPPINGS - - resp = self.client.indices.create( - index=index_name, - mappings=mappings, - ) - logging.info(f"Index {index_name} created: {resp}") - - - def create_index_if_not_exists(self, index_name, mappings=None, ping_client=False): - """ - Create an Elasticsearch index if it does not already exist. - - :param index_name: Name of the index to create. - :param mappings: Mappings for the index. If None, default mappings will be used. - :param ping_client: If True, checks if the Elasticsearch client is available before creating the index. - """ - if ping_client and not self.ping(): - return - - if not self.client.indices.exists(index=index_name): - self.create_index(index_name, mappings, ping_client) - else: - logging.info(f"Index {index_name} already exists. Skipping creation.") - - - def delete_index(self, index_name, ping_client=False): - """ - Delete an Elasticsearch index. - - :param index_name: Name of the index to delete. - :param ping_client: If True, checks if the Elasticsearch client is available before deleting the index. - """ - if ping_client and not self.ping(): - return - - self.client.indices.delete(index=index_name) - - - def index_document(self, index_name, doc_id, document, ping_client=False): - """ - Index a document in Elasticsearch. - - :param index_name: Name of the index. - :param doc_id: ID of the document. - :param document: Document to index. - :param ping_client: If True, checks if the Elasticsearch client is available before indexing the document. - """ - if ping_client and not self.ping(): - return - - self.client.index(index=index_name, id=doc_id, document=document) - - - def index_documents(self, index_name, documents, ping_client=False): - """ - Index multiple documents in Elasticsearch. - - :param index_name: Name of the index. - :param documents: Dictionary of documents to index, where keys are document IDs and values are the documents. - :param ping_client: If True, checks if the Elasticsearch client is available before indexing the documents. - """ - if ping_client and not self.ping(): - return - - helpers.bulk( - self.client, - ( - { - "_index": index_name, - "_id": doc_id, - "_source": document, - } - for doc_id, document in documents.items() - ), - ) - - - def delete_document(self, index_name, doc_id, ping_client=False): - """ - Delete a document from Elasticsearch. - - :param index_name: Name of the index. - :param doc_id: ID of the document to delete. - :param ping_client: If True, checks if the Elasticsearch client is available before deleting the document. - """ - if ping_client and not self.ping(): - return - - try: - self.client.delete(index=index_name, id=doc_id) - except NotFoundError as e: - logging.error(f"Failed to delete document {doc_id} from Elasticsearch: {e}") - - - def delete_documents(self, index_name, doc_ids, ping_client=False): - """ - Delete multiple documents from Elasticsearch using bulk. - :param index_name: Name of the index. - :param doc_ids: List of document IDs to delete. - :param ping_client: If True, checks if the Elasticsearch client is available before deleting the documents. - """ - if ping_client and not self.ping(): - return - - actions = ( - { - "_op_type": "delete", - "_index": index_name, - "_id": doc_id, - } - for doc_id in doc_ids - ) - - try: - helpers.bulk(self.client, actions) - except helpers.BulkIndexError as e: - logging.error(f"BulkIndexError occurred: {e.errors}") - - - def delete_documents_by_key(self, index_name, data, ping_client=False): - """ - Delete multiple documents from Elasticsearch based on specific key-value pairs. - - :param index_name: Name of the index. - :param data: Dictionary where keys are field names and values are single values or lists of values. - :param ping_client: If True, checks if the Elasticsearch client is available before deleting the documents. - """ - if ping_client and not self.ping(): - return - - query = { - "query": { - "bool": { - "must": [ - { - "terms": { - key: values if isinstance(values, list) else [values] - } - } - for key, values in data.items() - ] - } - } - } - - try: - self.client.delete_by_query(index=index_name, body=query) - return True - except Exception as e: - logging.error(f"Failed to delete documents: {e}") - - return False - - - def fetch_and_update_documents_locally(self, index_name, documents, batch_size=5000, ping_client=False): - """ - Fetch existing documents from Elasticsearch and update local documents with accumulated metrics. - This function retrieves documents from Elasticsearch in batches and merges their metric fields - with the provided local documents. The merge operation adds values for specific metric fields - or sets them if they don't exist in the local documents. - - Args: - index_name (str): Name of the Elasticsearch index to fetch documents from. - documents (dict): Dictionary of documents to be updated, where keys are document IDs and values - are dictionaries containing metric data. - batch_size (int, optional): Number of documents to fetch in each batch from Elasticsearch. - Defaults to 5000. - ping_client (bool, optional): If True, checks if the Elasticsearch client is available before - fetching documents. Defaults to False. - - Returns: - None: The function modifies the input documents dictionary in-place. - """ - if ping_client and not self.ping(): - return - - existing_docs = {} - ids = list(documents.keys()) - - for i in range(0, len(ids), batch_size): - batch_ids = ids[i:i+batch_size] - resp = self.client.mget(index=index_name, ids=batch_ids) - for doc in resp.get('docs', []): - if doc.get('found'): - existing_docs[doc['_id']] = doc['_source'] - logging.info(f'Found {len(existing_docs)} existing documents in Elasticsearch for update.') - - for doc_id, existing in existing_docs.items(): - current = documents[doc_id] - for field in [ - "total_requests", - "unique_requests", - "total_investigations", - "unique_investigations", - ]: - if field in existing and field in current: - current[field] += existing[field] - elif field in existing: - current[field] = existing[field] - - - def export_to_index(self, index_name, data, batch_size=5000, ping_client=False): - """ - Export data to Elasticsearch index in bulk operations. - This function converts input data to index documents, processes them locally, - and then indexes them to Elasticsearch in batches to optimize performance. - - Args: - index_name (str): Name of the Elasticsearch index to export data to. - data: The data to be exported to the Elasticsearch index - batch_size (int, optional): Number of documents to process in each bulk operation. - Defaults to 5000. - ping_client (bool, optional): If True, checks if the Elasticsearch client is available - - Returns: - None: Function performs side effects by indexing data to Elasticsearch - """ - if ping_client and not self.ping(): - return - - bulk_data = [] - documents = index_utils.convert_to_index_documents(data) - self.fetch_and_update_documents_locally(index_name=index_name, documents=documents) - - for key, metric_data in documents.items(): - metric_data['pid'] = metric_data.get('pid_v3') or metric_data.get('pid_v2') or metric_data.get('pid_generic', '') - bulk_data.append({ - "_id": key, - "_source": metric_data, - }) - - if len(bulk_data) >= batch_size: - self.index_documents( - index_name=index_name, - documents={doc["_id"]: doc["_source"] for doc in bulk_data}, - ) - bulk_data = [] - - self.index_documents( - index_name=index_name, - documents={doc["_id"]: doc["_source"] for doc in bulk_data}, - ) diff --git a/metrics/fixtures/top100articles.csv b/metrics/fixtures/top100articles.csv deleted file mode 100755 index 9d979f3..0000000 --- a/metrics/fixtures/top100articles.csv +++ /dev/null @@ -1,97 +0,0 @@ -print_issn online_issn pid_issn collection pid yop year_month_day total_item_requests total_item_investigations unique_item_requests unique_item_investigations -0002-7014 1851-8044 0002-7014 arg S0002-70142005000300005 2005 2024-05-26 13 16 13 16 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000100020 2009 2024-05-26 9 10 8 9 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000200012 2009 2024-05-26 8 9 8 9 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000200018 2009 2024-05-26 8 8 8 8 -0002-7014 1851-8044 0002-7014 arg S0002-70142005000300004 2005 2024-05-26 8 11 8 11 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000200011 2009 2024-05-26 8 9 8 9 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000200001 2009 2024-05-26 7 7 7 7 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000200010 2009 2024-05-26 7 9 7 9 -0002-7014 1851-8044 0002-7014 arg S0002-70142005000300007 2005 2024-05-26 7 10 7 10 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000200003 2009 2024-05-26 7 9 7 9 -0002-7014 1851-8044 0002-7014 arg S0002-70142008000400010 2008 2024-05-26 7 7 7 7 -0002-7014 1851-8044 0002-7014 arg S0002-70142005000300008 2005 2024-05-26 7 9 7 9 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000400008 2009 2024-05-26 7 7 7 7 -0002-7014 1851-8044 0002-7014 arg S0002-70142006000400009 2006 2024-05-26 6 6 6 6 -0002-7014 1851-8044 0002-7014 arg S0002-70142006000200009 2006 2024-05-26 6 7 6 7 -0002-7014 1851-8044 0002-7014 arg S0002-70142010000100007 2010 2024-05-26 6 6 6 6 -0002-7014 1851-8044 0002-7014 arg S0002-70142007000300003 2007 2024-05-26 6 6 6 6 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000100022 2009 2024-05-26 6 6 6 6 -0002-7014 1851-8044 0002-7014 arg S0002-70142010000100006 2010 2024-05-26 6 6 6 6 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000200002 2009 2024-05-26 6 7 6 7 -0002-7014 1851-8044 0002-7014 arg S0002-70142010000100002 2010 2024-05-26 6 6 6 6 -0002-7014 1851-8044 0002-7014 arg S0002-70142007000200014 2007 2024-05-26 6 6 6 6 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000100021 2009 2024-05-26 6 6 6 6 -0002-7014 1851-8044 0002-7014 arg S0002-70142010000400010 2010 2024-05-26 6 6 6 6 -0002-7014 1851-8044 0002-7014 arg S0002-70142010000200001 2010 2024-05-26 6 6 6 6 -0002-7014 1851-8044 0002-7014 arg S0002-70142010000200002 2010 2024-05-26 6 7 6 7 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000200014 2009 2024-05-26 5 6 5 6 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000100014 2009 2024-05-26 5 5 5 5 -0002-7014 1851-8044 0002-7014 arg S0002-70142005000200009 2005 2024-05-26 5 5 5 5 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000200004 2009 2024-05-26 5 5 5 5 -0002-7014 1851-8044 0002-7014 arg S0002-70142006000100016 2006 2024-05-26 5 5 5 5 -0002-7014 1851-8044 0002-7014 arg S0002-70142006000200015 2006 2024-05-26 5 5 5 5 -0002-7014 1851-8044 0002-7014 arg S0002-70142007000300005 2007 2024-05-26 5 5 5 5 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000300009 2009 2024-05-26 5 5 5 5 -0002-7014 1851-8044 0002-7014 arg S0002-70142010000200010 2010 2024-05-26 4 4 4 4 -0002-7014 1851-8044 0002-7014 arg S0002-70142008000100015 2008 2024-05-26 3 4 3 4 -0002-7014 1851-8044 0002-7014 arg S0002-70142005000300002 2005 2024-05-26 2 5 2 5 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000200015 2009 2024-05-26 2 3 2 3 -0002-7014 1851-8044 0002-7014 arg S0002-70142005000300001 2005 2024-05-26 2 5 2 5 -0002-7014 1851-8044 0002-7014 arg S0002-70142005000300009 2005 2024-05-26 2 4 2 4 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000200005 2009 2024-05-26 2 4 2 4 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000200008 2009 2024-05-26 2 3 2 3 -0002-7014 1851-8044 0002-7014 arg S0002-70142005000300006 2005 2024-05-26 2 5 2 5 -0002-7014 1851-8044 0002-7014 arg S0002-70142005000300010 2005 2024-05-26 2 2 2 2 -0002-7014 1851-8044 0002-7014 arg S0002-70142005000300003 2005 2024-05-26 2 5 2 5 -0002-7014 1851-8044 0002-7014 arg S0002-70142006000300001 2006 2024-05-26 2 2 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000100005 2009 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000200016 2009 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142005000400004 2005 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142008000100009 2008 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142005000100014 2005 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000200009 2009 2024-05-26 1 2 1 2 -0002-7014 1851-8044 0002-7014 arg S0002-70142006000100019 2006 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000200013 2009 2024-05-26 1 3 1 3 -0002-7014 1851-8044 0002-7014 arg S0002-70142008000400007 2008 2024-05-26 1 2 1 2 -0002-7014 1851-8044 0002-7014 arg S0002-70142008000300010 2008 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000200006 2009 2024-05-26 1 3 1 3 -0002-7014 1851-8044 0002-7014 arg S0002-70142006000200018 2006 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142008000400002 2008 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142010000300005 2010 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142006000200007 2006 2024-05-26 1 3 1 3 -0002-7014 1851-8044 0002-7014 arg S0002-70142006000400004 2006 2024-05-26 1 2 1 2 -0002-7014 1851-8044 0002-7014 arg S0002-70142007000100004 2007 2024-05-26 1 3 1 3 -0002-7014 1851-8044 0002-7014 arg S0002-70142007000200021 2007 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142007000100002 2007 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000100004 2009 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000400004 2009 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142008000400006 2008 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142006000400005 2006 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142008000300006 2008 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142008000400011 2008 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142007000300001 2007 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142007000100020 2007 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142006000400002 2006 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142005000100005 2005 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000200017 2009 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142008000100005 2008 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000200007 2009 2024-05-26 1 4 1 4 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000100023 2009 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142008000100008 2008 2024-05-26 0 1 0 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142006000400008 2006 2024-05-26 0 1 0 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142005000400005 2005 2024-05-26 0 1 0 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142006000200006 2006 2024-05-26 0 1 0 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142005000400007 2005 2024-05-26 0 1 0 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142008000200013 2008 2024-05-26 0 1 0 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142006000400003 2006 2024-05-26 0 1 0 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000400006 2009 2024-05-26 0 1 0 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142007000300008 2007 2024-05-26 0 1 0 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142005000200008 2005 2024-05-26 0 1 0 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142008000200006 2008 2024-05-26 0 1 0 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142008000400004 2008 2024-05-26 0 1 0 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142005000400006 2005 2024-05-26 0 1 0 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142007000300006 2007 2024-05-26 0 1 0 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142006000300003 2006 2024-05-26 0 1 0 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142008000100007 2008 2024-05-26 0 1 0 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142006000100009 2006 2024-05-26 0 1 0 1 diff --git a/metrics/fixtures/top100articles.tar.gz b/metrics/fixtures/top100articles.tar.gz deleted file mode 100644 index cd49556706cc65863c0fc842519af13978638501..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1675 zcmV;626Xu!iwFP?L#bu}1MOOCZreBz?qBXHa)B)PCUuTIfMFzV>qTm(wo@RtUy?FM zl844Z!$FDyk!U+sjj!LlXCz7~^<$9AZnV_)QMyO%|9(3E`oeffDT8wl=~D$K=b!mV ze>?S{tT)bR7mRw4%6X%|JV;Iw_l~#oWqV3gPTQ~B-OInP&v$f_FW0cm4>Du=6 zP5c92wUjP>G1&l4=<%&OwNOT=>-$>NA-ie5AT#PPM;(NcB8q_msWYrbkO{2|F*Xp( zNmiys%8Wv(@H>WDD2Rq0n0Gx5j?yRg?2nC9e@HoE;;~{ z@XUHd=m0eGN1bHWbblN<4ogInUu2Goiz6W7DAY-hdc7VY3R_b5A>vM|SC5`sk7GxA zgIG>7N5jR@FTBU!a#VseBdNBn)p(L)#&aQNJzA&=jsdaeo-t-36UV8#a&Uy0!8u$N zTd(#Eb$qw(q)s=_{9Vzyn4vte#!O9Uo2OuKG>fwgnGKNcPs57`XBO*e6j3}FR zSZ11rIW5zuGaZOiz2-o43(Igw4VLj-mVx=2%^x9~;d!Qm<21WAIsWbk{dbP6*GHp* zoc4bVAG`lUORH?&|KYSYMh0K@e`x!;|KlT$KVbjI=$}IQAv*!qvKlWoXdUfFUmtsfL6+-SAK|}=2>aiBCC1DEh8u(dsCK5 zYb~m*c{5iFE$67t!J?o#;JQGyQP}G{*=!D(&HA!zS`>>p(7?iC)|F+_qHy(hxJqG_ zFhQ8r-eoFR?6}64rPHFImPr6}H3k+1wdi)hs4|6@(5a@b_5oeFYGkd9dR=6H^~hiI z|B>P~g)fOKO#C;wAe5CF2a>-Qz1~I5i=qMD&I5Tr+;x$&A&#w_3>{4a)w4oGo|W-N zR8r1@eIz#;Fl8k-SRGrSSA|hEEUIUX`SqsClqIiw#Yj=m@Bun1M1e$H$J3;eODTph zKrD;lP&NpX3b$*k^m>w$wL)$>+iFR0d?j&8oo#Ddi)h*$I}qW9A1_DCj))7MFI#k0 zCl4q~0eR1CW71j=*Wize0kIYi@3}aFhhxjkHmi^jv1MkPrzpk32QH3s!BH|G)@(L5 z3W@FpLC*m|AaoM_LTzqS-!`3BJ;2X9b96jVm#IZx_$c9*ZibmhlVe2UhS+kkpw9J$r zIrIo|&7U75rydx86mZ`Q#Pw%)OEbnF}_@8RHBqtN1crc;lN38}*$r|E;kA0boV`xF)Nckjjc VJU)-l