Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
e0932f8
feat(harvesters): adiciona suporte a verify, issn e stop em AMHarvest…
robertatakenaka Mar 13, 2026
c53e1ab
feat(journal/sources): refatora _get_collection_journals para usar AM…
robertatakenaka Mar 13, 2026
1f45d34
feat(issue/loader): propaga verify, limit, issn e stop nas funções de…
robertatakenaka Mar 13, 2026
e4b65aa
feat(preprint): adiciona parâmetros verify, timeout e stop a harvest_…
robertatakenaka Mar 13, 2026
4e96509
feat(controller): adiciona verify, issn e stop ao ArticleIteratorBuilder
robertatakenaka Mar 13, 2026
868279b
feat(issue/tasks): adiciona verify, limit, issn e stop às tasks de is…
robertatakenaka Mar 13, 2026
f2e86ae
feat(journal/tasks): adiciona issn e stop às tasks de carga de journals
robertatakenaka Mar 13, 2026
2456cec
feat(article/tasks): adiciona verify, issn e stop às tasks de artigos…
robertatakenaka Mar 13, 2026
de86672
feat(tasks_scheduler): adiciona verify, issn, limit e stop nos agenda…
robertatakenaka Mar 13, 2026
49597ab
fix(harvesters): corrige filtro issn por tipo de registro e adiciona …
robertatakenaka Mar 13, 2026
3c7cf0f
fix(controller): extrai kwargs do queryset e adiciona log de total po…
robertatakenaka Mar 13, 2026
6702cae
fix(article/tasks): remove stop guard duplicado e log verboso do disp…
robertatakenaka Mar 13, 2026
d7798c3
feat(tasks_scheduler): adiciona stop=None nos agendamentos de dispatc…
robertatakenaka Mar 13, 2026
b502d13
feat(collection): add verify param to Collection.load
robertatakenaka Mar 13, 2026
043d19d
feat(collection): pass verify param in task_load_collections
robertatakenaka Mar 13, 2026
a41adbd
feat(bigbang): add verify param to task_start and pass to Collection.…
robertatakenaka Mar 13, 2026
7f328c2
feat(issue): propagate verify param through issue loading functions
robertatakenaka Mar 13, 2026
a26ce26
feat(issue): pass verify param in task_update_issues_from_amissue
robertatakenaka Mar 13, 2026
007e438
feat(article): add verify param to get_or_create_toc_sections and pas…
robertatakenaka Mar 13, 2026
9995ebc
feat(article): pass verify param to Collection.load in ArticleIterato…
robertatakenaka Mar 13, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 18 additions & 4 deletions article/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,9 @@ def __init__(
limit=None,
timeout=None,
opac_url=None,
verify=True,
issn=None,
stop=None,
force_update=None,
):
self.user = user
Expand All @@ -460,6 +463,9 @@ def __init__(
self.limit = limit
self.timeout = timeout
self.opac_url = opac_url
self.verify = verify
self.issn = issn
self.stop = stop
self.force_update = force_update

self._iter_from_harvest_count = 0
Expand Down Expand Up @@ -492,18 +498,23 @@ def _iter_from_pid_provider(self):
issn_list = [i for i in journal_issns if i] if journal_issns else None
if journal_issns and not issn_list:
continue
qs = PidProviderXML.get_queryset(

kwargs = dict(
issn_list=issn_list,
from_pub_year=self.from_pub_year,
until_pub_year=self.until_pub_year,
from_updated_date=self.from_date,
until_updated_date=self.until_date,
proc_status_list=self.proc_status_list or [PPXML_STATUS_TODO, PPXML_STATUS_INVALID],
)
self._iter_from_pid_provider_count += qs.count()
qs = PidProviderXML.get_queryset(**kwargs)
total = qs.count()

logging.info(f"PidProviderXML queryset total: {total}, kwargs: {kwargs}")

self._iter_from_pid_provider_count += total
for item in qs.iterator():
yield {"pp_xml_id": item.id}
logging.info(f"_iter_from_pid_provider: yielded {self._iter_from_pid_provider_count} items")

def _iter_from_article(self):
"""
Expand Down Expand Up @@ -550,7 +561,7 @@ def _iter_from_harvest(self):
"""Itera documentos coletados via OPAC ou ArticleMeta."""

if Collection.objects.count() == 0:
Collection.load(self.user)
Collection.load(self.user, verify=self.verify)

count = 0
for collection_acron in self.collection_acron_list or list(Collection.get_acronyms()):
Expand Down Expand Up @@ -594,6 +605,9 @@ def _build_harvester(self, collection_acron):
until_date=self.until_date,
limit=self.limit,
timeout=self.timeout,
verify=self.verify,
issn=self.issn,
stop=self.stop,
)
if collection_acron == "scl":
return OPACHarvester(self.opac_url or "www.scielo.br", collection_acron, **kwargs)
Expand Down
10 changes: 9 additions & 1 deletion article/sources/preprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,16 @@
class PreprintArticleSaveError(Exception): ...


def harvest_preprints(URL, user):
def harvest_preprints(URL, user, verify=True, timeout=30, stop=None):
sickle = Sickle(URL)
recs = sickle.ListRecords(metadataPrefix="oai_dc")
Comment on lines +22 to 24
Comment on lines +22 to 24

count = 0
for rec in recs:
if stop and count >= stop:
logging.info(f"Reached stop limit of {stop} preprints")
break

article_info = get_info_article(rec)
identifier = get_doi(article_info["identifier"])
doi = get_or_create_doi(doi=identifier, user=user)
Expand Down Expand Up @@ -73,6 +79,8 @@ def harvest_preprints(URL, user):
# acessíveis na área administrativa
# para que o usuário fique sabendo quais itens falharam
raise PreprintArticleSaveError(e)

count += 1


def get_info_article(rec):
Expand Down
6 changes: 3 additions & 3 deletions article/sources/xmlsps.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ def load_article(user, xml=None, file_path=None, v3=None, pp_xml=None):
article.languages.add(main_lang)

article.sections.set(
get_or_create_toc_sections(xmltree=xmltree, user=user, errors=errors, issue=article.issue)
get_or_create_toc_sections(xmltree=xmltree, user=user, errors=errors, issue=article.issue, verify=True)
)
article.titles.set(
create_or_update_titles(
Expand Down Expand Up @@ -522,7 +522,7 @@ def get_or_create_fundings(xmltree, user, item, errors):
return data


def get_or_create_toc_sections(xmltree, user, errors, issue):
def get_or_create_toc_sections(xmltree, user, errors, issue, verify=True):
"""
Extrai e cria seções do sumário (TOC) a partir do XML.

Expand All @@ -538,7 +538,7 @@ def get_or_create_toc_sections(xmltree, user, errors, issue):
try:
if not issue.table_of_contents.exists():
for am_issue in AMIssue.objects.filter(new_record=issue):
load_issue_sections(user, issue, am_issue=am_issue)
load_issue_sections(user, issue, am_issue=am_issue, verify=verify)
toc_sections = ArticleTocSections(xmltree=xmltree).sections
for item in toc_sections:
section_title = item.get("section")
Expand Down
23 changes: 18 additions & 5 deletions article/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def load_funding_data(user, file_path):


@celery_app.task(bind=True, name=_('load_preprints'))
def load_preprint(self, user_id, oai_pmh_preprint_uri):
def load_preprint(self, user_id, oai_pmh_preprint_uri, verify=True, timeout=30, stop=None):
"""
Coleta e carrega preprints de um endpoint OAI-PMH específico.

Expand All @@ -62,6 +62,9 @@ def load_preprint(self, user_id, oai_pmh_preprint_uri):
self: Instância da tarefa Celery
user_id (int): ID do usuário executando a tarefa (obrigatório)
oai_pmh_preprint_uri (str): URI do endpoint OAI-PMH para coleta (obrigatório)
verify (bool): Verificação SSL para requisições HTTP
timeout (int): Timeout para requisições HTTP
stop (int, optional): Número máximo de preprints a processar

Returns:
None
Expand Down Expand Up @@ -89,7 +92,7 @@ def load_preprint(self, user_id, oai_pmh_preprint_uri):
"""
user = User.objects.get(pk=user_id)
## fazer filtro para não coletar tudo sempre
harvest_preprints(oai_pmh_preprint_uri, user)
harvest_preprints(oai_pmh_preprint_uri, user, verify=verify, timeout=timeout, stop=stop)


@celery_app.task(bind=True)
Expand Down Expand Up @@ -643,6 +646,7 @@ def task_check_article_availability(
article_id=None,
collection_acron_list=None,
timeout=None,
verify=True,
is_activate=None,
force_update=False,
):
Expand All @@ -659,6 +663,7 @@ def task_check_article_availability(
article_id (int, optional): ID do artigo a verificar (obrigatório)
collection_acron_list (list, optional): Lista de acrônimos de coleções para filtro
timeout (int, optional): Timeout em segundos para verificações HTTP
verify (bool): Verificação SSL para requisições HTTP
is_activate (bool, optional): Se deve ativar artigo após verificação
force_update (bool): Força nova verificação mesmo se recente

Expand Down Expand Up @@ -719,6 +724,9 @@ def task_dispatch_articles(
limit=None,
timeout=None,
opac_url=None,
verify=True,
issn=None,
stop=None,
# --- ativa article_source ---
article_source_status_list=None,
):
Expand Down Expand Up @@ -747,6 +755,9 @@ def task_dispatch_articles(
limit (int, optional): Limite máximo de artigos a processar
timeout (int, optional): Timeout para operações HTTP
opac_url (str, optional): URL base do OPAC para harvest
verify (bool): Verificação SSL para requisições HTTP
issn (str, optional): ISSN para filtrar por journal específico
stop (int, optional): Número máximo de itens a processar
article_source_status_list (list, optional): Status do article_source para filtro

Returns:
Expand Down Expand Up @@ -795,12 +806,14 @@ def task_dispatch_articles(
limit=limit,
timeout=timeout,
opac_url=opac_url,
verify=verify,
issn=issn,
stop=stop,
force_update=force_update,
):
if item_kwargs is None:
skipped += 1
continue
logging.info(f"Dispatching article with kwargs: {item_kwargs}")
task_process_article_pipeline.delay(**item_kwargs, **common_kwargs)
dispatched += 1

Expand Down Expand Up @@ -937,7 +950,7 @@ def task_process_article_pipeline(
am_article=am_article,
auto_solve_pid_conflict=auto_solve_pid_conflict,
)
pp_xml_id = article_source.pid_provider_xml.id
pp_xml_id = article_source.pid_provider_xml.id if article_source.pid_provider_xml else None

if article_source_id:
article_source = ArticleSource.objects.get(id=article_source_id)
Expand All @@ -946,7 +959,7 @@ def task_process_article_pipeline(
force_update=force_update,
auto_solve_pid_conflict=auto_solve_pid_conflict,
)
pp_xml_id = article_source.pid_provider_xml.id
pp_xml_id = article_source.pid_provider_xml.id if article_source.pid_provider_xml else None

if not pp_xml_id:
raise ValueError(
Expand Down
3 changes: 2 additions & 1 deletion bigbang/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,12 @@ def task_start(
self,
user_id=None,
username=None,
verify=True,
):
try:
user = _get_user(user_id, username)
Language.load(user)
Collection.load(user)
Collection.load(user, verify=verify)
Vocabulary.load(user)
Standard.load(user)
Subject.load(user)
Expand Down
21 changes: 18 additions & 3 deletions bigbang/tasks_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,10 @@ def schedule_task_dispatch_articles(username, enabled=False):
limit=None,
timeout=None,
opac_url=None,
verify=True,
issn=None,
article_source_status_list=None,
stop=None,
),
description=_("Dispatch articles to processing pipeline"),
priority=TASK_PRIORITY,
Expand Down Expand Up @@ -271,7 +274,7 @@ def schedule_load_journal_from_article_meta(username, enabled=False):
"""
Agenda a tarefa de carga de dados de journals obtidos do AM e Core.

Configura verify=True para verificação SSL nas requisições HTTP.
Configura verify=True para verificação SSL e limit=100 para limitar coleta.
"""
schedule_task(
task="journal.tasks.load_journal_from_article_meta",
Expand All @@ -280,6 +283,9 @@ def schedule_load_journal_from_article_meta(username, enabled=False):
load_data=False,
collection_acron="scl",
verify=True,
limit=1000,
issn=None,
stop=None,
),
description=_("Carga de dados de journals obtidos do AM e Core"),
priority=1,
Expand All @@ -295,7 +301,7 @@ def schedule_collect_journals_from_am(username, enabled=False):
"""
Agenda a tarefa de coleta de journals da fonte AM.

Configura verify=True para verificação SSL nas requisições HTTP.
Configura verify=True para verificação SSL e limit=100 para limitar coleta.
"""
schedule_task(
task="journal.tasks.load_journal_from_article_meta",
Expand All @@ -304,6 +310,9 @@ def schedule_collect_journals_from_am(username, enabled=False):
load_data=True,
collection_acron="scl",
verify=True,
limit=1000,
issn=None,
stop=None,
),
description=_("Coleta de journals da fonte AM"),
priority=1,
Expand Down Expand Up @@ -416,7 +425,9 @@ def schedule_export_journal_to_articlemeta(username, enabled=False):

def schedule_load_issue_from_articlemeta(username, enabled=False):
"""
Agenda a tarefa de carregar issues do ArticleMeta
Agenda a tarefa de carregar issues do ArticleMeta.

Configura verify=True para verificação SSL e limit=100 para limitar coleta.
"""
schedule_task(
task="issue.tasks.load_issue_from_articlemeta",
Expand All @@ -429,6 +440,10 @@ def schedule_load_issue_from_articlemeta(username, enabled=False):
until_date=None,
force_update=None,
timeout=30,
verify=True,
limit=1000,
issn=None,
stop=None,
Comment on lines 426 to +446
),
Comment on lines 426 to 447
description=_("Load issues from ArticleMeta"),
priority=TASK_PRIORITY,
Expand Down
4 changes: 2 additions & 2 deletions collection/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,12 +223,12 @@ def __str__(self):
base_form_class = CoreAdminModelForm

@classmethod
def load(cls, user, collections_data=None):
def load(cls, user, collections_data=None, verify=True):
if not collections_data:
collections_data = fetch_data(
"https://articlemeta.scielo.org/api/v1/collection/identifiers/",
json=True,
verify=False,
verify=verify,
)

for collection_data in collections_data:
Expand Down
4 changes: 2 additions & 2 deletions collection/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@


@celery_app.task(bind=True)
def task_load_collections(self, user_id=None, username=None):
def task_load_collections(self, user_id=None, username=None, verify=True):
if user_id:
user = User.objects.get(pk=user_id)
if username:
user = User.objects.get(username=username)
Collection.load(user)
Collection.load(user, verify=verify)
Loading
Loading