diff --git a/services/data-topcitation/config.json b/services/data-topcitation/config.json index 16931e1b..e367a556 100644 --- a/services/data-topcitation/config.json +++ b/services/data-topcitation/config.json @@ -5,7 +5,7 @@ "EZS_METRICS": true, "EZS_CONCURRENCY": 2, "EZS_CONTINUE_DELAY": 60, - "EZS_PIPELINE_DELAY": 600, + "EZS_PIPELINE_DELAY": 18000, "EZS_NSHARDS": 32, "EZS_CACHE": true, "EZS_VERBOSE": false, diff --git a/services/data-topcitation/tests.hurl b/services/data-topcitation/tests.hurl index c5cf3e5e..4e215b50 100644 --- a/services/data-topcitation/tests.hurl +++ b/services/data-topcitation/tests.hurl @@ -75,10 +75,10 @@ Content-Type: application/json "id": "https://doi.org/10.1093/mnras/sts454" }, { - "id": "https://openalex.org/W3125762999" + "id": "https://openalex.org/W3121968724" }, { - "id": "https://openalex.org/W3121968724" + "id": "https://openalex.org/W3125762999" } ] }, @@ -133,10 +133,10 @@ Content-Type: application/json "id": "https://doi.org/10.1093/mnras/sts454" }, { - "id": "https://openalex.org/W3125762999" + "id": "https://openalex.org/W3121968724" }, { - "id": "https://openalex.org/W3121968724" + "id": "https://openalex.org/W3125762999" } ] }, @@ -252,9 +252,9 @@ Content-Type: application/json } }, { - "id": "https://openalex.org/W3125762999", + "id": "https://openalex.org/W3121968724", "value": { - "label": "https://openalex.org/W3125762999", + "label": "https://openalex.org/W3121968724", "viz$color": { "r": "255", "g": "0", @@ -271,9 +271,9 @@ Content-Type: application/json } }, { - "id": "https://openalex.org/W3121968724", + "id": "https://openalex.org/W3125762999", "value": { - "label": "https://openalex.org/W3121968724", + "label": "https://openalex.org/W3125762999", "viz$color": { "r": "255", "g": "0", diff --git a/services/data-topcitation/v1/retrieve-json.ini b/services/data-topcitation/v1/retrieve-json.ini index 12fe784d..8dafd6b0 100644 --- a/services/data-topcitation/v1/retrieve-json.ini +++ b/services/data-topcitation/v1/retrieve-json.ini @@ -19,6 +19,10 @@ plugin = spawn [JSONParse] separator = * +[env] +path = identifier +value = get("value") + [exchange] value = get('value') @@ -31,10 +35,10 @@ path = */*.json # Convert XML to JSON -[group] - [exec] command = ./v1/topcitation-segment.py +args = fix('-p') +args = env('identifier') [exchange] value = get('value') diff --git a/services/data-topcitation/v1/topcitation-segment.py b/services/data-topcitation/v1/topcitation-segment.py index 004862fa..47914658 100755 --- a/services/data-topcitation/v1/topcitation-segment.py +++ b/services/data-topcitation/v1/topcitation-segment.py @@ -1,15 +1,25 @@ #!/usr/bin/env python3 +# -*- coding: utf-8 -*- import networkx as nx import json import math import sys +import os +import tarfile, json -json_data = json.load(sys.stdin) + +retrieve_id = str(sys.argv[sys.argv.index('-p') + 1]) +path_file = str(os.path.join("/tmp/retrieve", retrieve_id)) + +with tarfile.open(path_file, "r:gz") as tar: + json_data = [json.load(f) for f in (tar.extractfile(m) for m in tar) if f] G = nx.DiGraph() doi_info = {} for citation in json_data: + if "value" not in citation: + continue if "message" in citation["value"].keys() : continue citation_id = citation["id"] diff --git a/services/data-topcitation/v1/topcitation.py b/services/data-topcitation/v1/topcitation.py index bbd1575c..e557346a 100755 --- a/services/data-topcitation/v1/topcitation.py +++ b/services/data-topcitation/v1/topcitation.py @@ -4,19 +4,55 @@ import json import sys import os +import time OPENALEX_TOKEN = os.getenv("OPENALEX_API_KEY") - def get_openalex_info(doi): - url = f"https://api.openalex.org/works/doi:{doi}?api_key={OPENALEX_TOKEN}" - response = requests.get(url) + url = f"https://api.openalex.org/works/doi:{doi}" + + for attempt in range(3): # 3 tentatives maximum + try: + print(f"[DEBUG] Starting request for DOI {doi}, attempt {attempt+1}", file=sys.stderr) + + time.sleep(0.12) + response = requests.get(url, timeout=(5, 10)) + + print(f"HTTP={response.status_code} for DOI {doi}", file=sys.stderr) + + # 1. Erreurs HTTP + if response.status_code != 200: + print(f"[ERROR] API OpenAlex HTTP {response.status_code}", file=sys.stderr) + continue # on réessaie + + # 2. JSON validation + try: + data = response.json() + except ValueError: + print("[ERROR] Réponse non-JSON reçue", file=sys.stderr) + continue # on réessaie + + # 3. Vérification des champs attendus + if "id" not in data: + print("[ERROR] JSON invalide ou incomplet", file=sys.stderr) + continue + + return data + + except requests.exceptions.Timeout: + pass + print(f"[ERROR] Timeout pour DOI {doi} (tentative {attempt+1})", file=sys.stderr) + + except requests.exceptions.RequestException as e: + pass + print(f"[ERROR] Exception réseau pour DOI {doi} : {e} (tentative {attempt+1})", file=sys.stderr) + + # Si on arrive ici = les 3 tentatives ont échoué + print(f"[ERROR] Abandon du DOI {doi} après 3 tentatives", file=sys.stderr) + return None + + - if response.status_code == 200: - return response.json() - else: - return None - def extract_doi_referenced_works(data): if data: doi_url = data.get('doi') @@ -32,8 +68,9 @@ def openAlex_to_doi(url) : url_parse = url.split("/") id = url_parse[-1] - url = f"https://api.openalex.org/works/{id}?api_key={OPENALEX_TOKEN}" + url = f"https://api.openalex.org/works/{id}" + time.sleep(0.12) response = requests.get(url) if response.status_code == 200: @@ -57,11 +94,15 @@ def main(): all_references = {} - for doi in dois: + for i, doi in enumerate(dois, start=1): + # --- traitement du DOI --- get_info_doi = get_openalex_info(doi) references = extract_doi_referenced_works(get_info_doi) all_references.update(references) + print(f"DOI {i}/{len(dois)} MB", file=sys.stderr) + + # inialisation du dict avec deux clés possibles : # le compteur du nombre de citation à 0 et la liste pour les futurs dois citation_count = defaultdict(lambda: {"count": 0, "doi": []}) @@ -70,6 +111,7 @@ def main(): for doi, references in all_references.items(): if references == "champ referenced_works vide": # Ajouter une entrée dans le JSON indiquant que le champ referenced_works est vide + #print({"id": doi, "value": {"message": "champ referenced_works vide"}}) sys.stdout.write(json.dumps({"id": doi, "value": {"message": "champ referenced_works vide"}})) sys.stdout.write("\n") else : @@ -88,10 +130,16 @@ def main(): # voir pour modifier ce paramètre top_citations = sorted_citations[:nbCitations] + result = [] # on itère sur la liste qui contient les tuples citation, count et doi pour les ajouter aux différents champs for citation, info in top_citations: + # result.append({"id": openAlex_to_doi(citation),"value": {"count": info["count"],"citing_doi": info["doi"]}}) sys.stdout.write(json.dumps({"id":openAlex_to_doi(citation), "value":{"count": info["count"],"citing_doi": info["doi"]}})) sys.stdout.write("\n") + # with open("output-topcitation.json", "w", encoding="utf-8") as f: + # json.dump(result, f, ensure_ascii=False, indent=2) + + if __name__ == "__main__": main()