Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion services/data-topcitation/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"EZS_METRICS": true,
"EZS_CONCURRENCY": 2,
"EZS_CONTINUE_DELAY": 60,
"EZS_PIPELINE_DELAY": 600,
"EZS_PIPELINE_DELAY": 18000,
"EZS_NSHARDS": 32,
"EZS_CACHE": true,
"EZS_VERBOSE": false,
Expand Down
16 changes: 8 additions & 8 deletions services/data-topcitation/tests.hurl
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,10 @@ Content-Type: application/json
"id": "https://doi.org/10.1093/mnras/sts454"
},
{
"id": "https://openalex.org/W3125762999"
"id": "https://openalex.org/W3121968724"
},
{
"id": "https://openalex.org/W3121968724"
"id": "https://openalex.org/W3125762999"
}
]
},
Expand Down Expand Up @@ -133,10 +133,10 @@ Content-Type: application/json
"id": "https://doi.org/10.1093/mnras/sts454"
},
{
"id": "https://openalex.org/W3125762999"
"id": "https://openalex.org/W3121968724"
},
{
"id": "https://openalex.org/W3121968724"
"id": "https://openalex.org/W3125762999"
}
]
},
Expand Down Expand Up @@ -252,9 +252,9 @@ Content-Type: application/json
}
},
{
"id": "https://openalex.org/W3125762999",
"id": "https://openalex.org/W3121968724",
"value": {
"label": "https://openalex.org/W3125762999",
"label": "https://openalex.org/W3121968724",
"viz$color": {
"r": "255",
"g": "0",
Expand All @@ -271,9 +271,9 @@ Content-Type: application/json
}
},
{
"id": "https://openalex.org/W3121968724",
"id": "https://openalex.org/W3125762999",
"value": {
"label": "https://openalex.org/W3121968724",
"label": "https://openalex.org/W3125762999",
"viz$color": {
"r": "255",
"g": "0",
Expand Down
8 changes: 6 additions & 2 deletions services/data-topcitation/v1/retrieve-json.ini
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ plugin = spawn
[JSONParse]
separator = *

[env]
path = identifier
value = get("value")

[exchange]
value = get('value')

Expand All @@ -31,10 +35,10 @@ path = */*.json

# Convert XML to JSON

[group]

[exec]
command = ./v1/topcitation-segment.py
args = fix('-p')
args = env('identifier')

[exchange]
value = get('value')
Expand Down
12 changes: 11 additions & 1 deletion services/data-topcitation/v1/topcitation-segment.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,25 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import networkx as nx
import json
import math
import sys
import os
import tarfile, json

json_data = json.load(sys.stdin)

retrieve_id = str(sys.argv[sys.argv.index('-p') + 1])
path_file = str(os.path.join("/tmp/retrieve", retrieve_id))

with tarfile.open(path_file, "r:gz") as tar:
json_data = [json.load(f) for f in (tar.extractfile(m) for m in tar) if f]

G = nx.DiGraph()

doi_info = {}
for citation in json_data:
if "value" not in citation:
continue
if "message" in citation["value"].keys() :
continue
citation_id = citation["id"]
Expand Down
68 changes: 58 additions & 10 deletions services/data-topcitation/v1/topcitation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,55 @@
import json
import sys
import os
import time

OPENALEX_TOKEN = os.getenv("OPENALEX_API_KEY")


def get_openalex_info(doi):
url = f"https://api.openalex.org/works/doi:{doi}?api_key={OPENALEX_TOKEN}"
response = requests.get(url)
url = f"https://api.openalex.org/works/doi:{doi}"

for attempt in range(3): # 3 tentatives maximum
try:
print(f"[DEBUG] Starting request for DOI {doi}, attempt {attempt+1}", file=sys.stderr)

time.sleep(0.12)
response = requests.get(url, timeout=(5, 10))

print(f"HTTP={response.status_code} for DOI {doi}", file=sys.stderr)

# 1. Erreurs HTTP
if response.status_code != 200:
print(f"[ERROR] API OpenAlex HTTP {response.status_code}", file=sys.stderr)
continue # on réessaie

# 2. JSON validation
try:
data = response.json()
except ValueError:
print("[ERROR] Réponse non-JSON reçue", file=sys.stderr)
continue # on réessaie

# 3. Vérification des champs attendus
if "id" not in data:
print("[ERROR] JSON invalide ou incomplet", file=sys.stderr)
continue

return data

except requests.exceptions.Timeout:
pass
print(f"[ERROR] Timeout pour DOI {doi} (tentative {attempt+1})", file=sys.stderr)

except requests.exceptions.RequestException as e:
pass
print(f"[ERROR] Exception réseau pour DOI {doi} : {e} (tentative {attempt+1})", file=sys.stderr)

# Si on arrive ici = les 3 tentatives ont échoué
print(f"[ERROR] Abandon du DOI {doi} après 3 tentatives", file=sys.stderr)
return None



if response.status_code == 200:
return response.json()
else:
return None

def extract_doi_referenced_works(data):
if data:
doi_url = data.get('doi')
Expand All @@ -32,8 +68,9 @@ def openAlex_to_doi(url) :

url_parse = url.split("/")
id = url_parse[-1]
url = f"https://api.openalex.org/works/{id}?api_key={OPENALEX_TOKEN}"
url = f"https://api.openalex.org/works/{id}"

time.sleep(0.12)
response = requests.get(url)

if response.status_code == 200:
Expand All @@ -57,11 +94,15 @@ def main():

all_references = {}

for doi in dois:
for i, doi in enumerate(dois, start=1):
# --- traitement du DOI ---
get_info_doi = get_openalex_info(doi)
references = extract_doi_referenced_works(get_info_doi)
all_references.update(references)

print(f"DOI {i}/{len(dois)} MB", file=sys.stderr)


# inialisation du dict avec deux clés possibles :
# le compteur du nombre de citation à 0 et la liste pour les futurs dois
citation_count = defaultdict(lambda: {"count": 0, "doi": []})
Expand All @@ -70,6 +111,7 @@ def main():
for doi, references in all_references.items():
if references == "champ referenced_works vide":
# Ajouter une entrée dans le JSON indiquant que le champ referenced_works est vide
#print({"id": doi, "value": {"message": "champ referenced_works vide"}})
sys.stdout.write(json.dumps({"id": doi, "value": {"message": "champ referenced_works vide"}}))
sys.stdout.write("\n")
else :
Expand All @@ -88,10 +130,16 @@ def main():
# voir pour modifier ce paramètre
top_citations = sorted_citations[:nbCitations]

result = []
# on itère sur la liste qui contient les tuples citation, count et doi pour les ajouter aux différents champs
for citation, info in top_citations:
# result.append({"id": openAlex_to_doi(citation),"value": {"count": info["count"],"citing_doi": info["doi"]}})
sys.stdout.write(json.dumps({"id":openAlex_to_doi(citation), "value":{"count": info["count"],"citing_doi": info["doi"]}}))
sys.stdout.write("\n")

# with open("output-topcitation.json", "w", encoding="utf-8") as f:
# json.dump(result, f, ensure_ascii=False, indent=2)


if __name__ == "__main__":
main()
Loading