Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion site/cds_rdm/inspire_harvester/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def build_task_arguments(
},
}
],
"batch_size": 10,
"batch_size": 100,
"write_many": False,
"transformers": [{"type": "inspire-json-transformer"}],
}
Expand Down
26 changes: 20 additions & 6 deletions site/cds_rdm/inspire_harvester/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,19 +40,27 @@ def _iter(self, url, *args, **kwargs):
"""Yields HTTP response."""
# header set to include additional data (external file URLs and more detailed metadata
headers = {"Accept": "application/vnd+inspire.record.expanded+json"}
initial_url = url

while url: # Continue until there is no "next" link
current_app.logger.info(f"Querying URL: {url}.")
response = requests.get(url, headers=headers)
data = response.json()
if response.status_code == 200:
current_app.logger.debug("Request response is successful (200).")
if data["hits"]["total"] == 0:
total = data["hits"]["total"]
hits = data["hits"]["hits"]

if total == 0:
current_app.logger.warning(
f"No results found when querying INSPIRE. See URL: {url}."
)
elif url == initial_url:
current_app.logger.info(
f"Records found: {total}."
)

for inspire_record in data["hits"]["hits"]:
for inspire_record in hits:
current_app.logger.debug(
f"Sending INSPIRE record #{inspire_record['id']} to transformer."
)
Expand All @@ -71,33 +79,39 @@ def read(self, item=None, *args, **kwargs):

# Fetch all document types marked for CDS via the OAI set
oai_set = "ForCDS"
document_type = "thesis"

q = f"_oai.sets:{oai_set}"
if document_type:
q += f" AND document_type:{document_type}"


if self._inspire_id:
# get by INSPIRE id
current_app.logger.info(
f"Fetching records by ID {self._inspire_id} from INSPIRE."
)
query_params = {"q": f"_oai.sets:{oai_set} AND id:{self._inspire_id}"}
query_params = {"q": f"{q} AND id:{self._inspire_id}"}
elif self._on_date:
# get by the exact date
current_app.logger.info(
f"Fetching records by exact date {self._on_date} from INSPIRE."
)
query_params = {"q": f"_oai.sets:{oai_set} AND du:{self._on_date}"}
query_params = {"q": f"{q} AND du:{self._on_date}"}
elif self._until:
# get by the date range
current_app.logger.info(
f"Fetching records by the date range {self._since} - {self._until} from INSPIRE."
)
query_params = {
"q": f"_oai.sets:{oai_set} AND du >= {self._since} AND du <= {self._until}"
"q": f"{q} AND du >= {self._since} AND du <= {self._until}"
}
else:
# get since specified date until now
current_app.logger.info(
f"Fetching records since {self._since} from INSPIRE."
)
query_params = {"q": f"_oai.sets:{oai_set} AND du >= {self._since}"}
query_params = {"q": f"{q} AND du >= {self._since}"}

base_url = "https://inspirehep.net/api/literature"
encoded_query = urlencode(query_params)
Expand Down
71 changes: 48 additions & 23 deletions site/cds_rdm/inspire_harvester/transform/mappers/basic_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class ResourceTypeMapper(MapperBase):

id = "metadata.resource_type.id"

def map_value(self, src_metadata, ctx, logger):
def map_value(self, src_record, ctx, logger):
"""Map resource type value."""
return ctx.resource_type.value

Expand All @@ -34,8 +34,9 @@ class TitleMapper(MapperBase):

id = "metadata.title"

def map_value(self, src_metadata, ctx, logger):
def map_value(self, src_record, ctx, logger):
"""Map title value."""
src_metadata = src_record.get("metadata", {})
inspire_titles = src_metadata.get("titles", [])
return inspire_titles[0].get("title")

Expand All @@ -46,28 +47,41 @@ class AdditionalTitlesMapper(MapperBase):

id = "metadata.additional_titles"

def map_value(self, src_metadata, ctx, logger):
def map_value(self, src_record, ctx, logger):
"""Map additional titles."""
src_metadata = src_record.get("metadata", {})
inspire_titles = src_metadata.get("titles", [])
rdm_additional_titles = []
seen_titles = []
seen_subtitles = []
if len(inspire_titles) > 1:
seen_titles.append(inspire_titles[0])
for i, inspire_title in enumerate(inspire_titles[1:]):
try:

alt_title = {
"title": inspire_title.get("title"),
"type": {
"id": "alternative-title",
},
}
rdm_additional_titles.append(alt_title)
if inspire_title.get("subtitle"):
_title = inspire_title.get("title")
if _title and _title not in seen_titles:
seen_titles.append(_title)
alt_title = {
"title": _title,
"type": {
"id": "alternative-title",
},
}

rdm_additional_titles.append(alt_title)

_subtitle = inspire_title.get("title")
if _subtitle and _subtitle not in seen_subtitles:
seen_subtitles.append(_subtitle)
subtitle = {
"title": inspire_title.get("subtitle"),
"title": _subtitle,
"type": {
"id": "subtitle",
},
}
rdm_additional_titles.append(subtitle)

except Exception as e:
ctx.errors.append(
f"Title {inspire_title} transform failed. INSPIRE#{ctx.inspire_id}. Error: {e}."
Expand All @@ -88,8 +102,9 @@ def validate(self, src, ctx):
if len(imprints) > 1:
ctx.errors.append(f"More than 1 imprint found. INSPIRE#{ctx.inspire_id}.")

def map_value(self, src_metadata, ctx, logger):
def map_value(self, src_record, ctx, logger):
"""Map publisher value."""
src_metadata = src_record.get("metadata", {})
imprints = src_metadata.get("imprints", [])
imprint = None
publisher = None
Expand All @@ -115,15 +130,16 @@ class PublicationDateMapper(MapperBase):

id = "metadata.publication_date"

def map_value(self, src_metadata, ctx, logger):
def map_value(self, src_record, ctx, logger):
"""Transform publication date."""
src_metadata = src_record.get("metadata", {})
imprints = src_metadata.get("imprints", [])
imprint_date = imprints[0].get("date") if imprints else None

publication_info = src_metadata.get("publication_info", [])
publication_date = publication_info[0].get("year") if publication_info else None

creation_date = src_metadata.get("created")
creation_date = src_record.get("created")

date = publication_date or imprint_date or creation_date
if date and isinstance(date, int):
Expand All @@ -145,8 +161,9 @@ class CopyrightMapper(MapperBase):

id = "metadata.copyright"

def map_value(self, src_metadata, ctx, logger):
def map_value(self, src_record, ctx, logger):
"""Transform copyrights."""
src_metadata = src_record.get("metadata", {})
# format: "© {holder} {year}, {statement} {url}"
copyrights = src_metadata.get("copyright", [])
result_list = []
Expand Down Expand Up @@ -179,8 +196,9 @@ class DescriptionMapper(MapperBase):

id = "metadata.description"

def map_value(self, src_metadata, ctx, logger):
def map_value(self, src_record, ctx, logger):
"""Mapping of abstracts."""
src_metadata = src_record.get("metadata", {})
abstracts = src_metadata.get("abstracts", [])
if abstracts:
return abstracts[0]["value"]
Expand All @@ -192,16 +210,21 @@ class AdditionalDescriptionsMapper(MapperBase):

id = "metadata.additional_descriptions"

def map_value(self, src_metadata, ctx, logger):
def map_value(self, src_record, ctx, logger):
"""Mapping of additional descriptions."""
src_metadata = src_record.get("metadata", {})
abstracts = src_metadata.get("abstracts", [])
additional_descriptions = []

if len(abstracts) > 1:
seen_abstracts = [abstracts[0]["value"]]
for x in abstracts[1:]:
additional_descriptions.append(
{"description": x["value"], "type": {"id": "abstract"}}
)
new_abstract = x["value"]
if new_abstract not in seen_abstracts:
seen_abstracts.append(new_abstract)
additional_descriptions.append(
{"description": new_abstract, "type": {"id": "abstract"}}
)

# TODO move it to book resource?
book_series = src_metadata.get("book_series", [])
Expand All @@ -226,8 +249,9 @@ class SubjectsMapper(MapperBase):

id = "metadata.subjects"

def map_value(self, src_metadata, ctx, logger):
def map_value(self, src_record, ctx, logger):
"""Mapping of keywords to subjects."""
src_metadata = src_record.get("metadata", {})
keywords = src_metadata.get("keywords", [])
mapped_subjects = []
for keyword in keywords:
Expand All @@ -248,8 +272,9 @@ class LanguagesMapper(MapperBase):

id = "metadata.languages"

def map_value(self, src_metadata, ctx, logger):
def map_value(self, src_record, ctx, logger):
"""Mapping and converting of languages."""
src_metadata = src_record.get("metadata", {})
languages = src_metadata.get("languages", [])
mapped_langs = []
for lang in languages:
Expand Down
16 changes: 12 additions & 4 deletions site/cds_rdm/inspire_harvester/transform/mappers/contributors.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ def _transform_author_affiliations(self, author):
def _transform_creatibutors(self, authors, ctx):
"""Transform creatibutors."""
creatibutors = []
if not authors:
return creatibutors
try:
for author in authors:
first_name = author.get("first_name")
Expand Down Expand Up @@ -98,7 +100,7 @@ def _transform_creatibutors(self, authors, ctx):
)
return None

def map_value(self, src, ctx, logger):
def map_value(self, src_record, ctx, logger):
"""Map creatibutors value (to be implemented by subclasses)."""
pass

Expand All @@ -112,8 +114,9 @@ class AuthorsMapper(CreatibutorsMapper):

id = "metadata.creators"

def map_value(self, src_metadata, ctx, logger):
def map_value(self, src_record, ctx, logger):
"""Map authors to RDM creators."""
src_metadata = src_record.get("metadata", {})
authors = src_metadata.get("authors", [])
creators = []
for author in authors:
Expand All @@ -136,7 +139,11 @@ def map_value(self, src_metadata, ctx, logger):
}
mapped_corporate_authors.append(contributor)

return self._transform_creatibutors(creators, ctx) + mapped_corporate_authors
contributors = self._transform_creatibutors(creators, ctx)
if not contributors:
contributors = []

return contributors + mapped_corporate_authors


@dataclass(frozen=True)
Expand All @@ -145,8 +152,9 @@ class ContributorsMapper(CreatibutorsMapper):

id = "metadata.contributors"

def map_value(self, src_metadata, ctx, logger):
def map_value(self, src_record, ctx, logger):
"""Map authors to RDM contributors."""
src_metadata = src_record.get("metadata", {})
authors = src_metadata.get("authors", [])
contributors = []

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from idutils.normalizers import normalize_isbn

from cds_rdm.inspire_harvester.transform.mappers.mapper import MapperBase
from cds_rdm.inspire_harvester.transform.utils import search_vocabulary
from cds_rdm.inspire_harvester.utils import search_vocabulary


@dataclass(frozen=True)
Expand All @@ -21,8 +21,9 @@ class ImprintMapper(MapperBase):

id = "custom_fields.imprint:imprint"

def map_value(self, src_metadata, ctx, logger):
def map_value(self, src_record, ctx, logger):
"""Apply thesis field mapping."""
src_metadata = src_record.get("metadata", {})
imprints = src_metadata.get("imprints", [])
imprint = imprints[0] if imprints else None
isbns = src_metadata.get("isbns", [])
Expand Down Expand Up @@ -58,8 +59,9 @@ class CERNFieldsMapper(MapperBase):

id = "custom_fields"

def map_value(self, src_metadata, ctx, logger):
def map_value(self, src_record, ctx, logger):
"""Apply mapping."""
src_metadata = src_record.get("metadata", {})
acc_exp_list = src_metadata.get("accelerator_experiments", [])
_accelerators = []
_experiments = []
Expand All @@ -72,7 +74,10 @@ def map_value(self, src_metadata, ctx, logger):
logger.debug(
f"Searching vocabulary 'accelerator' for term: '{accelerator}'"
)
accelerator = f"{institution} {accelerator}"
if institution:
accelerator = f"{institution} {accelerator}"
else:
accelerator = f"{accelerator}"
result = search_vocabulary(accelerator, "accelerators", ctx, logger)
if result.total == 1:
logger.info(f"Found accelerator '{accelerator}'")
Expand Down
Loading
Loading