From 4b071f7c3d30343dc3858a976a1c90001848fdec Mon Sep 17 00:00:00 2001
From: Vittorio Meloni <vittorio.meloni@crs4.it>
Date: Tue, 14 Mar 2023 17:55:51 +0100
Subject: [PATCH 01/15] feat: adds script to synchronize biobanks and
 collections data with fdp

---
 sync_directory_with_fdp.py | 159 +++++++++++++++++++++++++++++++++++++
 1 file changed, 159 insertions(+)
 create mode 100644 sync_directory_with_fdp.py

diff --git a/sync_directory_with_fdp.py b/sync_directory_with_fdp.py
new file mode 100644
index 0000000..7193a7a
--- /dev/null
+++ b/sync_directory_with_fdp.py
@@ -0,0 +1,159 @@
+from molgenis import client
+from molgenis.client import MolgenisRequestError
+
+molgenisURL = 'http://localhost:82'
+directoryURL = 'http://localhost:82'
+
+MOLGENIS_URL = f'{molgenisURL}'  # URL of the molgenis to query
+DIRECTORY_URL = f'{directoryURL}'  # URL of the directory
+
+BBMRI_BIOBANK_ENTITY = 'eu_bbmri_eric_biobanks'
+BBMRI_COLLECTION_ENTITY = 'eu_bbmri_eric_collections'
+
+FDP_BIOBANK_ENTITY = 'fdp_Biobank'
+FDP_COLLECTION_ENTITY = 'fdp_Collection'
+
+ORPHA_ONTOLOGY_PREFIX = 'http://www.orpha.net/ORDO/Orphanet_'
+ORPHA_DIRECTORY_PREFIX = 'ORPHA:'
+ICD_10_ONTOLOGY_PREFIX = 'http://purl.bioontology.org/ontology/ICD10/'
+ICD_10_DIRECTORY_PREFIX = 'urn:miriam:icd:'
+
+BIOBANKS_ATTRIBUTES = 'id,name,acronym,description,country,juridical_person,collections,contact'
+BIOBANKS_EXPAND_ATTRIBUTES = 'country,juridical_person,collections,contact'
+COLLECTIONS_ATTRIBUTES = 'id,name,description,biobank,diagnosis_available,country,parent_collection'
+COLLECTIONS_EXPAND_ATTRIBUTES = 'biobank,diagnosis_available,country'
+
+COLLECTION_TYPES_ONTOLOGIES = {
+    'BIRTH_COHORT': 'http://purl.obolibrary.org/obo/OBI_0002614',
+    'CASE_CONTROL': 'http://purl.obolibrary.org/obo/OBIB_0000693',
+    'COHORT': 'http://purl.obolibrary.org/obo/OBIB_0000696',
+    'CROSS_SECTIONAL': 'http://purl.obolibrary.org/obo/OBIB_0000694',
+    'DISEASE_SPECIFIC': 'http://purl.obolibrary.org/obo/OBI_0002615',
+    'HOSPITAL': None,
+    'IMAGE': None,
+    'LONGITUDINAL': 'http://purl.obolibrary.org/obo/OBIB_0000695',
+    'NON_HUMAN': None,
+    'OTHER': None,
+    'POPULATION_BASED': 'http://purl.obolibrary.org/obo/OBIB_0000698',
+    'PROSPECTIVE_COLLECTION': None,
+    'QUALITY_CONTROL': 'http://purl.obolibrary.org/obo/OBIB_0000699',
+    'RD': None,
+    'SAMPLE': None,
+    'TWIN_STUDY': 'http://purl.obolibrary.org/obo/OBIB_0000700'
+}
+
+
+def _get_missing_biobanks(session, **kwargs):
+    print("Getting source entities from {}".format(BBMRI_BIOBANK_ENTITY))
+    source_records = session.get(BBMRI_BIOBANK_ENTITY, **kwargs)
+    print("Getting ids already present")
+    dest_records_ids = [r['identifier'] for r in session.get(FDP_BIOBANK_ENTITY, attributes='identifier')]
+    new_records = [sr for sr in source_records if sr['id'] not in dest_records_ids]
+    print("Found {} new records to insert".format(len(new_records)))
+    return source_records
+
+
+def _add_new_records(session, entity, records):
+    created_records = []
+    for i in range(0, len(records), 1000):
+        try:
+            created_records.extend(session.add_all(entity, records[i:i + 1000]))
+        except MolgenisRequestError as ex:
+            print("Error adding records")
+            print(ex)
+    print("Added {} record(s) of type {}".format(len(created_records), entity))
+
+
+def _convert_country(country):
+    return 'GB' if country == 'UK' else country
+
+
+def _get_disease_ontology_code(disease_code):
+    if ORPHA_DIRECTORY_PREFIX in disease_code:
+        return disease_code.replace(ORPHA_ONTOLOGY_PREFIX, ORPHA_ONTOLOGY_PREFIX)
+    if ICD_10_DIRECTORY_PREFIX in disease_code:
+        return disease_code.replace(ICD_10_DIRECTORY_PREFIX, ICD_10_ONTOLOGY_PREFIX)
+
+
+def _get_collection_type_ontology_code(collection_type):
+    return COLLECTION_TYPES_ONTOLOGIES.get(collection_type, None)
+
+
+def get_records_to_add(biobank_data, session):
+    missing_iris = []
+    for c in biobank_data['collections']:
+        for d in c['diagnosis_available']:
+            try:
+                session.get_by_id('fdp_IRI', d['id'], attributes='id')
+            except MolgenisRequestError:
+                missing_iris.append((d['id'], _get_disease_ontology_code(d['id'])))
+
+        for t in c['type']:
+            try:
+                session.get_by_id('fdp_IRI', t['id'], attributes='id')
+            except MolgenisRequestError:
+                ontology_code = _get_collection_type_ontology_code(t['id'])
+                if ontology_code is not None:
+                    missing_iris.append((t['id'], ontology_code))
+
+    return {
+        'fdp_Biobank': {
+            'IRI': f'{DIRECTORY_URL}/api/fdp/fdp_Biobank/{biobank_data["id"]}',
+            'catalog': 'bbmri-directory',  # TODO: get it dynamically
+            'identifier': biobank_data['id'],
+            'title': biobank_data['name'],
+            'acronym': biobank_data['acronym'] if 'acronym' in biobank_data else None,
+            'description': biobank_data['description'] if 'description' in biobank_data else None,
+            'publisher': f'{biobank_data["id"]}-pub',
+            'landingPage': f'{DIRECTORY_URL}/#/biobank/{biobank_data["id"]}',
+            'contactPoint': f'{biobank_data["id"]}-cp' if 'contact' in biobank_data else None,
+            'country': _convert_country(biobank_data['country']['id'])
+        },
+        'fdp_Publisher': {
+            'identifier': f'{biobank_data["id"]}-pub',
+            'label': biobank_data['juridical_person']
+        },
+        'fdp_ContactPointIndividual': {
+            'identifier': f'{biobank_data["id"]}-cp',
+            'email': f'mailto:{biobank_data["contact"]["email"]}' if 'contact' in biobank_data else '',
+            'address': '',
+            'telephone': '',
+        },
+        'fdp_Collection': [{
+            'identifier': c['id'],
+            'title': c['name'],
+            'biobank': biobank_data["id"],
+            'description': c['description'] if 'description' in c else None,
+            'type': [t['id'] for t in c['type'] if _get_collection_type_ontology_code(t['id']) is not None],
+            'theme': [d['id'] for d in c['diagnosis_available']]
+        } for c in biobank_data['collections']],
+        'fdp_IRI': missing_iris
+    }
+
+
+def sync_biobanks(session, **kwargs):
+    missing_biobanks = _get_missing_biobanks(session, attributes=BIOBANKS_ATTRIBUTES,
+                                             expand=BIOBANKS_EXPAND_ATTRIBUTES, **kwargs)
+    biobanks = []
+    publishers = []
+    contacts = []
+    collections = []
+    iris = set()
+    for b in missing_biobanks:
+        records = get_records_to_add(b, session)
+        biobanks.append(records['fdp_Biobank'])
+        publishers.append(records['fdp_Publisher'])
+        contacts.append(records['fdp_ContactPointIndividual'])
+        collections.extend(records['fdp_Collection'])
+        iris.update(records['fdp_IRI'])
+
+    _add_new_records(session, 'fdp_Publisher', publishers)
+    _add_new_records(session, 'fdp_ContactPointIndividual', contacts)
+    _add_new_records(session, 'fdp_IRI', [{'id': i[0], 'IRI': i[1]} for i in iris])
+    _add_new_records(session, 'fdp_Biobank', biobanks)
+    _add_new_records(session, 'fdp_Collection', collections)
+
+s = client.Session(MOLGENIS_URL)
+s.login('admin', 'admin')
+
+sync_biobanks(s, num=1)

From b2396a9a0a8274dcd387ea6ca5a31bfa41eff111 Mon Sep 17 00:00:00 2001
From: Vittorio Meloni <vittorio.meloni@crs4.it>
Date: Fri, 17 Mar 2023 16:04:39 +0100
Subject: [PATCH 02/15] feat: adds DataService support for Collection to
 potentially describe a record service endpoint

---
 sync_directory_with_fdp.py | 50 ++++++++++++++++++++++++++++++--------
 1 file changed, 40 insertions(+), 10 deletions(-)

diff --git a/sync_directory_with_fdp.py b/sync_directory_with_fdp.py
index 7193a7a..334e79d 100644
--- a/sync_directory_with_fdp.py
+++ b/sync_directory_with_fdp.py
@@ -9,6 +9,7 @@
 
 BBMRI_BIOBANK_ENTITY = 'eu_bbmri_eric_biobanks'
 BBMRI_COLLECTION_ENTITY = 'eu_bbmri_eric_collections'
+BBMRI_DATA_SERVICE_ENTITY = 'eu_bbmri_eric_record_service'
 
 FDP_BIOBANK_ENTITY = 'fdp_Biobank'
 FDP_COLLECTION_ENTITY = 'fdp_Collection'
@@ -18,10 +19,10 @@
 ICD_10_ONTOLOGY_PREFIX = 'http://purl.bioontology.org/ontology/ICD10/'
 ICD_10_DIRECTORY_PREFIX = 'urn:miriam:icd:'
 
-BIOBANKS_ATTRIBUTES = 'id,name,acronym,description,country,juridical_person,collections,contact'
-BIOBANKS_EXPAND_ATTRIBUTES = 'country,juridical_person,collections,contact'
-COLLECTIONS_ATTRIBUTES = 'id,name,description,biobank,diagnosis_available,country,parent_collection'
-COLLECTIONS_EXPAND_ATTRIBUTES = 'biobank,diagnosis_available,country'
+COLLECTIONS_ATTRIBUTES = '*,record_service(*)'
+BIOBANKS_ATTRIBUTES = f'id,name,acronym,description,country,juridical_person,contact,collections'
+BIOBANKS_EXPAND_ATTRIBUTES = f'country,juridical_person,contact,collections'
+COLLECTIONS_EXPAND_ATTRIBUTES = 'biobank,diagnosis_available,country,record_service'
 
 COLLECTION_TYPES_ONTOLOGIES = {
     'BIRTH_COHORT': 'http://purl.obolibrary.org/obo/OBI_0002614',
@@ -45,7 +46,7 @@
 
 def _get_missing_biobanks(session, **kwargs):
     print("Getting source entities from {}".format(BBMRI_BIOBANK_ENTITY))
-    source_records = session.get(BBMRI_BIOBANK_ENTITY, **kwargs)
+    source_records = session.get(BBMRI_BIOBANK_ENTITY, q='id==bbmri-eric:ID:RD_ES:44001', **kwargs)
     print("Getting ids already present")
     dest_records_ids = [r['identifier'] for r in session.get(FDP_BIOBANK_ENTITY, attributes='identifier')]
     new_records = [sr for sr in source_records if sr['id'] not in dest_records_ids]
@@ -81,6 +82,7 @@ def _get_collection_type_ontology_code(collection_type):
 
 def get_records_to_add(biobank_data, session):
     missing_iris = []
+    dataServices = []
     for c in biobank_data['collections']:
         for d in c['diagnosis_available']:
             try:
@@ -96,7 +98,16 @@ def get_records_to_add(biobank_data, session):
                 if ontology_code is not None:
                     missing_iris.append((t['id'], ontology_code))
 
-    return {
+        if 'record_service' in c:
+            rs_data = session.get_by_id(BBMRI_DATA_SERVICE_ENTITY, c['record_service']['id'])
+            dataServices.append({
+                'identifier': rs_data['id'],
+                'endpointUrl': rs_data['url'],
+                'endpointDescription': rs_data['description'] if 'description' in rs_data else None,
+                'conformsTo': rs_data['conformsTo']
+            })
+
+    res = {
         'fdp_Biobank': {
             'IRI': f'{DIRECTORY_URL}/api/fdp/fdp_Biobank/{biobank_data["id"]}',
             'catalog': 'bbmri-directory',  # TODO: get it dynamically
@@ -125,20 +136,27 @@ def get_records_to_add(biobank_data, session):
             'biobank': biobank_data["id"],
             'description': c['description'] if 'description' in c else None,
             'type': [t['id'] for t in c['type'] if _get_collection_type_ontology_code(t['id']) is not None],
-            'theme': [d['id'] for d in c['diagnosis_available']]
+            'theme': [d['id'] for d in c['diagnosis_available']],
+            'service': c['record_service']['id'] if 'record_service' in c else None
         } for c in biobank_data['collections']],
-        'fdp_IRI': missing_iris
+        'fdp_IRI': missing_iris,
     }
+    if len(dataServices) > 0:
+        res.update({
+            'fdp_DataService': dataServices
+        })
+    return res
 
 
 def sync_biobanks(session, **kwargs):
-    missing_biobanks = _get_missing_biobanks(session, attributes=BIOBANKS_ATTRIBUTES,
-                                             expand=BIOBANKS_EXPAND_ATTRIBUTES, **kwargs)
+    missing_biobanks = _get_missing_biobanks(session, attributes=BIOBANKS_ATTRIBUTES, expand=BIOBANKS_EXPAND_ATTRIBUTES,
+                                             **kwargs)
     biobanks = []
     publishers = []
     contacts = []
     collections = []
     iris = set()
+    data_services = []
     for b in missing_biobanks:
         records = get_records_to_add(b, session)
         biobanks.append(records['fdp_Biobank'])
@@ -147,12 +165,24 @@ def sync_biobanks(session, **kwargs):
         collections.extend(records['fdp_Collection'])
         iris.update(records['fdp_IRI'])
 
+        if 'fdp_DataService' in records:
+            data_services.extend(records['fdp_DataService'])
+
+    print(data_services)
+    session.delete_list('fdp_DataService', [d['identifier'] for d in data_services])
+    session.delete_list('fdp_Collection', [c['identifier'] for c in collections])
+    session.delete_list('fdp_Biobank', [b['identifier'] for b in biobanks])
+    session.delete_list('fdp_Publisher', [p['identifier'] for p in publishers])
+    session.delete_list('fdp_ContactPointIndividual', [c['identifier'] for c in contacts])
+
     _add_new_records(session, 'fdp_Publisher', publishers)
     _add_new_records(session, 'fdp_ContactPointIndividual', contacts)
     _add_new_records(session, 'fdp_IRI', [{'id': i[0], 'IRI': i[1]} for i in iris])
+    _add_new_records(session, 'fdp_DataService', data_services)
     _add_new_records(session, 'fdp_Biobank', biobanks)
     _add_new_records(session, 'fdp_Collection', collections)
 
+
 s = client.Session(MOLGENIS_URL)
 s.login('admin', 'admin')
 

From 6cd2b5dcefad3a539cda5913703b91dd846df2a9 Mon Sep 17 00:00:00 2001
From: Vittorio Meloni <vittorio.meloni@crs4.it>
Date: Mon, 20 Mar 2023 16:54:23 +0100
Subject: [PATCH 03/15] feat: adds option to delete records before adding them

---
 sync_directory_with_fdp.py | 151 +++++++++++++++++++++----------------
 1 file changed, 86 insertions(+), 65 deletions(-)

diff --git a/sync_directory_with_fdp.py b/sync_directory_with_fdp.py
index 334e79d..67252c0 100644
--- a/sync_directory_with_fdp.py
+++ b/sync_directory_with_fdp.py
@@ -1,18 +1,23 @@
+from collections import OrderedDict
+
 from molgenis import client
 from molgenis.client import MolgenisRequestError
 
-molgenisURL = 'http://localhost:82'
-directoryURL = 'http://localhost:82'
+molgenis_url = 'http://localhost:82'
+directory_url = 'http://localhost:82'
 
-MOLGENIS_URL = f'{molgenisURL}'  # URL of the molgenis to query
-DIRECTORY_URL = f'{directoryURL}'  # URL of the directory
+MOLGENIS_URL = f'{molgenis_url}'  # URL of the molgenis to query
+DIRECTORY_URL = f'{directory_url}'  # URL of the directory
 
 BBMRI_BIOBANK_ENTITY = 'eu_bbmri_eric_biobanks'
-BBMRI_COLLECTION_ENTITY = 'eu_bbmri_eric_collections'
 BBMRI_DATA_SERVICE_ENTITY = 'eu_bbmri_eric_record_service'
 
-FDP_BIOBANK_ENTITY = 'fdp_Biobank'
-FDP_COLLECTION_ENTITY = 'fdp_Collection'
+FDP_BIOBANK = 'fdp_Biobank'
+FDP_COLLECTION = 'fdp_Collection'
+FDP_PUBLISHER = 'fdp_Publisher'
+FDP_CONTACT = 'fdp_ContactPointIndividual'
+FDP_DATA_SERVICE = 'fdp_DataService'
+FDP_IRI = 'fdp_IRI'
 
 ORPHA_ONTOLOGY_PREFIX = 'http://www.orpha.net/ORDO/Orphanet_'
 ORPHA_DIRECTORY_PREFIX = 'ORPHA:'
@@ -44,17 +49,21 @@
 }
 
 
-def _get_missing_biobanks(session, **kwargs):
+def get_missing_biobanks(session, reset, **kwargs):
     print("Getting source entities from {}".format(BBMRI_BIOBANK_ENTITY))
-    source_records = session.get(BBMRI_BIOBANK_ENTITY, q='id==bbmri-eric:ID:RD_ES:44001', **kwargs)
+    source_records = session.get(BBMRI_BIOBANK_ENTITY, **kwargs)
+    # if reset is True the missing biobanks are all
+    if reset:
+        return source_records
+
     print("Getting ids already present")
-    dest_records_ids = [r['identifier'] for r in session.get(FDP_BIOBANK_ENTITY, attributes='identifier')]
+    dest_records_ids = [r['identifier'] for r in session.get(FDP_BIOBANK, attributes='identifier')]
     new_records = [sr for sr in source_records if sr['id'] not in dest_records_ids]
     print("Found {} new records to insert".format(len(new_records)))
-    return source_records
+    return new_records
 
 
-def _add_new_records(session, entity, records):
+def create_records(session, entity, records):
     created_records = []
     for i in range(0, len(records), 1000):
         try:
@@ -65,42 +74,54 @@ def _add_new_records(session, entity, records):
     print("Added {} record(s) of type {}".format(len(created_records), entity))
 
 
-def _convert_country(country):
+def delete_records(session, entity, records):
+    removed_records = []
+    for i in range(0, len(records), 1000):
+        try:
+            removed_records.extend(
+                session.delete_list(entity, [record['identifier'] for record in records[i:i + 1000]]))
+        except MolgenisRequestError as ex:
+            print("Error removing records")
+            print(ex)
+    print(f"Removed {len(removed_records)} of type {entity}")
+
+
+def get_country(country):
     return 'GB' if country == 'UK' else country
 
 
-def _get_disease_ontology_code(disease_code):
+def get_disease_ontology_code(disease_code):
     if ORPHA_DIRECTORY_PREFIX in disease_code:
-        return disease_code.replace(ORPHA_ONTOLOGY_PREFIX, ORPHA_ONTOLOGY_PREFIX)
+        return disease_code.replace(ORPHA_DIRECTORY_PREFIX, ORPHA_ONTOLOGY_PREFIX)
     if ICD_10_DIRECTORY_PREFIX in disease_code:
         return disease_code.replace(ICD_10_DIRECTORY_PREFIX, ICD_10_ONTOLOGY_PREFIX)
 
 
-def _get_collection_type_ontology_code(collection_type):
+def get_collection_type_ontology_code(collection_type):
     return COLLECTION_TYPES_ONTOLOGIES.get(collection_type, None)
 
 
 def get_records_to_add(biobank_data, session):
     missing_iris = []
-    dataServices = []
+    data_services = []
     for c in biobank_data['collections']:
         for d in c['diagnosis_available']:
             try:
                 session.get_by_id('fdp_IRI', d['id'], attributes='id')
             except MolgenisRequestError:
-                missing_iris.append((d['id'], _get_disease_ontology_code(d['id'])))
+                missing_iris.append((d['id'], get_disease_ontology_code(d['id'])))
 
         for t in c['type']:
             try:
                 session.get_by_id('fdp_IRI', t['id'], attributes='id')
             except MolgenisRequestError:
-                ontology_code = _get_collection_type_ontology_code(t['id'])
+                ontology_code = get_collection_type_ontology_code(t['id'])
                 if ontology_code is not None:
                     missing_iris.append((t['id'], ontology_code))
 
         if 'record_service' in c:
             rs_data = session.get_by_id(BBMRI_DATA_SERVICE_ENTITY, c['record_service']['id'])
-            dataServices.append({
+            data_services.append({
                 'identifier': rs_data['id'],
                 'endpointUrl': rs_data['url'],
                 'endpointDescription': rs_data['description'] if 'description' in rs_data else None,
@@ -108,82 +129,82 @@ def get_records_to_add(biobank_data, session):
             })
 
     res = {
-        'fdp_Biobank': {
+        FDP_BIOBANK: {
+            'identifier': biobank_data['id'],
             'IRI': f'{DIRECTORY_URL}/api/fdp/fdp_Biobank/{biobank_data["id"]}',
             'catalog': 'bbmri-directory',  # TODO: get it dynamically
-            'identifier': biobank_data['id'],
             'title': biobank_data['name'],
             'acronym': biobank_data['acronym'] if 'acronym' in biobank_data else None,
             'description': biobank_data['description'] if 'description' in biobank_data else None,
             'publisher': f'{biobank_data["id"]}-pub',
             'landingPage': f'{DIRECTORY_URL}/#/biobank/{biobank_data["id"]}',
             'contactPoint': f'{biobank_data["id"]}-cp' if 'contact' in biobank_data else None,
-            'country': _convert_country(biobank_data['country']['id'])
+            'country': get_country(biobank_data['country']['id'])
         },
-        'fdp_Publisher': {
+        FDP_PUBLISHER: {
             'identifier': f'{biobank_data["id"]}-pub',
             'label': biobank_data['juridical_person']
         },
-        'fdp_ContactPointIndividual': {
+        FDP_CONTACT: {
             'identifier': f'{biobank_data["id"]}-cp',
             'email': f'mailto:{biobank_data["contact"]["email"]}' if 'contact' in biobank_data else '',
             'address': '',
             'telephone': '',
         },
-        'fdp_Collection': [{
+        FDP_COLLECTION: [{
             'identifier': c['id'],
             'title': c['name'],
             'biobank': biobank_data["id"],
             'description': c['description'] if 'description' in c else None,
-            'type': [t['id'] for t in c['type'] if _get_collection_type_ontology_code(t['id']) is not None],
+            'type': [t['id'] for t in c['type'] if get_collection_type_ontology_code(t['id']) is not None],
             'theme': [d['id'] for d in c['diagnosis_available']],
             'service': c['record_service']['id'] if 'record_service' in c else None
         } for c in biobank_data['collections']],
-        'fdp_IRI': missing_iris,
+        FDP_IRI: missing_iris,
+        FDP_DATA_SERVICE: data_services
     }
-    if len(dataServices) > 0:
-        res.update({
-            'fdp_DataService': dataServices
-        })
     return res
 
 
-def sync_biobanks(session, **kwargs):
-    missing_biobanks = _get_missing_biobanks(session, attributes=BIOBANKS_ATTRIBUTES, expand=BIOBANKS_EXPAND_ATTRIBUTES,
-                                             **kwargs)
-    biobanks = []
-    publishers = []
-    contacts = []
-    collections = []
-    iris = set()
-    data_services = []
+def sync(session, reset, **kwargs):
+    missing_biobanks = get_missing_biobanks(session, reset=reset, attributes=BIOBANKS_ATTRIBUTES,
+                                            expand=BIOBANKS_EXPAND_ATTRIBUTES,
+                                            **kwargs)
+
+    records = OrderedDict({
+        FDP_PUBLISHER: [],
+        FDP_CONTACT: [],
+        FDP_IRI: set(),
+        FDP_DATA_SERVICE: [],
+        FDP_BIOBANK: [],
+        FDP_COLLECTION: []
+    })
     for b in missing_biobanks:
-        records = get_records_to_add(b, session)
-        biobanks.append(records['fdp_Biobank'])
-        publishers.append(records['fdp_Publisher'])
-        contacts.append(records['fdp_ContactPointIndividual'])
-        collections.extend(records['fdp_Collection'])
-        iris.update(records['fdp_IRI'])
-
-        if 'fdp_DataService' in records:
-            data_services.extend(records['fdp_DataService'])
-
-    print(data_services)
-    session.delete_list('fdp_DataService', [d['identifier'] for d in data_services])
-    session.delete_list('fdp_Collection', [c['identifier'] for c in collections])
-    session.delete_list('fdp_Biobank', [b['identifier'] for b in biobanks])
-    session.delete_list('fdp_Publisher', [p['identifier'] for p in publishers])
-    session.delete_list('fdp_ContactPointIndividual', [c['identifier'] for c in contacts])
-
-    _add_new_records(session, 'fdp_Publisher', publishers)
-    _add_new_records(session, 'fdp_ContactPointIndividual', contacts)
-    _add_new_records(session, 'fdp_IRI', [{'id': i[0], 'IRI': i[1]} for i in iris])
-    _add_new_records(session, 'fdp_DataService', data_services)
-    _add_new_records(session, 'fdp_Biobank', biobanks)
-    _add_new_records(session, 'fdp_Collection', collections)
+        new_records = get_records_to_add(b, session)
+        for k, v in new_records.items():
+            if type(records[k]) == list:
+                if type(new_records[k]) == list:
+                    records[k].extend(new_records[k])
+                else:
+                    records[k].append(new_records[k])
+            else:
+                records[k].update(new_records[k])
+
+    if reset:
+        for k, v in reversed(records.items()):
+            if k != 'fdp_IRI' and len(v) > 0:
+                delete_records(session, k, v)
+
+    for k, v in records.items():
+        if len(v) > 0:
+            if k == FDP_IRI:
+                create_records(session, k, [{'id': i[0], 'IRI': i[1]} for i in v])
+            else:
+                create_records(session, k, v)
 
 
 s = client.Session(MOLGENIS_URL)
 s.login('admin', 'admin')
 
-sync_biobanks(s, num=1)
+reset = True
+sync(s, reset)

From 7a0c97498729f2d8c6f8609e4946ecbbb09b9f89 Mon Sep 17 00:00:00 2001
From: Vittorio Meloni <vittorio.meloni@crs4.it>
Date: Tue, 21 Mar 2023 17:16:28 +0100
Subject: [PATCH 04/15] feat: added parameters

---
 sync_directory_with_fdp.py | 37 ++++++++++++++++++++-----------------
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/sync_directory_with_fdp.py b/sync_directory_with_fdp.py
index 67252c0..8e6a1e6 100644
--- a/sync_directory_with_fdp.py
+++ b/sync_directory_with_fdp.py
@@ -1,14 +1,9 @@
+import logging
 from collections import OrderedDict
 
 from molgenis import client
 from molgenis.client import MolgenisRequestError
 
-molgenis_url = 'http://localhost:82'
-directory_url = 'http://localhost:82'
-
-MOLGENIS_URL = f'{molgenis_url}'  # URL of the molgenis to query
-DIRECTORY_URL = f'{directory_url}'  # URL of the directory
-
 BBMRI_BIOBANK_ENTITY = 'eu_bbmri_eric_biobanks'
 BBMRI_DATA_SERVICE_ENTITY = 'eu_bbmri_eric_record_service'
 
@@ -24,10 +19,8 @@
 ICD_10_ONTOLOGY_PREFIX = 'http://purl.bioontology.org/ontology/ICD10/'
 ICD_10_DIRECTORY_PREFIX = 'urn:miriam:icd:'
 
-COLLECTIONS_ATTRIBUTES = '*,record_service(*)'
 BIOBANKS_ATTRIBUTES = f'id,name,acronym,description,country,juridical_person,contact,collections'
 BIOBANKS_EXPAND_ATTRIBUTES = f'country,juridical_person,contact,collections'
-COLLECTIONS_EXPAND_ATTRIBUTES = 'biobank,diagnosis_available,country,record_service'
 
 COLLECTION_TYPES_ONTOLOGIES = {
     'BIRTH_COHORT': 'http://purl.obolibrary.org/obo/OBI_0002614',
@@ -101,7 +94,7 @@ def get_collection_type_ontology_code(collection_type):
     return COLLECTION_TYPES_ONTOLOGIES.get(collection_type, None)
 
 
-def get_records_to_add(biobank_data, session):
+def get_records_to_add(biobank_data, session, directory_prefix):
     missing_iris = []
     data_services = []
     for c in biobank_data['collections']:
@@ -131,13 +124,13 @@ def get_records_to_add(biobank_data, session):
     res = {
         FDP_BIOBANK: {
             'identifier': biobank_data['id'],
-            'IRI': f'{DIRECTORY_URL}/api/fdp/fdp_Biobank/{biobank_data["id"]}',
+            'IRI': f'{directory_prefix}/api/fdp/fdp_Biobank/{biobank_data["id"]}',
             'catalog': 'bbmri-directory',  # TODO: get it dynamically
             'title': biobank_data['name'],
             'acronym': biobank_data['acronym'] if 'acronym' in biobank_data else None,
             'description': biobank_data['description'] if 'description' in biobank_data else None,
             'publisher': f'{biobank_data["id"]}-pub',
-            'landingPage': f'{DIRECTORY_URL}/#/biobank/{biobank_data["id"]}',
+            'landingPage': f'{directory_prefix}/#/biobank/{biobank_data["id"]}',
             'contactPoint': f'{biobank_data["id"]}-cp' if 'contact' in biobank_data else None,
             'country': get_country(biobank_data['country']['id'])
         },
@@ -166,7 +159,7 @@ def get_records_to_add(biobank_data, session):
     return res
 
 
-def sync(session, reset, **kwargs):
+def sync(session, reset, directory_prefix, **kwargs):
     missing_biobanks = get_missing_biobanks(session, reset=reset, attributes=BIOBANKS_ATTRIBUTES,
                                             expand=BIOBANKS_EXPAND_ATTRIBUTES,
                                             **kwargs)
@@ -180,7 +173,7 @@ def sync(session, reset, **kwargs):
         FDP_COLLECTION: []
     })
     for b in missing_biobanks:
-        new_records = get_records_to_add(b, session)
+        new_records = get_records_to_add(b, session, directory_prefix)
         for k, v in new_records.items():
             if type(records[k]) == list:
                 if type(new_records[k]) == list:
@@ -203,8 +196,18 @@ def sync(session, reset, **kwargs):
                 create_records(session, k, v)
 
 
-s = client.Session(MOLGENIS_URL)
-s.login('admin', 'admin')
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--molgenis-url', '-U')
+    parser.add_argument('--molgenis-user', '-u')
+    parser.add_argument('--molgenis-password', '-p')
+    parser.add_argument('--directory-prefix', '-d', help='The main prefix of the url to be used to generate IRIs', default='https://directory.bbmri-eric.eu/')
+    parser.add_argument('--reset', '-r', dest='reset', action='store_true')
+    args = parser.parse_args()
+
+    s = client.Session(args.molgenis_url)
+    s.login(args.molgenis_user, args.molgenis_password)
+
+    sync(s, args.directory_prefix, args.reset)
 
-reset = True
-sync(s, reset)

From adeb53c4bcf769cbda342586ff141851a6340590 Mon Sep 17 00:00:00 2001
From: Vittorio Meloni <vittorio.meloni@crs4.it>
Date: Wed, 22 Mar 2023 12:32:00 +0100
Subject: [PATCH 05/15] fixes order of parameters

---
 sync_directory_with_fdp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sync_directory_with_fdp.py b/sync_directory_with_fdp.py
index 8e6a1e6..6da3698 100644
--- a/sync_directory_with_fdp.py
+++ b/sync_directory_with_fdp.py
@@ -159,7 +159,7 @@ def get_records_to_add(biobank_data, session, directory_prefix):
     return res
 
 
-def sync(session, reset, directory_prefix, **kwargs):
+def sync(session, directory_prefix, reset, **kwargs):
     missing_biobanks = get_missing_biobanks(session, reset=reset, attributes=BIOBANKS_ATTRIBUTES,
                                             expand=BIOBANKS_EXPAND_ATTRIBUTES,
                                             **kwargs)

From bc7abbbc8ae50916cf6c94c483c61560251e3051 Mon Sep 17 00:00:00 2001
From: Vittorio Meloni <vittorio.meloni@crs4.it>
Date: Tue, 4 Apr 2023 17:14:38 +0200
Subject: [PATCH 06/15] feat: adds contact for collections

---
 sync_directory_with_fdp.py | 105 +++++++++++++++++++++++++------------
 1 file changed, 72 insertions(+), 33 deletions(-)

diff --git a/sync_directory_with_fdp.py b/sync_directory_with_fdp.py
index 6da3698..74e140e 100644
--- a/sync_directory_with_fdp.py
+++ b/sync_directory_with_fdp.py
@@ -5,11 +5,12 @@
 from molgenis.client import MolgenisRequestError
 
 BBMRI_BIOBANK_ENTITY = 'eu_bbmri_eric_biobanks'
+BBMRI_CONTACT_ENTITY = 'eu_bbmri_eric_persons'
 BBMRI_DATA_SERVICE_ENTITY = 'eu_bbmri_eric_record_service'
 
 FDP_BIOBANK = 'fdp_Biobank'
 FDP_COLLECTION = 'fdp_Collection'
-FDP_PUBLISHER = 'fdp_Publisher'
+FDP_BIOBANK_ORGANIZATION = 'fdp_BiobankOrganization'
 FDP_CONTACT = 'fdp_ContactPointIndividual'
 FDP_DATA_SERVICE = 'fdp_DataService'
 FDP_IRI = 'fdp_IRI'
@@ -67,12 +68,12 @@ def create_records(session, entity, records):
     print("Added {} record(s) of type {}".format(len(created_records), entity))
 
 
-def delete_records(session, entity, records):
+def delete_records(session, entity, records_ids):
     removed_records = []
-    for i in range(0, len(records), 1000):
+    for i in range(0, len(records_ids), 1000):
         try:
             removed_records.extend(
-                session.delete_list(entity, [record['identifier'] for record in records[i:i + 1000]]))
+                session.delete_list(entity, [record_id for record_id in records_ids[i:i + 1000]]))
         except MolgenisRequestError as ex:
             print("Error removing records")
             print(ex)
@@ -94,17 +95,38 @@ def get_collection_type_ontology_code(collection_type):
     return COLLECTION_TYPES_ONTOLOGIES.get(collection_type, None)
 
 
+def get_contact_record(session, contact_id):
+    contact = session.get_by_id(BBMRI_CONTACT_ENTITY, contact_id)
+    return (
+        f'{contact["id"]}',
+        f'mailto:{contact["email"]}',
+        f'tel:{contact["phone"].replace(" ", "")}' if 'phone' in contact else None,
+        contact['first_name'] if 'first_name' in contact else None,
+        contact['last_name'] if 'last_name' in contact else None,
+        contact['title_before_name'] if 'title_before_name' in contact else None,
+        contact['title_after_name'] if 'title_after_name' in contact else None
+    )
+
+
 def get_records_to_add(biobank_data, session, directory_prefix):
     missing_iris = []
     data_services = []
-    for c in biobank_data['collections']:
-        for d in c['diagnosis_available']:
+    contacts = []
+    print('processing biobank', biobank_data['id'])
+
+    print("getting biobank's contact data")
+    if 'contact' in biobank_data:
+        contacts.append(get_contact_record(session, biobank_data['contact']['id']))
+
+    for collection in biobank_data['collections']:
+        print("processing collection", collection['id'])
+        for d in collection['diagnosis_available']:
             try:
                 session.get_by_id('fdp_IRI', d['id'], attributes='id')
             except MolgenisRequestError:
                 missing_iris.append((d['id'], get_disease_ontology_code(d['id'])))
 
-        for t in c['type']:
+        for t in collection['type']:
             try:
                 session.get_by_id('fdp_IRI', t['id'], attributes='id')
             except MolgenisRequestError:
@@ -112,45 +134,45 @@ def get_records_to_add(biobank_data, session, directory_prefix):
                 if ontology_code is not None:
                     missing_iris.append((t['id'], ontology_code))
 
-        if 'record_service' in c:
-            rs_data = session.get_by_id(BBMRI_DATA_SERVICE_ENTITY, c['record_service']['id'])
+        if 'contact' in collection:
+            contacts.append(get_contact_record(session, collection['contact']['id']))
+
+        if 'record_service' in collection:
+            rs = session.get_by_id(BBMRI_DATA_SERVICE_ENTITY, collection['record_service']['id'])
             data_services.append({
-                'identifier': rs_data['id'],
-                'endpointUrl': rs_data['url'],
-                'endpointDescription': rs_data['description'] if 'description' in rs_data else None,
-                'conformsTo': rs_data['conformsTo']
+                'identifier': rs['id'],
+                'endpointUrl': rs['url'],
+                'endpointDescription': rs['description'] if 'description' in rs else None,
+                'conformsTo': rs['conformsTo']
             })
 
     res = {
         FDP_BIOBANK: {
             'identifier': biobank_data['id'],
-            'IRI': f'{directory_prefix}/api/fdp/fdp_Biobank/{biobank_data["id"]}',
-            'catalog': 'bbmri-directory',  # TODO: get it dynamically
+            'IRI': f'{directory_prefix}/api/fdp/fdp_Biobank/{biobank_data["id"]}',  # TODO: use the PID
             'title': biobank_data['name'],
             'acronym': biobank_data['acronym'] if 'acronym' in biobank_data else None,
             'description': biobank_data['description'] if 'description' in biobank_data else None,
             'publisher': f'{biobank_data["id"]}-pub',
             'landingPage': f'{directory_prefix}/#/biobank/{biobank_data["id"]}',
-            'contactPoint': f'{biobank_data["id"]}-cp' if 'contact' in biobank_data else None,
+            'contactPoint': f'{biobank_data["contact"]["id"]}' if 'contact' in biobank_data else None,
             'country': get_country(biobank_data['country']['id'])
         },
-        FDP_PUBLISHER: {
+        FDP_BIOBANK_ORGANIZATION: {
             'identifier': f'{biobank_data["id"]}-pub',
-            'label': biobank_data['juridical_person']
-        },
-        FDP_CONTACT: {
-            'identifier': f'{biobank_data["id"]}-cp',
-            'email': f'mailto:{biobank_data["contact"]["email"]}' if 'contact' in biobank_data else '',
-            'address': '',
-            'telephone': '',
+            'name': biobank_data['juridical_person']
         },
+        FDP_CONTACT: contacts,
         FDP_COLLECTION: [{
             'identifier': c['id'],
+            'IRI': f'{directory_prefix}/api/fdp/fdp_Collection/{c["id"]}',
+            'contactPoint': f'{c["contact"]["id"]}' if 'contact' in c else None,
+            'catalog': 'bbmri-directory',  # TODO: get it dynamically
             'title': c['name'],
             'biobank': biobank_data["id"],
             'description': c['description'] if 'description' in c else None,
-            'type': [t['id'] for t in c['type'] if get_collection_type_ontology_code(t['id']) is not None],
-            'theme': [d['id'] for d in c['diagnosis_available']],
+            'type': [t['id'] for t in collection['type'] if get_collection_type_ontology_code(t['id']) is not None],
+            'theme': [d['id'] for d in collection['diagnosis_available']],
             'service': c['record_service']['id'] if 'record_service' in c else None
         } for c in biobank_data['collections']],
         FDP_IRI: missing_iris,
@@ -165,8 +187,8 @@ def sync(session, directory_prefix, reset, **kwargs):
                                             **kwargs)
 
     records = OrderedDict({
-        FDP_PUBLISHER: [],
-        FDP_CONTACT: [],
+        FDP_BIOBANK_ORGANIZATION: [],
+        FDP_CONTACT: set(),
         FDP_IRI: set(),
         FDP_DATA_SERVICE: [],
         FDP_BIOBANK: [],
@@ -185,29 +207,46 @@ def sync(session, directory_prefix, reset, **kwargs):
 
     if reset:
         for k, v in reversed(records.items()):
-            if k != 'fdp_IRI' and len(v) > 0:
-                delete_records(session, k, v)
+            if k not in (FDP_IRI, FDP_CONTACT) and len(v) > 0:
+                delete_records(session, k, [i['identifier'] for i in v])
+            if k == FDP_CONTACT and len(v) > 0:
+                delete_records(session, k, [i[0] for i in v])
 
     for k, v in records.items():
         if len(v) > 0:
             if k == FDP_IRI:
                 create_records(session, k, [{'id': i[0], 'IRI': i[1]} for i in v])
+            elif k == FDP_CONTACT:
+                create_records(session, k, [{
+                    'identifier': i[0],
+                    'email': i[1],
+                    'telephone': i[2],
+                    'given_name': i[3],
+                    'family_name': i[4],
+                    'honorific_prefix': i[5],
+                    'honorific_suffix': i[6]
+                } for i in v])
             else:
                 create_records(session, k, v)
 
 
 if __name__ == '__main__':
     import argparse
+
     parser = argparse.ArgumentParser()
     parser.add_argument('--molgenis-url', '-U')
     parser.add_argument('--molgenis-user', '-u')
     parser.add_argument('--molgenis-password', '-p')
-    parser.add_argument('--directory-prefix', '-d', help='The main prefix of the url to be used to generate IRIs', default='https://directory.bbmri-eric.eu/')
+    parser.add_argument('--directory-prefix', '-d',
+                        help='The main prefix of the url to be used to generate IRIs',
+                        default='https://directory.bbmri-eric.eu/')
     parser.add_argument('--reset', '-r', dest='reset', action='store_true')
     args = parser.parse_args()
 
+    directory_prefix = args.directory_prefix.replace('/', '',
+                                                     -1)  # just in case the input put the last /, it removes it
+
     s = client.Session(args.molgenis_url)
     s.login(args.molgenis_user, args.molgenis_password)
 
-    sync(s, args.directory_prefix, args.reset)
-
+    sync(s, directory_prefix, args.reset)

From d11c52233b8168f8a337ea552aeb952cc02b136d Mon Sep 17 00:00:00 2001
From: Vittorio Meloni <vittorio.meloni@crs4.it>
Date: Wed, 5 Apr 2023 16:35:49 +0200
Subject: [PATCH 07/15] fixes collection type and theme

---
 sync_directory_with_fdp.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sync_directory_with_fdp.py b/sync_directory_with_fdp.py
index 74e140e..71ea631 100644
--- a/sync_directory_with_fdp.py
+++ b/sync_directory_with_fdp.py
@@ -171,8 +171,8 @@ def get_records_to_add(biobank_data, session, directory_prefix):
             'title': c['name'],
             'biobank': biobank_data["id"],
             'description': c['description'] if 'description' in c else None,
-            'type': [t['id'] for t in collection['type'] if get_collection_type_ontology_code(t['id']) is not None],
-            'theme': [d['id'] for d in collection['diagnosis_available']],
+            'type': [t['id'] for t in c['type'] if get_collection_type_ontology_code(t['id']) is not None],
+            'theme': [d['id'] for d in c['diagnosis_available']],
             'service': c['record_service']['id'] if 'record_service' in c else None
         } for c in biobank_data['collections']],
         FDP_IRI: missing_iris,

From bea5998bb9a466b1da7390c259d4d88d5e184687 Mon Sep 17 00:00:00 2001
From: Vittorio Meloni <vittorio.meloni@crs4.it>
Date: Wed, 21 Jun 2023 11:59:07 +0200
Subject: [PATCH 08/15] fix(fdp): fixes to not override previous addedd
 collections to the catalog

---
 sync_directory_with_fdp.py | 88 ++++++++++++++++++++++++++++++++++----
 1 file changed, 80 insertions(+), 8 deletions(-)

diff --git a/sync_directory_with_fdp.py b/sync_directory_with_fdp.py
index 71ea631..75a6d2e 100644
--- a/sync_directory_with_fdp.py
+++ b/sync_directory_with_fdp.py
@@ -1,4 +1,11 @@
-import logging
+"""
+Script to synchronize the BBMRI Directory data of biobanks and collections with their FAIR Data Point/DCAT representation.
+It gets the data of biobanks and collections from a Molgenis instance of the Directory, converts it and upload the
+converted one into another Molgenis instance. The Molgenis instance can be the same or different. The destination
+instance needs to be deployed with the FDP instance already deployed. Also, the FDP must have the fdp_Catalog already
+created.
+"""
+
 from collections import OrderedDict
 
 from molgenis import client
@@ -8,6 +15,7 @@
 BBMRI_CONTACT_ENTITY = 'eu_bbmri_eric_persons'
 BBMRI_DATA_SERVICE_ENTITY = 'eu_bbmri_eric_record_service'
 
+FDP_CATALOG = 'fdp_Catalog'
 FDP_BIOBANK = 'fdp_Biobank'
 FDP_COLLECTION = 'fdp_Collection'
 FDP_BIOBANK_ORGANIZATION = 'fdp_BiobankOrganization'
@@ -42,7 +50,10 @@
     'TWIN_STUDY': 'http://purl.obolibrary.org/obo/OBIB_0000700'
 }
 
-
+"""
+It gets the data from the source Molgenis and filters the one that are already it the destination Molgenis.
+If reset flag is True it doesn't filter the biobanks to add 
+"""
 def get_missing_biobanks(session, reset, **kwargs):
     print("Getting source entities from {}".format(BBMRI_BIOBANK_ENTITY))
     source_records = session.get(BBMRI_BIOBANK_ENTITY, **kwargs)
@@ -57,6 +68,13 @@ def get_missing_biobanks(session, reset, **kwargs):
     return new_records
 
 
+"""
+Send converted data to the destination
+
+:params session: Molgenis session to use
+:params entity: the name of Molgenis entity type of the records to add 
+:params records: lists of dictionary with data of the FDP entity to add
+"""
 def create_records(session, entity, records):
     created_records = []
     for i in range(0, len(records), 1000):
@@ -68,6 +86,13 @@ def create_records(session, entity, records):
     print("Added {} record(s) of type {}".format(len(created_records), entity))
 
 
+"""
+Delete records in the destination Molgenis. Used when reset flag is True
+
+:params session: Molgenis session to use
+:params entity: the name of Molgenis entity type of the records to delete
+:params records_ids: the ids of the records of type :entity: to delete 
+"""
 def delete_records(session, entity, records_ids):
     removed_records = []
     for i in range(0, len(records_ids), 1000):
@@ -80,10 +105,16 @@ def delete_records(session, entity, records_ids):
     print(f"Removed {len(removed_records)} of type {entity}")
 
 
+"""
+Returns the country code correspondent to the country in input 
+"""
 def get_country(country):
     return 'GB' if country == 'UK' else country
 
 
+"""
+Returns the IRI of the disease code to use in the FDP
+"""
 def get_disease_ontology_code(disease_code):
     if ORPHA_DIRECTORY_PREFIX in disease_code:
         return disease_code.replace(ORPHA_DIRECTORY_PREFIX, ORPHA_ONTOLOGY_PREFIX)
@@ -91,10 +122,22 @@ def get_disease_ontology_code(disease_code):
         return disease_code.replace(ICD_10_DIRECTORY_PREFIX, ICD_10_ONTOLOGY_PREFIX)
 
 
+"""
+Return the IRI of the collection type
+
+:params collection_type: the collection type code in the directory 
+"""
 def get_collection_type_ontology_code(collection_type):
     return COLLECTION_TYPES_ONTOLOGIES.get(collection_type, None)
 
 
+"""
+Gets the contact data of the contact with id :contact_id: from the source Molgenis and 
+returns the FDP corresponding FDP record
+
+:params session: Molgenis session to use
+:params contact_id: the id of the contact in the Directory
+"""
 def get_contact_record(session, contact_id):
     contact = session.get_by_id(BBMRI_CONTACT_ENTITY, contact_id)
     return (
@@ -108,6 +151,16 @@ def get_contact_record(session, contact_id):
     )
 
 
+"""
+It generates the FDP records related to a biobank from the representation of the biobank in the Directory.
+It returns a dictionary with data for entities:
+fdp_Biobank: data of the Biobank as organization
+fdp_BiobankOrganization: data of the Jurystic Person that manage the Biobank
+fdp_Collection: list of collections of the biobank
+fdp_Contacts: contact of the biobank and the collections to add
+fdp_IRI: codes of diseases and collection types (if not already present in the destination)
+fdp_DataService: data related to the Data service, if present
+"""
 def get_records_to_add(biobank_data, session, directory_prefix):
     missing_iris = []
     data_services = []
@@ -121,12 +174,14 @@ def get_records_to_add(biobank_data, session, directory_prefix):
     for collection in biobank_data['collections']:
         print("processing collection", collection['id'])
         for d in collection['diagnosis_available']:
+            # it checks if the diagnosis is already present in the destination, if not it adds it to the ones to insert
             try:
                 session.get_by_id('fdp_IRI', d['id'], attributes='id')
             except MolgenisRequestError:
                 missing_iris.append((d['id'], get_disease_ontology_code(d['id'])))
 
         for t in collection['type']:
+            # same as diagnosis for collection type
             try:
                 session.get_by_id('fdp_IRI', t['id'], attributes='id')
             except MolgenisRequestError:
@@ -135,9 +190,11 @@ def get_records_to_add(biobank_data, session, directory_prefix):
                     missing_iris.append((t['id'], ontology_code))
 
         if 'contact' in collection:
+            # it adds data about the contacts
             contacts.append(get_contact_record(session, collection['contact']['id']))
 
         if 'record_service' in collection:
+            # if the collection has a record service it generates the corresponding DataService
             rs = session.get_by_id(BBMRI_DATA_SERVICE_ENTITY, collection['record_service']['id'])
             data_services.append({
                 'identifier': rs['id'],
@@ -167,7 +224,7 @@ def get_records_to_add(biobank_data, session, directory_prefix):
             'identifier': c['id'],
             'IRI': f'{directory_prefix}/api/fdp/fdp_Collection/{c["id"]}',
             'contactPoint': f'{c["contact"]["id"]}' if 'contact' in c else None,
-            'catalog': 'bbmri-directory',  # TODO: get it dynamically
+            # 'catalog': 'bbmri-directory',  # it
             'title': c['name'],
             'biobank': biobank_data["id"],
             'description': c['description'] if 'description' in c else None,
@@ -180,12 +237,21 @@ def get_records_to_add(biobank_data, session, directory_prefix):
     }
     return res
 
+def update_catalog(session, new_collections):
+    prev_collections = session.get_by_id(FDP_CATALOG, 'bbmri-directory', attributes='collection')
+
+    collections = set([c['identifier'] for c in prev_collections['collection'] + new_collections])
+
+    session.update_one(FDP_CATALOG, 'bbmri-directory', 'collection', list(collections))
 
+"""
+Main function that gets the data of the missing biobanks, convert it and upload the new records.
+"""
 def sync(session, directory_prefix, reset, **kwargs):
+    # it gets the missing biobanks
     missing_biobanks = get_missing_biobanks(session, reset=reset, attributes=BIOBANKS_ATTRIBUTES,
-                                            expand=BIOBANKS_EXPAND_ATTRIBUTES,
-                                            **kwargs)
-
+                                            expand=BIOBANKS_EXPAND_ATTRIBUTES, **kwargs)
+    # it gathers the data for all the biobanks
     records = OrderedDict({
         FDP_BIOBANK_ORGANIZATION: [],
         FDP_CONTACT: set(),
@@ -195,7 +261,9 @@ def sync(session, directory_prefix, reset, **kwargs):
         FDP_COLLECTION: []
     })
     for b in missing_biobanks:
+        # it gets the records to add for a biobank
         new_records = get_records_to_add(b, session, directory_prefix)
+        # it updates the overall records with the ones from of the processed biobank
         for k, v in new_records.items():
             if type(records[k]) == list:
                 if type(new_records[k]) == list:
@@ -205,13 +273,16 @@ def sync(session, directory_prefix, reset, **kwargs):
             else:
                 records[k].update(new_records[k])
 
+    # if the reset flag is True, it deletes the old records
     if reset:
+        session.update_one('fdp_Catalog', 'bbmri-directory', 'collection', [])
         for k, v in reversed(records.items()):
             if k not in (FDP_IRI, FDP_CONTACT) and len(v) > 0:
                 delete_records(session, k, [i['identifier'] for i in v])
             if k == FDP_CONTACT and len(v) > 0:
                 delete_records(session, k, [i[0] for i in v])
 
+    # it creates all the
     for k, v in records.items():
         if len(v) > 0:
             if k == FDP_IRI:
@@ -229,6 +300,8 @@ def sync(session, directory_prefix, reset, **kwargs):
             else:
                 create_records(session, k, v)
 
+    update_catalog(session, records[FDP_COLLECTION])
+
 
 if __name__ == '__main__':
     import argparse
@@ -238,8 +311,7 @@ def sync(session, directory_prefix, reset, **kwargs):
     parser.add_argument('--molgenis-user', '-u')
     parser.add_argument('--molgenis-password', '-p')
     parser.add_argument('--directory-prefix', '-d',
-                        help='The main prefix of the url to be used to generate IRIs',
-                        default='https://directory.bbmri-eric.eu/')
+                        help='The main prefix of the url to be used to generate IRIs')
     parser.add_argument('--reset', '-r', dest='reset', action='store_true')
     args = parser.parse_args()
 

From d8791f29d2b579b74a2a581fae442aaefcf69daa Mon Sep 17 00:00:00 2001
From: Vittorio Meloni <vittorio.meloni@crs4.it>
Date: Wed, 21 Jun 2023 12:09:10 +0200
Subject: [PATCH 09/15] docs(fdp): changes description of the fdp_sync script

---
 sync_directory_with_fdp.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sync_directory_with_fdp.py b/sync_directory_with_fdp.py
index 75a6d2e..34fed0f 100644
--- a/sync_directory_with_fdp.py
+++ b/sync_directory_with_fdp.py
@@ -2,8 +2,8 @@
 Script to synchronize the BBMRI Directory data of biobanks and collections with their FAIR Data Point/DCAT representation.
 It gets the data of biobanks and collections from a Molgenis instance of the Directory, converts it and upload the
 converted one into another Molgenis instance. The Molgenis instance can be the same or different. The destination
-instance needs to be deployed with the FDP instance already deployed. Also, the FDP must have the fdp_Catalog already
-created.
+instance needs the FDP EMX model and the "bbmri-directory" FDP_Catalog (i.e., the FDP Catalog with data of the directory)
+already deployed
 """
 
 from collections import OrderedDict

From cf076ddc5d17bee571153f224afdcb94738c6230 Mon Sep 17 00:00:00 2001
From: Vittorio Meloni <vittorio.meloni@crs4.it>
Date: Wed, 28 Jun 2023 10:37:00 +0200
Subject: [PATCH 10/15] docs: moves docstrings

---
 sync_directory_with_fdp.py | 100 +++++++++++++++++++------------------
 1 file changed, 51 insertions(+), 49 deletions(-)

diff --git a/sync_directory_with_fdp.py b/sync_directory_with_fdp.py
index 34fed0f..ec86dbc 100644
--- a/sync_directory_with_fdp.py
+++ b/sync_directory_with_fdp.py
@@ -50,11 +50,12 @@
     'TWIN_STUDY': 'http://purl.obolibrary.org/obo/OBIB_0000700'
 }
 
-"""
-It gets the data from the source Molgenis and filters the one that are already it the destination Molgenis.
-If reset flag is True it doesn't filter the biobanks to add 
-"""
+
 def get_missing_biobanks(session, reset, **kwargs):
+    """
+    It gets the data from the source Molgenis and filters the one that are already it the destination Molgenis.
+    If reset flag is True it doesn't filter the biobanks to add
+    """
     print("Getting source entities from {}".format(BBMRI_BIOBANK_ENTITY))
     source_records = session.get(BBMRI_BIOBANK_ENTITY, **kwargs)
     # if reset is True the missing biobanks are all
@@ -68,14 +69,14 @@ def get_missing_biobanks(session, reset, **kwargs):
     return new_records
 
 
-"""
-Send converted data to the destination
-
-:params session: Molgenis session to use
-:params entity: the name of Molgenis entity type of the records to add 
-:params records: lists of dictionary with data of the FDP entity to add
-"""
 def create_records(session, entity, records):
+    """
+    Send converted data to the destination
+
+    :params session: Molgenis session to use
+    :params entity: the name of Molgenis entity type of the records to add
+    :params records: lists of dictionary with data of the FDP entity to add
+    """
     created_records = []
     for i in range(0, len(records), 1000):
         try:
@@ -86,14 +87,14 @@ def create_records(session, entity, records):
     print("Added {} record(s) of type {}".format(len(created_records), entity))
 
 
-"""
-Delete records in the destination Molgenis. Used when reset flag is True
-
-:params session: Molgenis session to use
-:params entity: the name of Molgenis entity type of the records to delete
-:params records_ids: the ids of the records of type :entity: to delete 
-"""
 def delete_records(session, entity, records_ids):
+    """
+    Delete records in the destination Molgenis. Used when reset flag is True
+
+    :params session: Molgenis session to use
+    :params entity: the name of Molgenis entity type of the records to delete
+    :params records_ids: the ids of the records of type :entity: to delete
+    """
     removed_records = []
     for i in range(0, len(records_ids), 1000):
         try:
@@ -105,40 +106,40 @@ def delete_records(session, entity, records_ids):
     print(f"Removed {len(removed_records)} of type {entity}")
 
 
-"""
-Returns the country code correspondent to the country in input 
-"""
 def get_country(country):
+    """
+    Returns the country code correspondent to the country in input
+    """
     return 'GB' if country == 'UK' else country
 
 
-"""
-Returns the IRI of the disease code to use in the FDP
-"""
 def get_disease_ontology_code(disease_code):
+    """
+    Returns the IRI of the disease code to use in the FDP
+    """
     if ORPHA_DIRECTORY_PREFIX in disease_code:
         return disease_code.replace(ORPHA_DIRECTORY_PREFIX, ORPHA_ONTOLOGY_PREFIX)
     if ICD_10_DIRECTORY_PREFIX in disease_code:
         return disease_code.replace(ICD_10_DIRECTORY_PREFIX, ICD_10_ONTOLOGY_PREFIX)
 
 
-"""
-Return the IRI of the collection type
-
-:params collection_type: the collection type code in the directory 
-"""
 def get_collection_type_ontology_code(collection_type):
-    return COLLECTION_TYPES_ONTOLOGIES.get(collection_type, None)
+    """
+    Return the IRI of the collection type
 
+    :params collection_type: the collection type code in the directory
+    """
+    return COLLECTION_TYPES_ONTOLOGIES.get(collection_type, None)
 
-"""
-Gets the contact data of the contact with id :contact_id: from the source Molgenis and 
-returns the FDP corresponding FDP record
 
-:params session: Molgenis session to use
-:params contact_id: the id of the contact in the Directory
-"""
 def get_contact_record(session, contact_id):
+    """
+    Gets the contact data of the contact with id :contact_id: from the source Molgenis and
+    returns the FDP corresponding FDP record
+
+    :params session: Molgenis session to use
+    :params contact_id: the id of the contact in the Directory
+    """
     contact = session.get_by_id(BBMRI_CONTACT_ENTITY, contact_id)
     return (
         f'{contact["id"]}',
@@ -151,17 +152,18 @@ def get_contact_record(session, contact_id):
     )
 
 
-"""
-It generates the FDP records related to a biobank from the representation of the biobank in the Directory.
-It returns a dictionary with data for entities:
-fdp_Biobank: data of the Biobank as organization
-fdp_BiobankOrganization: data of the Jurystic Person that manage the Biobank
-fdp_Collection: list of collections of the biobank
-fdp_Contacts: contact of the biobank and the collections to add
-fdp_IRI: codes of diseases and collection types (if not already present in the destination)
-fdp_DataService: data related to the Data service, if present
-"""
 def get_records_to_add(biobank_data, session, directory_prefix):
+    """
+    It generates the FDP records related to a biobank from the representation of the biobank in the Directory.
+    It returns a dictionary with data for entities:
+    fdp_Biobank: data of the Biobank as organization
+    fdp_BiobankOrganization: data of the Jurystic Person that manage the Biobank
+    fdp_Collection: list of collections of the biobank
+    fdp_Contacts: contact of the biobank and the collections to add
+    fdp_IRI: codes of diseases and collection types (if not already present in the destination)
+    fdp_DataService: data related to the Data service, if present
+    """
+
     missing_iris = []
     data_services = []
     contacts = []
@@ -244,10 +246,10 @@ def update_catalog(session, new_collections):
 
     session.update_one(FDP_CATALOG, 'bbmri-directory', 'collection', list(collections))
 
-"""
-Main function that gets the data of the missing biobanks, convert it and upload the new records.
-"""
 def sync(session, directory_prefix, reset, **kwargs):
+    """
+    Main function that gets the data of the missing biobanks, convert it and upload the new records.
+    """
     # it gets the missing biobanks
     missing_biobanks = get_missing_biobanks(session, reset=reset, attributes=BIOBANKS_ATTRIBUTES,
                                             expand=BIOBANKS_EXPAND_ATTRIBUTES, **kwargs)

From f7f3c6ba640a8bd0eb35cc596fb817eb6e619a64 Mon Sep 17 00:00:00 2001
From: Vittorio Meloni <vittorio.meloni@crs4.it>
Date: Mon, 6 Nov 2023 15:33:39 +0100
Subject: [PATCH 11/15] feat: minor changes to fdp

---
 sync_directory_with_fdp.py | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/sync_directory_with_fdp.py b/sync_directory_with_fdp.py
index ec86dbc..a50350b 100644
--- a/sync_directory_with_fdp.py
+++ b/sync_directory_with_fdp.py
@@ -18,7 +18,7 @@
 FDP_CATALOG = 'fdp_Catalog'
 FDP_BIOBANK = 'fdp_Biobank'
 FDP_COLLECTION = 'fdp_Collection'
-FDP_BIOBANK_ORGANIZATION = 'fdp_BiobankOrganization'
+FDP_BIOBANK_LEGAL_PERSON = 'fdp_BiobankLegalPerson'
 FDP_CONTACT = 'fdp_ContactPointIndividual'
 FDP_DATA_SERVICE = 'fdp_DataService'
 FDP_IRI = 'fdp_IRI'
@@ -58,13 +58,18 @@ def get_missing_biobanks(session, reset, **kwargs):
     """
     print("Getting source entities from {}".format(BBMRI_BIOBANK_ENTITY))
     source_records = session.get(BBMRI_BIOBANK_ENTITY, **kwargs)
+    biobanks_with_record_service = []
+    for sr in source_records:
+        for c in sr["collections"]:
+            if "record_service" in c:
+                biobanks_with_record_service.append(sr)
     # if reset is True the missing biobanks are all
     if reset:
-        return source_records
+        return biobanks_with_record_service
 
     print("Getting ids already present")
     dest_records_ids = [r['identifier'] for r in session.get(FDP_BIOBANK, attributes='identifier')]
-    new_records = [sr for sr in source_records if sr['id'] not in dest_records_ids]
+    new_records = [sr for sr in biobanks_with_record_service if sr['id'] not in dest_records_ids]
     print("Found {} new records to insert".format(len(new_records)))
     return new_records
 
@@ -217,23 +222,25 @@ def get_records_to_add(biobank_data, session, directory_prefix):
             'contactPoint': f'{biobank_data["contact"]["id"]}' if 'contact' in biobank_data else None,
             'country': get_country(biobank_data['country']['id'])
         },
-        FDP_BIOBANK_ORGANIZATION: {
+        FDP_BIOBANK_LEGAL_PERSON: {
             'identifier': f'{biobank_data["id"]}-pub',
             'name': biobank_data['juridical_person']
         },
         FDP_CONTACT: contacts,
         FDP_COLLECTION: [{
-            'identifier': c['id'],
             'IRI': f'{directory_prefix}/api/fdp/fdp_Collection/{c["id"]}',
-            'contactPoint': f'{c["contact"]["id"]}' if 'contact' in c else None,
-            # 'catalog': 'bbmri-directory',  # it
-            'title': c['name'],
+            'identifier': c['id'],
             'biobank': biobank_data["id"],
+            'title': c['name'],
             'description': c['description'] if 'description' in c else None,
-            'type': [t['id'] for t in c['type'] if get_collection_type_ontology_code(t['id']) is not None],
             'theme': [d['id'] for d in c['diagnosis_available']],
-            'service': c['record_service']['id'] if 'record_service' in c else None
-        } for c in biobank_data['collections']],
+            'type': [t['id'] for t in c['type'] if get_collection_type_ontology_code(t['id']) is not None],
+            'landingPage': f'{directory_prefix}/#/biobank/{c["id"]}',
+            'contactPoint': f'{c["contact"]["id"]}' if 'contact' in c else None,
+            'service': c['record_service']['id'] if 'record_service' in c else None,
+            'vpConnection': 'ejprd-vp-discoverable' if 'record_service' in c else None,
+            'personalData': 'true'
+        } for c in biobank_data['collections'] if len(f'{directory_prefix}/#/biobank/{c["id"]}') < 255] ,
         FDP_IRI: missing_iris,
         FDP_DATA_SERVICE: data_services
     }
@@ -253,9 +260,10 @@ def sync(session, directory_prefix, reset, **kwargs):
     # it gets the missing biobanks
     missing_biobanks = get_missing_biobanks(session, reset=reset, attributes=BIOBANKS_ATTRIBUTES,
                                             expand=BIOBANKS_EXPAND_ATTRIBUTES, **kwargs)
+
     # it gathers the data for all the biobanks
     records = OrderedDict({
-        FDP_BIOBANK_ORGANIZATION: [],
+        FDP_BIOBANK_LEGAL_PERSON: [],
         FDP_CONTACT: set(),
         FDP_IRI: set(),
         FDP_DATA_SERVICE: [],
@@ -317,9 +325,7 @@ def sync(session, directory_prefix, reset, **kwargs):
     parser.add_argument('--reset', '-r', dest='reset', action='store_true')
     args = parser.parse_args()
 
-    directory_prefix = args.directory_prefix.replace('/', '',
-                                                     -1)  # just in case the input put the last /, it removes it
-
+    directory_prefix = args.directory_prefix # .replace('/', '', -1)  # just in case the input put the last /, it removes it
     s = client.Session(args.molgenis_url)
     s.login(args.molgenis_user, args.molgenis_password)
 

From 58e8639dfce4c3676326668abd935f56ee640e36 Mon Sep 17 00:00:00 2001
From: Vittorio Meloni <vittorio.meloni@crs4.it>
Date: Wed, 24 Jan 2024 17:02:49 +0100
Subject: [PATCH 12/15] fix: fixes biobanks duplication

---
 sync_directory_with_fdp.py | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/sync_directory_with_fdp.py b/sync_directory_with_fdp.py
index a50350b..7581f21 100644
--- a/sync_directory_with_fdp.py
+++ b/sync_directory_with_fdp.py
@@ -5,9 +5,9 @@
 instance needs the FDP EMX model and the "bbmri-directory" FDP_Catalog (i.e., the FDP Catalog with data of the directory)
 already deployed
 """
-
 from collections import OrderedDict
 
+import pprint
 from molgenis import client
 from molgenis.client import MolgenisRequestError
 
@@ -60,9 +60,12 @@ def get_missing_biobanks(session, reset, **kwargs):
     source_records = session.get(BBMRI_BIOBANK_ENTITY, **kwargs)
     biobanks_with_record_service = []
     for sr in source_records:
+        add_biobank = False
         for c in sr["collections"]:
             if "record_service" in c:
-                biobanks_with_record_service.append(sr)
+                add_biobank = True
+        if add_biobank:
+            biobanks_with_record_service.append(sr)
     # if reset is True the missing biobanks are all
     if reset:
         return biobanks_with_record_service
@@ -82,6 +85,12 @@ def create_records(session, entity, records):
     :params entity: the name of Molgenis entity type of the records to add
     :params records: lists of dictionary with data of the FDP entity to add
     """
+    # with open(f'fdp_data/{entity}.csv', 'w' ) as csvfile:
+    #     writer = csv.DictWriter(csvfile, fieldnames=records[0].keys(), dialect="excel", quotechar="\"")
+    #     writer.writeheader()
+    #     for r in records:
+    #         writer.writerow(r)
+
     created_records = []
     for i in range(0, len(records), 1000):
         try:
@@ -207,13 +216,14 @@ def get_records_to_add(biobank_data, session, directory_prefix):
                 'identifier': rs['id'],
                 'endpointUrl': rs['url'],
                 'endpointDescription': rs['description'] if 'description' in rs else None,
-                'conformsTo': rs['conformsTo']
+                'conformsTo': rs['conformsTo'],
+                'type': rs['type']
             })
 
     res = {
         FDP_BIOBANK: {
             'identifier': biobank_data['id'],
-            'IRI': f'{directory_prefix}/api/fdp/fdp_Biobank/{biobank_data["id"]}',  # TODO: use the PID
+            # 'IRI': f'{directory_prefix}/api/fdp/fdp_Biobank/{biobank_data["id"]}',  # TODO: use the PID
             'title': biobank_data['name'],
             'acronym': biobank_data['acronym'] if 'acronym' in biobank_data else None,
             'description': biobank_data['description'] if 'description' in biobank_data else None,
@@ -240,12 +250,14 @@ def get_records_to_add(biobank_data, session, directory_prefix):
             'service': c['record_service']['id'] if 'record_service' in c else None,
             'vpConnection': 'ejprd-vp-discoverable' if 'record_service' in c else None,
             'personalData': 'true'
-        } for c in biobank_data['collections'] if len(f'{directory_prefix}/#/biobank/{c["id"]}') < 255] ,
+        } for c in biobank_data['collections'] if
+            'record_service' in c and len(f'{directory_prefix}/#/biobank/{c["id"]}') < 255],
         FDP_IRI: missing_iris,
         FDP_DATA_SERVICE: data_services
     }
     return res
 
+
 def update_catalog(session, new_collections):
     prev_collections = session.get_by_id(FDP_CATALOG, 'bbmri-directory', attributes='collection')
 
@@ -253,6 +265,7 @@ def update_catalog(session, new_collections):
 
     session.update_one(FDP_CATALOG, 'bbmri-directory', 'collection', list(collections))
 
+
 def sync(session, directory_prefix, reset, **kwargs):
     """
     Main function that gets the data of the missing biobanks, convert it and upload the new records.
@@ -291,7 +304,7 @@ def sync(session, directory_prefix, reset, **kwargs):
                 delete_records(session, k, [i['identifier'] for i in v])
             if k == FDP_CONTACT and len(v) > 0:
                 delete_records(session, k, [i[0] for i in v])
-
+    pprint.pprint(records)
     # it creates all the
     for k, v in records.items():
         if len(v) > 0:
@@ -325,7 +338,7 @@ def sync(session, directory_prefix, reset, **kwargs):
     parser.add_argument('--reset', '-r', dest='reset', action='store_true')
     args = parser.parse_args()
 
-    directory_prefix = args.directory_prefix # .replace('/', '', -1)  # just in case the input put the last /, it removes it
+    directory_prefix = args.directory_prefix  # .replace('/', '', -1)  # just in case the input put the last /, it removes it
     s = client.Session(args.molgenis_url)
     s.login(args.molgenis_user, args.molgenis_password)
 

From 5174ad76813e4ac9f2c46996a959f5a8a5ea1fef Mon Sep 17 00:00:00 2001
From: Vittorio Meloni <vittorio.meloni@crs4.it>
Date: Mon, 9 Sep 2024 15:06:29 +0200
Subject: [PATCH 13/15] fix: adapts to new model

---
 sync_directory_with_fdp.py | 82 ++++++++++++++++++++++++++------------
 1 file changed, 56 insertions(+), 26 deletions(-)

diff --git a/sync_directory_with_fdp.py b/sync_directory_with_fdp.py
index 7581f21..816a9d2 100644
--- a/sync_directory_with_fdp.py
+++ b/sync_directory_with_fdp.py
@@ -8,6 +8,8 @@
 from collections import OrderedDict
 
 import pprint
+from datetime import datetime
+
 from molgenis import client
 from molgenis.client import MolgenisRequestError
 
@@ -58,21 +60,22 @@ def get_missing_biobanks(session, reset, **kwargs):
     """
     print("Getting source entities from {}".format(BBMRI_BIOBANK_ENTITY))
     source_records = session.get(BBMRI_BIOBANK_ENTITY, **kwargs)
-    biobanks_with_record_service = []
+    print("Found {}".format(source_records))
+    missing_biobanks = []
     for sr in source_records:
-        add_biobank = False
-        for c in sr["collections"]:
-            if "record_service" in c:
-                add_biobank = True
+        add_biobank = True
+        # for c in sr["collections"]:
+        #     if "record_service" in c:
+        #         add_biobank = True
         if add_biobank:
-            biobanks_with_record_service.append(sr)
+            missing_biobanks.append(sr)
     # if reset is True the missing biobanks are all
     if reset:
-        return biobanks_with_record_service
+        return missing_biobanks
 
     print("Getting ids already present")
     dest_records_ids = [r['identifier'] for r in session.get(FDP_BIOBANK, attributes='identifier')]
-    new_records = [sr for sr in biobanks_with_record_service if sr['id'] not in dest_records_ids]
+    new_records = [sr for sr in missing_biobanks if sr['id'] not in dest_records_ids]
     print("Found {} new records to insert".format(len(new_records)))
     return new_records
 
@@ -85,12 +88,6 @@ def create_records(session, entity, records):
     :params entity: the name of Molgenis entity type of the records to add
     :params records: lists of dictionary with data of the FDP entity to add
     """
-    # with open(f'fdp_data/{entity}.csv', 'w' ) as csvfile:
-    #     writer = csv.DictWriter(csvfile, fieldnames=records[0].keys(), dialect="excel", quotechar="\"")
-    #     writer.writeheader()
-    #     for r in records:
-    #         writer.writerow(r)
-
     created_records = []
     for i in range(0, len(records), 1000):
         try:
@@ -217,7 +214,12 @@ def get_records_to_add(biobank_data, session, directory_prefix):
                 'endpointUrl': rs['url'],
                 'endpointDescription': rs['description'] if 'description' in rs else None,
                 'conformsTo': rs['conformsTo'],
-                'type': rs['type']
+                'type': rs['type'],
+                'issued': datetime.now().isoformat(),
+                'modified': datetime.now().isoformat(),
+                'publisher': 'bbmri',
+                'title': f'Data Service of {collection["name"]}',
+                'language': ['eng-eu']
             })
 
     res = {
@@ -239,29 +241,57 @@ def get_records_to_add(biobank_data, session, directory_prefix):
         FDP_CONTACT: contacts,
         FDP_COLLECTION: [{
             'IRI': f'{directory_prefix}/api/fdp/fdp_Collection/{c["id"]}',
+            'additionalRDFType': "http://www.w3.org/ns/dcat#Dataset",
             'identifier': c['id'],
             'biobank': biobank_data["id"],
+            'publisher': 'bbmri',
             'title': c['name'],
             'description': c['description'] if 'description' in c else None,
-            'theme': [d['id'] for d in c['diagnosis_available']],
-            'type': [t['id'] for t in c['type'] if get_collection_type_ontology_code(t['id']) is not None],
-            'landingPage': f'{directory_prefix}/#/biobank/{c["id"]}',
+            'diseases': [d['id'].replace("urn:miriam:icd:", "ICD10:") for d in c['diagnosis_available']],
+            'theme': 'EU:HEALTH',
+            'type': '',
+            'landingPage': f'{directory_prefix}/#/collection/{c["id"]}',
             'contactPoint': f'{c["contact"]["id"]}' if 'contact' in c else None,
             'service': c['record_service']['id'] if 'record_service' in c else None,
             'vpConnection': 'ejprd-vp-discoverable' if 'record_service' in c else None,
-            'personalData': 'true'
-        } for c in biobank_data['collections'] if
-            'record_service' in c and len(f'{directory_prefix}/#/biobank/{c["id"]}') < 255],
+            'issued': datetime.now().isoformat(),
+            'modified': datetime.now().isoformat(),
+            'personalData': 'true',
+            'version': '',
+            'language': ['eng-eu'],
+            'country': 'EU',
+            'rights': 'restricted',
+            'policy': '', # CRC Cohort 'https://www.bbmri-eric.eu/services/access-policies/',
+            'minAge': c['age_low'] if 'age_low' in c else None,
+            'maxAge': c['age_high'] if 'age_high' in c else None,
+            'numberOfRecords': c['size'] if 'size' in c else None,
+            'numberOfUniqueIndividuals': c['number_of_donors']
+
+        } for c in biobank_data['collections']],
         FDP_IRI: missing_iris,
         FDP_DATA_SERVICE: data_services
     }
     return res
 
 
+def get_collections_in_catalog(session):
+    catalog = session.get_by_id(FDP_CATALOG, 'bbmri-directory', attributes='collection')
+    return [c['identifier'] for c in catalog['collection']]
+
+
+def reset_catalog(session, collections_to_update):
+    collections_in_catalog = get_collections_in_catalog(session)
+    collections_to_update_ids = list([c['IRI'].split('/')[-1] for c in collections_to_update])
+    for c in collections_in_catalog[:]:
+        if c in collections_to_update_ids:
+            collections_in_catalog.remove(c)
+    session.update_one('fdp_Catalog', 'bbmri-directory', 'collection', collections_in_catalog)
+
+
 def update_catalog(session, new_collections):
-    prev_collections = session.get_by_id(FDP_CATALOG, 'bbmri-directory', attributes='collection')
+    prev_collections = get_collections_in_catalog(session)
 
-    collections = set([c['identifier'] for c in prev_collections['collection'] + new_collections])
+    collections = set(prev_collections + [c['identifier'] for c in new_collections])
 
     session.update_one(FDP_CATALOG, 'bbmri-directory', 'collection', list(collections))
 
@@ -298,14 +328,14 @@ def sync(session, directory_prefix, reset, **kwargs):
 
     # if the reset flag is True, it deletes the old records
     if reset:
-        session.update_one('fdp_Catalog', 'bbmri-directory', 'collection', [])
+        reset_catalog(session, new_records[FDP_COLLECTION])
+
         for k, v in reversed(records.items()):
             if k not in (FDP_IRI, FDP_CONTACT) and len(v) > 0:
                 delete_records(session, k, [i['identifier'] for i in v])
             if k == FDP_CONTACT and len(v) > 0:
                 delete_records(session, k, [i[0] for i in v])
-    pprint.pprint(records)
-    # it creates all the
+
     for k, v in records.items():
         if len(v) > 0:
             if k == FDP_IRI:

From 45262a8eab0b41522e5b811cf381e5f73ec7c0a9 Mon Sep 17 00:00:00 2001
From: Vittorio Meloni <vittorio.meloni@crs4.it>
Date: Mon, 30 Sep 2024 11:59:00 +0200
Subject: [PATCH 14/15] adds: EU:HEALTH as theme

---
 sync_directory_with_fdp.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sync_directory_with_fdp.py b/sync_directory_with_fdp.py
index 816a9d2..2e16082 100644
--- a/sync_directory_with_fdp.py
+++ b/sync_directory_with_fdp.py
@@ -248,7 +248,7 @@ def get_records_to_add(biobank_data, session, directory_prefix):
             'title': c['name'],
             'description': c['description'] if 'description' in c else None,
             'diseases': [d['id'].replace("urn:miriam:icd:", "ICD10:") for d in c['diagnosis_available']],
-            'theme': 'EU:HEALTH',
+            'theme': ['EU:HEALTH'] + [d['id'] for d in c['diagnosis_available']],
             'type': '',
             'landingPage': f'{directory_prefix}/#/collection/{c["id"]}',
             'contactPoint': f'{c["contact"]["id"]}' if 'contact' in c else None,
@@ -372,4 +372,4 @@ def sync(session, directory_prefix, reset, **kwargs):
     s = client.Session(args.molgenis_url)
     s.login(args.molgenis_user, args.molgenis_password)
 
-    sync(s, directory_prefix, args.reset)
+    sync(s, directory_prefix, args.reset, q="id==bbmri-eric:ID:EU_BBMRI-ERIC")

From 3731746d598ee0608177bd1bf66a40238a33368f Mon Sep 17 00:00:00 2001
From: Vittorio Meloni <vitto.meloni@gmail.com>
Date: Thu, 20 Feb 2025 11:21:51 +0100
Subject: [PATCH 15/15] Add authorship and acknwoledgements to
 sync_directory_with_fdp.py

---
 sync_directory_with_fdp.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/sync_directory_with_fdp.py b/sync_directory_with_fdp.py
index 2e16082..facfcde 100644
--- a/sync_directory_with_fdp.py
+++ b/sync_directory_with_fdp.py
@@ -4,6 +4,20 @@
 converted one into another Molgenis instance. The Molgenis instance can be the same or different. The destination
 instance needs the FDP EMX model and the "bbmri-directory" FDP_Catalog (i.e., the FDP Catalog with data of the directory)
 already deployed
+
+--------------------------------------------------------------------------------------------------------------------------
+
+Authors
+
+ - Vittorio Meloni <vittorio.meloni@crs4.it>
+
+Acknowledgments:
+
+This work has been partially funded by the following sources:
+
+ - The European Joint Programme on Rare Disease (EJPRD) project (grant agreement N. 825575);
+
+and has evolved within the context of the BBMRI-ERIC Common Service IT.
 """
 from collections import OrderedDict