diff --git a/src/mtd_sync/mtd_sync.py b/src/mtd_sync/mtd_sync.py index bbc0390..bb6e844 100644 --- a/src/mtd_sync/mtd_sync.py +++ b/src/mtd_sync/mtd_sync.py @@ -69,7 +69,7 @@ def _get_xml(self, path): def _get_af_xml(self): return self._get_xml(self.af_path) - def get_af_list(self): + def get_af_list(self) -> list: xml = self._get_af_xml() _xml_parser = etree.XMLParser(ns_clean=True, recover=True, encoding="utf-8") root = etree.fromstring(xml, parser=_xml_parser) @@ -82,7 +82,7 @@ def get_af_list(self): def _get_ds_xml(self): return self._get_xml(self.ds_path) - def get_ds_list(self): + def get_ds_list(self) -> list: xml = self._get_ds_xml() return parse_jdd_xml(xml) @@ -221,11 +221,19 @@ def process_af_and_ds(af_list, ds_list, id_role=None): continue user_add_total_time += time.time() - start_add_user_time af = sync_af(af) + # TODO: choose whether or not to commit retrieval of the AF before association of actors + # and possibly retrieve an AF without any actor associated to it + db.session.commit() + # If AF has not been synchronized ; due to the lack of a UUID ; actor cannot be associated to it + # and thus we skip to the next AF + if not af: + continue associate_actors( actors, CorAcquisitionFrameworkActor, "id_acquisition_framework", af.id_acquisition_framework, + af.unique_acquisition_framework_id, ) # TODO: remove actors removed from MTD db.session.commit() @@ -242,7 +250,13 @@ def process_af_and_ds(af_list, ds_list, id_role=None): user_add_total_time += time.time() - start_add_user_time ds = sync_ds(ds, list_cd_nomenclature) if ds is not None: - associate_actors(actors, CorDatasetActor, "id_dataset", ds.id_dataset) + associate_actors( + actors, + CorDatasetActor, + "id_dataset", + ds.id_dataset, + ds.unique_dataset_id, + ) user_add_total_time = round(user_add_total_time, 2) db.session.commit() diff --git a/src/mtd_sync/mtd_utils.py b/src/mtd_sync/mtd_utils.py index bafa976..fad7719 100644 --- a/src/mtd_sync/mtd_utils.py +++ b/src/mtd_sync/mtd_utils.py @@ -1,10 +1,12 @@ import logging import json from copy import copy +import pprint +from typing import Literal from flask import current_app from sqlalchemy import select, exists -from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.exc import SQLAlchemyError, IntegrityError from sqlalchemy.sql import func, update from sqlalchemy.dialects.postgresql import insert as pg_insert @@ -122,7 +124,19 @@ def sync_af(af): TAcquisitionFramework The updated or inserted acquisition framework. """ + # TODO: handle case where af_uuid is None ; as will raise an error at database level when executing the statement below ; + # af_uuid being None, i.e. af UUID is missing, could be due to no UUID specified in `` tag in the XML file + # Solutions - if UUID is missing: + # - Just pass the sync of the AF + # - Generate a UUID for the AF af_uuid = af["unique_acquisition_framework_id"] + + if not af_uuid: + log.warning( + f"No UUID provided for the AF with UUID '{af_uuid}' - SKIPPING SYNCHRONIZATION FOR THIS AF." + ) + return None + af_exists = DB.session.scalar( exists().where(TAcquisitionFramework.unique_acquisition_framework_id == af_uuid).select() ) @@ -185,21 +199,34 @@ def add_or_update_organism(uuid, nom, email): return DB.session.execute(statement).scalar() -def associate_actors(actors, CorActor, pk_name, pk_value): +def associate_actors( + actors, + CorActor: CorAcquisitionFrameworkActor | CorDatasetActor, + pk_name: Literal["id_acquisition_framework", "id_dataset"], + pk_value: str, + uuid_mtd: str, +): """ - Associate actor and DS or AF according to CorActor value. + Associate actors with either a given : + - Acquisition framework - writing to the table `gn_meta.cor_acquisition_framework_actor`. + - Dataset - writing to the table `gn_meta.cor_dataset_actor`. Parameters ---------- actors : list list of actors - CorActor : db.Model - table model - pk_name : str - pk attribute name + CorActor : CorAcquisitionFrameworkActor | CorDatasetActor + the SQLAlchemy model corresponding to the destination table + pk_name : Literal['id_acquisition_framework', 'id_dataset'] + pk attribute name: + - 'id_acquisition_framework' for AF + - 'id_dataset' for DS pk_value : str - pk value + pk value: ID of the AF or DS + uuid_mtd : str + UUID of the AF or DS """ + type_mtd = "AF" if pk_name == "id_acquisition_framework" else "DS" for actor in actors: id_organism = None uuid_organism = actor["uuid_organism"] @@ -218,20 +245,49 @@ def associate_actors(actors, CorActor, pk_name, pk_value): ), **{pk_name: pk_value}, ) - if not id_organism: - values["id_role"] = DB.session.scalar( - select(User.id_role).filter_by(email=actor["email"]) - ) - else: + # TODO: choose wether to: + # - (retained) Try to associate to an organism first and then to a user + # - Try to associate to a user first and then to an organism + if id_organism: values["id_organism"] = id_organism - statement = ( - pg_insert(CorActor) - .values(**values) - .on_conflict_do_nothing( - index_elements=[pk_name, "id_organism", "id_nomenclature_actor_role"], + # TODO: handle case where no user is retrieved for the actor email: + # - (retained) Just do not try to associate the actor with the metadata + # - Try to retrieve and id_organism from the organism name - field `organism` + # - Try to retrieve and id_organism from the actor email considered as an organism email - field `email` + # - Try to insert a new user from the actor name - field `name` - and possibly also email - field `email` + else: + id_user_from_email = DB.session.scalar( + select(User.id_role).filter_by(email=actor["email"]).where(User.groupe.is_(False)) + ) + if id_user_from_email: + values["id_role"] = id_user_from_email + else: + log.warning( + f"MTD - actor association impossible for {type_mtd} with UUID '{uuid_mtd}' because no id_organism nor id_organism could be retrieved - with the following actor information:\n" + + format_str_dict_actor_for_logging(actor) + ) + continue + try: + statement = ( + pg_insert(CorActor) + .values(**values) + .on_conflict_do_nothing( + index_elements=[ + pk_name, + "id_organism" if id_organism else "id_role", + "id_nomenclature_actor_role", + ], + ) + ) + DB.session.execute(statement) + except IntegrityError as I: + db.session.rollback() + # TODO: uncomment following lines when work is done - commented to limit log while developping + log.error( + f"MTD - DB INTEGRITY ERROR - actor association failed for {type_mtd} with UUID '{uuid_mtd}' and following actor information:\n" + + format_sqlalchemy_error_for_logging(I) + + format_str_dict_actor_for_logging(actor) ) - ) - DB.session.execute(statement) def associate_dataset_modules(dataset): @@ -251,6 +307,54 @@ def associate_dataset_modules(dataset): ) +def format_sqlalchemy_error_for_logging(error: SQLAlchemyError): + """ + Format SQLAlchemy error information in a nice way for MTD logging + + Parameters + ---------- + error : SQLAlchemyError + the SQLAlchemy error + + Returns + ------- + str + formatted error information + """ + indented_original_error_message = str(error.orig).replace("\n", "\n\t") + + formatted_error_message = "".join( + [ + f"\t{indented_original_error_message}", + f"SQL QUERY: {error.statement}\n", + f"\tSQL PARAMS: {error.params}\n", + ] + ) + + return formatted_error_message + + +def format_str_dict_actor_for_logging(actor: dict): + """ + Format actor information in a nice way for MTD logging + + Parameters + ---------- + actor : dict + actor information: actor_role, email, name, organism, uuid_organism, ... + + Returns + ------- + str + formatted actor information + """ + formatted_str_dict_actor = "\tACTOR:\n\t\t" + pprint.pformat(actor).replace( + "\n", "\n\t\t" + ).rstrip("\t") + + return formatted_str_dict_actor + + class CasAuthentificationError(GeonatureApiError): pass @@ -259,7 +363,8 @@ def insert_user_and_org(info_user, update_user_organism: bool = True): id_provider_inpn = current_app.config["MTD_SYNC"]["ID_PROVIDER_INPN"] idprov = AuthenficationCASINPN() idprov.id_provider = id_provider_inpn - auth_manager.add_provider(id_provider_inpn, idprov) + if id_provider_inpn not in auth_manager: + auth_manager.add_provider(id_provider_inpn, idprov) # if not id_provider_inpn in auth_manager: # raise GeonatureApiError( diff --git a/src/mtd_sync/xml_parser.py b/src/mtd_sync/xml_parser.py index 7b1d86d..b550cde 100644 --- a/src/mtd_sync/xml_parser.py +++ b/src/mtd_sync/xml_parser.py @@ -130,10 +130,35 @@ def parse_jdd_xml(xml): root = ET.fromstring(xml, parser=_xml_parser) jdd_list = [] + + def format_acquisition_framework_id_from_xml(provided_af_uuid) -> str | None: + """ + Format the acquisition framework UUID provided for the dataset + i.e. the value for the tag `` in the XML file + + Args: + provided_af_uuid (str): The acquisition framework UUID + Returns: + str | None: The formatted acquisition framework UUID, or None if none was provided + """ + if not provided_af_uuid: + return None + + if provided_af_uuid.startswith("http://oafs.fr/meta/ca/"): + return provided_af_uuid.split("/")[-1] + + return provided_af_uuid + for jdd in root.findall(".//" + namespace + "JeuDeDonnees"): # We extract all the required informations from the different tags of the XML file jdd_uuid = get_tag_content(jdd, "identifiantJdd") - ca_uuid = get_tag_content(jdd, "identifiantCadre") + # TODO: handle case where value for the tag `` in the XML file is not of the form `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx` + # Solutions - if in the form `http://oafs.fr/meta/ca/xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx` (has some entries for INPN MTD PREPROD and instance 'Nationale') : + # - (retained) Format by keeping only the `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx` part + # - Add a check further in the MTD sync to process only if ca_uuid is in the right format + ca_uuid = format_acquisition_framework_id_from_xml( + get_tag_content(jdd, "identifiantCadre") + ) dataset_name = get_tag_content(jdd, "libelle") dataset_shortname = get_tag_content(jdd, "libelleCourt", default_value="") dataset_desc = get_tag_content(jdd, "description", default_value="")