diff --git a/src/mtd_sync/mtd_sync.py b/src/mtd_sync/mtd_sync.py
index bbc0390..bb6e844 100644
--- a/src/mtd_sync/mtd_sync.py
+++ b/src/mtd_sync/mtd_sync.py
@@ -69,7 +69,7 @@ def _get_xml(self, path):
def _get_af_xml(self):
return self._get_xml(self.af_path)
- def get_af_list(self):
+ def get_af_list(self) -> list:
xml = self._get_af_xml()
_xml_parser = etree.XMLParser(ns_clean=True, recover=True, encoding="utf-8")
root = etree.fromstring(xml, parser=_xml_parser)
@@ -82,7 +82,7 @@ def get_af_list(self):
def _get_ds_xml(self):
return self._get_xml(self.ds_path)
- def get_ds_list(self):
+ def get_ds_list(self) -> list:
xml = self._get_ds_xml()
return parse_jdd_xml(xml)
@@ -221,11 +221,19 @@ def process_af_and_ds(af_list, ds_list, id_role=None):
continue
user_add_total_time += time.time() - start_add_user_time
af = sync_af(af)
+ # TODO: choose whether or not to commit retrieval of the AF before association of actors
+ # and possibly retrieve an AF without any actor associated to it
+ db.session.commit()
+ # If AF has not been synchronized ; due to the lack of a UUID ; actor cannot be associated to it
+ # and thus we skip to the next AF
+ if not af:
+ continue
associate_actors(
actors,
CorAcquisitionFrameworkActor,
"id_acquisition_framework",
af.id_acquisition_framework,
+ af.unique_acquisition_framework_id,
)
# TODO: remove actors removed from MTD
db.session.commit()
@@ -242,7 +250,13 @@ def process_af_and_ds(af_list, ds_list, id_role=None):
user_add_total_time += time.time() - start_add_user_time
ds = sync_ds(ds, list_cd_nomenclature)
if ds is not None:
- associate_actors(actors, CorDatasetActor, "id_dataset", ds.id_dataset)
+ associate_actors(
+ actors,
+ CorDatasetActor,
+ "id_dataset",
+ ds.id_dataset,
+ ds.unique_dataset_id,
+ )
user_add_total_time = round(user_add_total_time, 2)
db.session.commit()
diff --git a/src/mtd_sync/mtd_utils.py b/src/mtd_sync/mtd_utils.py
index bafa976..fad7719 100644
--- a/src/mtd_sync/mtd_utils.py
+++ b/src/mtd_sync/mtd_utils.py
@@ -1,10 +1,12 @@
import logging
import json
from copy import copy
+import pprint
+from typing import Literal
from flask import current_app
from sqlalchemy import select, exists
-from sqlalchemy.exc import SQLAlchemyError
+from sqlalchemy.exc import SQLAlchemyError, IntegrityError
from sqlalchemy.sql import func, update
from sqlalchemy.dialects.postgresql import insert as pg_insert
@@ -122,7 +124,19 @@ def sync_af(af):
TAcquisitionFramework
The updated or inserted acquisition framework.
"""
+ # TODO: handle case where af_uuid is None ; as will raise an error at database level when executing the statement below ;
+ # af_uuid being None, i.e. af UUID is missing, could be due to no UUID specified in `` tag in the XML file
+ # Solutions - if UUID is missing:
+ # - Just pass the sync of the AF
+ # - Generate a UUID for the AF
af_uuid = af["unique_acquisition_framework_id"]
+
+ if not af_uuid:
+ log.warning(
+ f"No UUID provided for the AF with UUID '{af_uuid}' - SKIPPING SYNCHRONIZATION FOR THIS AF."
+ )
+ return None
+
af_exists = DB.session.scalar(
exists().where(TAcquisitionFramework.unique_acquisition_framework_id == af_uuid).select()
)
@@ -185,21 +199,34 @@ def add_or_update_organism(uuid, nom, email):
return DB.session.execute(statement).scalar()
-def associate_actors(actors, CorActor, pk_name, pk_value):
+def associate_actors(
+ actors,
+ CorActor: CorAcquisitionFrameworkActor | CorDatasetActor,
+ pk_name: Literal["id_acquisition_framework", "id_dataset"],
+ pk_value: str,
+ uuid_mtd: str,
+):
"""
- Associate actor and DS or AF according to CorActor value.
+ Associate actors with either a given :
+ - Acquisition framework - writing to the table `gn_meta.cor_acquisition_framework_actor`.
+ - Dataset - writing to the table `gn_meta.cor_dataset_actor`.
Parameters
----------
actors : list
list of actors
- CorActor : db.Model
- table model
- pk_name : str
- pk attribute name
+ CorActor : CorAcquisitionFrameworkActor | CorDatasetActor
+ the SQLAlchemy model corresponding to the destination table
+ pk_name : Literal['id_acquisition_framework', 'id_dataset']
+ pk attribute name:
+ - 'id_acquisition_framework' for AF
+ - 'id_dataset' for DS
pk_value : str
- pk value
+ pk value: ID of the AF or DS
+ uuid_mtd : str
+ UUID of the AF or DS
"""
+ type_mtd = "AF" if pk_name == "id_acquisition_framework" else "DS"
for actor in actors:
id_organism = None
uuid_organism = actor["uuid_organism"]
@@ -218,20 +245,49 @@ def associate_actors(actors, CorActor, pk_name, pk_value):
),
**{pk_name: pk_value},
)
- if not id_organism:
- values["id_role"] = DB.session.scalar(
- select(User.id_role).filter_by(email=actor["email"])
- )
- else:
+ # TODO: choose wether to:
+ # - (retained) Try to associate to an organism first and then to a user
+ # - Try to associate to a user first and then to an organism
+ if id_organism:
values["id_organism"] = id_organism
- statement = (
- pg_insert(CorActor)
- .values(**values)
- .on_conflict_do_nothing(
- index_elements=[pk_name, "id_organism", "id_nomenclature_actor_role"],
+ # TODO: handle case where no user is retrieved for the actor email:
+ # - (retained) Just do not try to associate the actor with the metadata
+ # - Try to retrieve and id_organism from the organism name - field `organism`
+ # - Try to retrieve and id_organism from the actor email considered as an organism email - field `email`
+ # - Try to insert a new user from the actor name - field `name` - and possibly also email - field `email`
+ else:
+ id_user_from_email = DB.session.scalar(
+ select(User.id_role).filter_by(email=actor["email"]).where(User.groupe.is_(False))
+ )
+ if id_user_from_email:
+ values["id_role"] = id_user_from_email
+ else:
+ log.warning(
+ f"MTD - actor association impossible for {type_mtd} with UUID '{uuid_mtd}' because no id_organism nor id_organism could be retrieved - with the following actor information:\n"
+ + format_str_dict_actor_for_logging(actor)
+ )
+ continue
+ try:
+ statement = (
+ pg_insert(CorActor)
+ .values(**values)
+ .on_conflict_do_nothing(
+ index_elements=[
+ pk_name,
+ "id_organism" if id_organism else "id_role",
+ "id_nomenclature_actor_role",
+ ],
+ )
+ )
+ DB.session.execute(statement)
+ except IntegrityError as I:
+ db.session.rollback()
+ # TODO: uncomment following lines when work is done - commented to limit log while developping
+ log.error(
+ f"MTD - DB INTEGRITY ERROR - actor association failed for {type_mtd} with UUID '{uuid_mtd}' and following actor information:\n"
+ + format_sqlalchemy_error_for_logging(I)
+ + format_str_dict_actor_for_logging(actor)
)
- )
- DB.session.execute(statement)
def associate_dataset_modules(dataset):
@@ -251,6 +307,54 @@ def associate_dataset_modules(dataset):
)
+def format_sqlalchemy_error_for_logging(error: SQLAlchemyError):
+ """
+ Format SQLAlchemy error information in a nice way for MTD logging
+
+ Parameters
+ ----------
+ error : SQLAlchemyError
+ the SQLAlchemy error
+
+ Returns
+ -------
+ str
+ formatted error information
+ """
+ indented_original_error_message = str(error.orig).replace("\n", "\n\t")
+
+ formatted_error_message = "".join(
+ [
+ f"\t{indented_original_error_message}",
+ f"SQL QUERY: {error.statement}\n",
+ f"\tSQL PARAMS: {error.params}\n",
+ ]
+ )
+
+ return formatted_error_message
+
+
+def format_str_dict_actor_for_logging(actor: dict):
+ """
+ Format actor information in a nice way for MTD logging
+
+ Parameters
+ ----------
+ actor : dict
+ actor information: actor_role, email, name, organism, uuid_organism, ...
+
+ Returns
+ -------
+ str
+ formatted actor information
+ """
+ formatted_str_dict_actor = "\tACTOR:\n\t\t" + pprint.pformat(actor).replace(
+ "\n", "\n\t\t"
+ ).rstrip("\t")
+
+ return formatted_str_dict_actor
+
+
class CasAuthentificationError(GeonatureApiError):
pass
@@ -259,7 +363,8 @@ def insert_user_and_org(info_user, update_user_organism: bool = True):
id_provider_inpn = current_app.config["MTD_SYNC"]["ID_PROVIDER_INPN"]
idprov = AuthenficationCASINPN()
idprov.id_provider = id_provider_inpn
- auth_manager.add_provider(id_provider_inpn, idprov)
+ if id_provider_inpn not in auth_manager:
+ auth_manager.add_provider(id_provider_inpn, idprov)
# if not id_provider_inpn in auth_manager:
# raise GeonatureApiError(
diff --git a/src/mtd_sync/xml_parser.py b/src/mtd_sync/xml_parser.py
index 7b1d86d..b550cde 100644
--- a/src/mtd_sync/xml_parser.py
+++ b/src/mtd_sync/xml_parser.py
@@ -130,10 +130,35 @@ def parse_jdd_xml(xml):
root = ET.fromstring(xml, parser=_xml_parser)
jdd_list = []
+
+ def format_acquisition_framework_id_from_xml(provided_af_uuid) -> str | None:
+ """
+ Format the acquisition framework UUID provided for the dataset
+ i.e. the value for the tag `` in the XML file
+
+ Args:
+ provided_af_uuid (str): The acquisition framework UUID
+ Returns:
+ str | None: The formatted acquisition framework UUID, or None if none was provided
+ """
+ if not provided_af_uuid:
+ return None
+
+ if provided_af_uuid.startswith("http://oafs.fr/meta/ca/"):
+ return provided_af_uuid.split("/")[-1]
+
+ return provided_af_uuid
+
for jdd in root.findall(".//" + namespace + "JeuDeDonnees"):
# We extract all the required informations from the different tags of the XML file
jdd_uuid = get_tag_content(jdd, "identifiantJdd")
- ca_uuid = get_tag_content(jdd, "identifiantCadre")
+ # TODO: handle case where value for the tag `` in the XML file is not of the form `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx`
+ # Solutions - if in the form `http://oafs.fr/meta/ca/xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx` (has some entries for INPN MTD PREPROD and instance 'Nationale') :
+ # - (retained) Format by keeping only the `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx` part
+ # - Add a check further in the MTD sync to process only if ca_uuid is in the right format
+ ca_uuid = format_acquisition_framework_id_from_xml(
+ get_tag_content(jdd, "identifiantCadre")
+ )
dataset_name = get_tag_content(jdd, "libelle")
dataset_shortname = get_tag_content(jdd, "libelleCourt", default_value="")
dataset_desc = get_tag_content(jdd, "description", default_value="")