Skip to content

Commit

Permalink
Feat (import): Support constant values in the mapping (#3271)
Browse files Browse the repository at this point in the history
* feat(import): can post fieldmapping in new structured format
* fix(import): insert_import_data_in_transient_table
* fix(import): get_import_values
* fix(import): prepare_import
* feat(import): support default values
* feat(import): default value edition support nomenclature widget
 refactor(import): minor refact on getFieldMappingValues()
* feat(import): improve default value json edition
* feat(import): present default values on report
* feat(import): db migration
* feat(import): test default value
* test(import): support fieldmapping format v2
* feat(import): dynamic form on default values
* fix(import): no longer send empty string for date default value
* fix(import): test field jsonschema_definitions
* feat(import): fix occhab revisions
* feat(import): rename obsolete revision
* style(import): code format
* feat(import): fieldmapping validators deal with default values
* feat(import): occhab revision to set bib_fields.type_field
* Fix (migration) move migration to geonature branch

---------

Co-authored-by: Pierre-Narcisi <[email protected]>
  • Loading branch information
20cents and Pierre-Narcisi authored Dec 16, 2024
1 parent 08ec80d commit 38aea2f
Show file tree
Hide file tree
Showing 30 changed files with 780 additions and 267 deletions.
20 changes: 16 additions & 4 deletions backend/geonature/core/gn_synthese/imports/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def check_transient_data(task, logger, imprt: TImports):
selected_fields = {
field_name: fields[field_name]
for field_name, source_field in imprt.fieldmapping.items()
if source_field in imprt.columns
if source_field.get("column_src", None) in imprt.columns
}
init_rows_validity(imprt)
task.update_state(state="PROGRESS", meta={"progress": 0.05})
Expand Down Expand Up @@ -218,7 +218,15 @@ def update_batch_progress(batch, step):
do_nomenclatures_mapping(
imprt,
entity,
selected_fields,
{
field_name: fields[field_name]
for field_name, mapping in imprt.fieldmapping.items()
if field_name in fields
and (
mapping.get("column_src", None) in imprt.columns
or mapping.get("default_value") is not None
)
},
fill_with_defaults=current_app.config["IMPORT"][
"FILL_MISSING_NOMENCLATURE_WITH_DEFAULT_VALUE"
],
Expand Down Expand Up @@ -339,11 +347,15 @@ def import_data_to_destination(imprt: TImports) -> None:
if field_name not in fields: # not a destination field
continue
field = fields[field_name]
column_src = source_field.get("column_src", None)
if field.multi:
if not set(source_field).isdisjoint(imprt.columns):
if not set(column_src).isdisjoint(imprt.columns):
insert_fields |= {field}
else:
if source_field in imprt.columns:
if (
column_src in imprt.columns
or source_field.get("default_value", None) is not None
):
insert_fields |= {field}

insert_fields -= {fields["unique_dataset_id"]} # Column only used for filling `id_dataset`
Expand Down
6 changes: 3 additions & 3 deletions backend/geonature/core/imports/checks/dataframe/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from geonature.utils.env import db

from geonature.core.imports.models import ImportUserError, ImportUserErrorType
from geonature.core.imports.models import ImportUserError, ImportUserErrorType, TImports
from geonature.core.imports.utils import generated_fields


Expand Down Expand Up @@ -101,7 +101,7 @@ def __error_replace(*args, **kwargs):
return _error_replace


def report_error(imprt, entity, df, error):
def report_error(imprt: TImports, entity, df, error):
"""
Reports an error found in the dataframe, updates the validity column and insert
the error in the `t_user_errors` table.
Expand Down Expand Up @@ -147,7 +147,7 @@ def report_error(imprt, entity, df, error):
# f'{error_type.name}' # FIXME comment
ordered_invalid_rows = sorted(invalid_rows["line_no"])
column = generated_fields.get(error["column"], error["column"])
column = imprt.fieldmapping.get(column, column)
column = imprt.fieldmapping.get(column, {}).get("column_src", column)
# If an error for same import, same column and of the same type already exists,
# we concat existing erroneous rows with current rows.
stmt = pg_insert(ImportUserError).values(
Expand Down
16 changes: 9 additions & 7 deletions backend/geonature/core/imports/checks/sql/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,10 @@ def init_rows_validity(imprt: TImports, dataset_name_field: str = "id_dataset"):
# as rows with multi-entity field only will raise an ORPHAN_ROW error
selected_fields_names = []
for field_name, source_field in imprt.fieldmapping.items():
if type(source_field) == list:
selected_fields_names.extend(set(source_field) & set(imprt.columns))
elif source_field in imprt.columns:
column_src = source_field.get("column_src", None)
if type(column_src) == list:
selected_fields_names.extend(set(column_src) & set(imprt.columns))
elif column_src in imprt.columns:
selected_fields_names.append(field_name)
for entity in entities:
# Select fields associated to this entity *and only to this entity*
Expand All @@ -64,15 +65,16 @@ def init_rows_validity(imprt: TImports, dataset_name_field: str = "id_dataset"):
)


def check_orphan_rows(imprt):
def check_orphan_rows(imprt: TImports):
transient_table = imprt.destination.get_transient_table()
# TODO: handle multi-source fields
# This is actually not a big issue as multi-source fields are unlikely to also be multi-entity fields.
selected_fields_names = []
for field_name, source_field in imprt.fieldmapping.items():
if type(source_field) == list:
selected_fields_names.extend(set(source_field) & set(imprt.columns))
elif source_field in imprt.columns:
column_src = source_field.get("column_src", None)
if type(column_src) == list:
selected_fields_names.extend(set(column_src) & set(imprt.columns))
elif column_src in imprt.columns:
selected_fields_names.append(field_name)
# Select fields associated to multiple entities
AllEntityField = sa.orm.aliased(EntityField)
Expand Down
2 changes: 1 addition & 1 deletion backend/geonature/core/imports/checks/sql/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def report_erroneous_rows(
transient_table = imprt.destination.get_transient_table()
error_type = ImportUserErrorType.query.filter_by(name=error_type).one()
error_column = generated_fields.get(error_column, error_column)
error_column = imprt.fieldmapping.get(error_column, error_column)
error_column = imprt.fieldmapping.get(error_column, {}).get("column_src", error_column)
if error_type.level in level_validity_mapping:
assert entity is not None
cte = (
Expand Down
27 changes: 23 additions & 4 deletions backend/geonature/core/imports/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -479,6 +479,7 @@ class BibFields(db.Model):
fr_label = db.Column(db.Unicode, nullable=False)
eng_label = db.Column(db.Unicode, nullable=True)
type_field = db.Column(db.Unicode, nullable=True)
type_field_params = db.Column(MutableDict.as_mutable(JSON))
mandatory = db.Column(db.Boolean, nullable=False)
autogenerated = db.Column(db.Boolean, nullable=False)
mnemonique = db.Column(db.Unicode, db.ForeignKey(BibNomenclaturesTypes.mnemonique))
Expand Down Expand Up @@ -608,7 +609,7 @@ def optional_conditions_to_jsonschema(name_field: str, optional_conditions: Iter
"if": {
"not": {
"properties": {
field_opt: {"type": "string"} for field_opt in optional_conditions
field_opt: {"type": "object"} for field_opt in optional_conditions
}
}
},
Expand Down Expand Up @@ -726,9 +727,27 @@ def validate_values(field_mapping_json):
"type": "object",
"properties": {
field.name_field: {
"type": (
"boolean" if field.autogenerated else ("array" if field.multi else "string")
),
"type": "object",
"properties": {
"column_src": {
"type": (
"boolean"
if field.autogenerated
else ("array" if field.multi else "string")
),
},
"default_value": {
"oneOf": [
{"type": "boolean"},
{"type": "number"},
{"type": "string"},
{"type": "array"},
]
},
},
"required": [],
"additionalProperties": False,
"anyOf": [{"required": ["column_src"]}, {"required": ["default_value"]}],
}
for field in fields
},
Expand Down
3 changes: 3 additions & 0 deletions backend/geonature/core/imports/routes/fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,11 @@ def get_fields(scope, destination):
fields=[
"id_field",
"name_field",
"type_field",
"type_field_params",
"fr_label",
"eng_label",
"mnemonique",
"mandatory",
"autogenerated",
"multi",
Expand Down
7 changes: 5 additions & 2 deletions backend/geonature/core/imports/routes/imports.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,8 +368,11 @@ def get_import_values(scope, imprt):
# this nomenclated field is not mapped
continue
source = imprt.fieldmapping[field.name_field]
if source not in imprt.columns:
# the file do not contain this field expected by the mapping
if (
source.get("column_src", None) not in imprt.columns
and source.get("default_value", None) is None
):
# the file do not contain this field expected by the mapping and there is no default value
continue
# TODO: vérifier que l’on a pas trop de valeurs différentes ?
column = field.source_column
Expand Down
53 changes: 39 additions & 14 deletions backend/geonature/core/imports/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import json
from enum import IntEnum
from datetime import datetime, timedelta
from typing import IO, Any, Dict, Iterable, List, Optional, Set, Tuple
from typing import IO, Any, Dict, Iterable, List, Optional, Set, Tuple, Union

from flask import current_app, render_template
import sqlalchemy as sa
Expand Down Expand Up @@ -163,7 +163,9 @@ def detect_separator(file_: IO, encoding: str) -> Optional[str]:
return dialect.delimiter


def preprocess_value(dataframe: pd.DataFrame, field: BibFields, source_col: str) -> pd.Series:
def preprocess_value(
dataframe: pd.DataFrame, field: BibFields, source_col: Union[str, List[str]], default_value: Any
) -> pd.Series:
"""
Preprocesses values in a DataFrame depending if the field contains multiple values (e.g. additional_data) or not.
Expand All @@ -184,8 +186,14 @@ def preprocess_value(dataframe: pd.DataFrame, field: BibFields, source_col: str)
"""

def build_additional_data(columns: dict):
try:
default_values = json.loads(default_value)
except Exception:
default_values = {}
result = {}
for key, value in columns.items():
if value is None or value == "":
value = default_values.get(key, None)
if value is None:
continue
try:
Expand All @@ -198,9 +206,17 @@ def build_additional_data(columns: dict):

if field.multi:
assert type(source_col) is list
for col in source_col:
if col not in dataframe.columns:
dataframe[col] = None

Check warning on line 211 in backend/geonature/core/imports/utils.py

View check run for this annotation

Codecov / codecov/patch

backend/geonature/core/imports/utils.py#L211

Added line #L211 was not covered by tests
col = dataframe[source_col].apply(build_additional_data, axis=1)
else:
if source_col not in dataframe.columns:
dataframe[source_col] = None
col = dataframe[source_col]
if default_value is not None:
col = col.replace({"": default_value, None: default_value})

return col


Expand Down Expand Up @@ -244,8 +260,10 @@ def insert_import_data_in_transient_table(imprt: TImports) -> int:
}
data.update(
{
dest_field: preprocess_value(chunk, source_field["field"], source_field["value"])
for dest_field, source_field in fieldmapping.items()
dest_field: preprocess_value(
chunk, mapping["field"], mapping["column_src"], mapping["default_value"]
)
for dest_field, mapping in fieldmapping.items()
}
)
# XXX keep extra_fields in t_imports_synthese? or add config argument?
Expand Down Expand Up @@ -293,21 +311,25 @@ def build_fieldmapping(

for field in fields:
if field.name_field in imprt.fieldmapping:
mapping = imprt.fieldmapping[field.name_field]
column_src = mapping.get("column_src", None)
default_value = mapping.get("default_value", None)
if field.multi:
correct = list(set(columns) & set(imprt.fieldmapping[field.name_field]))
correct = list(set(columns) & set(column_src))
if len(correct) > 0:
fieldmapping[field.source_column] = {
"value": correct,
"field": field,
"column_src": correct,
"default_value": default_value,
}
used_columns.extend(correct)
else:
if imprt.fieldmapping[field.name_field] in columns:
fieldmapping[field.source_column] = {
"value": imprt.fieldmapping[field.name_field],
"field": field,
}
used_columns.append(imprt.fieldmapping[field.name_field])
fieldmapping[field.source_column] = {
"field": field,
"column_src": column_src,
"default_value": default_value,
}
used_columns.append(column_src)
return fieldmapping, used_columns


Expand Down Expand Up @@ -442,8 +464,11 @@ def get_mapping_data(import_: TImports, entity: Entity):
fields = {ef.field.name_field: ef.field for ef in entity.fields}
selected_fields = {
field_name: fields[field_name]
for field_name, source_field in import_.fieldmapping.items()
if source_field in import_.columns and field_name in fields
for field_name, mapping in import_.fieldmapping.items()
if (
mapping.get("column_src") in import_.columns or mapping.get("default_value") is not None
)
and field_name in fields
}
source_cols = set()
for field in selected_fields.values():
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""bib_field.type_field conforms to dynamic_form.type_widget
Revision ID: a94bea44ab56
Revises: e43b01a18850
Create Date: 2024-12-11 15:44:52.912515
"""

from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision = "a94bea44ab56"
down_revision = "e43b01a18850"
branch_labels = None
depends_on = None


def upgrade():
op.execute(
"""
ALTER TABLE gn_imports.bib_fields ADD type_field_params jsonb NULL;
"""
)
op.execute(
"""
UPDATE gn_imports.bib_fields
SET type_field =
case
-- mnemonique is handled front side
WHEN mnemonique IS NOT NULL AND mnemonique != '' THEN NULL
-- multi is handled front side
WHEN multi = true THEN null
WHEN type_field IN ('integer', 'real') THEN 'number'
WHEN type_field IN ('geometry', 'jsonb', 'json', 'wkt') THEN 'textarea'
WHEN type_field LIKE 'timestamp%' THEN 'date'
WHEN type_field ~ '^character varying\((\d+)\)$'
AND COALESCE(substring(type_field FROM '\d+')::int, 0) > 68 THEN 'textarea'
-- Default: garder la valeur actuelle.
ELSE NULL
END;
"""
)


def downgrade():
op.execute(
"""
ALTER TABLE gn_imports.bib_fields DROP COLUMN type_field_params;
"""
)
Loading

0 comments on commit 38aea2f

Please sign in to comment.