From e8a76ff37fac44fd51306a62bd11b74dcb539edf Mon Sep 17 00:00:00 2001 From: nazarfil Date: Tue, 22 Oct 2024 11:17:11 +0200 Subject: [PATCH 1/3] fix: fills in empty field with NaN --- hexa/datasets/queue.py | 2 +- hexa/datasets/tests/fixtures/example_with_nan.csv | 4 ++++ hexa/datasets/tests/test_generate_sample.py | 15 +++++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) create mode 100644 hexa/datasets/tests/fixtures/example_with_nan.csv diff --git a/hexa/datasets/queue.py b/hexa/datasets/queue.py index 81c8f19a3..996052cba 100644 --- a/hexa/datasets/queue.py +++ b/hexa/datasets/queue.py @@ -76,7 +76,7 @@ def generate_sample( random_state=SAMPLING_SEED, replace=True, ) - dataset_file_sample.sample = sample.to_dict(orient="records") + dataset_file_sample.sample = sample.fillna("NaN").to_dict(orient="records") dataset_file_sample.status = DatasetFileSample.STATUS_FINISHED except Exception as e: logger.exception( diff --git a/hexa/datasets/tests/fixtures/example_with_nan.csv b/hexa/datasets/tests/fixtures/example_with_nan.csv new file mode 100644 index 000000000..4998f7e8c --- /dev/null +++ b/hexa/datasets/tests/fixtures/example_with_nan.csv @@ -0,0 +1,4 @@ +name,surname,age,married +Joe,Doe,10,True +Liam,Smith,,False +Emma,Johnson,,False \ No newline at end of file diff --git a/hexa/datasets/tests/test_generate_sample.py b/hexa/datasets/tests/test_generate_sample.py index c1e0c7d70..9e8df33f0 100644 --- a/hexa/datasets/tests/test_generate_sample.py +++ b/hexa/datasets/tests/test_generate_sample.py @@ -116,6 +116,21 @@ def test_generate_sample( ], None, ), + ( + "example_with_nan.csv", + DatasetFileSample.STATUS_FINISHED, + [ + { + "age": "NaN", + "name": "Liam", + "married": False, + "surname": "Smith", + }, + {"age": 10.0, "name": "Joe", "married": True, "surname": "Doe"}, + {"age": 10.0, "name": "Joe", "married": True, "surname": "Doe"}, + ], + None, + ), ] for ( fixture_name, From 0b38d465dc8d052c3b9293ddb44778d3b78c7c08 Mon Sep 17 00:00:00 2001 From: nazarfil Date: Tue, 22 Oct 2024 14:43:05 +0200 Subject: [PATCH 2/3] try encoder --- .../0012_alter_datasetfilesample_sample.py | 24 +++++++++++++++++++ hexa/datasets/models.py | 23 +++++++++++++++++- hexa/datasets/queue.py | 2 +- 3 files changed, 47 insertions(+), 2 deletions(-) create mode 100644 hexa/datasets/migrations/0012_alter_datasetfilesample_sample.py diff --git a/hexa/datasets/migrations/0012_alter_datasetfilesample_sample.py b/hexa/datasets/migrations/0012_alter_datasetfilesample_sample.py new file mode 100644 index 000000000..f22437ca9 --- /dev/null +++ b/hexa/datasets/migrations/0012_alter_datasetfilesample_sample.py @@ -0,0 +1,24 @@ +# Generated by Django 4.2.16 on 2024-10-22 12:00 + +from django.db import migrations, models + +import hexa.datasets.models + + +class Migration(migrations.Migration): + dependencies = [ + ("datasets", "0011_alter_datasetfilesample_sample"), + ] + + operations = [ + migrations.AlterField( + model_name="datasetfilesample", + name="sample", + field=models.JSONField( + blank=True, + default=list, + encoder=hexa.datasets.models.DataframeJsonEncoder, + null=True, + ), + ), + ] diff --git a/hexa/datasets/models.py b/hexa/datasets/models.py index 94d178d66..2adce3654 100644 --- a/hexa/datasets/models.py +++ b/hexa/datasets/models.py @@ -1,4 +1,5 @@ import logging +import math import secrets from functools import cached_property @@ -350,6 +351,24 @@ class Meta: ordering = ["uri"] +class DataframeJsonEncoder(DjangoJSONEncoder): + def default(self, obj): + if isinstance(obj, float) and (math.isnan(obj) or math.isinf(obj)): + return None + elif isinstance(obj, (list, dict)): + return self.clean_nan_values(obj) + return super().default(obj) + + def clean_nan_values(self, data): + if isinstance(data, dict): + return {k: self.clean_nan_values(v) for k, v in data.items()} + elif isinstance(data, list): + return [self.clean_nan_values(i) for i in data] + elif isinstance(data, float) and (math.isnan(data) or math.isinf(data)): + return None + return data + + class DatasetFileSample(Base): STATUS_PROCESSING = "PROCESSING" STATUS_FAILED = "FAILED" @@ -360,7 +379,9 @@ class DatasetFileSample(Base): (STATUS_FAILED, _("Failed")), (STATUS_FINISHED, _("Finished")), ] - sample = JSONField(blank=True, default=list, null=True, encoder=DjangoJSONEncoder) + sample = JSONField( + blank=True, default=list, null=True, encoder=DataframeJsonEncoder + ) status = models.CharField( max_length=10, choices=STATUS_CHOICES, diff --git a/hexa/datasets/queue.py b/hexa/datasets/queue.py index 996052cba..81c8f19a3 100644 --- a/hexa/datasets/queue.py +++ b/hexa/datasets/queue.py @@ -76,7 +76,7 @@ def generate_sample( random_state=SAMPLING_SEED, replace=True, ) - dataset_file_sample.sample = sample.fillna("NaN").to_dict(orient="records") + dataset_file_sample.sample = sample.to_dict(orient="records") dataset_file_sample.status = DatasetFileSample.STATUS_FINISHED except Exception as e: logger.exception( From f441e8aab406be80f0d752e214b8f4283d2afcad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quentin=20G=C3=A9r=C3=B4me?= Date: Tue, 22 Oct 2024 17:08:14 +0200 Subject: [PATCH 3/3] fix(jsonfield): Override encode() to remove NaN values (#841) --- hexa/datasets/models.py | 35 +++++++++++---------- hexa/datasets/tests/test_generate_sample.py | 15 +++++++-- 2 files changed, 32 insertions(+), 18 deletions(-) diff --git a/hexa/datasets/models.py b/hexa/datasets/models.py index 2adce3654..590a4167b 100644 --- a/hexa/datasets/models.py +++ b/hexa/datasets/models.py @@ -352,21 +352,21 @@ class Meta: class DataframeJsonEncoder(DjangoJSONEncoder): - def default(self, obj): - if isinstance(obj, float) and (math.isnan(obj) or math.isinf(obj)): - return None - elif isinstance(obj, (list, dict)): - return self.clean_nan_values(obj) - return super().default(obj) - - def clean_nan_values(self, data): - if isinstance(data, dict): - return {k: self.clean_nan_values(v) for k, v in data.items()} - elif isinstance(data, list): - return [self.clean_nan_values(i) for i in data] - elif isinstance(data, float) and (math.isnan(data) or math.isinf(data)): - return None - return data + def encode(self, obj): + # Recursively replace NaN with None (since it's a float, it does not call 'default' method) + def replace_nan(item): + if isinstance(item, float) and math.isnan(item): + return None + elif isinstance(item, dict): + return {key: replace_nan(value) for key, value in item.items()} + elif isinstance(item, list): + return [replace_nan(element) for element in item] + return item + + # Preprocess the object to replace NaN values + obj = replace_nan(obj) + # Use the superclass's encode method to serialize the preprocessed object + return super().encode(obj) class DatasetFileSample(Base): @@ -380,7 +380,10 @@ class DatasetFileSample(Base): (STATUS_FINISHED, _("Finished")), ] sample = JSONField( - blank=True, default=list, null=True, encoder=DataframeJsonEncoder + blank=True, + default=list, + null=True, + encoder=DataframeJsonEncoder, ) status = models.CharField( max_length=10, diff --git a/hexa/datasets/tests/test_generate_sample.py b/hexa/datasets/tests/test_generate_sample.py index 9e8df33f0..1761908e2 100644 --- a/hexa/datasets/tests/test_generate_sample.py +++ b/hexa/datasets/tests/test_generate_sample.py @@ -7,7 +7,12 @@ from pandas.errors import EmptyDataError from hexa.core.test import TestCase -from hexa.datasets.models import Dataset, DatasetFileSample, DatasetVersionFile +from hexa.datasets.models import ( + DataframeJsonEncoder, + Dataset, + DatasetFileSample, + DatasetVersionFile, +) from hexa.datasets.queue import ( add_system_attributes, generate_sample, @@ -21,6 +26,12 @@ from hexa.workspaces.models import Workspace, WorkspaceMembershipRole +class TestDataframeJsonEncoder(TestCase): + def test_default(self): + encoder = DataframeJsonEncoder() + self.assertEqual(encoder.encode({"a": float("nan")}), '{"a": null}') + + class TestCreateDatasetFileSampleTask(TestCase, DatasetTestMixin): @classmethod def setUpTestData(cls): @@ -121,7 +132,7 @@ def test_generate_sample( DatasetFileSample.STATUS_FINISHED, [ { - "age": "NaN", + "age": None, "name": "Liam", "married": False, "surname": "Smith",