Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: fills in empty fields with NaN #840

Merged
merged 3 commits into from
Oct 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions hexa/datasets/migrations/0012_alter_datasetfilesample_sample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Generated by Django 4.2.16 on 2024-10-22 12:00

from django.db import migrations, models

import hexa.datasets.models


class Migration(migrations.Migration):
dependencies = [
("datasets", "0011_alter_datasetfilesample_sample"),
]

operations = [
migrations.AlterField(
model_name="datasetfilesample",
name="sample",
field=models.JSONField(
blank=True,
default=list,
encoder=hexa.datasets.models.DataframeJsonEncoder,
null=True,
),
),
]
26 changes: 25 additions & 1 deletion hexa/datasets/models.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import math
import secrets
from functools import cached_property

Expand Down Expand Up @@ -350,6 +351,24 @@ class Meta:
ordering = ["uri"]


class DataframeJsonEncoder(DjangoJSONEncoder):
def encode(self, obj):
# Recursively replace NaN with None (since it's a float, it does not call 'default' method)
def replace_nan(item):
if isinstance(item, float) and math.isnan(item):
return None
elif isinstance(item, dict):
return {key: replace_nan(value) for key, value in item.items()}
elif isinstance(item, list):
return [replace_nan(element) for element in item]
return item

# Preprocess the object to replace NaN values
obj = replace_nan(obj)
# Use the superclass's encode method to serialize the preprocessed object
return super().encode(obj)


class DatasetFileSample(Base):
STATUS_PROCESSING = "PROCESSING"
STATUS_FAILED = "FAILED"
Expand All @@ -360,7 +379,12 @@ class DatasetFileSample(Base):
(STATUS_FAILED, _("Failed")),
(STATUS_FINISHED, _("Finished")),
]
sample = JSONField(blank=True, default=list, null=True, encoder=DjangoJSONEncoder)
sample = JSONField(
blank=True,
default=list,
null=True,
encoder=DataframeJsonEncoder,
)
status = models.CharField(
max_length=10,
choices=STATUS_CHOICES,
Expand Down
4 changes: 4 additions & 0 deletions hexa/datasets/tests/fixtures/example_with_nan.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
name,surname,age,married
Joe,Doe,10,True
Liam,Smith,,False
Emma,Johnson,,False
28 changes: 27 additions & 1 deletion hexa/datasets/tests/test_generate_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,12 @@
from pandas.errors import EmptyDataError

from hexa.core.test import TestCase
from hexa.datasets.models import Dataset, DatasetFileSample, DatasetVersionFile
from hexa.datasets.models import (
DataframeJsonEncoder,
Dataset,
DatasetFileSample,
DatasetVersionFile,
)
from hexa.datasets.queue import (
add_system_attributes,
generate_sample,
Expand All @@ -21,6 +26,12 @@
from hexa.workspaces.models import Workspace, WorkspaceMembershipRole


class TestDataframeJsonEncoder(TestCase):
def test_default(self):
encoder = DataframeJsonEncoder()
self.assertEqual(encoder.encode({"a": float("nan")}), '{"a": null}')


class TestCreateDatasetFileSampleTask(TestCase, DatasetTestMixin):
@classmethod
def setUpTestData(cls):
Expand Down Expand Up @@ -116,6 +127,21 @@ def test_generate_sample(
],
None,
),
(
"example_with_nan.csv",
DatasetFileSample.STATUS_FINISHED,
[
{
"age": None,
"name": "Liam",
"married": False,
"surname": "Smith",
},
{"age": 10.0, "name": "Joe", "married": True, "surname": "Doe"},
{"age": 10.0, "name": "Joe", "married": True, "surname": "Doe"},
],
None,
),
]
for (
fixture_name,
Expand Down
Loading