From acb6d0a2d8b357dab7972a05b3d5ec1247008498 Mon Sep 17 00:00:00 2001 From: Robbe Sneyders Date: Tue, 12 Mar 2024 16:09:32 +0100 Subject: [PATCH] Add string as fallback index type when writing data (#904) `pa.from_numpy_dtype` fails when passing an object dtype, which Dask uses when the dtype is not explicitly known. I propose to fall back on `string` if this is the case. This will probably be correct for the index column in most cases when the `dtype` is unknown, and non-breaking in most other cases. --------- Co-authored-by: Matthias Richter --- src/fondant/component/data_io.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/fondant/component/data_io.py b/src/fondant/component/data_io.py index 48b4082c..703dd6a9 100644 --- a/src/fondant/component/data_io.py +++ b/src/fondant/component/data_io.py @@ -214,9 +214,19 @@ def _write_dataframe(self, dataframe: dd.DataFrame) -> None: # The id needs to be added explicitly since we will convert this to a PyArrow schema # later and use it in the `pandas.to_parquet` method. + try: + index_type = pa.from_numpy_dtype(dataframe.index.dtype) + except pa.lib.ArrowNotImplementedError: + # The dtype of the index is `np._object`. Fall back on string instead. + logging.warning( + "Failed to infer dtype of index column, falling back to `string`. " + "Specify the dtype explicitly to prevent this.", + ) + index_type = pa.string() + schema.update( { - "id": pa.from_numpy_dtype(dataframe.index.dtype), + "id": index_type, }, )