✨ ETL convenience function for converting array to dataframe

Adding a new module to deepicedrain for Extract, Transform and Load (ETL) workflows! Putting slices of a 2D array into several columns inside a dataframe is now easier with the array_to_dataframe function. Inspired by dask/dask#5021. The function is generalized so that dask arrays convert to a dask DataFrame, and numpy arrays convert to a pandas DataFrame.
weiji14 · Aug 18, 2020 · 8c2bf32 · 8c2bf32
1 parent 99f9750
commit 8c2bf32
Show file tree

Hide file tree

Showing 4 changed files with 84 additions and 0 deletions.
diff --git a/deepicedrain/README.md b/deepicedrain/README.md
@@ -16,3 +16,6 @@ Contents:
   - Region - Bounding box data class structure that has xarray subsetting capabilities and more!
   - deltatime_to_utctime - Converts GPS time from an epoch (default is 2018 Jan 1st) to UTC time
   - lonlat_to_xy - Reprojects longitude/latitude EPSG:4326 coordinates to x/y EPSG:3031 coordinates
+
+- :card_file_box: extraload.py - Convenience functions for extracting, transforming and loading data
+  - array_to_dataframe - Turns a 1D/2D numpy/dask array into a tidy pandas/dask dataframe table
diff --git a/deepicedrain/__init__.py b/deepicedrain/__init__.py
@@ -5,6 +5,7 @@
 
 import deepicedrain
 from deepicedrain.deltamath import calculate_delta, nanptp, nan_linregress
+from deepicedrain.extraload import array_to_dataframe
 from deepicedrain.spatiotemporal import Region, deltatime_to_utctime, lonlat_to_xy
 
 __version__: str = "0.2.1"

diff --git a/deepicedrain/extraload.py b/deepicedrain/extraload.py
@@ -0,0 +1,43 @@
+"""
+Extract, Tranform and Load (ETL) functions for handling ICESat-2 point clouds.
+Copies data seamlessly between different array structures and file formats!
+"""
+import pandas as pd
+
+
+def array_to_dataframe(array, colname: str = None, startcol: int = 0):
+    """
+    Converts a 1D or 2D data array into a tidy dataframe structure.
+    An array of shape (m, n) will turn into a table with m rows and n columns.
+
+    These are the possible conversions:
+    - numpy array -> pandas DataFrame
+    - dask Array -> dask DataFrame
+
+    Pass in a colname to set the column name. By default, it will automatically
+    use the array.name attribute in dask Arrays, but this can be overriden.
+    For 2D arrays, columns will be formatted as 'col_0', 'col_1', 'col_2' and
+    so on. The startcol argument allows adjustment of the starting column
+    number, helpful if you prefer starting from 1, e.g. 'col_1', 'col_2', etc.
+
+    See also https://github.com/dask/dask/issues/5021
+    """
+    if not colname:
+        colname = array.name if hasattr(array, "name") else ""
+
+    if array.ndim == 1:  # 1-dimensional arrays
+        columns = colname
+    elif array.ndim == 2:  # 2-dimensional arrays
+        colname += "_" if colname != "" else ""  # add underscore to name
+        columns = [f"{colname}{i+startcol}" for i in range(array.shape[1])]
+
+    try:
+        # Attempt dask Array to dask DataFrame conversion
+        dataframe: dask.dataframe.core.DataFrame = array.to_dask_dataframe(
+            columns=columns
+        )
+    except AttributeError:
+        # Fallback to converting to pandas.DataFrame
+        dataframe: pd.DataFrame = pd.DataFrame.from_records(data=array, columns=columns)
+
+    return dataframe
diff --git a/deepicedrain/tests/test_array_to_dataframe.py b/deepicedrain/tests/test_array_to_dataframe.py
@@ -0,0 +1,37 @@
+"""
+Tests the array_to_dataframe function
+"""
+import dask
+import numpy as np
+import pandas as pd
+import pytest
+
+from deepicedrain import array_to_dataframe
+
+
+@pytest.mark.parametrize("shape", [(10, 1), (10, 2)])
+def test_numpy_array_to_pandas_dataframe(shape):
+    """
+    Test converting from a numpy.array to a pandas.Dataframe, and ensure that
+    the colname argument works.
+    """
+    array: np.ndarray = np.ones(shape=shape)
+    dataframe = array_to_dataframe(array=array)
+
+    assert isinstance(dataframe, pd.DataFrame)
+    assert len(dataframe.columns) == shape[1]
+    assert dataframe.columns.to_list() == [str(i) for i in range(shape[1])]
+
+
+@pytest.mark.parametrize("shape", [(10, 1), (10, 2)])
+def test_dask_array_to_dask_dataframe(shape):
+    """
+    Test converting from a dask.array to a dask.dataframe, and ensure that the
+    startcol argument works.
+    """
+    array: dask.array.core.Array = dask.array.ones(shape=shape, name="varname")
+    dataframe = array_to_dataframe(array=array, startcol=1)
+
+    assert isinstance(dataframe, dask.dataframe.core.DataFrame)
+    assert len(dataframe.columns) == shape[1]
+    assert dataframe.columns.to_list() == [f"varname_{i+1}" for i in range(shape[1])]