-
-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
✨ ETL convenience function for converting array to dataframe
Adding a new module to deepicedrain for Extract, Transform and Load (ETL) workflows! Putting slices of a 2D array into several columns inside a dataframe is now easier with the array_to_dataframe function. Inspired by dask/dask#5021. The function is generalized so that dask arrays convert to a dask DataFrame, and numpy arrays convert to a pandas DataFrame.
- Loading branch information
Showing
4 changed files
with
84 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
""" | ||
Extract, Tranform and Load (ETL) functions for handling ICESat-2 point clouds. | ||
Copies data seamlessly between different array structures and file formats! | ||
""" | ||
import pandas as pd | ||
|
||
|
||
def array_to_dataframe(array, colname: str = None, startcol: int = 0): | ||
""" | ||
Converts a 1D or 2D data array into a tidy dataframe structure. | ||
An array of shape (m, n) will turn into a table with m rows and n columns. | ||
These are the possible conversions: | ||
- numpy array -> pandas DataFrame | ||
- dask Array -> dask DataFrame | ||
Pass in a colname to set the column name. By default, it will automatically | ||
use the array.name attribute in dask Arrays, but this can be overriden. | ||
For 2D arrays, columns will be formatted as 'col_0', 'col_1', 'col_2' and | ||
so on. The startcol argument allows adjustment of the starting column | ||
number, helpful if you prefer starting from 1, e.g. 'col_1', 'col_2', etc. | ||
See also https://github.com/dask/dask/issues/5021 | ||
""" | ||
if not colname: | ||
colname = array.name if hasattr(array, "name") else "" | ||
|
||
if array.ndim == 1: # 1-dimensional arrays | ||
columns = colname | ||
elif array.ndim == 2: # 2-dimensional arrays | ||
colname += "_" if colname != "" else "" # add underscore to name | ||
columns = [f"{colname}{i+startcol}" for i in range(array.shape[1])] | ||
|
||
try: | ||
# Attempt dask Array to dask DataFrame conversion | ||
dataframe: dask.dataframe.core.DataFrame = array.to_dask_dataframe( | ||
columns=columns | ||
) | ||
except AttributeError: | ||
# Fallback to converting to pandas.DataFrame | ||
dataframe: pd.DataFrame = pd.DataFrame.from_records(data=array, columns=columns) | ||
|
||
return dataframe |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
""" | ||
Tests the array_to_dataframe function | ||
""" | ||
import dask | ||
import numpy as np | ||
import pandas as pd | ||
import pytest | ||
|
||
from deepicedrain import array_to_dataframe | ||
|
||
|
||
@pytest.mark.parametrize("shape", [(10, 1), (10, 2)]) | ||
def test_numpy_array_to_pandas_dataframe(shape): | ||
""" | ||
Test converting from a numpy.array to a pandas.Dataframe, and ensure that | ||
the colname argument works. | ||
""" | ||
array: np.ndarray = np.ones(shape=shape) | ||
dataframe = array_to_dataframe(array=array) | ||
|
||
assert isinstance(dataframe, pd.DataFrame) | ||
assert len(dataframe.columns) == shape[1] | ||
assert dataframe.columns.to_list() == [str(i) for i in range(shape[1])] | ||
|
||
|
||
@pytest.mark.parametrize("shape", [(10, 1), (10, 2)]) | ||
def test_dask_array_to_dask_dataframe(shape): | ||
""" | ||
Test converting from a dask.array to a dask.dataframe, and ensure that the | ||
startcol argument works. | ||
""" | ||
array: dask.array.core.Array = dask.array.ones(shape=shape, name="varname") | ||
dataframe = array_to_dataframe(array=array, startcol=1) | ||
|
||
assert isinstance(dataframe, dask.dataframe.core.DataFrame) | ||
assert len(dataframe.columns) == shape[1] | ||
assert dataframe.columns.to_list() == [f"varname_{i+1}" for i in range(shape[1])] |