-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add generalized transformation from pandas to numpy
- Loading branch information
1 parent
8572b4a
commit 0ed1e1f
Showing
1 changed file
with
141 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,141 @@ | ||
from typing import Union | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
||
|
||
def df_to_array(df: Union[pd.DataFrame, pd.Series]) -> np.array: | ||
""" | ||
Transforms a pandas data frame to a numpy array | ||
Args: | ||
df: the pandas data frame | ||
Returns: | ||
a numpy array | ||
Examples: | ||
Same result as np.array(df) if rows of df are one dimensional: | ||
>>> df_one = pd.DataFrame({ | ||
... 'x_0': [1, 2, 3], | ||
... 'x_1': [4, 5, 6], | ||
... 'x_2': [7, 8, 9]}) | ||
>>> np.array_equal(np.array(df_one), df_to_array(df_one)) | ||
True | ||
If the rows contain lists ... | ||
>>> df_list = pd.DataFrame({ | ||
... 'x_0': [[0, 0], [1, 0], [2, 0]], | ||
... 'x_1': [[0, 1], [1, 1], [2, 1]], | ||
... 'x_2': [[0, 2], [1, 2], [2, 2]] | ||
... }) | ||
>>> array_transformed = df_to_array(df_list) | ||
>>> array_cast = np.array(df_list) | ||
the results are not equal: | ||
>>> np.array_equal(array_transformed, array_cast) | ||
False | ||
The cast array contains objects which are hard to work with: | ||
>>> array_cast | ||
array([[list([0, 0]), list([0, 1]), list([0, 2])], | ||
[list([1, 0]), list([1, 1]), list([1, 2])], | ||
[list([2, 0]), list([2, 1]), list([2, 2])]], dtype=object) | ||
The transformed array containst vectors (numbers): | ||
>>> array_transformed | ||
array([[[0, 0], | ||
[0, 1], | ||
[0, 2]], | ||
<BLANKLINE> | ||
[[1, 0], | ||
[1, 1], | ||
[1, 2]], | ||
<BLANKLINE> | ||
[[2, 0], | ||
[2, 1], | ||
[2, 2]]]) | ||
... the same is true for arrays: | ||
>>> df_array = pd.DataFrame({ | ||
... 'x_0': [np.array([0, 0]), np.array([1, 0]), np.array([2, 0])], | ||
... 'x_1': [np.array([0, 1]), np.array([1, 1]), np.array([2, 1])], | ||
... 'x_2': [np.array([0, 2]), np.array([1, 2]), np.array([2, 2])] | ||
... }) | ||
>>> array_transformed = df_to_array(df_array) | ||
>>> array_cast = np.array(df_list) | ||
the results are not equal: | ||
>>> np.array_equal(array_transformed, array_cast) | ||
False | ||
The cast array contains objects which are hard to work with: | ||
>>> array_cast | ||
array([[list([0, 0]), list([0, 1]), list([0, 2])], | ||
[list([1, 0]), list([1, 1]), list([1, 2])], | ||
[list([2, 0]), list([2, 1]), list([2, 2])]], dtype=object) | ||
The transformed array containst vectors (numbers): | ||
>>> array_transformed | ||
array([[[0, 0], | ||
[0, 1], | ||
[0, 2]], | ||
<BLANKLINE> | ||
[[1, 0], | ||
[1, 1], | ||
[1, 2]], | ||
<BLANKLINE> | ||
[[2, 0], | ||
[2, 1], | ||
[2, 2]]]) | ||
# This also works with more nesting: | ||
>>> df_nested = pd.DataFrame({ | ||
... 'x_0': [[[0,0],[1,1]], [[0,0],[2,2]]], | ||
... 'x_1': [[[1,1],[1,1]], [[1,1],[2,2]]] | ||
... }) | ||
>>> df_to_array(df_nested) | ||
array([[[[0, 0], | ||
[1, 1]], | ||
<BLANKLINE> | ||
[[1, 1], | ||
[1, 1]]], | ||
<BLANKLINE> | ||
<BLANKLINE> | ||
[[[0, 0], | ||
[2, 2]], | ||
<BLANKLINE> | ||
[[1, 1], | ||
[2, 2]]]]) | ||
When the inner lists don't have the same shape, an error is thrown and one can use | ||
a flattening version of this (ATTENTION: when using the flattening version, | ||
information about which entry belongs to which condition is lost): | ||
""" | ||
|
||
_lst = [list(row) for _, row in df.iterrows()] | ||
return np.array(_lst) | ||
|
||
|
||
def df_to_array_flatten(df: Union[pd.DataFrame, pd.Series]) -> np.array: | ||
""" | ||
Flattens elements in a pandas DataFrame to resolve shape inconsistencies. | ||
Args: | ||
df: A pandas DataFrame or Series with inconsistent element shapes. | ||
Returns: | ||
A numpy array where all elements are flattened. | ||
Example: | ||
>>> df_inconsistent = pd.DataFrame({ | ||
... 'x_0': [0, 2, 4], | ||
... 'x_1': [[1, 1], [3, 3], [5, 5]] | ||
... }) | ||
>>> df_to_array_flatten(df_inconsistent) | ||
array([[0, 1, 1], | ||
[2, 3, 3], | ||
[4, 5, 5]]) | ||
""" | ||
return np.array( | ||
[np.concatenate([np.ravel(x) if isinstance(x, (list, np.ndarray)) else [x] for x in row]) | ||
for _, row in df.iterrows()]) |