Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add dataframe conversions to flascdataframe #211

Merged
merged 3 commits into from
Sep 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
264 changes: 174 additions & 90 deletions flasc/flasc_dataframe.py
Original file line number Diff line number Diff line change
@@ -1,73 +1,93 @@
"""FLASC DataFrame module."""

from pandas import DataFrame


# Create a new DataFrame subclass
class FlascDataFrame(DataFrame):
"""Subclass of pandas.DataFrame for working with FLASC data.

I think it makes most sense to store it as FLASC expects it:
- with the correct column names
- in wide format

Then, can offer a transformation to export as the user would like it, for them to work on it
further. How, then, would I revert it back to the needed format

Stores data in preferred Flasc format, or user format, with option to convert between the two.

Two possible types of data we should try to handle:
1. Semiwide:
- One column for time stamp
- One column for turbine id
- Many data channel columns
2. Long:
- One column for time stamp
- One column for variable name
- One column for value

FLASC format is wide, i.e.
- One column for time stamp
- One column for each channel for each turbine

Want handling to go between long and wide and semiwide and wide.
Want handling to go between long and wide.
"""

# Attributes to pickle must be in this list
_metadata = ["name_map", "_user_format"]

def __init__(self, *args, name_map=None, **kwargs):
_metadata = [
"channel_name_map",
"_channel_name_map_to_user",
"_user_format",
"_long_data_columns",
]

def __init__(self, *args, channel_name_map=None, long_data_columns=None, **kwargs):
"""Initialize the FlascDataFrame class, a subclass of pandas.DataFrame.

Args:
*args: arguments to pass to the DataFrame constructor
name_map (dict): Dictionary of column names to map from the user format to the FLASC
format, where the key string is the user format and the value string is the FLASC
equivalent. Defaults to None.
channel_name_map (dict): Dictionary of column names to map from the user format to the
FLASC format, where the key string is the user format and the value string is the
FLASC equivalent. Defaults to None.
long_data_columns (dict): Dictionary of column names for long format data. Defaults to
{"variable_column": "variable", "value_column": "value"}. If
not provided, user data format assumed to be wide.
**kwargs: keyword arguments to pass to the DataFrame constructor
"""
super().__init__(*args, **kwargs)

self._user_format = "wide" # or "long" or "semiwide"
# Check that the time column is present
if "time" not in self.columns:
raise ValueError("Column 'time' must be present in the DataFrame")

# check that name_map dictionary is valid
if name_map is not None:
if not isinstance(name_map, dict):
raise ValueError("name_map must be a dictionary")
if not all(isinstance(k, str) and isinstance(v, str) for k, v in name_map.items()):
raise ValueError("name_map must be a dictionary of strings")
self.name_map = name_map
# Apply the name_map
self.convert_to_flasc_format(inplace=True) # Do we want to do this here?
if channel_name_map is not None:
if not isinstance(channel_name_map, dict):
raise ValueError("channel_name_map must be a dictionary")
if not all(
isinstance(k, str) and isinstance(v, str) for k, v in channel_name_map.items()
):
raise ValueError("channel_name_map must be a dictionary of strings")
self.channel_name_map = channel_name_map

# Save the reversed name_map (to go to user_format)
self._channel_name_map_to_user = (
{v: k for k, v in channel_name_map.items()} if channel_name_map is not None else None
)

# Determine the user format
if long_data_columns is None:
self._user_format = "wide"
self._long_data_columns = None
else:
self._user_format = "long"

# Confirm the long_data_columns is a dictionary with the correct keys
if not isinstance(long_data_columns, dict):
raise ValueError("long_data_columns must be a dictionary")
if not all(col in long_data_columns for col in ["variable_column", "value_column"]):
raise ValueError(
"long_data_columns must contain keys 'variable_column', " "and 'value_column'"
)
self._long_data_columns = long_data_columns

@property
def in_flasc_format(self):
"""Return True if the data is in FLASC format, False otherwise."""
if ("time" in self.columns) and ("pow_000" in self.columns):
return True
else:
return False

@property
def _constructor(self):
return FlascDataFrame

def __str__(self):
"""Printout when calling print(df)."""
if self._in_flasc_format:
if self.in_flasc_format:
return "FlascDataFrame in FLASC format\n" + super().__str__()
else:
return "FlascDataFrame in user format\n" + super().__str__()
return f"FlascDataFrame in user ({self._user_format}) format\n" + super().__str__()

@property
def n_turbines(self):
Expand All @@ -81,7 +101,7 @@ def n_turbines(self):

def check_flasc_format(self):
"""Raise an error if the data is not in FLASC format."""
if not self._in_flasc_format:
if not self.in_flasc_format:
raise ValueError(
(
"Data must be in FLASC format to perform this operation."
Expand All @@ -92,70 +112,134 @@ def check_flasc_format(self):
pass

def convert_to_user_format(self, inplace=False):
"""Convert the DataFrame to the format that the user expects, given the name_map."""
# Convert the format
if self._user_format == "long":
self._convert_wide_to_long() # Should this be assigned to something?
elif self._user_format == "semiwide":
self._convert_wide_to_semiwide() # Should this be assigned to something?
elif self._user_format == "wide":
pass
"""Convert the DataFrame to the format that the user expects, given the channel_name_map.

Args:
inplace (bool): If True, modify the DataFrame in place.
If False, return a new DataFrame.

Returns:
FlascDataFrame: FlascDataFrame in user format if inplace is False, None otherwise.

"""
# Check if already in user format
if not self.in_flasc_format:
if inplace:
return
else:
return self.copy()

# Make a copy of self
df_user = self.copy()

# Set the flag
self._in_flasc_format = False
# Rename the channel columns to user-specified names
if self.channel_name_map is not None:
df_user.rename(columns=self._channel_name_map_to_user, inplace=True)

# Convert column names and return
if self.name_map is not None:
return self.rename(columns={v: k for k, v in self.name_map.items()}, inplace=inplace)
# Convert the format to long if _user_format is long
if self._user_format == "long":
df_user = self._convert_wide_to_long(df_user)

# Assign to self or return
if inplace:
self.__init__(
df_user,
channel_name_map=self.channel_name_map,
long_data_columns=self._long_data_columns,
)
else:
return None if inplace else self.copy()
return df_user

def convert_to_flasc_format(self, inplace=False):
"""Convert the DataFrame to the format that FLASC expects."""
# Convert the format
if self._user_format == "long":
self._convert_long_to_wide() # Should this be assigned to something?
elif self._user_format == "semiwide":
self._convert_semiwide_to_wide() # Should this be assigned to something?
elif self._user_format == "wide":
pass
"""Convert the DataFrame to the format that FLASC expects.

Args:
inplace (bool): If True, modify the DataFrame in place. If False,
return a new DataFrame.

# Set the flag
self._in_flasc_format = True
Returns:
FlascDataFrame: FlascDataFrame in FLASC format if inplace is False, None otherwise

# Convert column names and return
if self.name_map is not None:
return self.rename(columns=self.name_map, inplace=inplace)
"""
# Check if already in flasc format
if self.in_flasc_format:
if inplace:
return
else:
return self.copy()

# Make a copy of self
df_flasc = self.copy()

# Convert back from long if necessary
if self._user_format == "long":
df_flasc = self._convert_long_to_wide(df_flasc)

# Rename the channel columns to flasc-naming convention
if self.channel_name_map is not None:
df_flasc.rename(columns=self.channel_name_map, inplace=True)

# Assign to self or return
if inplace:
self.__init__(
df_flasc,
channel_name_map=self.channel_name_map,
long_data_columns=self._long_data_columns,
)
else:
return None if inplace else self.copy()
return df_flasc

def _convert_long_to_wide(self):
"""Convert a long format DataFrame to a wide format DataFrame."""
# raise NotImplementedError("TO DO")
pass
def _convert_long_to_wide(self, df_):
"""Convert a long format DataFrame to a wide format DataFrame.

def _convert_semiwide_to_wide(self):
"""Convert a semiwide format DataFrame to a wide format DataFrame."""
raise NotImplementedError("TO DO")
Args:
df_ (FlascDataFrame): Long format FlascDataFrame

def _convert_wide_to_long(self):
"""Convert a wide format DataFrame to a long format DataFrame."""
if "time" not in self.columns:
raise ValueError("Column 'time' must be present in the DataFrame")
Returns:
FlascDataFrame: Wide format FlascDataFrame
"""
# Pivot the table so the variable column becomes the column names with time
# kept as the first column and value as the values
df_ = df_.pivot(
index="time",
columns=self._long_data_columns["variable_column"],
values=self._long_data_columns["value_column"],
).reset_index()

# Remove the name
df_.columns.name = None

# Reset the index to make the time column a regular column
return FlascDataFrame(
df_,
channel_name_map=self.channel_name_map,
long_data_columns=self._long_data_columns,
)

return self.melt(id_vars="time", var_name="variable", value_name="value")
def _convert_wide_to_long(self, df_):
"""Convert a wide format DataFrame to a long format DataFrame.

def _convert_wide_to_semiwide(self):
"""Convert a wide format DataFrame to a semiwide format DataFrame."""
if "time" not in self.columns:
raise ValueError("Column 'time' must be present in the DataFrame")
Args:
df_ (FlascDataFrame): Wide format FlascDataFrame

Returns:
FlascDataFrame: Long format FlascDataFrame

raise NotImplementedError("TO DO")
# Should have columns:
# time
# turbine_id (as specified by the user)
# variable
# value
"""
df_ = df_.melt(
id_vars="time",
var_name=self._long_data_columns["variable_column"],
value_name=self._long_data_columns["value_column"],
).sort_values(["time", self._long_data_columns["variable_column"]])

# Reset index for cleanliness
df_ = df_.reset_index(drop=True)

return FlascDataFrame(
df_,
channel_name_map=self.channel_name_map,
long_data_columns=self._long_data_columns,
)

def to_feather(self, path, **kwargs):
"""Raise warning about lost information and save to feather format."""
Expand Down
Loading
Loading