Skip to content

Commit

Permalink
MZ 20240402 update bug fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
ziast committed Apr 2, 2024
1 parent 011cd82 commit cc65ba3
Show file tree
Hide file tree
Showing 7 changed files with 188 additions and 31 deletions.
3 changes: 2 additions & 1 deletion bdarpack/CleanData.py
Original file line number Diff line number Diff line change
Expand Up @@ -840,7 +840,8 @@ def update_data(self, new_df, filename_suffix=""):
# Update latest clean df to output_df
old_df = deepcopy(self.clean_df)
self.clean_df = deepcopy(new_df)
self.clean_df.reset_index(drop=True, inplace=True) #reset the index
if not self.create_unique_index:
self.clean_df.reset_index(drop=True, inplace=True) #reset the index

self._save_data_to_file()

Expand Down
184 changes: 167 additions & 17 deletions bdarpack/Constraints.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from bdarpack import utils_ as ut_
from copy import deepcopy
import pandas as pd
import numpy as np

class Constraints:
"""
Expand Down Expand Up @@ -35,13 +36,16 @@ def output_log_to_file(self):
print('No logger is defined.')


def multiparent_conditions(self, df, var_array, dict_conditions_values):
def multiparent_conditions(self, df, var_array, dict_conditions_values, options={}):
"""Function for replacement of values in a dataframe based on multiple conditions evaluated from multiple columns.
Parameters:
df (dataframe): The dataframe to be updated.
var_array (list): A list of strings of column names to be updated.
dict_condiions_values (dict): A dictionary of conditions and values. The conditions are evaluated from multiple columns and the corresponding value is then inserted into the specified columns.
dict_conditions_values (dict): A dictionary of conditions and values. The conditions are evaluated from multiple columns and the corresponding value is then inserted into the specified columns.
options (dict): Options varying usage.
- "duplicate_output" (bool): Default is `False`. If `True`, the columns in `var_array` will not be replaced with new values. Instead, a duplicated `var_array` will be created and updated based on given `dict_conditions_values`.
- "duplicate_output_suffix" (str): Default is '_dup'.
Returns:
df (dataframe): The updated dataframe
Expand All @@ -59,10 +63,28 @@ def multiparent_conditions(self, df, var_array, dict_conditions_values):
This example updates the dataframe with the values 0 and 1 in "column1" and "column2" columns, according to the conditions given. In particular, the value 0 is inserted when "parent1_column" is greater than 5 AND "parent2_column" is less than 10. The value 1 is inserted when "parent3_column" is equal to "Yes".
"""

# (MZ): 03-22-2024: add duplicate option
if "duplicate_output" in options:
dup_output_bool = options['duplicate_output']
else:
dup_output_bool = False
if "duplicate_output_suffix" in options:
dup_output_suffix = options['duplicate_output_suffix']
else:
dup_output_suffix = '_dup'

# Duplicate columns in var_array
suffix = dup_output_suffix
var_array_use = [vari + suffix for vari in var_array]
for i in range(len(var_array)):
df[var_array_use[i]] = df[var_array[i]]

# Initialise log
for var in var_array:
self.init_log(var)
df[var] = df[var].convert_dtypes()
# for var in var_array:
for i in range(len(var_array)):
self.init_log(var_array[i])
df[var_array[i]] = df[var_array[i]].convert_dtypes()
df[var_array_use[i]] = df[var_array_use[i]].convert_dtypes()

# Iterate through conditions and values
for key, dict_condition_value in dict_conditions_values.items():
Expand All @@ -77,16 +99,31 @@ def multiparent_conditions(self, df, var_array, dict_conditions_values):
else:
cond_ = cond_ & (df[parent].map(condition))

df.loc[cond_,var_array] = dict_condition_value['value']
df.loc[cond_,var_array_use] = dict_condition_value['value']

# Convert dataframe to best possible dtype
for var in var_array:
for var in var_array_use:
df[var] = df[var].convert_dtypes()

# Generate mismatch list
mismatch_dict = {}
for i in range(len(var_array)):
mismatch_dict[var_array[i]] = self.find_mismatch(df, var_array[i], var_array_use[i])

# Replace if dup_output_bool=False
if (not dup_output_bool):
for i in range(len(var_array)):
df[var_array[i]] = df[var_array_use[i]]
df = df.drop(var_array_use, axis=1)

# Create log
for var in var_array:
mismatch_str = ','.join(mismatch_dict[var])
msg = f"Replaced {var} using conditions and values given in dict_conditions_values."
self.log[var]['evaluate_df_column'] = msg
self.log[var]['multiparent_conditions'] = {
"msg": msg,
"replaced": mismatch_str
}

if self.debug:
print(f"For variable: {var}: {msg}")
Expand All @@ -96,7 +133,7 @@ def multiparent_conditions(self, df, var_array, dict_conditions_values):
return df


def evaluate_df_column(self, df, column_names, dict_conditions_values=None, func=None, output_column_name=None):
def evaluate_df_column(self, df, column_names, dict_conditions_values=None, func=None, output_column_name=None, options={}):
"""This function takes a dataframe and column name(s) and evaluates the column based on the given conditions and values, creating a new column in the dataframe with the evaluated values. Optionally, a function can be passed in to evaluate the column.
Parameters:
Expand All @@ -111,6 +148,9 @@ def evaluate_df_column(self, df, column_names, dict_conditions_values=None, func
}
func (function): Optional. A function to be applied on the columns
output_column_name (str): Optional. A string containing the name of the output column. If not provided, the default is the name of the column plus '_evaluated'.
options (dict): Optional, options varying usage.
- "duplicate_output" (bool): Default is `False`. If `True`, the columns in `output_column name` will not be replaced with new values. Instead, a duplicated `output_column_name` will be created and updated based on given `dict_conditions_values`. This is functionality applies only when `output_column_name`is an existing column in the dataframe.
- "duplicate_output_suffix" (str): Default is '_dup'.
Returns:
df (dataframe): the dataframe with the evaluated values in the new column
Expand All @@ -126,20 +166,53 @@ def evaluate_df_column(self, df, column_names, dict_conditions_values=None, func
output_column_name='item_type'
)
"""

# (MZ): 03-26-2024: add duplicate option
if "duplicate_output" in options:
dup_output_bool = options['duplicate_output']
else:
dup_output_bool = False
if "duplicate_output_suffix" in options:
dup_output_suffix = options['duplicate_output_suffix']
else:
dup_output_suffix = '_dup'

column_exist = False
# Create a new column with the new values
if isinstance(column_names, str):
if output_column_name is None:
output_column_name = column_name + '_evaluated'
df[output_column_name] = df[column_names].copy()

if output_column_name in df.columns: # column is to be replaced
column_exist = True
original_column_name = deepcopy(output_column_name)
output_column_name = output_column_name + dup_output_suffix #output in duplicate column

if column_exist:
df[output_column_name] = df[original_column_name].copy()
else:
df[output_column_name] = df[column_names].copy()
else:
column_name = column_names[0]
if output_column_name is None:
output_column_name = ''.join(column_names) + '_evaluated'
df[output_column_name] = df[column_name].copy()

if output_column_name in df.columns: # column is to be replaced
column_exist = True
original_column_name = deepcopy(output_column_name)
output_column_name = output_column_name + dup_output_suffix #output in duplicate column

if column_exist:
df[output_column_name] = df[original_column_name].copy()
else:
df[output_column_name] = df[column_name].copy()


# initialise log
self.init_log(output_column_name)
if column_exist:
self.init_log(original_column_name)
else:
self.init_log(output_column_name)

if func is None:
# Iterate through each condition and value
Expand Down Expand Up @@ -167,15 +240,49 @@ def evaluate_df_column(self, df, column_names, dict_conditions_values=None, func
# Convert dataframe to best possible dtype
df[output_column_name] = df[output_column_name].convert_dtypes()

# Generate mismatch list
mismatch_str = 'Secondary column not in original dataframe'
if column_exist: # column is to be replaced
# mismatch_dict = {}
mismatch_list = self.find_mismatch(df, original_column_name, output_column_name)
if len(mismatch_list)==0:
mismatch_str = 'No mismatches'
else:
# mismatch_str = ','.join(mismatch_list)
mismatch_str = ','.join(str(item) for item in mismatch_list)

# Replace if dup_output_false = False
if column_exist:
if (not dup_output_bool):
df[original_column_name] = df[output_column_name]
df = df.drop([output_column_name], axis=1)

# Create log
msg = f"Replaced {output_column_name} using conditions and values given in dict_conditions_values."
self.log[output_column_name]['evaluate_df_column'] = msg
if column_exist:
if (not dup_output_bool):
msg = f"Replaced {original_column_name} using conditions and values given in dict_conditions_values."
self.log[original_column_name]['evaluate_df_column'] = {
"msg": msg,
"replaced": mismatch_str
}
else:
msg = f"Created secondary {output_column_name} using conditions and values given in dict_conditions_values."
self.log[original_column_name]['evaluate_df_column'] = {
"msg": msg,
"mismatch": mismatch_str
}
else:
msg = f"Created secondary {output_column_name} using conditions and values given in dict_conditions_values."
self.log[output_column_name]['evaluate_df_column'] = {
"msg": msg,
"replaced": mismatch_str
}

if self.debug:
print(f"For variable: {output_column_name}: {msg}")
if self.logging:
self.logger.info(f"For variable: {output_column_name}: {msg}")

return df


Expand Down Expand Up @@ -225,10 +332,12 @@ def convertBlankstoValue(self, df, var_array=None, value=None):
replace_value = value

# return the number of missing values converted
number_of_missing_values_converted = df[v].isnull().sum()
# number_of_missing_values_converted = df[v].isnull().sum() #(MZ): 20240322
number_of_missing_values_converted = df[v].isnull().sum() + (df[v] == "").sum()

# Convert missing values in a dataframe column to value
df[v].fillna(replace_value, inplace=True)
df[v].loc[df[v] == ""] = replace_value #(MZ): 20240322

# Convert dataframe to best possible dtype
df[v] = df[v].convert_dtypes()
Expand Down Expand Up @@ -297,4 +406,45 @@ def func(row):
if self.logging:
self.logger.debug(f"For variable: {A}: {msg}")

return df
return df

def find_mismatch(self, df, col1, col2):
mismatched_list = []

if self.debug:
print(f"Checking column: {col1} against {col2}")
if self.logging:
self.logger.debug(f"Checking column: {col1} against {col2}")

a = df[col1] != df[col2]
a_bool = a.fillna(True)

# Get rows where both columns have <NA> values
na_rows = (df[col1].isna()) & (df[col2].isna())
# Update a_bool values only for these rows
a_bool.loc[na_rows] = False

if a_bool.any():
# get index of rows where values are not equal
mismatched_index = np.where(a_bool)[0]
mismatched_row = df.iloc[mismatched_index]
mismatched_list = mismatched_row.index.tolist()
if self.debug:
if (len(mismatched_list)<20):
print("Mismatched rows index:", ','.join(str(item) for item in mismatched_list))
else:
print("Mismatched rows index:", "Too many to show.")
if self.logging:
if (len(mismatched_list)<20):
# mismatched_str = ', '.join(mismatched_list)
mismatched_str = ','.join(str(item) for item in mismatched_list)
self.logger.debug(f"Mismatched rows index: {mismatched_str}")
else:
self.logger.debug("Mismatched rows index: Too many to show.")
else:
if self.debug:
print("Columns are identical.")
else:
self.logger.debug("Columns are identical.")

return mismatched_list
7 changes: 5 additions & 2 deletions bdarpack/TabulaCopula.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,17 +182,21 @@ def _load_definitions(self, definitions):
self._update_defaults(var_to_update="privacy_batch_n", new_value="PRIVACY_BATCH_N", definitions=definitions)

self.prefix_path = definitions.PREFIX_PATH
self.prefix_path = self.prefix_path.replace("\\","/")

self.trainxlsx = definitions.TRAINXLSX
self.traindictxlsx = definitions.TRAINDICTXLSX

self.train_data_path = self.prefix_path + self.folder_trainData + "\\"
self.train_data_path = self.train_data_path.replace("\\","/")
self.train_data_filename = self.train_data_path + definitions.TRAINXLSX
self.train_data_dict_filename = self.train_data_path + definitions.TRAINDICTXLSX

self.syn_data_path = self.prefix_path + self.folder_synData + "\\"
self.privacyMetrics_path = self.prefix_path + self.folder_privacyMetrics + "\\"
self.syn_data_path = self.syn_data_path.replace("\\","/")

self.privacyMetrics_path = self.prefix_path + self.folder_privacyMetrics + "\\"
self.privacyMetrics_path = self.privacyMetrics_path.replace("\\","/")

self.train_df = None #initialise training data (dataframe)
self.dict_df = None #initialise data dictionary (dataframe)
Expand Down Expand Up @@ -278,7 +282,6 @@ def read_inputData(self, sheetname=None):
var_names = [row[self.dict_var_varname] for index, row in self.dict_df.iterrows() if row[self.dict_var_vartype] == 'string']
self.train_df[var_names] = self.train_df[var_names].astype("str")


if (self.debug):
print(f"Input data loaded.")

Expand Down
14 changes: 7 additions & 7 deletions bdarpack/VIsualPlot.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@



def hist(data_1d, fig=None, ax=None, position=None, title=None, alpha=0.8, color='blue', label=''):
def hist(data_1d, fig=None, ax=None, position=None, title=None, alpha=0.8, color='blue', label='', bins='auto'):
"""Function to generate a histogram plot.
Parameters:
Expand All @@ -41,14 +41,14 @@ def hist(data_1d, fig=None, ax=None, position=None, title=None, alpha=0.8, color

data_dropped_na = data_1d.dropna()

ax.hist(data_dropped_na, density=True, bins='auto', alpha=alpha, color=color, label=label)
(n, bins, patches) = ax.hist(data_dropped_na, density=True, bins=bins, alpha=alpha, color=color, label=label)

ax.legend(loc='best', frameon=False)
ax.set_title(title, fontsize=8)

return ax, fig

def hist_compare(real_data, syn_data, var_list, no_cols=2):
def hist_compare(real_data, syn_data, var_list, no_cols=2, bins='auto'):

no_rows = len(var_list) // no_cols + math.ceil(len(var_list) % no_cols)
ax_hist = []
Expand All @@ -60,13 +60,13 @@ def hist_compare(real_data, syn_data, var_list, no_cols=2):
position_tuple = (int(no_rows), int(no_cols), int(index+1))

if (index == 0):
(ax_hist_out, fig_histogram) = hist(x_real, position = position_tuple, label=f"Original: n = {len(x_real)}")
(ax_hist_out, fig_histogram) = hist(x_real, position = position_tuple, bins=bins, label=f"Original: n = {len(x_real)}")
ax_hist.append(ax_hist_out)
(ax_hist[index], fig_histogram) = hist(x_syn, ax=ax_hist[index], fig=fig_histogram, color='grey', title=f'Histogram Plot for {var}', label=f"Synthetic: n={len(x_syn)}")
(ax_hist[index], fig_histogram) = hist(x_syn, ax=ax_hist[index], fig=fig_histogram, color='grey', title=f'Histogram Plot for {var}', bins=bins, label=f"Synthetic: n={len(x_syn)}")
else:
(ax_hist_out, fig_histogram) = hist(x_real, fig=fig_histogram, position = position_tuple, label=f"Original: n = {len(x_real)}")
(ax_hist_out, fig_histogram) = hist(x_real, fig=fig_histogram, position = position_tuple, bins=bins, label=f"Original: n = {len(x_real)}")
ax_hist.append(ax_hist_out)
(ax_hist[index], fig_histogram) = hist(x_syn, ax=ax_hist[index], fig=fig_histogram, color='grey', title=f'Histogram Plot for {var}', label=f"Synthetic: n={len(x_syn)}")
(ax_hist[index], fig_histogram) = hist(x_syn, ax=ax_hist[index], fig=fig_histogram, color='grey', title=f'Histogram Plot for {var}', bins=bins, label=f"Synthetic: n={len(x_syn)}")

return ax_hist, fig_histogram

Expand Down
4 changes: 2 additions & 2 deletions bdarpack/utils_.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,7 +415,7 @@ def mapping_dictDateFormatConversion(str):
str = str.replace("dd", r"%d")
elif ("d" in str): # day number without a leading zero
if OS_TYPE=='Windows': # use "%-d" for unix, "%#d" for windows
str = str.replace("d", r"%#d")
str = str.replace("d", r"%d")
elif OS_TYPE=='Linux' or 'Darwin':
str = str.replace("d", r"%-d")

Expand All @@ -429,7 +429,7 @@ def mapping_dictDateFormatConversion(str):
str = str.replace("mm", r"%m")
elif ("m" in str): # month number without a leading zero
if OS_TYPE=='Windows': # use "%-m" for unix, "%#m" for windows
str = str.replace("m", r"%#m")
str = str.replace("m", r"%m")
elif OS_TYPE=='Linux' or 'Darwin':
str = str.replace("m", r"%-m")

Expand Down
Loading

0 comments on commit cc65ba3

Please sign in to comment.