From cc65ba3f8cedb2fc2e072a3e8c6730bb91650ca5 Mon Sep 17 00:00:00 2001 From: mz Date: Tue, 2 Apr 2024 09:56:52 +0800 Subject: [PATCH] MZ 20240402 update bug fixes --- bdarpack/CleanData.py | 3 +- bdarpack/Constraints.py | 184 ++++++++++++++++-- bdarpack/TabulaCopula.py | 7 +- bdarpack/VIsualPlot.py | 14 +- bdarpack/utils_.py | 4 +- docs/gettingStarted/definitions.md | 5 +- ...Data_StandardiseDates_ConvertCharacters.md | 2 +- 7 files changed, 188 insertions(+), 31 deletions(-) diff --git a/bdarpack/CleanData.py b/bdarpack/CleanData.py index 89b8e85..855313e 100644 --- a/bdarpack/CleanData.py +++ b/bdarpack/CleanData.py @@ -840,7 +840,8 @@ def update_data(self, new_df, filename_suffix=""): # Update latest clean df to output_df old_df = deepcopy(self.clean_df) self.clean_df = deepcopy(new_df) - self.clean_df.reset_index(drop=True, inplace=True) #reset the index + if not self.create_unique_index: + self.clean_df.reset_index(drop=True, inplace=True) #reset the index self._save_data_to_file() diff --git a/bdarpack/Constraints.py b/bdarpack/Constraints.py index 2d20fb2..89cc361 100644 --- a/bdarpack/Constraints.py +++ b/bdarpack/Constraints.py @@ -1,6 +1,7 @@ from bdarpack import utils_ as ut_ from copy import deepcopy import pandas as pd +import numpy as np class Constraints: """ @@ -35,13 +36,16 @@ def output_log_to_file(self): print('No logger is defined.') - def multiparent_conditions(self, df, var_array, dict_conditions_values): + def multiparent_conditions(self, df, var_array, dict_conditions_values, options={}): """Function for replacement of values in a dataframe based on multiple conditions evaluated from multiple columns. Parameters: df (dataframe): The dataframe to be updated. var_array (list): A list of strings of column names to be updated. - dict_condiions_values (dict): A dictionary of conditions and values. The conditions are evaluated from multiple columns and the corresponding value is then inserted into the specified columns. + dict_conditions_values (dict): A dictionary of conditions and values. The conditions are evaluated from multiple columns and the corresponding value is then inserted into the specified columns. + options (dict): Options varying usage. + - "duplicate_output" (bool): Default is `False`. If `True`, the columns in `var_array` will not be replaced with new values. Instead, a duplicated `var_array` will be created and updated based on given `dict_conditions_values`. + - "duplicate_output_suffix" (str): Default is '_dup'. Returns: df (dataframe): The updated dataframe @@ -59,10 +63,28 @@ def multiparent_conditions(self, df, var_array, dict_conditions_values): This example updates the dataframe with the values 0 and 1 in "column1" and "column2" columns, according to the conditions given. In particular, the value 0 is inserted when "parent1_column" is greater than 5 AND "parent2_column" is less than 10. The value 1 is inserted when "parent3_column" is equal to "Yes". """ + # (MZ): 03-22-2024: add duplicate option + if "duplicate_output" in options: + dup_output_bool = options['duplicate_output'] + else: + dup_output_bool = False + if "duplicate_output_suffix" in options: + dup_output_suffix = options['duplicate_output_suffix'] + else: + dup_output_suffix = '_dup' + + # Duplicate columns in var_array + suffix = dup_output_suffix + var_array_use = [vari + suffix for vari in var_array] + for i in range(len(var_array)): + df[var_array_use[i]] = df[var_array[i]] + # Initialise log - for var in var_array: - self.init_log(var) - df[var] = df[var].convert_dtypes() + # for var in var_array: + for i in range(len(var_array)): + self.init_log(var_array[i]) + df[var_array[i]] = df[var_array[i]].convert_dtypes() + df[var_array_use[i]] = df[var_array_use[i]].convert_dtypes() # Iterate through conditions and values for key, dict_condition_value in dict_conditions_values.items(): @@ -77,16 +99,31 @@ def multiparent_conditions(self, df, var_array, dict_conditions_values): else: cond_ = cond_ & (df[parent].map(condition)) - df.loc[cond_,var_array] = dict_condition_value['value'] + df.loc[cond_,var_array_use] = dict_condition_value['value'] # Convert dataframe to best possible dtype - for var in var_array: + for var in var_array_use: df[var] = df[var].convert_dtypes() + # Generate mismatch list + mismatch_dict = {} + for i in range(len(var_array)): + mismatch_dict[var_array[i]] = self.find_mismatch(df, var_array[i], var_array_use[i]) + + # Replace if dup_output_bool=False + if (not dup_output_bool): + for i in range(len(var_array)): + df[var_array[i]] = df[var_array_use[i]] + df = df.drop(var_array_use, axis=1) + # Create log for var in var_array: + mismatch_str = ','.join(mismatch_dict[var]) msg = f"Replaced {var} using conditions and values given in dict_conditions_values." - self.log[var]['evaluate_df_column'] = msg + self.log[var]['multiparent_conditions'] = { + "msg": msg, + "replaced": mismatch_str + } if self.debug: print(f"For variable: {var}: {msg}") @@ -96,7 +133,7 @@ def multiparent_conditions(self, df, var_array, dict_conditions_values): return df - def evaluate_df_column(self, df, column_names, dict_conditions_values=None, func=None, output_column_name=None): + def evaluate_df_column(self, df, column_names, dict_conditions_values=None, func=None, output_column_name=None, options={}): """This function takes a dataframe and column name(s) and evaluates the column based on the given conditions and values, creating a new column in the dataframe with the evaluated values. Optionally, a function can be passed in to evaluate the column. Parameters: @@ -111,6 +148,9 @@ def evaluate_df_column(self, df, column_names, dict_conditions_values=None, func } func (function): Optional. A function to be applied on the columns output_column_name (str): Optional. A string containing the name of the output column. If not provided, the default is the name of the column plus '_evaluated'. + options (dict): Optional, options varying usage. + - "duplicate_output" (bool): Default is `False`. If `True`, the columns in `output_column name` will not be replaced with new values. Instead, a duplicated `output_column_name` will be created and updated based on given `dict_conditions_values`. This is functionality applies only when `output_column_name`is an existing column in the dataframe. + - "duplicate_output_suffix" (str): Default is '_dup'. Returns: df (dataframe): the dataframe with the evaluated values in the new column @@ -126,20 +166,53 @@ def evaluate_df_column(self, df, column_names, dict_conditions_values=None, func output_column_name='item_type' ) """ + + # (MZ): 03-26-2024: add duplicate option + if "duplicate_output" in options: + dup_output_bool = options['duplicate_output'] + else: + dup_output_bool = False + if "duplicate_output_suffix" in options: + dup_output_suffix = options['duplicate_output_suffix'] + else: + dup_output_suffix = '_dup' + column_exist = False # Create a new column with the new values if isinstance(column_names, str): if output_column_name is None: output_column_name = column_name + '_evaluated' - df[output_column_name] = df[column_names].copy() + + if output_column_name in df.columns: # column is to be replaced + column_exist = True + original_column_name = deepcopy(output_column_name) + output_column_name = output_column_name + dup_output_suffix #output in duplicate column + + if column_exist: + df[output_column_name] = df[original_column_name].copy() + else: + df[output_column_name] = df[column_names].copy() else: column_name = column_names[0] if output_column_name is None: output_column_name = ''.join(column_names) + '_evaluated' - df[output_column_name] = df[column_name].copy() + if output_column_name in df.columns: # column is to be replaced + column_exist = True + original_column_name = deepcopy(output_column_name) + output_column_name = output_column_name + dup_output_suffix #output in duplicate column + + if column_exist: + df[output_column_name] = df[original_column_name].copy() + else: + df[output_column_name] = df[column_name].copy() + + # initialise log - self.init_log(output_column_name) + if column_exist: + self.init_log(original_column_name) + else: + self.init_log(output_column_name) if func is None: # Iterate through each condition and value @@ -167,15 +240,49 @@ def evaluate_df_column(self, df, column_names, dict_conditions_values=None, func # Convert dataframe to best possible dtype df[output_column_name] = df[output_column_name].convert_dtypes() + # Generate mismatch list + mismatch_str = 'Secondary column not in original dataframe' + if column_exist: # column is to be replaced + # mismatch_dict = {} + mismatch_list = self.find_mismatch(df, original_column_name, output_column_name) + if len(mismatch_list)==0: + mismatch_str = 'No mismatches' + else: + # mismatch_str = ','.join(mismatch_list) + mismatch_str = ','.join(str(item) for item in mismatch_list) + + # Replace if dup_output_false = False + if column_exist: + if (not dup_output_bool): + df[original_column_name] = df[output_column_name] + df = df.drop([output_column_name], axis=1) + # Create log - msg = f"Replaced {output_column_name} using conditions and values given in dict_conditions_values." - self.log[output_column_name]['evaluate_df_column'] = msg + if column_exist: + if (not dup_output_bool): + msg = f"Replaced {original_column_name} using conditions and values given in dict_conditions_values." + self.log[original_column_name]['evaluate_df_column'] = { + "msg": msg, + "replaced": mismatch_str + } + else: + msg = f"Created secondary {output_column_name} using conditions and values given in dict_conditions_values." + self.log[original_column_name]['evaluate_df_column'] = { + "msg": msg, + "mismatch": mismatch_str + } + else: + msg = f"Created secondary {output_column_name} using conditions and values given in dict_conditions_values." + self.log[output_column_name]['evaluate_df_column'] = { + "msg": msg, + "replaced": mismatch_str + } if self.debug: print(f"For variable: {output_column_name}: {msg}") if self.logging: self.logger.info(f"For variable: {output_column_name}: {msg}") - + return df @@ -225,10 +332,12 @@ def convertBlankstoValue(self, df, var_array=None, value=None): replace_value = value # return the number of missing values converted - number_of_missing_values_converted = df[v].isnull().sum() + # number_of_missing_values_converted = df[v].isnull().sum() #(MZ): 20240322 + number_of_missing_values_converted = df[v].isnull().sum() + (df[v] == "").sum() # Convert missing values in a dataframe column to value df[v].fillna(replace_value, inplace=True) + df[v].loc[df[v] == ""] = replace_value #(MZ): 20240322 # Convert dataframe to best possible dtype df[v] = df[v].convert_dtypes() @@ -297,4 +406,45 @@ def func(row): if self.logging: self.logger.debug(f"For variable: {A}: {msg}") - return df \ No newline at end of file + return df + + def find_mismatch(self, df, col1, col2): + mismatched_list = [] + + if self.debug: + print(f"Checking column: {col1} against {col2}") + if self.logging: + self.logger.debug(f"Checking column: {col1} against {col2}") + + a = df[col1] != df[col2] + a_bool = a.fillna(True) + + # Get rows where both columns have values + na_rows = (df[col1].isna()) & (df[col2].isna()) + # Update a_bool values only for these rows + a_bool.loc[na_rows] = False + + if a_bool.any(): + # get index of rows where values are not equal + mismatched_index = np.where(a_bool)[0] + mismatched_row = df.iloc[mismatched_index] + mismatched_list = mismatched_row.index.tolist() + if self.debug: + if (len(mismatched_list)<20): + print("Mismatched rows index:", ','.join(str(item) for item in mismatched_list)) + else: + print("Mismatched rows index:", "Too many to show.") + if self.logging: + if (len(mismatched_list)<20): + # mismatched_str = ', '.join(mismatched_list) + mismatched_str = ','.join(str(item) for item in mismatched_list) + self.logger.debug(f"Mismatched rows index: {mismatched_str}") + else: + self.logger.debug("Mismatched rows index: Too many to show.") + else: + if self.debug: + print("Columns are identical.") + else: + self.logger.debug("Columns are identical.") + + return mismatched_list \ No newline at end of file diff --git a/bdarpack/TabulaCopula.py b/bdarpack/TabulaCopula.py index 462f439..7b80f23 100644 --- a/bdarpack/TabulaCopula.py +++ b/bdarpack/TabulaCopula.py @@ -182,17 +182,21 @@ def _load_definitions(self, definitions): self._update_defaults(var_to_update="privacy_batch_n", new_value="PRIVACY_BATCH_N", definitions=definitions) self.prefix_path = definitions.PREFIX_PATH + self.prefix_path = self.prefix_path.replace("\\","/") self.trainxlsx = definitions.TRAINXLSX self.traindictxlsx = definitions.TRAINDICTXLSX self.train_data_path = self.prefix_path + self.folder_trainData + "\\" + self.train_data_path = self.train_data_path.replace("\\","/") self.train_data_filename = self.train_data_path + definitions.TRAINXLSX self.train_data_dict_filename = self.train_data_path + definitions.TRAINDICTXLSX self.syn_data_path = self.prefix_path + self.folder_synData + "\\" - self.privacyMetrics_path = self.prefix_path + self.folder_privacyMetrics + "\\" + self.syn_data_path = self.syn_data_path.replace("\\","/") + self.privacyMetrics_path = self.prefix_path + self.folder_privacyMetrics + "\\" + self.privacyMetrics_path = self.privacyMetrics_path.replace("\\","/") self.train_df = None #initialise training data (dataframe) self.dict_df = None #initialise data dictionary (dataframe) @@ -278,7 +282,6 @@ def read_inputData(self, sheetname=None): var_names = [row[self.dict_var_varname] for index, row in self.dict_df.iterrows() if row[self.dict_var_vartype] == 'string'] self.train_df[var_names] = self.train_df[var_names].astype("str") - if (self.debug): print(f"Input data loaded.") diff --git a/bdarpack/VIsualPlot.py b/bdarpack/VIsualPlot.py index f2229a1..9c309fe 100644 --- a/bdarpack/VIsualPlot.py +++ b/bdarpack/VIsualPlot.py @@ -14,7 +14,7 @@ -def hist(data_1d, fig=None, ax=None, position=None, title=None, alpha=0.8, color='blue', label=''): +def hist(data_1d, fig=None, ax=None, position=None, title=None, alpha=0.8, color='blue', label='', bins='auto'): """Function to generate a histogram plot. Parameters: @@ -41,14 +41,14 @@ def hist(data_1d, fig=None, ax=None, position=None, title=None, alpha=0.8, color data_dropped_na = data_1d.dropna() - ax.hist(data_dropped_na, density=True, bins='auto', alpha=alpha, color=color, label=label) + (n, bins, patches) = ax.hist(data_dropped_na, density=True, bins=bins, alpha=alpha, color=color, label=label) ax.legend(loc='best', frameon=False) ax.set_title(title, fontsize=8) return ax, fig -def hist_compare(real_data, syn_data, var_list, no_cols=2): +def hist_compare(real_data, syn_data, var_list, no_cols=2, bins='auto'): no_rows = len(var_list) // no_cols + math.ceil(len(var_list) % no_cols) ax_hist = [] @@ -60,13 +60,13 @@ def hist_compare(real_data, syn_data, var_list, no_cols=2): position_tuple = (int(no_rows), int(no_cols), int(index+1)) if (index == 0): - (ax_hist_out, fig_histogram) = hist(x_real, position = position_tuple, label=f"Original: n = {len(x_real)}") + (ax_hist_out, fig_histogram) = hist(x_real, position = position_tuple, bins=bins, label=f"Original: n = {len(x_real)}") ax_hist.append(ax_hist_out) - (ax_hist[index], fig_histogram) = hist(x_syn, ax=ax_hist[index], fig=fig_histogram, color='grey', title=f'Histogram Plot for {var}', label=f"Synthetic: n={len(x_syn)}") + (ax_hist[index], fig_histogram) = hist(x_syn, ax=ax_hist[index], fig=fig_histogram, color='grey', title=f'Histogram Plot for {var}', bins=bins, label=f"Synthetic: n={len(x_syn)}") else: - (ax_hist_out, fig_histogram) = hist(x_real, fig=fig_histogram, position = position_tuple, label=f"Original: n = {len(x_real)}") + (ax_hist_out, fig_histogram) = hist(x_real, fig=fig_histogram, position = position_tuple, bins=bins, label=f"Original: n = {len(x_real)}") ax_hist.append(ax_hist_out) - (ax_hist[index], fig_histogram) = hist(x_syn, ax=ax_hist[index], fig=fig_histogram, color='grey', title=f'Histogram Plot for {var}', label=f"Synthetic: n={len(x_syn)}") + (ax_hist[index], fig_histogram) = hist(x_syn, ax=ax_hist[index], fig=fig_histogram, color='grey', title=f'Histogram Plot for {var}', bins=bins, label=f"Synthetic: n={len(x_syn)}") return ax_hist, fig_histogram diff --git a/bdarpack/utils_.py b/bdarpack/utils_.py index 2e57e1f..5a34257 100644 --- a/bdarpack/utils_.py +++ b/bdarpack/utils_.py @@ -415,7 +415,7 @@ def mapping_dictDateFormatConversion(str): str = str.replace("dd", r"%d") elif ("d" in str): # day number without a leading zero if OS_TYPE=='Windows': # use "%-d" for unix, "%#d" for windows - str = str.replace("d", r"%#d") + str = str.replace("d", r"%d") elif OS_TYPE=='Linux' or 'Darwin': str = str.replace("d", r"%-d") @@ -429,7 +429,7 @@ def mapping_dictDateFormatConversion(str): str = str.replace("mm", r"%m") elif ("m" in str): # month number without a leading zero if OS_TYPE=='Windows': # use "%-m" for unix, "%#m" for windows - str = str.replace("m", r"%#m") + str = str.replace("m", r"%m") elif OS_TYPE=='Linux' or 'Darwin': str = str.replace("m", r"%-m") diff --git a/docs/gettingStarted/definitions.md b/docs/gettingStarted/definitions.md index 804ae7b..0f81028 100644 --- a/docs/gettingStarted/definitions.md +++ b/docs/gettingStarted/definitions.md @@ -18,7 +18,10 @@ nav_order: 5 | RAWDICTXLSX | Defines filename containing the data dictionary. E.g. "xx.xlsx", "xx.csv" | Data loading | | RAWDICTXLSX_SHEETNAME | if RAWDICTXLSX is an excel file, assign the sheetname from which to load the dictionary. If not specified, will read the first sheet. E.g. "Sheet1" | Data loading | | LOGGING | Option to output logfile. If `True`, logfile will be built. If not specified, default is `True`. | Log | -| LOG_FILENAME | Defines filename of logfile. If not defined, default is `logfile.txt`. | +| LOG_FILENAME | Defines filename of logfile. If not defined, default is `logfile.txt`. | Log | +| CREATE_UNIQUE_INDEX | Option to create unique row index from existing columns. If `True`, new index will be created. If not specified, default is `False`. | Indexing | +| UNIQUE_INDEX_COMPOSITION_LIST | List of column names to create new index from. If not specified, default is `[]`. E.g.: `["subject_id", "visit"]` | Indexing | +| UNIQUE_INDEX_DELIMITER | Delimiter to separate values from composition list. If not specified, default is `_` | Indexing | | LONG_VAR_MARKER | Defines the variable name that indicates which longitudinal group that row belongs to. If not specified, default is `None`. | Longitudinal data | | DICT_VAR_VARNAME | Column in data dictionary containing variable names in input data. If not specified, set as "`NAME`". | Data Dictionary settings | | DICT_VAR_VARCATEGORY | Column in data dictionary setting the category of the variable name. If not specified, set as "`CATEGORY`" | Data Dictionary settings | diff --git a/docs/gettingStarted/examples/CleanData_StandardiseDates_ConvertCharacters.md b/docs/gettingStarted/examples/CleanData_StandardiseDates_ConvertCharacters.md index 20eba81..908e973 100644 --- a/docs/gettingStarted/examples/CleanData_StandardiseDates_ConvertCharacters.md +++ b/docs/gettingStarted/examples/CleanData_StandardiseDates_ConvertCharacters.md @@ -103,7 +103,7 @@ print(cd.raw_df) Print Original Data Dictionary -Notice that we have not specified `CODINGS` for the date variables `date_1` and `date_3`. CleanData, nevertheless, tries to match the data to common date formats and perform the conversion accordingly. We have specified `CODINGS` for `date_1` and `date_4`, but have used the wrong format (`mm-dd-yyyy`) for date_4. The proper format should have been `dd-mm-yyyy`. +Notice that we have not specified `CODINGS` for the date variables `date_1` and `date_3`. CleanData, nevertheless, tries to match the data to common date formats and perform the conversion accordingly. We have specified `CODINGS` for `date_2` and `date_4`, but have used the wrong format (`mm-dd-yyyy`) for date_4. The proper format should have been `dd-mm-yyyy`. The format for `date_2` is also wrong, but because we have used the `Date` format in MS Excel, the dates were still read correctly. ``` print(cd.dict_df)