From cc65ba3f8cedb2fc2e072a3e8c6730bb91650ca5 Mon Sep 17 00:00:00 2001
From: mz <tanmz87@gmail.com>
Date: Tue, 2 Apr 2024 09:56:52 +0800
Subject: [PATCH] MZ 20240402 update bug fixes

---
 bdarpack/CleanData.py                         |   3 +-
 bdarpack/Constraints.py                       | 184 ++++++++++++++++--
 bdarpack/TabulaCopula.py                      |   7 +-
 bdarpack/VIsualPlot.py                        |  14 +-
 bdarpack/utils_.py                            |   4 +-
 docs/gettingStarted/definitions.md            |   5 +-
 ...Data_StandardiseDates_ConvertCharacters.md |   2 +-
 7 files changed, 188 insertions(+), 31 deletions(-)

diff --git a/bdarpack/CleanData.py b/bdarpack/CleanData.py
index 89b8e85..855313e 100644
--- a/bdarpack/CleanData.py
+++ b/bdarpack/CleanData.py
@@ -840,7 +840,8 @@ def update_data(self, new_df, filename_suffix=""):
             # Update latest clean df to output_df
             old_df = deepcopy(self.clean_df)
             self.clean_df = deepcopy(new_df)
-            self.clean_df.reset_index(drop=True, inplace=True) #reset the index
+            if not self.create_unique_index:
+                self.clean_df.reset_index(drop=True, inplace=True) #reset the index
 
             self._save_data_to_file()
 
diff --git a/bdarpack/Constraints.py b/bdarpack/Constraints.py
index 2d20fb2..89cc361 100644
--- a/bdarpack/Constraints.py
+++ b/bdarpack/Constraints.py
@@ -1,6 +1,7 @@
 from bdarpack import utils_ as ut_
 from copy import deepcopy
 import pandas as pd
+import numpy as np
 
 class Constraints:
     """
@@ -35,13 +36,16 @@ def output_log_to_file(self):
             print('No logger is defined.')
 
 
-    def multiparent_conditions(self, df, var_array, dict_conditions_values):
+    def multiparent_conditions(self, df, var_array, dict_conditions_values, options={}):
         """Function for replacement of values in a dataframe based on multiple conditions evaluated from multiple columns.
         
         Parameters:
             df (dataframe): The dataframe to be updated.
             var_array (list): A list of strings of column names to be updated.
-            dict_condiions_values (dict): A dictionary of conditions and values. The conditions are evaluated from multiple columns and the corresponding value is then inserted into the specified columns.
+            dict_conditions_values (dict): A dictionary of conditions and values. The conditions are evaluated from multiple columns and the corresponding value is then inserted into the specified columns.
+            options (dict): Options varying usage.
+                - "duplicate_output" (bool): Default is `False`. If `True`, the columns in `var_array` will not be replaced with new values. Instead, a duplicated `var_array` will be created and updated based on given `dict_conditions_values`.
+                - "duplicate_output_suffix" (str): Default is '_dup'.
         
         Returns:
             df (dataframe): The updated dataframe
@@ -59,10 +63,28 @@ def multiparent_conditions(self, df, var_array, dict_conditions_values):
             This example updates the dataframe with the values 0 and 1 in "column1" and "column2" columns, according to the conditions given. In particular, the value 0 is inserted when "parent1_column" is greater than 5 AND "parent2_column" is less than 10. The value 1 is inserted when "parent3_column" is equal to "Yes".
         """
 
+        # (MZ): 03-22-2024: add duplicate option
+        if "duplicate_output" in options:
+            dup_output_bool = options['duplicate_output']
+        else:
+            dup_output_bool = False
+        if "duplicate_output_suffix" in options:
+            dup_output_suffix = options['duplicate_output_suffix']
+        else:
+            dup_output_suffix = '_dup'
+
+        # Duplicate columns in var_array
+        suffix = dup_output_suffix
+        var_array_use = [vari + suffix for vari in var_array]
+        for i in range(len(var_array)):
+            df[var_array_use[i]] = df[var_array[i]]
+
         # Initialise log
-        for var in var_array:
-            self.init_log(var)
-            df[var] = df[var].convert_dtypes()
+        # for var in var_array:
+        for i in range(len(var_array)):
+            self.init_log(var_array[i])
+            df[var_array[i]] = df[var_array[i]].convert_dtypes()
+            df[var_array_use[i]] = df[var_array_use[i]].convert_dtypes()
         
         # Iterate through conditions and values
         for key, dict_condition_value in dict_conditions_values.items():
@@ -77,16 +99,31 @@ def multiparent_conditions(self, df, var_array, dict_conditions_values):
                 else:
                     cond_ = cond_ & (df[parent].map(condition))
 
-            df.loc[cond_,var_array] = dict_condition_value['value']
+            df.loc[cond_,var_array_use] = dict_condition_value['value']
 
         # Convert dataframe to best possible dtype
-        for var in var_array:
+        for var in var_array_use:
             df[var] = df[var].convert_dtypes()
 
+        # Generate mismatch list
+        mismatch_dict = {}
+        for i in range(len(var_array)):
+            mismatch_dict[var_array[i]] = self.find_mismatch(df, var_array[i], var_array_use[i])
+
+        # Replace if dup_output_bool=False
+        if (not dup_output_bool):
+            for i in range(len(var_array)):
+                df[var_array[i]] = df[var_array_use[i]]
+            df = df.drop(var_array_use, axis=1)
+
         # Create log
         for var in var_array:
+            mismatch_str = ','.join(mismatch_dict[var])
             msg = f"Replaced {var} using conditions and values given in dict_conditions_values."
-            self.log[var]['evaluate_df_column'] = msg
+            self.log[var]['multiparent_conditions'] = {
+                "msg": msg,
+                "replaced": mismatch_str
+            }
 
             if self.debug:
                 print(f"For variable: {var}: {msg}")
@@ -96,7 +133,7 @@ def multiparent_conditions(self, df, var_array, dict_conditions_values):
         return df
 
 
-    def evaluate_df_column(self, df, column_names, dict_conditions_values=None, func=None, output_column_name=None):
+    def evaluate_df_column(self, df, column_names, dict_conditions_values=None, func=None, output_column_name=None, options={}):
         """This function takes a dataframe and column name(s) and evaluates the column based on the given conditions and values, creating a new column in the dataframe with the evaluated values. Optionally, a function can be passed in to evaluate the column.
 
         Parameters:
@@ -111,6 +148,9 @@ def evaluate_df_column(self, df, column_names, dict_conditions_values=None, func
                 }
             func (function): Optional. A function to be applied on the columns
             output_column_name (str): Optional. A string containing the name of the output column. If not provided, the default is the name of the column plus '_evaluated'.
+            options (dict): Optional, options varying usage.
+                - "duplicate_output" (bool): Default is `False`. If `True`, the columns in `output_column name` will not be replaced with new values. Instead, a duplicated `output_column_name` will be created and updated based on given `dict_conditions_values`. This is functionality applies only when `output_column_name`is an existing column in the dataframe.
+                - "duplicate_output_suffix" (str): Default is '_dup'.
 
         Returns:
             df (dataframe): the dataframe with the evaluated values in the new column
@@ -126,20 +166,53 @@ def evaluate_df_column(self, df, column_names, dict_conditions_values=None, func
                 output_column_name='item_type'
             )
         """
+
+        # (MZ): 03-26-2024: add duplicate option
+        if "duplicate_output" in options:
+            dup_output_bool = options['duplicate_output']
+        else:
+            dup_output_bool = False
+        if "duplicate_output_suffix" in options:
+            dup_output_suffix = options['duplicate_output_suffix']
+        else:
+            dup_output_suffix = '_dup'
         
+        column_exist = False
         # Create a new column with the new values
         if isinstance(column_names, str):
             if output_column_name is None:
                 output_column_name = column_name + '_evaluated'
-            df[output_column_name] = df[column_names].copy()
+            
+            if output_column_name in df.columns: # column is to be replaced
+                column_exist = True
+                original_column_name = deepcopy(output_column_name)
+                output_column_name = output_column_name + dup_output_suffix #output in duplicate column
+
+            if column_exist:
+                df[output_column_name] = df[original_column_name].copy()
+            else:
+                df[output_column_name] = df[column_names].copy()
         else:
             column_name = column_names[0]
             if output_column_name is None:
                 output_column_name = ''.join(column_names) + '_evaluated'
-            df[output_column_name] = df[column_name].copy()
 
+            if output_column_name in df.columns: # column is to be replaced
+                column_exist = True
+                original_column_name = deepcopy(output_column_name)
+                output_column_name = output_column_name + dup_output_suffix #output in duplicate column
+
+            if column_exist:
+                df[output_column_name] = df[original_column_name].copy()
+            else:
+                df[output_column_name] = df[column_name].copy()
+
+        
         # initialise log
-        self.init_log(output_column_name) 
+        if column_exist:
+            self.init_log(original_column_name)
+        else:
+            self.init_log(output_column_name)
 
         if func is None:
             # Iterate through each condition and value
@@ -167,15 +240,49 @@ def evaluate_df_column(self, df, column_names, dict_conditions_values=None, func
         # Convert dataframe to best possible dtype
         df[output_column_name] = df[output_column_name].convert_dtypes()
 
+        # Generate mismatch list
+        mismatch_str = 'Secondary column not in original dataframe'
+        if column_exist: # column is to be replaced
+            # mismatch_dict = {}
+            mismatch_list = self.find_mismatch(df, original_column_name, output_column_name)
+            if len(mismatch_list)==0:
+                mismatch_str = 'No mismatches'
+            else:
+                # mismatch_str = ','.join(mismatch_list)
+                mismatch_str = ','.join(str(item) for item in mismatch_list)
+        
+        # Replace if dup_output_false = False
+        if column_exist:
+            if (not dup_output_bool):
+                df[original_column_name] = df[output_column_name]
+                df = df.drop([output_column_name], axis=1)
+
         # Create log
-        msg = f"Replaced {output_column_name} using conditions and values given in dict_conditions_values."
-        self.log[output_column_name]['evaluate_df_column'] = msg
+        if column_exist:
+            if (not dup_output_bool):
+                msg = f"Replaced {original_column_name} using conditions and values given in dict_conditions_values."
+                self.log[original_column_name]['evaluate_df_column'] = {
+                    "msg": msg,
+                    "replaced": mismatch_str
+                }
+            else:
+                msg = f"Created secondary {output_column_name} using conditions and values given in dict_conditions_values."
+                self.log[original_column_name]['evaluate_df_column'] = {
+                    "msg": msg,
+                    "mismatch": mismatch_str
+                }
+        else:
+            msg = f"Created secondary {output_column_name} using conditions and values given in dict_conditions_values."
+            self.log[output_column_name]['evaluate_df_column'] = {
+                "msg": msg,
+                "replaced": mismatch_str
+            }
 
         if self.debug:
             print(f"For variable: {output_column_name}: {msg}")
         if self.logging:
             self.logger.info(f"For variable: {output_column_name}: {msg}")
-
+            
         return df
 
 
@@ -225,10 +332,12 @@ def convertBlankstoValue(self, df, var_array=None, value=None):
                 replace_value = value
 
             # return the number of missing values converted
-            number_of_missing_values_converted = df[v].isnull().sum()
+            # number_of_missing_values_converted = df[v].isnull().sum() #(MZ): 20240322
+            number_of_missing_values_converted = df[v].isnull().sum() + (df[v] == "").sum()
 
             # Convert missing values in a dataframe column to value
             df[v].fillna(replace_value, inplace=True)
+            df[v].loc[df[v] == ""] = replace_value #(MZ): 20240322
 
             # Convert dataframe to best possible dtype
             df[v] = df[v].convert_dtypes()
@@ -297,4 +406,45 @@ def func(row):
         if self.logging:
             self.logger.debug(f"For variable: {A}: {msg}")
 
-        return df
\ No newline at end of file
+        return df
+    
+    def find_mismatch(self, df, col1, col2):
+        mismatched_list = []
+
+        if self.debug:
+            print(f"Checking column: {col1} against {col2}")
+        if self.logging:
+            self.logger.debug(f"Checking column: {col1} against {col2}")
+
+        a = df[col1] != df[col2]
+        a_bool = a.fillna(True)
+
+        # Get rows where both columns have <NA> values
+        na_rows = (df[col1].isna()) & (df[col2].isna())
+        # Update a_bool values only for these rows
+        a_bool.loc[na_rows] = False
+
+        if a_bool.any():
+            # get index of rows where values are not equal
+            mismatched_index = np.where(a_bool)[0]
+            mismatched_row = df.iloc[mismatched_index]
+            mismatched_list = mismatched_row.index.tolist()
+            if self.debug:
+                if (len(mismatched_list)<20):
+                    print("Mismatched rows index:", ','.join(str(item) for item in mismatched_list))
+                else:
+                    print("Mismatched rows index:", "Too many to show.")
+            if self.logging:
+                if (len(mismatched_list)<20):
+                    # mismatched_str = ', '.join(mismatched_list)
+                    mismatched_str = ','.join(str(item) for item in mismatched_list)
+                    self.logger.debug(f"Mismatched rows index: {mismatched_str}")
+                else:
+                    self.logger.debug("Mismatched rows index: Too many to show.")
+        else:
+            if self.debug:
+                print("Columns are identical.")
+            else:
+                self.logger.debug("Columns are identical.")
+
+        return mismatched_list
\ No newline at end of file
diff --git a/bdarpack/TabulaCopula.py b/bdarpack/TabulaCopula.py
index 462f439..7b80f23 100644
--- a/bdarpack/TabulaCopula.py
+++ b/bdarpack/TabulaCopula.py
@@ -182,17 +182,21 @@ def _load_definitions(self, definitions):
         self._update_defaults(var_to_update="privacy_batch_n", new_value="PRIVACY_BATCH_N", definitions=definitions)
 
         self.prefix_path = definitions.PREFIX_PATH
+        self.prefix_path = self.prefix_path.replace("\\","/")
 
         self.trainxlsx = definitions.TRAINXLSX
         self.traindictxlsx = definitions.TRAINDICTXLSX
 
         self.train_data_path = self.prefix_path + self.folder_trainData + "\\"
+        self.train_data_path = self.train_data_path.replace("\\","/")
         self.train_data_filename = self.train_data_path + definitions.TRAINXLSX
         self.train_data_dict_filename = self.train_data_path + definitions.TRAINDICTXLSX
 
         self.syn_data_path = self.prefix_path + self.folder_synData + "\\"
-        self.privacyMetrics_path = self.prefix_path + self.folder_privacyMetrics + "\\"
+        self.syn_data_path = self.syn_data_path.replace("\\","/")
 
+        self.privacyMetrics_path = self.prefix_path + self.folder_privacyMetrics + "\\"
+        self.privacyMetrics_path = self.privacyMetrics_path.replace("\\","/")
 
         self.train_df = None #initialise training data (dataframe)
         self.dict_df = None #initialise data dictionary (dataframe)
@@ -278,7 +282,6 @@ def read_inputData(self, sheetname=None):
         var_names = [row[self.dict_var_varname] for index, row in self.dict_df.iterrows() if row[self.dict_var_vartype] == 'string']
         self.train_df[var_names] = self.train_df[var_names].astype("str")
 
-
         if (self.debug):
             print(f"Input data loaded.")
 
diff --git a/bdarpack/VIsualPlot.py b/bdarpack/VIsualPlot.py
index f2229a1..9c309fe 100644
--- a/bdarpack/VIsualPlot.py
+++ b/bdarpack/VIsualPlot.py
@@ -14,7 +14,7 @@
 
 
 
-def hist(data_1d, fig=None, ax=None, position=None, title=None, alpha=0.8, color='blue', label=''):
+def hist(data_1d, fig=None, ax=None, position=None, title=None, alpha=0.8, color='blue', label='', bins='auto'):
     """Function to generate a histogram plot.
 
     Parameters:
@@ -41,14 +41,14 @@ def hist(data_1d, fig=None, ax=None, position=None, title=None, alpha=0.8, color
 
     data_dropped_na = data_1d.dropna()
 
-    ax.hist(data_dropped_na, density=True, bins='auto', alpha=alpha, color=color, label=label)
+    (n, bins, patches) = ax.hist(data_dropped_na, density=True, bins=bins, alpha=alpha, color=color, label=label)
 
     ax.legend(loc='best', frameon=False)
     ax.set_title(title, fontsize=8)
 
     return ax, fig
 
-def hist_compare(real_data, syn_data, var_list, no_cols=2):
+def hist_compare(real_data, syn_data, var_list, no_cols=2, bins='auto'):
 
     no_rows = len(var_list) // no_cols + math.ceil(len(var_list) % no_cols)
     ax_hist = []
@@ -60,13 +60,13 @@ def hist_compare(real_data, syn_data, var_list, no_cols=2):
         position_tuple = (int(no_rows), int(no_cols), int(index+1))
 
         if (index == 0):
-            (ax_hist_out, fig_histogram) = hist(x_real, position = position_tuple, label=f"Original: n = {len(x_real)}")
+            (ax_hist_out, fig_histogram) = hist(x_real, position = position_tuple, bins=bins, label=f"Original: n = {len(x_real)}")
             ax_hist.append(ax_hist_out)
-            (ax_hist[index], fig_histogram) = hist(x_syn, ax=ax_hist[index], fig=fig_histogram, color='grey', title=f'Histogram Plot for {var}', label=f"Synthetic: n={len(x_syn)}")
+            (ax_hist[index], fig_histogram) = hist(x_syn, ax=ax_hist[index], fig=fig_histogram, color='grey', title=f'Histogram Plot for {var}', bins=bins, label=f"Synthetic: n={len(x_syn)}")
         else:
-            (ax_hist_out, fig_histogram) = hist(x_real, fig=fig_histogram, position = position_tuple, label=f"Original: n = {len(x_real)}")
+            (ax_hist_out, fig_histogram) = hist(x_real, fig=fig_histogram, position = position_tuple, bins=bins, label=f"Original: n = {len(x_real)}")
             ax_hist.append(ax_hist_out)
-            (ax_hist[index], fig_histogram) = hist(x_syn, ax=ax_hist[index], fig=fig_histogram, color='grey', title=f'Histogram Plot for {var}', label=f"Synthetic: n={len(x_syn)}")
+            (ax_hist[index], fig_histogram) = hist(x_syn, ax=ax_hist[index], fig=fig_histogram, color='grey', title=f'Histogram Plot for {var}', bins=bins, label=f"Synthetic: n={len(x_syn)}")
 
     return ax_hist, fig_histogram
 
diff --git a/bdarpack/utils_.py b/bdarpack/utils_.py
index 2e57e1f..5a34257 100644
--- a/bdarpack/utils_.py
+++ b/bdarpack/utils_.py
@@ -415,7 +415,7 @@ def mapping_dictDateFormatConversion(str):
         str = str.replace("dd", r"%d")
     elif ("d" in str): # day number without a leading zero
         if OS_TYPE=='Windows': # use "%-d" for unix, "%#d" for windows
-            str = str.replace("d", r"%#d")
+            str = str.replace("d", r"%d")
         elif OS_TYPE=='Linux' or 'Darwin':
             str = str.replace("d", r"%-d")
 
@@ -429,7 +429,7 @@ def mapping_dictDateFormatConversion(str):
         str = str.replace("mm", r"%m")
     elif ("m" in str): # month number without a leading zero
         if OS_TYPE=='Windows': # use "%-m" for unix, "%#m" for windows
-            str = str.replace("m", r"%#m")
+            str = str.replace("m", r"%m")
         elif OS_TYPE=='Linux' or 'Darwin':
             str = str.replace("m", r"%-m")
     
diff --git a/docs/gettingStarted/definitions.md b/docs/gettingStarted/definitions.md
index 804ae7b..0f81028 100644
--- a/docs/gettingStarted/definitions.md
+++ b/docs/gettingStarted/definitions.md
@@ -18,7 +18,10 @@ nav_order: 5
 | RAWDICTXLSX       | Defines filename containing the data dictionary. E.g. "xx.xlsx", "xx.csv" | Data loading |
 | RAWDICTXLSX_SHEETNAME | if RAWDICTXLSX is an excel file, assign the sheetname from which to load the dictionary. If not specified, will read the first sheet. E.g. "Sheet1" | Data loading |
 | LOGGING           | Option to output logfile. If `True`, logfile will be built. If not specified, default is `True`. | Log | 
-| LOG_FILENAME      | Defines filename of logfile. If not defined, default is `logfile.txt`. |
+| LOG_FILENAME      | Defines filename of logfile. If not defined, default is `logfile.txt`. | Log |
+| CREATE_UNIQUE_INDEX  | Option to create unique row index from existing columns. If `True`, new index will be created. If not specified, default is `False`.  | Indexing |
+| UNIQUE_INDEX_COMPOSITION_LIST  | List of column names to create new index from. If not specified, default is `[]`. E.g.: `["subject_id", "visit"]`  | Indexing |
+| UNIQUE_INDEX_DELIMITER  | Delimiter to separate values from composition list. If not specified, default is `_` | Indexing |
 | LONG_VAR_MARKER   | Defines the variable name that indicates which longitudinal group that row belongs to. If not specified, default is `None`. | Longitudinal data |
 | DICT_VAR_VARNAME  | Column in data dictionary containing variable names in input data. If not specified, set as "`NAME`". | Data Dictionary settings |
 | DICT_VAR_VARCATEGORY | Column in data dictionary setting the category of the variable name. If not specified, set as "`CATEGORY`" | Data Dictionary settings |
diff --git a/docs/gettingStarted/examples/CleanData_StandardiseDates_ConvertCharacters.md b/docs/gettingStarted/examples/CleanData_StandardiseDates_ConvertCharacters.md
index 20eba81..908e973 100644
--- a/docs/gettingStarted/examples/CleanData_StandardiseDates_ConvertCharacters.md
+++ b/docs/gettingStarted/examples/CleanData_StandardiseDates_ConvertCharacters.md
@@ -103,7 +103,7 @@ print(cd.raw_df)
 
 Print Original Data Dictionary
 
-Notice that we have not specified `CODINGS` for the date variables `date_1` and `date_3`. CleanData, nevertheless, tries to match the data to common date formats and perform the conversion accordingly. We have specified `CODINGS` for `date_1` and `date_4`, but have used the wrong format (`mm-dd-yyyy`) for date_4. The proper format should have been `dd-mm-yyyy`.
+Notice that we have not specified `CODINGS` for the date variables `date_1` and `date_3`. CleanData, nevertheless, tries to match the data to common date formats and perform the conversion accordingly. We have specified `CODINGS` for `date_2` and `date_4`, but have used the wrong format (`mm-dd-yyyy`) for date_4. The proper format should have been `dd-mm-yyyy`. The format for `date_2` is also wrong, but because we have used the `Date` format in MS Excel, the dates were still read correctly.
 
 ```
 print(cd.dict_df)