fix(clean): clean_duplication issue 646

fix(clean): clean_duplication issue 646 fix(clean): clean_duplication issue 646 fix(clean): clean_duplication issue 646 fix(clean): clean_duplication issue 646 fix(clean): clean_duplication issue 646 fix(clean): clean_duplication issue 646
sfu-db · Jun 5, 2021 · ca9f708 · ca9f708
1 parent 47eda68
commit ca9f708
Show file tree

Hide file tree

Showing 4 changed files with 161 additions and 38 deletions.
diff --git a/dataprep/clean/clean_duplication.py b/dataprep/clean/clean_duplication.py
@@ -7,6 +7,7 @@
 from ipywidgets.widgets import Label, Dropdown, Checkbox, Button, HBox, VBox, Box, Layout, Text
 import pandas as pd
 import dask.dataframe as dd
+from varname import argname
 
 from .clean_duplication_utils import Clusterer
 
@@ -16,10 +17,13 @@
 
 
 def clean_duplication(
-    df: Union[pd.DataFrame, dd.DataFrame], column: str, df_var_name: str = "df", page_size: int = 5
+    df: Union[pd.DataFrame, dd.DataFrame],
+    column: str,
+    df_var_name: str = "default",
+    page_size: int = 5,
 ) -> Box:
     """
-    Cleans and standardized duplicate values in a DataFrame.
+    Cleans and standardizes duplicate values in a DataFrame.
 
     Read more in the :ref:`User Guide <duplication_userguide>`.
 
@@ -30,11 +34,11 @@ def clean_duplication(
     column
         The name of the column containing duplicate values.
     df_var_name
-        The variable name of the DataFrame being cleaned.
-        This is only needed for creating exported code snippets with
-        the correct variable name.
+        Optional parameter containing the variable name of the DataFrame being cleaned.
+        This is only needed for legacy compatibility with the original veraion of this
+        function, which needed it to produce correct exported code.
 
-        (default: 'df')
+        (default: 'default')
     page_size
         The number of clusters to display on each page.
 
@@ -53,6 +57,8 @@ def clean_duplication(
     0    New York
     1    New York
     """
+    if df_var_name == "default":
+        df_var_name = argname(df)
 
     return UserInterface(df, column, df_var_name, page_size).display()
 

diff --git a/dataprep/clean/clean_duplication_utils.py b/dataprep/clean/clean_duplication_utils.py
@@ -277,18 +277,18 @@ def final_df(self) -> None:
         Writes the final dataframe to a pickle file then reads the file from
         inside the IPython kernel.
         """
-        code = "# dataframe with cleaned string values\ndf_clean"
+        code = f"# dataframe with cleaned string values\n{self._df_name}_clean"
         encoded_code = (b64encode(str.encode(code))).decode()
         final_df = self._df.compute()
         # create a temporary directory for the dataframe file
-        tmp_dir = mkdtemp()
-        df_file = path.join(tmp_dir, "clean_duplication_output.pkl")
+        tmp_dir = mkdtemp().replace("\\", "/")
+        df_file = path.join(tmp_dir, "clean_duplication_output.pkl").replace("\\", "/")
         final_df.to_pickle(df_file)
         # code to read the file and delete the temporary directory afterwards
         execute_code = (
             "import pandas as pd\n"
             "import shutil\n"
-            f"df_clean = pd.read_pickle('{df_file}')\n"
+            f"{self._df_name}_clean = pd.read_pickle('{df_file}')\n"
             f"shutil.rmtree('{tmp_dir}')"
         )
         encoded_execute = (b64encode(str.encode(execute_code))).decode()