Skip to content

Commit

Permalink
fix(clean): clean_duplication issue 646
Browse files Browse the repository at this point in the history
fix(clean): clean_duplication issue 646

fix(clean): clean_duplication issue 646

fix(clean): clean_duplication issue 646

fix(clean): clean_duplication issue 646

fix(clean): clean_duplication issue 646

fix(clean): clean_duplication issue 646
  • Loading branch information
qidanrui committed Jun 5, 2021
1 parent 47eda68 commit ca9f708
Show file tree
Hide file tree
Showing 4 changed files with 161 additions and 38 deletions.
18 changes: 12 additions & 6 deletions dataprep/clean/clean_duplication.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from ipywidgets.widgets import Label, Dropdown, Checkbox, Button, HBox, VBox, Box, Layout, Text
import pandas as pd
import dask.dataframe as dd
from varname import argname

from .clean_duplication_utils import Clusterer

Expand All @@ -16,10 +17,13 @@


def clean_duplication(
df: Union[pd.DataFrame, dd.DataFrame], column: str, df_var_name: str = "df", page_size: int = 5
df: Union[pd.DataFrame, dd.DataFrame],
column: str,
df_var_name: str = "default",
page_size: int = 5,
) -> Box:
"""
Cleans and standardized duplicate values in a DataFrame.
Cleans and standardizes duplicate values in a DataFrame.
Read more in the :ref:`User Guide <duplication_userguide>`.
Expand All @@ -30,11 +34,11 @@ def clean_duplication(
column
The name of the column containing duplicate values.
df_var_name
The variable name of the DataFrame being cleaned.
This is only needed for creating exported code snippets with
the correct variable name.
Optional parameter containing the variable name of the DataFrame being cleaned.
This is only needed for legacy compatibility with the original veraion of this
function, which needed it to produce correct exported code.
(default: 'df')
(default: 'default')
page_size
The number of clusters to display on each page.
Expand All @@ -53,6 +57,8 @@ def clean_duplication(
0 New York
1 New York
"""
if df_var_name == "default":
df_var_name = argname(df)

return UserInterface(df, column, df_var_name, page_size).display()

Expand Down
8 changes: 4 additions & 4 deletions dataprep/clean/clean_duplication_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,18 +277,18 @@ def final_df(self) -> None:
Writes the final dataframe to a pickle file then reads the file from
inside the IPython kernel.
"""
code = "# dataframe with cleaned string values\ndf_clean"
code = f"# dataframe with cleaned string values\n{self._df_name}_clean"
encoded_code = (b64encode(str.encode(code))).decode()
final_df = self._df.compute()
# create a temporary directory for the dataframe file
tmp_dir = mkdtemp()
df_file = path.join(tmp_dir, "clean_duplication_output.pkl")
tmp_dir = mkdtemp().replace("\\", "/")
df_file = path.join(tmp_dir, "clean_duplication_output.pkl").replace("\\", "/")
final_df.to_pickle(df_file)
# code to read the file and delete the temporary directory afterwards
execute_code = (
"import pandas as pd\n"
"import shutil\n"
f"df_clean = pd.read_pickle('{df_file}')\n"
f"{self._df_name}_clean = pd.read_pickle('{df_file}')\n"
f"shutil.rmtree('{tmp_dir}')"
)
encoded_execute = (b64encode(str.encode(execute_code))).decode()
Expand Down
Loading

0 comments on commit ca9f708

Please sign in to comment.