Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

groupby operations returning pd.Series objects raise ValueError: cannot insert <column>, already exists #51

Closed
shouples opened this issue Sep 23, 2022 · 1 comment · Fixed by #57
Assignees
Labels
bug Something isn't working needs new test

Comments

@shouples
Copy link
Collaborator

shouples commented Sep 23, 2022

image

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
File /opt/conda/lib/python3.9/site-packages/dx/formatters/main.py:87, in handle_format(obj, ipython_shell)
     86 try:
---> 87     payload, metadata = datalink_processing(
     88         obj,
     89         default_index_used,
     90         ipython_shell=ipython,
     91     )
     92 except Exception as e:

File /opt/conda/lib/python3.9/site-packages/dx/formatters/main.py:50, in datalink_processing(df, default_index_used, ipython_shell)
     48     logger.debug(f"df is subset of existing {parent_display_id=}")
---> 50 payload, metadata = format_output(
     51     dxdf.df,
     52     update=parent_display_id,
     53     display_id=dxdf.display_id,
     54     has_default_index=default_index_used,
     55 )
     57 # this needs to happen after sending to the frontend
     58 # so the user doesn't wait as long for writing larger datasets

File /opt/conda/lib/python3.9/site-packages/dx/formatters/main.py:158, in format_output(df, update, display_id, has_default_index)
    156 sampled_df_dimensions = get_df_dimensions(df, prefix="truncated")
--> 158 payload = generate_body(df, display_id=display_id)
    160 dataframe_info = {
    161     "default_index_used": has_default_index,
    162     **orig_df_dimensions,
    163     **sampled_df_dimensions,
    164 }

File /opt/conda/lib/python3.9/site-packages/dx/formatters/main.py:133, in generate_body(df, display_id)
    132 if settings.DISPLAY_MODE == DXDisplayMode.simple:
--> 133     data = clean_df.reset_index().to_dict("records")
    134 elif settings.DISPLAY_MODE == DXDisplayMode.enhanced:

File /opt/conda/lib/python3.9/site-packages/pandas/util/_decorators.py:311, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
    306     warnings.warn(
    307         msg.format(arguments=arguments),
    308         FutureWarning,
    309         stacklevel=stacklevel,
    310     )
--> 311 return func(*args, **kwargs)

File /opt/conda/lib/python3.9/site-packages/pandas/core/frame.py:5848, in DataFrame.reset_index(self, level, drop, inplace, col_level, col_fill)
   5844             level_values = algorithms.take(
   5845                 level_values, lab, allow_fill=True, fill_value=lev._na_value
   5846             )
-> 5848         new_obj.insert(0, name, level_values)
   5850 new_obj.index = new_index

File /opt/conda/lib/python3.9/site-packages/pandas/core/frame.py:4443, in DataFrame.insert(self, loc, column, value, allow_duplicates)
   4441 if not allow_duplicates and column in self.columns:
   4442     # Should this be a different kind of error??
-> 4443     raise ValueError(f"cannot insert {column}, already exists")
   4444 if not isinstance(loc, int):

ValueError: cannot insert Location Description, already exists

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
Cell In [9], line 1
----> 1 df.groupby(['IUCR', 'Primary Type'])['Location Description'].value_counts()

File /opt/conda/lib/python3.9/site-packages/IPython/core/displayhook.py:262, in DisplayHook.__call__(self, result)
    260 self.start_displayhook()
    261 self.write_output_prompt()
--> 262 format_dict, md_dict = self.compute_format_data(result)
    263 self.update_user_ns(result)
    264 self.fill_exec_result(result)

File /opt/conda/lib/python3.9/site-packages/IPython/core/displayhook.py:151, in DisplayHook.compute_format_data(self, result)
    121 def compute_format_data(self, result):
    122     """Compute format data of the object to be displayed.
    123 
    124     The format data is a generalization of the :func:`repr` of an object.
   (...)
    149 
    150     """
--> 151     return self.shell.display_formatter.format(result)

File /opt/conda/lib/python3.9/site-packages/dx/formatters/main.py:106, in DXDisplayFormatter.format(self, obj, **kwargs)
    103 def format(self, obj, **kwargs):
    105     if IN_NOTEBOOK_ENV and isinstance(obj, tuple(settings.RENDERABLE_OBJECTS)):
--> 106         handle_format(obj)
    107         return ({}, {})
    109     return DEFAULT_IPYTHON_DISPLAY_FORMATTER.format(obj, **kwargs)

File /opt/conda/lib/python3.9/site-packages/dx/formatters/main.py:95, in handle_format(obj, ipython_shell)
     93     logger.debug(f"Error in datalink_processing: {e}")
     94     # fall back to default processing
---> 95     payload, metadata = format_output(obj, has_default_index=default_index_used)
     97 return payload, metadata

File /opt/conda/lib/python3.9/site-packages/dx/formatters/main.py:158, in format_output(df, update, display_id, has_default_index)
    155 df = sample_if_too_big(df, display_id=display_id)
    156 sampled_df_dimensions = get_df_dimensions(df, prefix="truncated")
--> 158 payload = generate_body(df, display_id=display_id)
    160 dataframe_info = {
    161     "default_index_used": has_default_index,
    162     **orig_df_dimensions,
    163     **sampled_df_dimensions,
    164 }
    165 metadata = generate_metadata(display_id=display_id, **dataframe_info)

File /opt/conda/lib/python3.9/site-packages/dx/formatters/main.py:133, in generate_body(df, display_id)
    130 clean_df = df.astype(object).where(df.notnull(), None)
    132 if settings.DISPLAY_MODE == DXDisplayMode.simple:
--> 133     data = clean_df.reset_index().to_dict("records")
    134 elif settings.DISPLAY_MODE == DXDisplayMode.enhanced:
    135     data = clean_df.reset_index().transpose().values.tolist()

File /opt/conda/lib/python3.9/site-packages/pandas/util/_decorators.py:311, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
    305 if len(args) > num_allow_args:
    306     warnings.warn(
    307         msg.format(arguments=arguments),
    308         FutureWarning,
    309         stacklevel=stacklevel,
    310     )
--> 311 return func(*args, **kwargs)

File /opt/conda/lib/python3.9/site-packages/pandas/core/frame.py:5848, in DataFrame.reset_index(self, level, drop, inplace, col_level, col_fill)
   5842         if lab is not None:
   5843             # if we have the codes, extract the values with a mask
   5844             level_values = algorithms.take(
   5845                 level_values, lab, allow_fill=True, fill_value=lev._na_value
   5846             )
-> 5848         new_obj.insert(0, name, level_values)
   5850 new_obj.index = new_index
   5851 if not inplace:

File /opt/conda/lib/python3.9/site-packages/pandas/core/frame.py:4443, in DataFrame.insert(self, loc, column, value, allow_duplicates)
   4437     raise ValueError(
   4438         "Cannot specify 'allow_duplicates=True' when "
   4439         "'self.flags.allows_duplicate_labels' is False."
   4440     )
   4441 if not allow_duplicates and column in self.columns:
   4442     # Should this be a different kind of error??
-> 4443     raise ValueError(f"cannot insert {column}, already exists")
   4444 if not isinstance(loc, int):
   4445     raise TypeError("loc must be int")

ValueError: cannot insert Location Description, already exists

No issues with groupby operations that return dataframes though:
image

@shouples shouples added the bug Something isn't working label Sep 23, 2022
@shouples shouples self-assigned this Sep 23, 2022
@shouples shouples changed the title groupby options returning pd.Series objects raise ValueError: cannot insert <column>, already exists groupby operations returning pd.Series objects raise ValueError: cannot insert <column>, already exists Sep 23, 2022
@shouples
Copy link
Collaborator Author

TODO:
check if original obj is pd.Series, and if its name shows up in the .index.names
image

shouples added a commit that referenced this issue Sep 29, 2022
…ames` conflict (#57)

* add multiindex series fixtures

* add tests for conversions to dataframes

* add tests for groupby series objects

* handle groupby series objects whose name is included in the MultiIndex names
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Something isn't working needs new test
Projects
None yet
Development

Successfully merging a pull request may close this issue.

1 participant