Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix reset_index with the default index is "distributed-sequence". #1193

Merged
merged 1 commit into from
Jan 15, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 14 additions & 18 deletions databricks/koalas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2722,36 +2722,32 @@ def rename(index):
if drop:
new_index_map = []

internal = self._internal.copy(
sdf=sdf,
index_map=index_map,
column_index=None,
column_scols=([scol_for(sdf, name_like_string(name)) for _, name in new_index_map]
+ self._internal.column_scols))

if self._internal.column_index_level > 1:
column_depth = len(self._internal.column_index[0])
if col_level >= column_depth:
raise IndexError('Too many levels: Index has only {} levels, not {}'
.format(column_depth, col_level + 1))
if any(col_level + len(name) > column_depth for _, name in new_index_map):
raise ValueError('Item must have length equal to number of levels.')
columns = pd.MultiIndex.from_tuples(
[tuple(([col_fill] * col_level)
+ list(name)
+ ([col_fill] * (column_depth - (len(name) + col_level))))
for _, name in new_index_map]
+ self._internal.column_index)
column_index = ([tuple(([col_fill] * col_level)
+ list(name)
+ ([col_fill] * (column_depth - (len(name) + col_level))))
for _, name in new_index_map]
+ self._internal.column_index)
else:
columns = [name for _, name in new_index_map] + self._internal.column_index
column_index = [name for _, name in new_index_map] + self._internal.column_index

internal = self._internal.copy(
sdf=sdf,
index_map=index_map,
column_index=column_index,
column_scols=([scol_for(sdf, name_like_string(name)) for _, name in new_index_map]
+ [scol_for(sdf, col) for col in self._internal.data_columns]))
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Essentially this line is the fix.


if inplace:
self._internal = internal
self.columns = columns
else:
kdf = DataFrame(internal)
kdf.columns = columns
return kdf
return DataFrame(internal)

def isnull(self):
"""
Expand Down
18 changes: 18 additions & 0 deletions databricks/koalas/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,24 @@ def test_iterrows(self):
self.assert_eq(pdf_k, kdf_k)
self.assert_eq(pdf_v, kdf_v)

def test_reset_index_with_default_index_types(self):
pdf = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]},
index=np.random.rand(3))
kdf = ks.from_pandas(pdf)

with ks.option_context('compute.default_index_type', 'sequence'):
self.assert_eq(kdf.reset_index(), pdf.reset_index())

with ks.option_context('compute.default_index_type', 'distributed-sequence'):
# the order might be changed.
self.assert_eq(kdf.reset_index().sort_index(),
pdf.reset_index())

with ks.option_context('compute.default_index_type', 'distributed'):
# the index is different.
self.assert_eq(kdf.reset_index().to_pandas().reset_index(drop=True),
pdf.reset_index())

def test_reset_index_with_multiindex_columns(self):
index = pd.MultiIndex.from_tuples([('bird', 'falcon'),
('bird', 'parrot'),
Expand Down
17 changes: 17 additions & 0 deletions databricks/koalas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -469,6 +469,23 @@ def test_any(self):

self.assert_eq((kser % 2 == 0).any(), (pser % 2 == 0).any())

def test_reset_index_with_default_index_types(self):
pser = pd.Series([1, 2, 3], name='0', index=np.random.rand(3))
kser = ks.from_pandas(pser)

with ks.option_context('compute.default_index_type', 'sequence'):
self.assert_eq(kser.reset_index(), pser.reset_index())

with ks.option_context('compute.default_index_type', 'distributed-sequence'):
# the order might be changed.
self.assert_eq(kser.reset_index().sort_index(),
pser.reset_index())

with ks.option_context('compute.default_index_type', 'distributed'):
# the index is different.
self.assert_eq(kser.reset_index().to_pandas().reset_index(drop=True),
pser.reset_index())

def test_sort_values(self):
pser = pd.Series([1, 2, 3, 4, 5, None, 7], name='0')
kser = ks.from_pandas(pser)
Expand Down