Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement Index.dropna & MultiIndex.dropna #938

Merged
merged 12 commits into from
Nov 25, 2019
58 changes: 57 additions & 1 deletion databricks/koalas/indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
Wrappers for Indexes to behave similar to pandas Index, MultiIndex.
"""

from functools import partial
from functools import partial, reduce
from typing import Any, List, Optional, Tuple, Union

import pandas as pd
Expand All @@ -36,6 +36,7 @@
from databricks.koalas.missing.indexes import _MissingPandasLikeIndex, _MissingPandasLikeMultiIndex
from databricks.koalas.series import Series
from databricks.koalas.utils import name_like_string
from databricks.koalas.internal import _InternalFrame


class Index(IndexOpsMixin):
Expand Down Expand Up @@ -317,6 +318,61 @@ def is_object(self):
"""
return is_object_dtype(self.dtype)

def dropna(self):
"""
Return Index or MultiIndex without NA/NaN values

Examples
--------

>>> df = ks.DataFrame([[1, 2], [4, 5], [7, 8]],
... index=['cobra', 'viper', None],
... columns=['max_speed', 'shield'])
>>> df
max_speed shield
cobra 1 2
viper 4 5
NaN 7 8

>>> df.index.dropna()
Index(['cobra', 'viper'], dtype='object')

Also support for MultiIndex

>>> midx = pd.MultiIndex([['lama', 'cow', 'falcon'],
... [None, 'weight', 'length']],
... [[0, 1, 1, 1, 1, 1, 2, 2, 2],
... [0, 1, 1, 0, 1, 2, 1, 1, 2]])
>>> s = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, None],
... index=midx)
>>> s
lama NaN 45.0
cow weight 200.0
weight 1.2
NaN 30.0
weight 250.0
length 1.5
falcon weight 320.0
weight 1.0
length NaN
Name: 0, dtype: float64

>>> s.index.dropna() # doctest: +SKIP
MultiIndex([( 'cow', 'weight'),
( 'cow', 'weight'),
( 'cow', 'weight'),
( 'cow', 'length'),
('falcon', 'weight'),
('falcon', 'weight'),
('falcon', 'length')],
)
"""
kdf = self._kdf.copy()
itholic marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can do as below:

sdf = kdf._internal.sdf.select(index_scols).dropna()
internal = kdf._internal.copy(sdf=sdf, column_index=[], ...)
kdf = DataFrame(internal)
return Index(kdf) if type(self) == Index else MultiIndex(kdf)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks for the comment!
i fixed with using _InternalFrame rather than using copy since i think we can't copy internal because sdf has only index columns while kdf has also data columns

sdf = kdf._internal.sdf.select(self._internal.index_scols).dropna()
internal = _InternalFrame(sdf=sdf, index_map=self._internal.index_map)
kdf = DataFrame(internal)
return Index(kdf) if type(self) == Index else MultiIndex(kdf)

def unique(self, level=None):
"""
Return unique values in the index.
Expand Down
2 changes: 0 additions & 2 deletions databricks/koalas/missing/indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ class _MissingPandasLikeIndex(object):
drop = unsupported_function('drop')
drop_duplicates = unsupported_function('drop_duplicates')
droplevel = unsupported_function('droplevel')
dropna = unsupported_function('dropna')
duplicated = unsupported_function('duplicated')
equals = unsupported_function('equals')
factorize = unsupported_function('factorize')
Expand Down Expand Up @@ -158,7 +157,6 @@ class _MissingPandasLikeMultiIndex(object):
drop = unsupported_function('drop')
drop_duplicates = unsupported_function('drop_duplicates')
droplevel = unsupported_function('droplevel')
dropna = unsupported_function('dropna')
duplicated = unsupported_function('duplicated')
equal_levels = unsupported_function('equal_levels')
equals = unsupported_function('equals')
Expand Down
8 changes: 8 additions & 0 deletions docs/source/reference/indexing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ Missing Values
.. autosummary::
:toctree: api/

Index.dropna
Index.isna
Index.notna

Expand Down Expand Up @@ -77,6 +78,13 @@ MultiIndex Properties

MultiIndex.names

MultiIndex Missing Values
~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/

MultiIndex.dropna

MultiIndex Modifying and computations
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
Expand Down