Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add type annotations to Frame (generic.py) #1894

Merged
merged 5 commits into from
Nov 7, 2020
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 78 additions & 35 deletions databricks/koalas/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from collections.abc import Iterable
from distutils.version import LooseVersion
from functools import reduce
from typing import List, Optional, Tuple, Union, TYPE_CHECKING
from typing import List, Optional, Tuple, Union, TYPE_CHECKING, cast
import warnings

import numpy as np # noqa: F401
Expand All @@ -37,6 +37,7 @@
from databricks.koalas.indexing import AtIndexer, iAtIndexer, iLocIndexer, LocIndexer
from databricks.koalas.internal import InternalFrame, NATURAL_ORDER_COLUMN_NAME
from databricks.koalas.spark import functions as SF
from databricks.koalas.typedef import Scalar
from databricks.koalas.utils import (
is_name_like_tuple,
is_name_like_value,
Expand Down Expand Up @@ -70,8 +71,34 @@ def _apply_series_op(self, op, should_resolve: bool = False):
def _reduce_for_stat_function(self, sfun, name, axis=None, numeric_only=True):
pass

@property
@abstractmethod
def dtypes(self):
pass

@abstractmethod
def to_pandas(self):
pass

@property
@abstractmethod
def index(self):
pass

@abstractmethod
def copy(self):
pass

@abstractmethod
def _to_internal_pandas(self):
pass

@abstractmethod
def head(self, n: int = 5):
pass

# TODO: add 'axis' parameter
def cummin(self, skipna: bool = True):
def cummin(self, skipna: bool = True) -> Union["ks.Series", "ks.DataFrame"]:
"""
Return cumulative minimum over a DataFrame or Series axis.

Expand Down Expand Up @@ -133,7 +160,7 @@ def cummin(self, skipna: bool = True):
) # type: ignore

# TODO: add 'axis' parameter
def cummax(self, skipna: bool = True):
def cummax(self, skipna: bool = True) -> Union["ks.Series", "ks.DataFrame"]:
"""
Return cumulative maximum over a DataFrame or Series axis.

Expand Down Expand Up @@ -196,7 +223,7 @@ def cummax(self, skipna: bool = True):
) # type: ignore

# TODO: add 'axis' parameter
def cumsum(self, skipna: bool = True):
def cumsum(self, skipna: bool = True) -> Union["ks.Series", "ks.DataFrame"]:
"""
Return cumulative sum over a DataFrame or Series axis.

Expand Down Expand Up @@ -261,7 +288,7 @@ def cumsum(self, skipna: bool = True):
# TODO: add 'axis' parameter
# TODO: use pandas_udf to support negative values and other options later
# other window except unbounded ones is supported as of Spark 3.0.
def cumprod(self, skipna: bool = True):
def cumprod(self, skipna: bool = True) -> Union["ks.Series", "ks.DataFrame"]:
"""
Return cumulative product over a DataFrame or Series axis.

Expand Down Expand Up @@ -332,7 +359,7 @@ def cumprod(self, skipna: bool = True):
# TODO: Although this has removed pandas >= 1.0.0, but we're keeping this as deprecated
# since we're using this for `DataFrame.info` internally.
# We can drop it once our minimal pandas version becomes 1.0.0.
def get_dtype_counts(self):
def get_dtype_counts(self) -> pd.Series:
"""
Return counts of unique dtypes in this object.

Expand Down Expand Up @@ -375,8 +402,8 @@ def get_dtype_counts(self):
if not isinstance(self.dtypes, Iterable):
dtypes = [self.dtypes]
else:
dtypes = self.dtypes
return pd.Series(dict(Counter([d.name for d in list(dtypes)])))
dtypes = list(self.dtypes)
return pd.Series(dict(Counter([d.name for d in dtypes])))

def pipe(self, func, *args, **kwargs):
r"""
Expand Down Expand Up @@ -471,7 +498,7 @@ def pipe(self, func, *args, **kwargs):
else:
return func(self, *args, **kwargs)

def to_numpy(self):
def to_numpy(self) -> np.ndarray:
"""
A NumPy ndarray representing the values in this DataFrame or Series.

Expand Down Expand Up @@ -509,7 +536,7 @@ def to_numpy(self):
return self.to_pandas().values

@property
def values(self):
def values(self) -> np.ndarray:
"""
Return a Numpy representation of the DataFrame or the Series.

Expand Down Expand Up @@ -586,7 +613,7 @@ def to_csv(
partition_cols: Optional[Union[str, List[str]]] = None,
index_col: Optional[Union[str, List[str]]] = None,
**options
):
) -> Optional[str]:
r"""
Write object to a comma-separated values (csv) file.

Expand Down Expand Up @@ -640,6 +667,10 @@ def to_csv(
It has higher priority and overwrites all other options.
This parameter only works when `path` is specified.

Returns
-------
str or None
ueshin marked this conversation as resolved.
Show resolved Hide resolved

See Also
--------
read_csv
Expand Down Expand Up @@ -801,6 +832,7 @@ def to_csv(
charToEscapeQuoteEscaping=escapechar,
)
builder.options(**options).format("csv").save(path)
return None

def to_json(
self,
Expand All @@ -811,7 +843,7 @@ def to_json(
partition_cols: Optional[Union[str, List[str]]] = None,
index_col: Optional[Union[str, List[str]]] = None,
**options
):
) -> Optional[str]:
"""
Convert the object to a JSON string.

Expand Down Expand Up @@ -860,6 +892,10 @@ def to_json(
It has a higher priority and overwrites all other options.
This parameter only works when `path` is specified.

Returns
--------
str or None

Examples
--------
>>> df = ks.DataFrame([['a', 'b'], ['c', 'd']],
Expand Down Expand Up @@ -913,6 +949,7 @@ def to_json(
builder.partitionBy(partition_cols)
builder._set_opts(compression=compression)
builder.options(**options).format("json").save(path)
return None

def to_excel(
self,
Expand All @@ -932,7 +969,7 @@ def to_excel(
inf_rep="inf",
verbose=True,
freeze_panes=None,
):
) -> None:
"""
Write object to an Excel sheet.

Expand Down Expand Up @@ -1046,7 +1083,7 @@ def to_excel(
kdf._to_internal_pandas(), self.to_excel, f, args
)

def mean(self, axis=None, numeric_only=True):
def mean(self, axis=None, numeric_only=True) -> Union[Scalar, "ks.Series"]:
"""
Return the mean of the values.

Expand Down Expand Up @@ -1091,7 +1128,7 @@ def mean(self, axis=None, numeric_only=True):
F.mean, name="mean", numeric_only=numeric_only, axis=axis
)

def sum(self, axis=None, numeric_only=True):
def sum(self, axis=None, numeric_only=True) -> Union[Scalar, "ks.Series"]:
"""
Return the sum of the values.

Expand Down Expand Up @@ -1136,7 +1173,7 @@ def sum(self, axis=None, numeric_only=True):
F.sum, name="sum", numeric_only=numeric_only, axis=axis
)

def skew(self, axis=None, numeric_only=True):
def skew(self, axis=None, numeric_only=True) -> Union[Scalar, "ks.Series"]:
"""
Return unbiased skew normalized by N-1.

Expand Down Expand Up @@ -1174,7 +1211,7 @@ def skew(self, axis=None, numeric_only=True):
F.skewness, name="skew", numeric_only=numeric_only, axis=axis
)

def kurtosis(self, axis=None, numeric_only=True):
def kurtosis(self, axis=None, numeric_only=True) -> Union[Scalar, "ks.Series"]:
"""
Return unbiased kurtosis using Fisher’s definition of kurtosis (kurtosis of normal == 0.0).
Normalized by N-1.
Expand Down Expand Up @@ -1215,7 +1252,7 @@ def kurtosis(self, axis=None, numeric_only=True):

kurt = kurtosis

def min(self, axis=None, numeric_only=None):
def min(self, axis=None, numeric_only=None) -> Union[Scalar, "ks.Series"]:
"""
Return the minimum of the values.

Expand Down Expand Up @@ -1261,7 +1298,7 @@ def min(self, axis=None, numeric_only=None):
F.min, name="min", numeric_only=numeric_only, axis=axis
)

def max(self, axis=None, numeric_only=None):
def max(self, axis=None, numeric_only=None) -> Union[Scalar, "ks.Series"]:
"""
Return the maximum of the values.

Expand Down Expand Up @@ -1307,7 +1344,7 @@ def max(self, axis=None, numeric_only=None):
F.max, name="max", numeric_only=numeric_only, axis=axis
)

def std(self, axis=None, numeric_only=True):
def std(self, axis=None, numeric_only=True) -> Union[Scalar, "ks.Series"]:
"""
Return sample standard deviation.

Expand Down Expand Up @@ -1352,7 +1389,7 @@ def std(self, axis=None, numeric_only=True):
F.stddev, name="std", numeric_only=numeric_only, axis=axis
)

def var(self, axis=None, numeric_only=True):
def var(self, axis=None, numeric_only=True) -> Union[Scalar, "ks.Series"]:
"""
Return unbiased variance.

Expand Down Expand Up @@ -1425,7 +1462,7 @@ def size(self) -> int:
else:
return len(self) * num_columns # type: ignore

def abs(self):
def abs(self) -> Union["ks.DataFrame", "ks.Series"]:
"""
Return a Series/DataFrame with absolute numeric value of each element.

Expand Down Expand Up @@ -1600,13 +1637,17 @@ def groupby(
"Constructor expects DataFrame or Series; however, " "got [%s]" % (self,)
)

def bool(self):
def bool(self) -> bool:
"""
Return the bool of a single element in the current object.

This must be a boolean scalar value, either True or False. Raise a ValueError if
the object does not have exactly 1 element, or that element is not boolean

Returns
--------
bool

Examples
--------
>>> ks.DataFrame({'a': [True]}).bool()
Expand Down Expand Up @@ -1838,7 +1879,7 @@ def last_valid_index(self):

return last_valid_idx

def median(self, axis=None, numeric_only=True, accuracy=10000):
def median(self, axis=None, numeric_only=True, accuracy=10000) -> Union[Scalar, "ks.Series"]:
"""
Return the median of the values for the requested axis.

Expand Down Expand Up @@ -1934,7 +1975,7 @@ def median(self, axis=None, numeric_only=True, accuracy=10000):
)

# TODO: 'center', 'win_type', 'on', 'axis' parameter should be implemented.
def rolling(self, window, min_periods=None):
def rolling(self, window, min_periods=None) -> Rolling:
"""
Provide rolling transformations.

Expand Down Expand Up @@ -1963,7 +2004,7 @@ def rolling(self, window, min_periods=None):

# TODO: 'center' and 'axis' parameter should be implemented.
# 'axis' implementation, refer https://github.com/databricks/koalas/pull/607
def expanding(self, min_periods=1):
def expanding(self, min_periods=1) -> Expanding:
"""
Provide expanding transformations.

Expand Down Expand Up @@ -2034,7 +2075,7 @@ def get(self, key, default=None):
except (KeyError, ValueError, IndexError):
return default

def squeeze(self, axis=None):
def squeeze(self, axis=None) -> Union[Scalar, "ks.DataFrame", "ks.Series"]:
"""
Squeeze 1 dimensional axis objects into scalars.

Expand Down Expand Up @@ -2158,15 +2199,17 @@ def squeeze(self, axis=None):
return self
else:
return series_from_column
elif isinstance(self, ks.Series):
else:
# The case of Series is simple.
# If Series has only a single value, just return it as a scalar.
# Otherwise, there is no change.
self_top_two = self.head(2)
has_single_value = len(self_top_two) == 1
return self_top_two[0] if has_single_value else self
return cast(Union[Scalar, ks.Series], self_top_two[0] if has_single_value else self)

def truncate(self, before=None, after=None, axis=None, copy=True):
def truncate(
self, before=None, after=None, axis=None, copy=True
) -> Union["ks.DataFrame", "ks.Series"]:
"""
Truncate a Series or DataFrame before and after some index value.

Expand Down Expand Up @@ -2285,7 +2328,7 @@ def truncate(self, before=None, after=None, axis=None, copy=True):
if not indexes_increasing and not indexes.is_monotonic_decreasing:
raise ValueError("truncate requires a sorted index")
if (before is None) and (after is None):
return self.copy() if copy else self
return cast(Union[ks.DataFrame, ks.Series], self.copy() if copy else self)
if (before is not None and after is not None) and before > after:
raise ValueError("Truncate: %s must be after %s" % (after, before))

Expand All @@ -2303,9 +2346,9 @@ def truncate(self, before=None, after=None, axis=None, copy=True):
elif axis == 1:
result = self.loc[:, before:after]

return result.copy() if copy else result
return cast(Union[ks.DataFrame, ks.Series], result.copy() if copy else result)

def to_markdown(self, buf=None, mode=None):
def to_markdown(self, buf=None, mode=None) -> str:
"""
Print Series or DataFrame in Markdown-friendly format.

Expand Down Expand Up @@ -2366,7 +2409,7 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, limit=None):
pass

# TODO: add 'downcast' when value parameter exists
def bfill(self, axis=None, inplace=False, limit=None):
def bfill(self, axis=None, inplace=False, limit=None) -> Union["ks.DataFrame", "ks.Series"]:
"""
Synonym for `DataFrame.fillna()` or `Series.fillna()` with ``method=`bfill```.

Expand Down Expand Up @@ -2440,7 +2483,7 @@ def bfill(self, axis=None, inplace=False, limit=None):
backfill = bfill

# TODO: add 'downcast' when value parameter exists
def ffill(self, axis=None, inplace=False, limit=None):
def ffill(self, axis=None, inplace=False, limit=None) -> Union["ks.DataFrame", "ks.Series"]:
"""
Synonym for `DataFrame.fillna()` or `Series.fillna()` with ``method=`ffill```.

Expand Down