Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Explicitly disallow empty list as index_spark_colum_names and index_names. #1895

Merged
merged 1 commit into from
Nov 9, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 4 additions & 15 deletions databricks/koalas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3495,21 +3495,10 @@ def rename(index):
scol_for(sdf, column).alias(name_like_string(name)) for column, name in new_index_map
]

if len(index_map) > 0: # type: ignore
index_scols = [scol_for(sdf, column) for column in index_map]
sdf = sdf.select(
index_scols
+ new_data_scols
+ self._internal.data_spark_columns
+ list(HIDDEN_COLUMNS)
)
else:
sdf = sdf.select(
new_data_scols + self._internal.data_spark_columns + list(HIDDEN_COLUMNS)
)

sdf = InternalFrame.attach_default_index(sdf)
index_map = OrderedDict({SPARK_DEFAULT_INDEX_NAME: None})
index_scols = [scol_for(sdf, column) for column in index_map]
sdf = sdf.select(
index_scols + new_data_scols + self._internal.data_spark_columns + list(HIDDEN_COLUMNS)
)

if self._internal.column_labels_level > 1:
column_depth = len(self._internal.column_labels[0])
Expand Down
24 changes: 11 additions & 13 deletions databricks/koalas/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@
from databricks.koalas.config import get_option
from databricks.koalas.utils import (
align_diff_frames,
column_labels_level,
is_name_like_tuple,
is_name_like_value,
name_like_string,
Expand Down Expand Up @@ -1318,29 +1317,28 @@ def _make_pandas_df_builder_func(kdf, func, return_schema, retain_index):
index_names = kdf._internal.index_names
data_columns = kdf._internal.data_spark_column_names
column_labels = kdf._internal.column_labels
column_labels_level = kdf._internal.column_labels_level

def rename_output(pdf):
# TODO: This logic below was borrowed from `DataFrame.to_pandas_frame` to set the index
# within each pdf properly. we might have to deduplicate it.
import pandas as pd

if len(index_columns) > 0:
append = False
for index_field in index_columns:
drop = index_field not in data_columns
pdf = pdf.set_index(index_field, drop=drop, append=append)
append = True
pdf = pdf[data_columns]
append = False
for index_field in index_columns:
drop = index_field not in data_columns
pdf = pdf.set_index(index_field, drop=drop, append=append)
append = True
pdf = pdf[data_columns]

if column_labels_level(column_labels) > 1:
if column_labels_level > 1:
pdf.columns = pd.MultiIndex.from_tuples(column_labels)
else:
pdf.columns = [None if label is None else label[0] for label in column_labels]

if len(index_names) > 0:
pdf.index.names = [
name if name is None or len(name) > 1 else name[0] for name in index_names
]
pdf.index.names = [
name if name is None or len(name) > 1 else name[0] for name in index_names
]

pdf = func(pdf)

Expand Down
13 changes: 6 additions & 7 deletions databricks/koalas/internal.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,7 +445,7 @@ def __init__(
assert isinstance(spark_frame, spark.DataFrame)
assert not spark_frame.isStreaming, "Koalas does not support Structured Streaming."

if index_spark_column_names is None:
if not index_spark_column_names:
assert not any(SPARK_INDEX_NAME_PATTERN.match(name) for name in spark_frame.columns), (
"Index columns should not appear in columns of the Spark DataFrame. Avoid "
"index column names [%s]." % SPARK_INDEX_NAME_PATTERN
Expand All @@ -470,7 +470,7 @@ def __init__(
NATURAL_ORDER_COLUMN_NAME, F.monotonically_increasing_id()
)

if index_names is None:
if not index_names:
index_names = [None] * len(index_spark_column_names)

assert len(index_spark_column_names) == len(index_names), (
Expand Down Expand Up @@ -857,11 +857,10 @@ def to_pandas_frame(self) -> pd.DataFrame:
name=names[0],
)

index_names = self.index_names
if len(index_names) > 0:
pdf.index.names = [
name if name is None or len(name) > 1 else name[0] for name in index_names
]
pdf.index.names = [
name if name is None or len(name) > 1 else name[0] for name in self.index_names
]

return pdf

@lazy_property
Expand Down
1 change: 1 addition & 0 deletions databricks/koalas/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ def test_reset_index(self):
kdf = ks.from_pandas(pdf)

self.assert_eq(kdf.reset_index(), pdf.reset_index())
self.assert_eq(kdf.reset_index().index, pdf.reset_index().index)
self.assert_eq(kdf.reset_index(drop=True), pdf.reset_index(drop=True))

pdf.index.name = "a"
Expand Down