diff --git a/databricks/koalas/indexing.py b/databricks/koalas/indexing.py index c25120b985..5b079bd9ab 100644 --- a/databricks/koalas/indexing.py +++ b/databricks/koalas/indexing.py @@ -22,7 +22,7 @@ from pandas.api.types import is_list_like from pyspark import sql as spark from pyspark.sql import functions as F -from pyspark.sql.types import BooleanType +from pyspark.sql.types import BooleanType, StringType, LongType from pyspark.sql.utils import AnalysisException from databricks.koalas.internal import _InternalFrame, HIDDEN_COLUMNS, NATURAL_ORDER_COLUMN_NAME @@ -233,7 +233,6 @@ class LocIndexer(_LocIndexerLike): .. note:: Note that contrary to usual python slices, **both** the start and the stop are included, and the step of the slice is not allowed. - In addition, with a slice, Koalas works as a filter between the range. .. note:: With a list or array of labels for row selection, Koalas behaves as a filter without reordering by the labels. @@ -290,13 +289,9 @@ class LocIndexer(_LocIndexerLike): Slice with labels for row and single label for column. As mentioned above, note that both the start and stop of the slice are included. - Also note that the row for 'sidewinder' is included since 'sidewinder' - is between 'cobra' and 'viper'. - >>> df.loc['cobra':'viper', 'max_speed'] - cobra 1 - viper 4 - sidewinder 7 + cobra 1 + viper 4 Name: max_speed, dtype: int64 Conditional that returns a boolean Series @@ -400,16 +395,58 @@ def _select_rows(self, rows_sel): # If slice is None - select everything, so nothing to do return None, None elif len(self._internal.index_columns) == 1: + sdf = self._kdf_or_kser._internal.sdf + index = self._kdf_or_kser.index + index_column = index.to_series() + index_data_type = index_column.spark_type start = rows_sel.start stop = rows_sel.stop + start_order_column = sdf[NATURAL_ORDER_COLUMN_NAME] + stop_order_column = sdf[NATURAL_ORDER_COLUMN_NAME] + + # get natural order from '__natural_order__' from start to stop + # to keep natural order. + start_and_stop = ( + sdf.select(index_column._scol, NATURAL_ORDER_COLUMN_NAME) + .where((index_column._scol == start) | (index_column._scol == stop)) + .collect()) + + start = [row[1] for row in start_and_stop if row[0] == start] + start = start[0] if len(start) > 0 else None + + stop = [row[1] for row in start_and_stop if row[0] == stop] + stop = stop[-1] if len(stop) > 0 else None + + # Assume we use the natural order by default. + start_order_column_type = LongType() + stop_order_column_type = LongType() + + # if index order is not monotonic increasing or decreasing + # and specified values don't exist in index, raise KeyError + if start is None and rows_sel.start is not None: + if not (index.is_monotonic_increasing or index.is_monotonic_decreasing): + raise KeyError(rows_sel.start) + else: + start = rows_sel.start + start_order_column = index_column._scol + start_order_column_type = index_data_type + if stop is None and rows_sel.stop is not None: + if not (index.is_monotonic_increasing or index.is_monotonic_decreasing): + raise KeyError(rows_sel.stop) + else: + stop = rows_sel.stop + stop_order_column = index_column._scol + stop_order_column_type = index_data_type + + # if start and stop are same, just get all start(or stop) values + if start == stop: + return index_column._scol == F.lit(rows_sel.start).cast(index_data_type), None - index_column = self._kdf_or_kser.index.to_series() - index_data_type = index_column.spark_type cond = [] if start is not None: - cond.append(index_column._scol >= F.lit(start).cast(index_data_type)) + cond.append(start_order_column >= F.lit(start).cast(start_order_column_type)) if stop is not None: - cond.append(index_column._scol <= F.lit(stop).cast(index_data_type)) + cond.append(stop_order_column <= F.lit(stop).cast(stop_order_column_type)) if len(cond) > 0: return reduce(lambda x, y: x & y, cond), None diff --git a/databricks/koalas/series.py b/databricks/koalas/series.py index 1453b72db5..52b3836237 100644 --- a/databricks/koalas/series.py +++ b/databricks/koalas/series.py @@ -3441,10 +3441,7 @@ def truncate(self, before=None, after=None, copy=True): if before > after: raise ValueError("Truncate: %s must be after %s" % (after, before)) - if indexes_increasing: - result = _col(self.to_frame()[before:after]) - else: - result = _col(self.to_frame()[after:before]) + result = _col(self.to_frame()[before:after]) return result.copy() if copy else result diff --git a/databricks/koalas/tests/test_indexing.py b/databricks/koalas/tests/test_indexing.py index c2ed505699..85f5cd2196 100644 --- a/databricks/koalas/tests/test_indexing.py +++ b/databricks/koalas/tests/test_indexing.py @@ -223,6 +223,34 @@ def test_loc(self): self.assert_eq(kdf.loc[1000:], pdf.loc[1000:]) self.assert_eq(kdf.loc[-2000:-1000], pdf.loc[-2000:-1000]) + # duplicated index test + pdf = pd.DataFrame( + [1, 2, 3, 4, 5, 6, 7, 8, 9], + index=[0, 1, 1, 2, 2, 2, 3, 4, 5]) + kdf = ks.from_pandas(pdf) + + self.assert_eq(repr(kdf.loc[:2]), repr(pdf.loc[:2])) + + # test when type of key is string and given value is not included in key + pdf = pd.DataFrame([1, 2, 3], index=['a', 'b', 'd']).loc['a':'z'] + kdf = ks.from_pandas(pdf) + + self.assert_eq(repr(kdf.loc['a':'z']), repr(pdf.loc['a':'z'])) + + # KeyError when index is not monotonic increasing or decreasing + # and specified values don't exist in index + kdf = ks.DataFrame([[1, 2], [4, 5], [7, 8]], + index=['cobra', 'viper', 'sidewinder']) + + self.assertRaises(KeyError, lambda: kdf.loc['cobra':'koalas']) + self.assertRaises(KeyError, lambda: kdf.loc['koalas':'viper']) + + kdf = ks.DataFrame([[1, 2], [4, 5], [7, 8]], + index=[10, 30, 20]) + + self.assertRaises(KeyError, lambda: kdf.loc[0:30]) + self.assertRaises(KeyError, lambda: kdf.loc[10:100]) + def test_loc_non_informative_index(self): pdf = pd.DataFrame({'x': [1, 2, 3, 4]}, index=[10, 20, 30, 40]) kdf = ks.from_pandas(pdf)