diff --git a/databricks/koalas/indexes.py b/databricks/koalas/indexes.py index 644fca7f76..dc832fe066 100644 --- a/databricks/koalas/indexes.py +++ b/databricks/koalas/indexes.py @@ -456,23 +456,56 @@ def has_duplicates(self) -> bool: Examples -------- - >>> kdf = ks.DataFrame({'a': [1, 2, 3]}, index=list('aac')) - >>> kdf.index.has_duplicates + >>> idx = ks.Index([1, 5, 7, 7]) + >>> idx.has_duplicates True - >>> kdf = ks.DataFrame({'a': [1, 2, 3]}, index=[list('abc'), list('def')]) - >>> kdf.index.has_duplicates + >>> idx = ks.Index([1, 5, 7]) + >>> idx.has_duplicates False - >>> kdf = ks.DataFrame({'a': [1, 2, 3]}, index=[list('aac'), list('eef')]) - >>> kdf.index.has_duplicates + >>> idx = ks.Index(["Watermelon", "Orange", "Apple", + ... "Watermelon"]) + >>> idx.has_duplicates True + + >>> idx = ks.Index(["Orange", "Apple", + ... "Watermelon"]) + >>> idx.has_duplicates + False """ sdf = self._internal.spark_frame.select(self.spark.column) scol = scol_for(sdf, sdf.columns[0]) return sdf.select(F.count(scol) != F.countDistinct(scol)).first()[0] + @property + def is_unique(self) -> bool: + """ + Return if the index has unique values. + + Examples + -------- + >>> idx = ks.Index([1, 5, 7, 7]) + >>> idx.is_unique + False + + >>> idx = ks.Index([1, 5, 7]) + >>> idx.is_unique + True + + >>> idx = ks.Index(["Watermelon", "Orange", "Apple", + ... "Watermelon"]) + >>> idx.is_unique + False + + >>> idx = ks.Index(["Orange", "Apple", + ... "Watermelon"]) + >>> idx.is_unique + True + """ + return not self.has_duplicates + @property def name(self) -> Union[str, Tuple[str, ...]]: """Return name of the Index.""" diff --git a/databricks/koalas/tests/test_indexes.py b/databricks/koalas/tests/test_indexes.py index a1b75caafe..b8339b9534 100644 --- a/databricks/koalas/tests/test_indexes.py +++ b/databricks/koalas/tests/test_indexes.py @@ -1445,3 +1445,29 @@ def test_inferred_type(self): pmidx = pd.MultiIndex.from_tuples([("a", "x")]) kmidx = ks.from_pandas(pmidx) self.assert_eq(pmidx.inferred_type, kmidx.inferred_type) + + def test_index_is_unique(self): + indexes = [("a", "b", "c"), ("a", "a", "c"), (1, 3, 3), (1, 2, 3)] + names = [None, "ks", "ks", None] + is_uniq = [True, False, False, True] + + for idx, name, expected in zip(indexes, names, is_uniq): + pdf = pd.DataFrame({"a": [1, 2, 3]}, index=pd.Index(idx, name=name)) + kdf = ks.from_pandas(pdf) + + self.assertEqual(kdf.index.is_unique, expected) + + def test_multiindex_is_unique(self): + indexes = [ + [list("abc"), list("edf")], + [list("aac"), list("edf")], + [list("aac"), list("eef")], + [[1, 4, 4], [4, 6, 6]], + ] + is_uniq = [True, True, False, False] + + for idx, expected in zip(indexes, is_uniq): + pdf = pd.DataFrame({"a": [1, 2, 3]}, index=idx) + kdf = ks.from_pandas(pdf) + + self.assertEqual(kdf.index.is_unique, expected) diff --git a/docs/source/reference/indexing.rst b/docs/source/reference/indexing.rst index 2c122184db..e51547ca6b 100644 --- a/docs/source/reference/indexing.rst +++ b/docs/source/reference/indexing.rst @@ -21,6 +21,7 @@ Properties Index.is_monotonic Index.is_monotonic_increasing Index.is_monotonic_decreasing + Index.is_unique Index.has_duplicates Index.hasnans Index.dtype