From b84e2c908e88fbdcf963cdae946986e467e76818 Mon Sep 17 00:00:00 2001 From: itholic Date: Tue, 18 Feb 2020 11:03:53 +0900 Subject: [PATCH 1/9] Implement Series.combine_first --- databricks/koalas/missing/series.py | 1 - databricks/koalas/series.py | 54 +++++++++++++++++++ .../koalas/tests/test_ops_on_diff_frames.py | 38 +++++++++++++ docs/source/reference/series.rst | 1 + 4 files changed, 93 insertions(+), 1 deletion(-) diff --git a/databricks/koalas/missing/series.py b/databricks/koalas/missing/series.py index 677a66aac8..c6bdcef10b 100644 --- a/databricks/koalas/missing/series.py +++ b/databricks/koalas/missing/series.py @@ -56,7 +56,6 @@ class _MissingPandasLikeSeries(object): between_time = unsupported_function('between_time') bfill = unsupported_function('bfill') combine = unsupported_function('combine') - combine_first = unsupported_function('combine_first') cov = unsupported_function('cov') divmod = unsupported_function('divmod') dot = unsupported_function('dot') diff --git a/databricks/koalas/series.py b/databricks/koalas/series.py index b4772239ac..512d192a6e 100644 --- a/databricks/koalas/series.py +++ b/databricks/koalas/series.py @@ -4139,6 +4139,60 @@ def pct_change(self, periods=1): return self._with_new_scol((scol - prev_row) / prev_row) + def combine_first(self, other): + """ + Combine Series values, choosing the calling Series's values first. + + .. note:: This API internally performs a join operation which can be pretty expensive + in general. if you want to use though, set `compute.ops_on_diff_frames` to True. + + Parameters + ---------- + other : Series + The value(s) to be combined with the `Series`. + + Returns + ------- + Series + The result of combining the Series with the other object. + + See Also + -------- + Series.combine : Perform elementwise operation on two Series + using a given function. + + Notes + ----- + Result index will be the union of the two indexes. + + Examples + -------- + >>> from databricks.koalas.config import set_option, reset_option + >>> set_option("compute.ops_on_diff_frames", True) + >>> s1 = ks.Series([1, np.nan]) + >>> s2 = ks.Series([3, 4]) + >>> s1.combine_first(s2) + 0 1.0 + 1 4.0 + Name: 0, dtype: float64 + + >>> reset_option("compute.ops_on_diff_frames") + """ + if not isinstance(self, ks.Series): + raise ValueError("`combine_first` only allows `Series` for parameter `other`") + this = '__this_0' + that = '__that_0' + combined = combine_frames(self.to_frame(), other) + index_scols = combined._internal.index_scols + sdf = combined._sdf + # If `self` has missing value, use value of `other` + cond = F.when(sdf[this].isNull(), sdf[that]).otherwise(sdf[this]) + sdf = sdf.select(*index_scols, cond.alias(self.name)) + internal = _InternalFrame( + sdf=sdf, + index_map=self._internal.index_map) + return _col(ks.DataFrame(internal)) + def _cum(self, func, skipna, part_cols=()): # This is used to cummin, cummax, cumsum, etc. diff --git a/databricks/koalas/tests/test_ops_on_diff_frames.py b/databricks/koalas/tests/test_ops_on_diff_frames.py index a3eb8a773b..38be269f6b 100644 --- a/databricks/koalas/tests/test_ops_on_diff_frames.py +++ b/databricks/koalas/tests/test_ops_on_diff_frames.py @@ -637,6 +637,33 @@ def test_multi_index_column_assignment_frame(self): with self.assertRaisesRegex(KeyError, 'Key length \\(3\\) exceeds index depth \\(2\\)'): kdf[('1', '2', '3')] = ks.Series([100, 200, 300, 200]) + def test_combine_first(self): + # Series.combine_first + kser1 = ks.Series({'falcon': 330.0, 'eagle': 160.0}) + kser2 = ks.Series({'falcon': 345.0, 'eagle': 200.0, 'duck': 30.0}) + pser1 = kser1.to_pandas() + pser2 = kser2.to_pandas() + + self.assert_eq(repr(kser1.combine_first(kser2).sort_index()), + repr(pser1.combine_first(pser2).sort_index())) + + # MultiIndex + midx1 = pd.MultiIndex([['lama', 'cow', 'falcon', 'koala'], + ['speed', 'weight', 'length', 'power']], + [[0, 3, 1, 1, 1, 2, 2, 2], + [0, 2, 0, 3, 2, 0, 1, 3]]) + midx2 = pd.MultiIndex([['lama', 'cow', 'falcon'], + ['speed', 'weight', 'length']], + [[0, 0, 0, 1, 1, 1, 2, 2, 2], + [0, 1, 2, 0, 1, 2, 0, 1, 2]]) + kser1 = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1], index=midx1) + kser2 = ks.Series([-45, 200, -1.2, 30, -250, 1.5, 320, 1, -0.3], index=midx2) + pser1 = kser1.to_pandas() + pser2 = kser2.to_pandas() + + self.assert_eq(repr(kser1.combine_first(kser2).sort_index()), + repr(pser1.combine_first(pser2).sort_index())) + class OpsOnDiffFramesDisabledTest(ReusedSQLTestCase, SQLTestUtils): @@ -738,3 +765,14 @@ def test_mask(self): with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"): self.assert_eq(repr(pdf1.mask(pdf2 > -250)), repr(kdf1.mask(kdf2 > -250).sort_index())) + + def test_combine_first(self): + # Series.combine_first + kser1 = ks.Series({'falcon': 330.0, 'eagle': 160.0}) + kser2 = ks.Series({'falcon': 345.0, 'eagle': 200.0, 'duck': 30.0}) + pser1 = kser1.to_pandas() + pser2 = kser2.to_pandas() + + with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"): + self.assert_eq(repr(pser1.combine_first(pser2)), + repr(kser1.combine_first(kser2).sort_index())) diff --git a/docs/source/reference/series.rst b/docs/source/reference/series.rst index bab3058a2c..e1a8687b82 100644 --- a/docs/source/reference/series.rst +++ b/docs/source/reference/series.rst @@ -78,6 +78,7 @@ Binary operator functions Series.rmod Series.floordiv Series.rfloordiv + Series.combine_first Series.lt Series.gt Series.le From 06ed257a29e0ab7b55b676e58342f60663119c10 Mon Sep 17 00:00:00 2001 From: itholic Date: Tue, 18 Feb 2020 11:07:49 +0900 Subject: [PATCH 2/9] Add failure test --- databricks/koalas/series.py | 2 +- databricks/koalas/tests/test_ops_on_diff_frames.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/databricks/koalas/series.py b/databricks/koalas/series.py index 512d192a6e..67b5c9b950 100644 --- a/databricks/koalas/series.py +++ b/databricks/koalas/series.py @@ -4178,7 +4178,7 @@ def combine_first(self, other): >>> reset_option("compute.ops_on_diff_frames") """ - if not isinstance(self, ks.Series): + if not isinstance(other, ks.Series): raise ValueError("`combine_first` only allows `Series` for parameter `other`") this = '__this_0' that = '__that_0' diff --git a/databricks/koalas/tests/test_ops_on_diff_frames.py b/databricks/koalas/tests/test_ops_on_diff_frames.py index 38be269f6b..561784bd97 100644 --- a/databricks/koalas/tests/test_ops_on_diff_frames.py +++ b/databricks/koalas/tests/test_ops_on_diff_frames.py @@ -646,6 +646,9 @@ def test_combine_first(self): self.assert_eq(repr(kser1.combine_first(kser2).sort_index()), repr(pser1.combine_first(pser2).sort_index())) + with self.assertRaisesRegex(ValueError, + "`combine_first` only allows `Series` for parameter `other`"): + kser1.combine_first(50) # MultiIndex midx1 = pd.MultiIndex([['lama', 'cow', 'falcon', 'koala'], From 910a8d288397abdeb54694609e1c9f7b9388363e Mon Sep 17 00:00:00 2001 From: itholic Date: Sun, 1 Mar 2020 16:28:49 +0900 Subject: [PATCH 3/9] Applying Black --- databricks/koalas/series.py | 8 ++- .../koalas/tests/test_ops_on_diff_frames.py | 49 ++++++++++--------- 2 files changed, 30 insertions(+), 27 deletions(-) diff --git a/databricks/koalas/series.py b/databricks/koalas/series.py index f3828fcfdd..550457aec8 100644 --- a/databricks/koalas/series.py +++ b/databricks/koalas/series.py @@ -4365,17 +4365,15 @@ def combine_first(self, other): """ if not isinstance(other, ks.Series): raise ValueError("`combine_first` only allows `Series` for parameter `other`") - this = '__this_0' - that = '__that_0' + this = "__this_0" + that = "__that_0" combined = combine_frames(self.to_frame(), other) index_scols = combined._internal.index_scols sdf = combined._sdf # If `self` has missing value, use value of `other` cond = F.when(sdf[this].isNull(), sdf[that]).otherwise(sdf[this]) sdf = sdf.select(*index_scols, cond.alias(self.name)) - internal = _InternalFrame( - sdf=sdf, - index_map=self._internal.index_map) + internal = _InternalFrame(sdf=sdf, index_map=self._internal.index_map) return _col(ks.DataFrame(internal)) def _cum(self, func, skipna, part_cols=()): diff --git a/databricks/koalas/tests/test_ops_on_diff_frames.py b/databricks/koalas/tests/test_ops_on_diff_frames.py index 0a57ac5cfd..ce348ad0d2 100644 --- a/databricks/koalas/tests/test_ops_on_diff_frames.py +++ b/databricks/koalas/tests/test_ops_on_diff_frames.py @@ -649,33 +649,38 @@ def test_multi_index_column_assignment_frame(self): def test_combine_first(self): # Series.combine_first - kser1 = ks.Series({'falcon': 330.0, 'eagle': 160.0}) - kser2 = ks.Series({'falcon': 345.0, 'eagle': 200.0, 'duck': 30.0}) + kser1 = ks.Series({"falcon": 330.0, "eagle": 160.0}) + kser2 = ks.Series({"falcon": 345.0, "eagle": 200.0, "duck": 30.0}) pser1 = kser1.to_pandas() pser2 = kser2.to_pandas() - self.assert_eq(repr(kser1.combine_first(kser2).sort_index()), - repr(pser1.combine_first(pser2).sort_index())) - with self.assertRaisesRegex(ValueError, - "`combine_first` only allows `Series` for parameter `other`"): + self.assert_eq( + repr(kser1.combine_first(kser2).sort_index()), + repr(pser1.combine_first(pser2).sort_index()), + ) + with self.assertRaisesRegex( + ValueError, "`combine_first` only allows `Series` for parameter `other`" + ): kser1.combine_first(50) # MultiIndex - midx1 = pd.MultiIndex([['lama', 'cow', 'falcon', 'koala'], - ['speed', 'weight', 'length', 'power']], - [[0, 3, 1, 1, 1, 2, 2, 2], - [0, 2, 0, 3, 2, 0, 1, 3]]) - midx2 = pd.MultiIndex([['lama', 'cow', 'falcon'], - ['speed', 'weight', 'length']], - [[0, 0, 0, 1, 1, 1, 2, 2, 2], - [0, 1, 2, 0, 1, 2, 0, 1, 2]]) + midx1 = pd.MultiIndex( + [["lama", "cow", "falcon", "koala"], ["speed", "weight", "length", "power"]], + [[0, 3, 1, 1, 1, 2, 2, 2], [0, 2, 0, 3, 2, 0, 1, 3]], + ) + midx2 = pd.MultiIndex( + [["lama", "cow", "falcon"], ["speed", "weight", "length"]], + [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], + ) kser1 = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1], index=midx1) kser2 = ks.Series([-45, 200, -1.2, 30, -250, 1.5, 320, 1, -0.3], index=midx2) pser1 = kser1.to_pandas() pser2 = kser2.to_pandas() - self.assert_eq(repr(kser1.combine_first(kser2).sort_index()), - repr(pser1.combine_first(pser2).sort_index())) + self.assert_eq( + repr(kser1.combine_first(kser2).sort_index()), + repr(pser1.combine_first(pser2).sort_index()), + ) class OpsOnDiffFramesDisabledTest(ReusedSQLTestCase, SQLTestUtils): @@ -775,16 +780,16 @@ def test_mask(self): kdf2 = ks.from_pandas(pdf2) with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"): - self.assert_eq(repr(pdf1.mask(pdf2 > -250)), - repr(kdf1.mask(kdf2 > -250).sort_index())) + self.assert_eq(repr(pdf1.mask(pdf2 > -250)), repr(kdf1.mask(kdf2 > -250).sort_index())) def test_combine_first(self): # Series.combine_first - kser1 = ks.Series({'falcon': 330.0, 'eagle': 160.0}) - kser2 = ks.Series({'falcon': 345.0, 'eagle': 200.0, 'duck': 30.0}) + kser1 = ks.Series({"falcon": 330.0, "eagle": 160.0}) + kser2 = ks.Series({"falcon": 345.0, "eagle": 200.0, "duck": 30.0}) pser1 = kser1.to_pandas() pser2 = kser2.to_pandas() with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"): - self.assert_eq(repr(pser1.combine_first(pser2)), - repr(kser1.combine_first(kser2).sort_index())) + self.assert_eq( + repr(pser1.combine_first(pser2)), repr(kser1.combine_first(kser2).sort_index()) + ) From baa87e8d37dba85686c56d1828c9cceac3dce934 Mon Sep 17 00:00:00 2001 From: itholic Date: Thu, 5 Mar 2020 21:44:22 +0900 Subject: [PATCH 4/9] Adding case when Series come from same DataFrame --- databricks/koalas/series.py | 21 ++++++++++----------- databricks/koalas/tests/test_series.py | 17 +++++++++++++++++ 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/databricks/koalas/series.py b/databricks/koalas/series.py index 550457aec8..911158eaaf 100644 --- a/databricks/koalas/series.py +++ b/databricks/koalas/series.py @@ -35,7 +35,7 @@ from pyspark.sql.window import Window from databricks import koalas as ks # For running doctests and reference resolution in PyCharm. -from databricks.koalas.config import get_option +from databricks.koalas.config import get_option, option_context from databricks.koalas.base import IndexOpsMixin from databricks.koalas.exceptions import SparkPandasIndexingError from databricks.koalas.frame import DataFrame @@ -4328,9 +4328,6 @@ def combine_first(self, other): """ Combine Series values, choosing the calling Series's values first. - .. note:: This API internally performs a join operation which can be pretty expensive - in general. if you want to use though, set `compute.ops_on_diff_frames` to True. - Parameters ---------- other : Series @@ -4352,22 +4349,24 @@ def combine_first(self, other): Examples -------- - >>> from databricks.koalas.config import set_option, reset_option - >>> set_option("compute.ops_on_diff_frames", True) >>> s1 = ks.Series([1, np.nan]) >>> s2 = ks.Series([3, 4]) >>> s1.combine_first(s2) 0 1.0 1 4.0 Name: 0, dtype: float64 - - >>> reset_option("compute.ops_on_diff_frames") """ if not isinstance(other, ks.Series): raise ValueError("`combine_first` only allows `Series` for parameter `other`") - this = "__this_0" - that = "__that_0" - combined = combine_frames(self.to_frame(), other) + if self._kdf is other._kdf: + this = self.name + that = other.name + combined = self._kdf + else: + this = "__this_{}".format(self.name) + that = "__that_{}".format(other.name) + with option_context("compute.ops_on_diff_frames", True): + combined = combine_frames(self.to_frame(), other) index_scols = combined._internal.index_scols sdf = combined._sdf # If `self` has missing value, use value of `other` diff --git a/databricks/koalas/tests/test_series.py b/databricks/koalas/tests/test_series.py index b41550c027..890fef40c5 100644 --- a/databricks/koalas/tests/test_series.py +++ b/databricks/koalas/tests/test_series.py @@ -1283,3 +1283,20 @@ def test_axes(self): kser = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx) pser = kser.to_pandas() self.assert_list_eq(kser.axes, pser.axes) + + def test_combine_first(self): + kdf = ks.DataFrame( + { + "A": pd.Series({"falcon": 330.0, "eagle": 160.0}), + "B": pd.Series({"falcon": 345.0, "eagle": 200.0, "duck": 30.0}), + } + ) + kser1 = kdf.A + kser2 = kdf.B + pser1 = kser1.to_pandas() + pser2 = kser2.to_pandas() + + self.assert_eq( + repr(kser1.combine_first(kser2).sort_index()), + repr(pser1.combine_first(pser2).sort_index()), + ) From 53cb1686998be79c4f06c3c00fc554da22358d06 Mon Sep 17 00:00:00 2001 From: itholic Date: Fri, 6 Mar 2020 02:48:57 +0900 Subject: [PATCH 5/9] Move test from ops_on_diff to series --- .../koalas/tests/test_ops_on_diff_frames.py | 47 ------------------- databricks/koalas/tests/test_series.py | 38 ++++++++++++++- 2 files changed, 36 insertions(+), 49 deletions(-) diff --git a/databricks/koalas/tests/test_ops_on_diff_frames.py b/databricks/koalas/tests/test_ops_on_diff_frames.py index e11ad30543..6537246f6c 100644 --- a/databricks/koalas/tests/test_ops_on_diff_frames.py +++ b/databricks/koalas/tests/test_ops_on_diff_frames.py @@ -647,41 +647,6 @@ def test_multi_index_column_assignment_frame(self): with self.assertRaisesRegex(KeyError, "Key length \\(3\\) exceeds index depth \\(2\\)"): kdf[("1", "2", "3")] = ks.Series([100, 200, 300, 200]) - def test_combine_first(self): - # Series.combine_first - kser1 = ks.Series({"falcon": 330.0, "eagle": 160.0}) - kser2 = ks.Series({"falcon": 345.0, "eagle": 200.0, "duck": 30.0}) - pser1 = kser1.to_pandas() - pser2 = kser2.to_pandas() - - self.assert_eq( - repr(kser1.combine_first(kser2).sort_index()), - repr(pser1.combine_first(pser2).sort_index()), - ) - with self.assertRaisesRegex( - ValueError, "`combine_first` only allows `Series` for parameter `other`" - ): - kser1.combine_first(50) - - # MultiIndex - midx1 = pd.MultiIndex( - [["lama", "cow", "falcon", "koala"], ["speed", "weight", "length", "power"]], - [[0, 3, 1, 1, 1, 2, 2, 2], [0, 2, 0, 3, 2, 0, 1, 3]], - ) - midx2 = pd.MultiIndex( - [["lama", "cow", "falcon"], ["speed", "weight", "length"]], - [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], - ) - kser1 = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1], index=midx1) - kser2 = ks.Series([-45, 200, -1.2, 30, -250, 1.5, 320, 1, -0.3], index=midx2) - pser1 = kser1.to_pandas() - pser2 = kser2.to_pandas() - - self.assert_eq( - repr(kser1.combine_first(kser2).sort_index()), - repr(pser1.combine_first(pser2).sort_index()), - ) - def test_to_series_comparison(self): kidx1 = ks.Index([1, 2, 3, 4, 5]) kidx2 = ks.Index([1, 2, 3, 4, 5]) @@ -792,15 +757,3 @@ def test_mask(self): with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"): self.assert_eq(repr(pdf1.mask(pdf2 > -250)), repr(kdf1.mask(kdf2 > -250).sort_index())) - - def test_combine_first(self): - # Series.combine_first - kser1 = ks.Series({"falcon": 330.0, "eagle": 160.0}) - kser2 = ks.Series({"falcon": 345.0, "eagle": 200.0, "duck": 30.0}) - pser1 = kser1.to_pandas() - pser2 = kser2.to_pandas() - - with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"): - self.assert_eq( - repr(pser1.combine_first(pser2)), repr(kser1.combine_first(kser2).sort_index()) - ) diff --git a/databricks/koalas/tests/test_series.py b/databricks/koalas/tests/test_series.py index 8f4a304416..fc53b9ac18 100644 --- a/databricks/koalas/tests/test_series.py +++ b/databricks/koalas/tests/test_series.py @@ -1279,10 +1279,44 @@ def test_axes(self): self.assert_list_eq(kser.axes, pser.axes) def test_combine_first(self): + kser1 = ks.Series({"falcon": 330.0, "eagle": 160.0}) + kser2 = ks.Series({"falcon": 345.0, "eagle": 200.0, "duck": 30.0}) + pser1 = kser1.to_pandas() + pser2 = kser2.to_pandas() + + self.assert_eq( + repr(kser1.combine_first(kser2).sort_index()), + repr(pser1.combine_first(pser2).sort_index()), + ) + with self.assertRaisesRegex( + ValueError, "`combine_first` only allows `Series` for parameter `other`" + ): + kser1.combine_first(50) + + # MultiIndex + midx1 = pd.MultiIndex( + [["lama", "cow", "falcon", "koala"], ["speed", "weight", "length", "power"]], + [[0, 3, 1, 1, 1, 2, 2, 2], [0, 2, 0, 3, 2, 0, 1, 3]], + ) + midx2 = pd.MultiIndex( + [["lama", "cow", "falcon"], ["speed", "weight", "length"]], + [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], + ) + kser1 = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1], index=midx1) + kser2 = ks.Series([-45, 200, -1.2, 30, -250, 1.5, 320, 1, -0.3], index=midx2) + pser1 = kser1.to_pandas() + pser2 = kser2.to_pandas() + + self.assert_eq( + repr(kser1.combine_first(kser2).sort_index()), + repr(pser1.combine_first(pser2).sort_index()), + ) + + # Series come from same DataFrame kdf = ks.DataFrame( { - "A": pd.Series({"falcon": 330.0, "eagle": 160.0}), - "B": pd.Series({"falcon": 345.0, "eagle": 200.0, "duck": 30.0}), + "A": {"falcon": 330.0, "eagle": 160.0}, + "B": {"falcon": 345.0, "eagle": 200.0, "duck": 30.0}, } ) kser1 = kdf.A From f134976456c3d5970e043ef97d4b665b836d15d4 Mon Sep 17 00:00:00 2001 From: itholic Date: Fri, 6 Mar 2020 03:48:31 +0900 Subject: [PATCH 6/9] Empty commit for build test From 3551c678fbeaa1aa9854907cab0980c5f9014356 Mon Sep 17 00:00:00 2001 From: itholic Date: Mon, 16 Mar 2020 11:55:39 +0900 Subject: [PATCH 7/9] scols -> spark_columns --- databricks/koalas/series.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/databricks/koalas/series.py b/databricks/koalas/series.py index 4988f974ce..859c79cc08 100644 --- a/databricks/koalas/series.py +++ b/databricks/koalas/series.py @@ -4378,11 +4378,11 @@ def combine_first(self, other): that = "__that_{}".format(other.name) with option_context("compute.ops_on_diff_frames", True): combined = combine_frames(self.to_frame(), other) - index_scols = combined._internal.index_scols + index_spark_columns = combined._internal.index_spark_columns sdf = combined._sdf # If `self` has missing value, use value of `other` cond = F.when(sdf[this].isNull(), sdf[that]).otherwise(sdf[this]) - sdf = sdf.select(*index_scols, cond.alias(self.name)) + sdf = sdf.select(*index_spark_columns, cond.alias(self.name)) internal = _InternalFrame(sdf=sdf, index_map=self._internal.index_map) return _col(ks.DataFrame(internal)) From d156f367f189a59e39ffcb293b696083a1e383fb Mon Sep 17 00:00:00 2001 From: itholic Date: Mon, 16 Mar 2020 13:19:53 +0900 Subject: [PATCH 8/9] Rebase to Master --- databricks/koalas/series.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/databricks/koalas/series.py b/databricks/koalas/series.py index 859c79cc08..93fe8eb5f4 100644 --- a/databricks/koalas/series.py +++ b/databricks/koalas/series.py @@ -4378,12 +4378,12 @@ def combine_first(self, other): that = "__that_{}".format(other.name) with option_context("compute.ops_on_diff_frames", True): combined = combine_frames(self.to_frame(), other) - index_spark_columns = combined._internal.index_spark_columns + index_scols = combined._internal.index_spark_columns sdf = combined._sdf # If `self` has missing value, use value of `other` cond = F.when(sdf[this].isNull(), sdf[that]).otherwise(sdf[this]) - sdf = sdf.select(*index_spark_columns, cond.alias(self.name)) - internal = _InternalFrame(sdf=sdf, index_map=self._internal.index_map) + sdf = sdf.select(*index_scols, cond.alias(self.name)) + internal = _InternalFrame(spark_frame=sdf, index_map=self._internal.index_map) return _col(ks.DataFrame(internal)) def dot(self, other): From c4fb5d0eb920dcf3702dc014423c9f9783696101 Mon Sep 17 00:00:00 2001 From: itholic Date: Mon, 16 Mar 2020 15:28:38 +0900 Subject: [PATCH 9/9] fix all comments --- databricks/koalas/series.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/databricks/koalas/series.py b/databricks/koalas/series.py index 93fe8eb5f4..3ef7219d39 100644 --- a/databricks/koalas/series.py +++ b/databricks/koalas/series.py @@ -4378,12 +4378,20 @@ def combine_first(self, other): that = "__that_{}".format(other.name) with option_context("compute.ops_on_diff_frames", True): combined = combine_frames(self.to_frame(), other) - index_scols = combined._internal.index_spark_columns sdf = combined._sdf # If `self` has missing value, use value of `other` cond = F.when(sdf[this].isNull(), sdf[that]).otherwise(sdf[this]) - sdf = sdf.select(*index_scols, cond.alias(self.name)) - internal = _InternalFrame(spark_frame=sdf, index_map=self._internal.index_map) + # If `self` and `other` come from same frame, the anchor should be kept + if self._kdf is other._kdf: + return self._with_new_scol(cond) + index_scols = combined._internal.index_spark_columns + sdf = sdf.select(*index_scols, cond.alias(self.name)).distinct() + internal = _InternalFrame( + spark_frame=sdf, + index_map=self._internal.index_map, + column_labels=self._internal.column_labels, + column_label_names=self._internal.column_label_names, + ) return _col(ks.DataFrame(internal)) def dot(self, other):