From f09e94305812ede7953d66363dfe5a76bda953ea Mon Sep 17 00:00:00 2001 From: thoo Date: Sun, 22 Sep 2019 16:26:03 -0400 Subject: [PATCH 1/6] fix on ks.merge for series --- databricks/koalas/frame.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py index 9f62019257..a97ab91018 100644 --- a/databricks/koalas/frame.py +++ b/databricks/koalas/frame.py @@ -5216,7 +5216,10 @@ def merge(self, right: 'DataFrame', how: str = 'inner', if right_keys and not left_keys: raise ValueError('Must pass left_on or left_index=True') if not left_keys and not right_keys: - common = list(self.columns.intersection(right.columns)) + if isinstance(right, ks.Series): + common = list(self.columns.intersection([right.name])) + else: + common = list(self.columns.intersection(right.columns)) if len(common) == 0: raise ValueError( 'No common columns to perform merge on. Merge options: ' From 377847e1ba0f3c4f703bf1c7407acd3f998cce4f Mon Sep 17 00:00:00 2001 From: thoo Date: Sun, 22 Sep 2019 18:44:49 -0400 Subject: [PATCH 2/6] add merge tests with series --- databricks/koalas/tests/test_dataframe.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py index d7b902944f..f86f553679 100644 --- a/databricks/koalas/tests/test_dataframe.py +++ b/databricks/koalas/tests/test_dataframe.py @@ -719,11 +719,13 @@ def test_merge(self): 'value': [4, 5, 6, 7, 8, 9], 'y': list('efghij')}, columns=['rkey', 'value', 'y']) + right_ps = pd.Series(list('defghi'), name='x', index=[5, 6, 7, 8, 9, 10]) left_kdf = ks.from_pandas(left_pdf) right_kdf = ks.from_pandas(right_pdf) + right_ks = ks.from_pandas(right_ps) - def check(op): + def check(op, right_kdf=right_kdf, right_pdf=right_pdf): k_res = op(left_kdf, right_kdf) k_res = k_res.to_pandas() k_res = k_res.sort_values(by=list(k_res.columns)) @@ -764,6 +766,23 @@ def check(op): check(lambda left, right: left.merge(right, left_on='lkey', right_on='rkey', suffixes=['_left', '_right'])) + # Test Series on the right + check(lambda left, right: left.merge(right), right_ks, right_ps) + check(lambda left, right: left.merge(right, left_on='x', right_on='x'), + right_ks, right_ps) + check(lambda left, right: left.set_index('x').merge(right, left_index=True, right_on='x'), + right_ks, right_ps) + + # Test join types with Series + for how in ['inner', 'left', 'right', 'outer']: + check(lambda left, right: left.merge(right, how=how), right_ks, right_ps) + check(lambda left, right: left.merge(right, left_on='x', right_on='x', how=how), + right_ks, right_ps) + + # suffix + check(lambda left, right: left.merge(right, suffixes=['_left', '_right'], how='outer', + left_index=True, right_index=True), right_ks, right_ps) + def test_merge_retains_indices(self): left_pdf = pd.DataFrame({'A': [0, 1]}) right_pdf = pd.DataFrame({'B': [1, 2]}, index=[1, 2]) From b06359329a77ec8dafbbedb616edd8c72b89e92e Mon Sep 17 00:00:00 2001 From: thoo Date: Mon, 23 Sep 2019 21:42:41 -0400 Subject: [PATCH 3/6] fix pytest and html format for DataFrame.merge --- databricks/koalas/frame.py | 43 ++++++++++++----------- databricks/koalas/tests/test_dataframe.py | 35 ++++++++++-------- 2 files changed, 43 insertions(+), 35 deletions(-) diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py index 0df458f69a..bda4da55f4 100644 --- a/databricks/koalas/frame.py +++ b/databricks/koalas/frame.py @@ -5093,33 +5093,34 @@ def merge(self, right: 'DataFrame', how: str = 'inner', Parameters ---------- right: Object to merge with. + DataFrame or named Series how: Type of merge to be performed. {'left', 'right', 'outer', 'inner'}, default 'inner' - left: use only keys from left frame, similar to a SQL left outer join; preserve key - order. - right: use only keys from right frame, similar to a SQL right outer join; preserve key - order. - outer: use union of keys from both frames, similar to a SQL full outer join; sort keys - lexicographically. - inner: use intersection of keys from both frames, similar to a SQL inner join; - preserve the order of the left keys. - on: Column or index level names to join on. These must be found in both DataFrames. If on - is None and not merging on indexes then this defaults to the intersection of the + left: use only keys from left frame, similar to a SQL left outer join; + preserve key order. + right: use only keys from right frame, similar to a SQL right outer join; + preserve key order. + outer: use union of keys from both frames, similar to a SQL full outer + join; sort keys lexicographically. + inner: use intersection of keys from both frames, similar to a SQL inner + join; preserve the order of the left keys. + on: Column or index level names to join on. + These must be found in both DataFrames. If on is None and not merging on + indexes then this defaults to the intersection of the columns in both DataFrames. - left_on: Column or index level names to join on in the left DataFrame. Can also - be an array or list of arrays of the length of the left DataFrame. + left_on: Column or index level names to join on in the left DataFrame. + Can also be an array or list of arrays of the length of the left DataFrame. These arrays are treated as if they are columns. - right_on: Column or index level names to join on in the right DataFrame. Can also - be an array or list of arrays of the length of the right DataFrame. + right_on: Column or index level names to join on in the right DataFrame. + Can also be an array or list of arrays of the length of the right DataFrame. These arrays are treated as if they are columns. - left_index: Use the index from the left DataFrame as the join key(s). If it is a - MultiIndex, the number of keys in the other DataFrame (either the index or a number of - columns) must match the number of levels. - right_index: Use the index from the right DataFrame as the join key. Same caveats as - left_index. - suffixes: Suffix to apply to overlapping column names in the left and right side, - respectively. + left_index: Use the index from the left DataFrame as the join key(s). + If it is a MultiIndex, the number of keys in the other DataFrame (either the + index or a number of columns) must match the number of levels. + right_index: Use the index from the right DataFrame as the join key. + Same caveats as left_index. + suffixes: Suffix to apply to overlapping column names in the left and right side, respectively. Returns ------- diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py index dcbc0ae72b..5a6e38aa09 100644 --- a/databricks/koalas/tests/test_dataframe.py +++ b/databricks/koalas/tests/test_dataframe.py @@ -16,6 +16,7 @@ from datetime import date, datetime import inspect +from distutils.version import LooseVersion import numpy as np import pandas as pd @@ -767,21 +768,23 @@ def check(op, right_kdf=right_kdf, right_pdf=right_pdf): suffixes=['_left', '_right'])) # Test Series on the right - check(lambda left, right: left.merge(right), right_ks, right_ps) - check(lambda left, right: left.merge(right, left_on='x', right_on='x'), - right_ks, right_ps) - check(lambda left, right: left.set_index('x').merge(right, left_index=True, right_on='x'), - right_ks, right_ps) - - # Test join types with Series - for how in ['inner', 'left', 'right', 'outer']: - check(lambda left, right: left.merge(right, how=how), right_ks, right_ps) - check(lambda left, right: left.merge(right, left_on='x', right_on='x', how=how), + # pd.DataFrame.merge with Series is implemented since version 0.24.0 + if LooseVersion(pd.__version__) > LooseVersion("0.24.2"): + check(lambda left, right: left.merge(right), right_ks, right_ps) + check(lambda left, right: left.merge(right, left_on='x', right_on='x'), + right_ks, right_ps) + check(lambda left, right: left.set_index('x').merge(right, left_index=True, right_on='x'), right_ks, right_ps) - # suffix - check(lambda left, right: left.merge(right, suffixes=['_left', '_right'], how='outer', - left_index=True, right_index=True), right_ks, right_ps) + # Test join types with Series + for how in ['inner', 'left', 'right', 'outer']: + check(lambda left, right: left.merge(right, how=how), right_ks, right_ps) + check(lambda left, right: left.merge(right, left_on='x', right_on='x', how=how), + right_ks, right_ps) + + # suffix with Series + check(lambda left, right: left.merge(right, suffixes=['_left', '_right'], how='outer', + left_index=True, right_index=True), right_ks, right_ps) def test_merge_retains_indices(self): left_pdf = pd.DataFrame({'A': [0, 1]}) @@ -1029,11 +1032,15 @@ def test_join(self): self.assert_eq(join_pdf, join_kdf) - # join with duplicated columns in Series and DataFrame + # join with duplicated columns in Series with self.assertRaisesRegex(ValueError, "columns overlap but no suffix specified"): kdf1.join(ks1, how='outer') + # join with duplicated columns in DataFrame + with self.assertRaisesRegex(ValueError, + "columns overlap but no suffix specified"): kdf1.join(kdf2, how='outer') + # check `on` parameter join_pdf = pdf1.join(pdf2.set_index('key'), on='key', lsuffix='_left', rsuffix='_right') join_pdf.sort_values(by=list(join_pdf.columns), inplace=True) From 742575d2ff6e3add6414d8ffd2289a1c2bee97d7 Mon Sep 17 00:00:00 2001 From: thoo Date: Mon, 23 Sep 2019 22:02:37 -0400 Subject: [PATCH 4/6] reset html format-will do later --- databricks/koalas/frame.py | 43 +++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py index bda4da55f4..0df458f69a 100644 --- a/databricks/koalas/frame.py +++ b/databricks/koalas/frame.py @@ -5093,34 +5093,33 @@ def merge(self, right: 'DataFrame', how: str = 'inner', Parameters ---------- right: Object to merge with. - DataFrame or named Series how: Type of merge to be performed. {'left', 'right', 'outer', 'inner'}, default 'inner' - left: use only keys from left frame, similar to a SQL left outer join; - preserve key order. - right: use only keys from right frame, similar to a SQL right outer join; - preserve key order. - outer: use union of keys from both frames, similar to a SQL full outer - join; sort keys lexicographically. - inner: use intersection of keys from both frames, similar to a SQL inner - join; preserve the order of the left keys. - on: Column or index level names to join on. - These must be found in both DataFrames. If on is None and not merging on - indexes then this defaults to the intersection of the + left: use only keys from left frame, similar to a SQL left outer join; preserve key + order. + right: use only keys from right frame, similar to a SQL right outer join; preserve key + order. + outer: use union of keys from both frames, similar to a SQL full outer join; sort keys + lexicographically. + inner: use intersection of keys from both frames, similar to a SQL inner join; + preserve the order of the left keys. + on: Column or index level names to join on. These must be found in both DataFrames. If on + is None and not merging on indexes then this defaults to the intersection of the columns in both DataFrames. - left_on: Column or index level names to join on in the left DataFrame. - Can also be an array or list of arrays of the length of the left DataFrame. + left_on: Column or index level names to join on in the left DataFrame. Can also + be an array or list of arrays of the length of the left DataFrame. These arrays are treated as if they are columns. - right_on: Column or index level names to join on in the right DataFrame. - Can also be an array or list of arrays of the length of the right DataFrame. + right_on: Column or index level names to join on in the right DataFrame. Can also + be an array or list of arrays of the length of the right DataFrame. These arrays are treated as if they are columns. - left_index: Use the index from the left DataFrame as the join key(s). - If it is a MultiIndex, the number of keys in the other DataFrame (either the - index or a number of columns) must match the number of levels. - right_index: Use the index from the right DataFrame as the join key. - Same caveats as left_index. - suffixes: Suffix to apply to overlapping column names in the left and right side, respectively. + left_index: Use the index from the left DataFrame as the join key(s). If it is a + MultiIndex, the number of keys in the other DataFrame (either the index or a number of + columns) must match the number of levels. + right_index: Use the index from the right DataFrame as the join key. Same caveats as + left_index. + suffixes: Suffix to apply to overlapping column names in the left and right side, + respectively. Returns ------- From f696e5d780777962190246b3eab440ccf469c96d Mon Sep 17 00:00:00 2001 From: thoo Date: Mon, 23 Sep 2019 22:06:37 -0400 Subject: [PATCH 5/6] switch pandas version to check --- databricks/koalas/tests/test_dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py index 5a6e38aa09..ff0a1f5976 100644 --- a/databricks/koalas/tests/test_dataframe.py +++ b/databricks/koalas/tests/test_dataframe.py @@ -769,7 +769,7 @@ def check(op, right_kdf=right_kdf, right_pdf=right_pdf): # Test Series on the right # pd.DataFrame.merge with Series is implemented since version 0.24.0 - if LooseVersion(pd.__version__) > LooseVersion("0.24.2"): + if LooseVersion(pd.__version__) >= LooseVersion("0.24.0"): check(lambda left, right: left.merge(right), right_ks, right_ps) check(lambda left, right: left.merge(right, left_on='x', right_on='x'), right_ks, right_ps) From a1f093799f7a6f49c785085ad129643f9b43b959 Mon Sep 17 00:00:00 2001 From: thoo Date: Mon, 23 Sep 2019 22:47:10 -0400 Subject: [PATCH 6/6] fix pylint --- databricks/koalas/tests/test_dataframe.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py index ff0a1f5976..a0c227b74b 100644 --- a/databricks/koalas/tests/test_dataframe.py +++ b/databricks/koalas/tests/test_dataframe.py @@ -773,8 +773,8 @@ def check(op, right_kdf=right_kdf, right_pdf=right_pdf): check(lambda left, right: left.merge(right), right_ks, right_ps) check(lambda left, right: left.merge(right, left_on='x', right_on='x'), right_ks, right_ps) - check(lambda left, right: left.set_index('x').merge(right, left_index=True, right_on='x'), - right_ks, right_ps) + check(lambda left, right: left.set_index('x').merge(right, left_index=True, + right_on='x'), right_ks, right_ps) # Test join types with Series for how in ['inner', 'left', 'right', 'outer']: @@ -784,7 +784,8 @@ def check(op, right_kdf=right_kdf, right_pdf=right_pdf): # suffix with Series check(lambda left, right: left.merge(right, suffixes=['_left', '_right'], how='outer', - left_index=True, right_index=True), right_ks, right_ps) + left_index=True, right_index=True), + right_ks, right_ps) def test_merge_retains_indices(self): left_pdf = pd.DataFrame({'A': [0, 1]})