Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue with ks.merge to Series #818

Merged
merged 7 commits into from
Sep 24, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion databricks/koalas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -5212,7 +5212,10 @@ def merge(self, right: 'DataFrame', how: str = 'inner',
if right_keys and not left_keys:
raise ValueError('Must pass left_on or left_index=True')
if not left_keys and not right_keys:
common = list(self.columns.intersection(right.columns))
if isinstance(right, ks.Series):
common = list(self.columns.intersection([right.name]))
else:
common = list(self.columns.intersection(right.columns))
if len(common) == 0:
raise ValueError(
'No common columns to perform merge on. Merge options: '
Expand Down
31 changes: 29 additions & 2 deletions databricks/koalas/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from datetime import date, datetime
import inspect
from distutils.version import LooseVersion

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -719,11 +720,13 @@ def test_merge(self):
'value': [4, 5, 6, 7, 8, 9],
'y': list('efghij')},
columns=['rkey', 'value', 'y'])
right_ps = pd.Series(list('defghi'), name='x', index=[5, 6, 7, 8, 9, 10])

left_kdf = ks.from_pandas(left_pdf)
right_kdf = ks.from_pandas(right_pdf)
right_ks = ks.from_pandas(right_ps)

def check(op):
def check(op, right_kdf=right_kdf, right_pdf=right_pdf):
k_res = op(left_kdf, right_kdf)
k_res = k_res.to_pandas()
k_res = k_res.sort_values(by=list(k_res.columns))
Expand Down Expand Up @@ -764,6 +767,26 @@ def check(op):
check(lambda left, right: left.merge(right, left_on='lkey', right_on='rkey',
suffixes=['_left', '_right']))

# Test Series on the right
# pd.DataFrame.merge with Series is implemented since version 0.24.0
if LooseVersion(pd.__version__) >= LooseVersion("0.24.0"):
check(lambda left, right: left.merge(right), right_ks, right_ps)
check(lambda left, right: left.merge(right, left_on='x', right_on='x'),
right_ks, right_ps)
check(lambda left, right: left.set_index('x').merge(right, left_index=True,
right_on='x'), right_ks, right_ps)

# Test join types with Series
for how in ['inner', 'left', 'right', 'outer']:
check(lambda left, right: left.merge(right, how=how), right_ks, right_ps)
check(lambda left, right: left.merge(right, left_on='x', right_on='x', how=how),
right_ks, right_ps)

# suffix with Series
check(lambda left, right: left.merge(right, suffixes=['_left', '_right'], how='outer',
left_index=True, right_index=True),
right_ks, right_ps)

def test_merge_retains_indices(self):
left_pdf = pd.DataFrame({'A': [0, 1]})
right_pdf = pd.DataFrame({'B': [1, 2]}, index=[1, 2])
Expand Down Expand Up @@ -1010,11 +1033,15 @@ def test_join(self):

self.assert_eq(join_pdf, join_kdf)

# join with duplicated columns in Series and DataFrame
# join with duplicated columns in Series
with self.assertRaisesRegex(ValueError,
"columns overlap but no suffix specified"):
kdf1.join(ks1, how='outer')
# join with duplicated columns in DataFrame
with self.assertRaisesRegex(ValueError,
"columns overlap but no suffix specified"):
kdf1.join(kdf2, how='outer')

# check `on` parameter
join_pdf = pdf1.join(pdf2.set_index('key'), on='key', lsuffix='_left', rsuffix='_right')
join_pdf.sort_values(by=list(join_pdf.columns), inplace=True)
Expand Down