Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support None to unlimit 'display.max_rows' #742

Merged
merged 2 commits into from
Sep 4, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions databricks/koalas/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,21 +106,23 @@ def validate(self, v: Any) -> None:

# Available options.
_options = [
# TODO: None should support unlimited.
Option(
key='display.max_rows',
doc=(
"This sets the maximum number of rows koalas should output when printing out "
"various output. For example, this value determines whether the repr() for a "
"dataframe prints out fully or just a truncated repr."),
"various output. For example, this value determines the number of rows to be "
"shown at the repr() in a dataframe. Set `None` to unlimit the input length. "
"Default is 1000."),
default=1000,
types=int,
check_func=(lambda v: v >= 0, "'display.max_rows' should be greater than or equal to 0.")),
types=(int, type(None)),
check_func=(
lambda v: v is None or v >= 0,
"'display.max_rows' should be greater than or equal to 0.")),

Option(
key='compute.max_rows',
doc=(
"'compute.max_rows sets the limit of the current DataFrame. Set `None` to unlimit "
"'compute.max_rows' sets the limit of the current DataFrame. Set `None` to unlimit "
"the input length. When the limit is set, it is executed by the shortcut by "
"collecting the data into driver side, and then using pandas API. If the limit is "
"unset, the operation is executed by PySpark. Default is 1000."),
Expand All @@ -130,7 +132,6 @@ def validate(self, v: Any) -> None:
lambda v: v is None or v >= 0,
"'compute.max_rows' should be greater than or equal to 0.")),

# TODO: None should support unlimited.
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shortcut_limit cannot support None as at least it needs empty dataframe to infer its type when its needed.

Option(
key='compute.shortcut_limit',
doc=(
Expand Down
6 changes: 6 additions & 0 deletions databricks/koalas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -6601,6 +6601,9 @@ def _to_internal_pandas(self):

def __repr__(self):
max_display_count = get_option("display.max_rows")
if max_display_count is None:
return repr(self._to_internal_pandas())

pdf = self.head(max_display_count + 1)._to_internal_pandas()
pdf_length = len(pdf)
repr_string = repr(pdf.iloc[:max_display_count])
Expand All @@ -6616,6 +6619,9 @@ def __repr__(self):

def _repr_html_(self):
max_display_count = get_option("display.max_rows")
if max_display_count is None:
return self._to_internal_pandas()._repr_html_()

pdf = self.head(max_display_count + 1)._to_internal_pandas()
pdf_length = len(pdf)
repr_html = pdf[:max_display_count]._repr_html_()
Expand Down
12 changes: 11 additions & 1 deletion databricks/koalas/indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,14 +240,24 @@ def __getattr__(self, item: str) -> Any:

def __repr__(self):
max_display_count = get_option("display.max_rows")
sdf = self._kdf._sdf.select(self._scol).limit(max_display_count + 1)
sdf = self._kdf._sdf.select(self._scol)

if max_display_count is None:
return repr(DataFrame(self._kdf._internal.copy(
sdf=sdf,
index_map=[(sdf.schema[0].name, self._kdf._internal.index_names[0])],
data_columns=[], column_index=[], column_index_names=None)).index.to_pandas())

sdf = sdf.limit(max_display_count + 1)
internal = self._kdf._internal.copy(
sdf=sdf,
index_map=[(sdf.schema[0].name, self._kdf._internal.index_names[0])],
data_columns=[], column_index=[], column_index_names=None)
pindex = DataFrame(internal).index.to_pandas()

pindex_length = len(pindex)
repr_string = repr(pindex[:max_display_count])

if pindex_length > max_display_count:
footer = '\nShowing only the first {}'.format(max_display_count)
return repr_string + footer
Expand Down
3 changes: 3 additions & 0 deletions databricks/koalas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3027,6 +3027,9 @@ def _to_internal_pandas(self):

def __repr__(self):
max_display_count = get_option("display.max_rows")
if max_display_count is None:
return repr(self._to_internal_pandas())

pser = self.head(max_display_count + 1)._to_internal_pandas()
pser_length = len(pser)
repr_string = repr(pser.iloc[:max_display_count])
Expand Down
3 changes: 2 additions & 1 deletion databricks/koalas/tests/test_frame_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@


class DataFramePlotTest(ReusedSQLTestCase, TestUtils):
sample_ratio_default = None

@classmethod
def setUpClass(cls):
Expand Down Expand Up @@ -233,7 +234,7 @@ def test_sampled_plot_with_ratio(self):
data = SampledPlot().get_sampled(kdf)
self.assertEqual(round(len(data) / 2500, 1), 0.5)
finally:
reset_option('plotting.sample_ratio')
set_option('plotting.sample_ratio', DataFramePlotTest.sample_ratio_default)

def test_sampled_plot_with_max_rows(self):
# 'plotting.max_rows' is 2000
Expand Down
29 changes: 29 additions & 0 deletions databricks/koalas/tests/test_repr.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,13 @@ def test_repr_dataframe(self):
kdf = ks.range(ReprTests.max_display_count + 1)
self.assertTrue("Showing only the first" in repr(kdf))

set_option("display.max_rows", None)
try:
kdf = ks.range(ReprTests.max_display_count + 1)
self.assert_eq(repr(kdf), repr(kdf.to_pandas()))
finally:
set_option("display.max_rows", ReprTests.max_display_count)

def test_repr_series(self):
kser = ks.range(ReprTests.max_display_count).id
self.assertTrue("Showing only the first" not in repr(kser))
Expand All @@ -45,6 +52,13 @@ def test_repr_series(self):
kser = ks.range(ReprTests.max_display_count + 1).id
self.assertTrue("Showing only the first" in repr(kser))

set_option("display.max_rows", None)
try:
kser = ks.range(ReprTests.max_display_count + 1).id
self.assert_eq(repr(kser), repr(kser.to_pandas()))
finally:
set_option("display.max_rows", ReprTests.max_display_count)

def test_repr_indexes(self):
kdf = ks.range(ReprTests.max_display_count)
kidx = kdf.index
Expand All @@ -55,10 +69,25 @@ def test_repr_indexes(self):
kidx = kdf.index
self.assertTrue("Showing only the first" in repr(kidx))

set_option("display.max_rows", None)
try:
kdf = ks.range(ReprTests.max_display_count + 1)
kidx = kdf.index
self.assert_eq(repr(kidx), repr(kidx.to_pandas()))
finally:
set_option("display.max_rows", ReprTests.max_display_count)

def test_html_repr(self):
kdf = ks.range(ReprTests.max_display_count)
self.assertTrue("Showing only the first" not in kdf._repr_html_())
self.assertEqual(kdf._repr_html_(), kdf.to_pandas()._repr_html_())

kdf = ks.range(ReprTests.max_display_count + 1)
self.assertTrue("Showing only the first" in kdf._repr_html_())

set_option("display.max_rows", None)
try:
kdf = ks.range(ReprTests.max_display_count + 1)
self.assertEqual(kdf._repr_html_(), kdf.to_pandas()._repr_html_())
finally:
set_option("display.max_rows", ReprTests.max_display_count)