Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat[python] support units for estimated_size ("mb","gb", etc) #4499

Merged
merged 2 commits into from
Aug 19, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 27 additions & 3 deletions py-polars/polars/internals/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
is_int_sequence,
is_str_sequence,
range_to_slice,
scale_bytes,
)

try:
Expand Down Expand Up @@ -113,6 +114,7 @@
ParallelStrategy,
ParquetCompression,
PivotAgg,
SizeUnit,
UniqueKeepStrategy,
)

Expand Down Expand Up @@ -300,10 +302,10 @@ def __init__(
else:
raise ValueError("DataFrame constructor not called properly.")

def estimated_size(self) -> int:
def estimated_size(self, unit: SizeUnit = "b") -> int | float:
"""
Return an estimation of the total (heap) allocated size of the `DataFrame` in
bytes.
bytes (pass `unit` to return estimated size in kilobytes, megabytes, etc)..

This estimation is the sum of the size of its buffers, validity, including
nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the
Expand All @@ -315,8 +317,30 @@ def estimated_size(self) -> int:
this function returns the visible size of the buffer, not its total capacity.

FFI buffers are included in this estimation.

Parameters
----------
unit : str, optional
Scale returned size to the given unit ("b", "kb", "mb", "gb", or "tb").
alexander-beedie marked this conversation as resolved.
Show resolved Hide resolved

Examples
--------
>>> df = pl.DataFrame(
... {
... "x": list(reversed(range(1_000_000))),
... "y": [v / 1000 for v in range(1_000_000)],
... "z": [str(v) for v in range(1_000_000)],
... },
... columns=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)],
... )
>>> df.estimated_size()
25888898
>>> df.estimated_size("mb")
24.689577102661133

"""
return self._df.estimated_size()
sz = self._df.estimated_size()
return scale_bytes(sz, to=unit)

@classmethod
def _from_pydf(cls: type[DF], py_df: PyDataFrame) -> DF:
Expand Down
22 changes: 19 additions & 3 deletions py-polars/polars/internals/series/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
is_bool_sequence,
is_int_sequence,
range_to_slice,
scale_bytes,
)

try:
Expand Down Expand Up @@ -96,6 +97,7 @@
InterpolationMethod,
NullBehavior,
RankMethod,
SizeUnit,
TimeUnit,
)

Expand Down Expand Up @@ -651,10 +653,10 @@ def flags(self) -> dict[str, bool]:
"SORTED_DESC": self._s.is_sorted_reverse_flag(),
}

def estimated_size(self) -> int:
def estimated_size(self, unit: SizeUnit = "b") -> int | float:
"""
Return an estimation of the total (heap) allocated size of the `Series` in
bytes.
bytes (pass `unit` to return estimated size in kilobytes, megabytes, etc).

This estimation is the sum of the size of its buffers, validity, including
nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the
Expand All @@ -667,8 +669,22 @@ def estimated_size(self) -> int:

FFI buffers are included in this estimation.

Parameters
----------
unit : str, optional
Scale returned size to the given unit ("b", "kb", "mb", "gb", or "tb").

Examples
--------
>>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32)
>>> s.estimated_size()
4000000
>>> s.estimated_size("mb")
3.814697265625

"""
return self._s.estimated_size()
sz = self._s.estimated_size()
return scale_bytes(sz, to=unit)

def sqrt(self) -> Series:
"""
Expand Down
12 changes: 12 additions & 0 deletions py-polars/polars/internals/type_aliases.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,18 @@
RankMethod: TypeAlias = Literal["average", "min", "max", "dense", "ordinal", "random"]
TimeUnit: TypeAlias = Literal["ns", "us", "ms"]
UniqueKeepStrategy: TypeAlias = Literal["first", "last"]
SizeUnit: TypeAlias = Literal[
"b",
"kb",
"mb",
"gb",
"tb",
"bytes",
"kilobytes",
"megabytes",
"gigabytes",
"terabytes",
]

# The following have a Rust enum equivalent with a different name
AsofJoinStrategy: TypeAlias = Literal["backward", "forward"] # AsofStrategy
Expand Down
16 changes: 15 additions & 1 deletion py-polars/polars/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from typing_extensions import ParamSpec, TypeGuard

if TYPE_CHECKING:
from polars.internals.type_aliases import TimeUnit
from polars.internals.type_aliases import SizeUnit, TimeUnit


def _process_null_values(
Expand Down Expand Up @@ -344,3 +344,17 @@ def _rename_kwargs(
stacklevel=3,
)
kwargs[new] = kwargs.pop(alias)


def scale_bytes(sz: int, to: SizeUnit) -> int | float:
"""Scale size in bytes to other size units (eg: "kb", "mb", "gb", "tb")."""
scaling_factor = {
"b": 1,
"k": 1024,
"m": 1024**2,
"g": 1024**3,
"t": 1024**4,
}[to[0]]
if scaling_factor > 1:
return sz / scaling_factor
return sz
12 changes: 10 additions & 2 deletions py-polars/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,5 +53,13 @@ def test_timedelta_to_pl_timedelta() -> None:


def test_estimated_size() -> None:
a = pl.Series([1, 2, 3])
assert a.estimated_size() == a.to_frame().estimated_size()
s = pl.Series("n", list(range(10_000)))
alexander-beedie marked this conversation as resolved.
Show resolved Hide resolved
df = s.to_frame()

for sz in (s.estimated_size(), s.estimated_size("b"), s.estimated_size("bytes")):
assert sz == df.estimated_size()

assert s.estimated_size("kb") == (df.estimated_size("b") / 1024)
assert s.estimated_size("mb") == (df.estimated_size("kb") / 1024)
assert s.estimated_size("gb") == (df.estimated_size("mb") / 1024)
assert s.estimated_size("tb") == (df.estimated_size("gb") / 1024)