Skip to content

Commit

Permalink
feat[python] support units for estimated_size ("mb","gb", etc) (#4499)
Browse files Browse the repository at this point in the history
  • Loading branch information
alexander-beedie authored Aug 19, 2022
1 parent 3c7302f commit de93b31
Show file tree
Hide file tree
Showing 5 changed files with 83 additions and 9 deletions.
30 changes: 27 additions & 3 deletions py-polars/polars/internals/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
is_int_sequence,
is_str_sequence,
range_to_slice,
scale_bytes,
)

try:
Expand Down Expand Up @@ -113,6 +114,7 @@
ParallelStrategy,
ParquetCompression,
PivotAgg,
SizeUnit,
UniqueKeepStrategy,
)

Expand Down Expand Up @@ -300,10 +302,10 @@ def __init__(
else:
raise ValueError("DataFrame constructor not called properly.")

def estimated_size(self) -> int:
def estimated_size(self, unit: SizeUnit = "b") -> int | float:
"""
Return an estimation of the total (heap) allocated size of the `DataFrame` in
bytes.
bytes (pass `unit` to return estimated size in kilobytes, megabytes, etc)..
This estimation is the sum of the size of its buffers, validity, including
nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the
Expand All @@ -315,8 +317,30 @@ def estimated_size(self) -> int:
this function returns the visible size of the buffer, not its total capacity.
FFI buffers are included in this estimation.
Parameters
----------
unit : {'b', 'kb', 'mb', 'gb', 'tb'}
Scale the returned size to the given unit.
Examples
--------
>>> df = pl.DataFrame(
... {
... "x": list(reversed(range(1_000_000))),
... "y": [v / 1000 for v in range(1_000_000)],
... "z": [str(v) for v in range(1_000_000)],
... },
... columns=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)],
... )
>>> df.estimated_size()
25888898
>>> df.estimated_size("mb")
24.689577102661133
"""
return self._df.estimated_size()
sz = self._df.estimated_size()
return scale_bytes(sz, to=unit)

@classmethod
def _from_pydf(cls: type[DF], py_df: PyDataFrame) -> DF:
Expand Down
22 changes: 19 additions & 3 deletions py-polars/polars/internals/series/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
is_bool_sequence,
is_int_sequence,
range_to_slice,
scale_bytes,
)

try:
Expand Down Expand Up @@ -96,6 +97,7 @@
InterpolationMethod,
NullBehavior,
RankMethod,
SizeUnit,
TimeUnit,
)

Expand Down Expand Up @@ -651,10 +653,10 @@ def flags(self) -> dict[str, bool]:
"SORTED_DESC": self._s.is_sorted_reverse_flag(),
}

def estimated_size(self) -> int:
def estimated_size(self, unit: SizeUnit = "b") -> int | float:
"""
Return an estimation of the total (heap) allocated size of the `Series` in
bytes.
bytes (pass `unit` to return estimated size in kilobytes, megabytes, etc).
This estimation is the sum of the size of its buffers, validity, including
nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the
Expand All @@ -667,8 +669,22 @@ def estimated_size(self) -> int:
FFI buffers are included in this estimation.
Parameters
----------
unit : {'b', 'kb', 'mb', 'gb', 'tb'}
Scale the returned size to the given unit.
Examples
--------
>>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32)
>>> s.estimated_size()
4000000
>>> s.estimated_size("mb")
3.814697265625
"""
return self._s.estimated_size()
sz = self._s.estimated_size()
return scale_bytes(sz, to=unit)

def sqrt(self) -> Series:
"""
Expand Down
12 changes: 12 additions & 0 deletions py-polars/polars/internals/type_aliases.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,18 @@
RankMethod: TypeAlias = Literal["average", "min", "max", "dense", "ordinal", "random"]
TimeUnit: TypeAlias = Literal["ns", "us", "ms"]
UniqueKeepStrategy: TypeAlias = Literal["first", "last"]
SizeUnit: TypeAlias = Literal[
"b",
"kb",
"mb",
"gb",
"tb",
"bytes",
"kilobytes",
"megabytes",
"gigabytes",
"terabytes",
]

# The following have a Rust enum equivalent with a different name
AsofJoinStrategy: TypeAlias = Literal["backward", "forward"] # AsofStrategy
Expand Down
16 changes: 15 additions & 1 deletion py-polars/polars/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from typing_extensions import ParamSpec, TypeGuard

if TYPE_CHECKING:
from polars.internals.type_aliases import TimeUnit
from polars.internals.type_aliases import SizeUnit, TimeUnit


def _process_null_values(
Expand Down Expand Up @@ -344,3 +344,17 @@ def _rename_kwargs(
stacklevel=3,
)
kwargs[new] = kwargs.pop(alias)


def scale_bytes(sz: int, to: SizeUnit) -> int | float:
"""Scale size in bytes to other size units (eg: "kb", "mb", "gb", "tb")."""
scaling_factor = {
"b": 1,
"k": 1024,
"m": 1024**2,
"g": 1024**3,
"t": 1024**4,
}[to[0]]
if scaling_factor > 1:
return sz / scaling_factor
return sz
12 changes: 10 additions & 2 deletions py-polars/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,5 +53,13 @@ def test_timedelta_to_pl_timedelta() -> None:


def test_estimated_size() -> None:
a = pl.Series([1, 2, 3])
assert a.estimated_size() == a.to_frame().estimated_size()
s = pl.Series("n", list(range(100)))
df = s.to_frame()

for sz in (s.estimated_size(), s.estimated_size("b"), s.estimated_size("bytes")):
assert sz == df.estimated_size()

assert s.estimated_size("kb") == (df.estimated_size("b") / 1024)
assert s.estimated_size("mb") == (df.estimated_size("kb") / 1024)
assert s.estimated_size("gb") == (df.estimated_size("mb") / 1024)
assert s.estimated_size("tb") == (df.estimated_size("gb") / 1024)

0 comments on commit de93b31

Please sign in to comment.