diff --git a/py-polars/polars/internals/dataframe/frame.py b/py-polars/polars/internals/dataframe/frame.py index eafd5d3137d4..b91d77ada892 100644 --- a/py-polars/polars/internals/dataframe/frame.py +++ b/py-polars/polars/internals/dataframe/frame.py @@ -53,6 +53,7 @@ is_int_sequence, is_str_sequence, range_to_slice, + scale_bytes, ) try: @@ -113,6 +114,7 @@ ParallelStrategy, ParquetCompression, PivotAgg, + SizeUnit, UniqueKeepStrategy, ) @@ -300,10 +302,10 @@ def __init__( else: raise ValueError("DataFrame constructor not called properly.") - def estimated_size(self) -> int: + def estimated_size(self, unit: SizeUnit = "b") -> int | float: """ Return an estimation of the total (heap) allocated size of the `DataFrame` in - bytes. + bytes (pass `unit` to return estimated size in kilobytes, megabytes, etc).. This estimation is the sum of the size of its buffers, validity, including nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the @@ -315,8 +317,30 @@ def estimated_size(self) -> int: this function returns the visible size of the buffer, not its total capacity. FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {'b', 'kb', 'mb', 'gb', 'tb'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... columns=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + """ - return self._df.estimated_size() + sz = self._df.estimated_size() + return scale_bytes(sz, to=unit) @classmethod def _from_pydf(cls: type[DF], py_df: PyDataFrame) -> DF: diff --git a/py-polars/polars/internals/series/series.py b/py-polars/polars/internals/series/series.py index 23fe87970cb5..9cba16f18952 100644 --- a/py-polars/polars/internals/series/series.py +++ b/py-polars/polars/internals/series/series.py @@ -54,6 +54,7 @@ is_bool_sequence, is_int_sequence, range_to_slice, + scale_bytes, ) try: @@ -96,6 +97,7 @@ InterpolationMethod, NullBehavior, RankMethod, + SizeUnit, TimeUnit, ) @@ -651,10 +653,10 @@ def flags(self) -> dict[str, bool]: "SORTED_DESC": self._s.is_sorted_reverse_flag(), } - def estimated_size(self) -> int: + def estimated_size(self, unit: SizeUnit = "b") -> int | float: """ Return an estimation of the total (heap) allocated size of the `Series` in - bytes. + bytes (pass `unit` to return estimated size in kilobytes, megabytes, etc). This estimation is the sum of the size of its buffers, validity, including nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the @@ -667,8 +669,22 @@ def estimated_size(self) -> int: FFI buffers are included in this estimation. + Parameters + ---------- + unit : {'b', 'kb', 'mb', 'gb', 'tb'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + """ - return self._s.estimated_size() + sz = self._s.estimated_size() + return scale_bytes(sz, to=unit) def sqrt(self) -> Series: """ diff --git a/py-polars/polars/internals/type_aliases.py b/py-polars/polars/internals/type_aliases.py index a1e1a1bb6f41..48fedc9e58e2 100644 --- a/py-polars/polars/internals/type_aliases.py +++ b/py-polars/polars/internals/type_aliases.py @@ -39,6 +39,18 @@ RankMethod: TypeAlias = Literal["average", "min", "max", "dense", "ordinal", "random"] TimeUnit: TypeAlias = Literal["ns", "us", "ms"] UniqueKeepStrategy: TypeAlias = Literal["first", "last"] +SizeUnit: TypeAlias = Literal[ + "b", + "kb", + "mb", + "gb", + "tb", + "bytes", + "kilobytes", + "megabytes", + "gigabytes", + "terabytes", +] # The following have a Rust enum equivalent with a different name AsofJoinStrategy: TypeAlias = Literal["backward", "forward"] # AsofStrategy diff --git a/py-polars/polars/utils.py b/py-polars/polars/utils.py index 94d7b370d52d..82c7e6474a4e 100644 --- a/py-polars/polars/utils.py +++ b/py-polars/polars/utils.py @@ -34,7 +34,7 @@ from typing_extensions import ParamSpec, TypeGuard if TYPE_CHECKING: - from polars.internals.type_aliases import TimeUnit + from polars.internals.type_aliases import SizeUnit, TimeUnit def _process_null_values( @@ -344,3 +344,17 @@ def _rename_kwargs( stacklevel=3, ) kwargs[new] = kwargs.pop(alias) + + +def scale_bytes(sz: int, to: SizeUnit) -> int | float: + """Scale size in bytes to other size units (eg: "kb", "mb", "gb", "tb").""" + scaling_factor = { + "b": 1, + "k": 1024, + "m": 1024**2, + "g": 1024**3, + "t": 1024**4, + }[to[0]] + if scaling_factor > 1: + return sz / scaling_factor + return sz diff --git a/py-polars/tests/test_utils.py b/py-polars/tests/test_utils.py index 155c9e8e0539..fcaa7d509462 100644 --- a/py-polars/tests/test_utils.py +++ b/py-polars/tests/test_utils.py @@ -53,5 +53,13 @@ def test_timedelta_to_pl_timedelta() -> None: def test_estimated_size() -> None: - a = pl.Series([1, 2, 3]) - assert a.estimated_size() == a.to_frame().estimated_size() + s = pl.Series("n", list(range(100))) + df = s.to_frame() + + for sz in (s.estimated_size(), s.estimated_size("b"), s.estimated_size("bytes")): + assert sz == df.estimated_size() + + assert s.estimated_size("kb") == (df.estimated_size("b") / 1024) + assert s.estimated_size("mb") == (df.estimated_size("kb") / 1024) + assert s.estimated_size("gb") == (df.estimated_size("mb") / 1024) + assert s.estimated_size("tb") == (df.estimated_size("gb") / 1024)