From e8f1498299ac36a4a469097fb144616314c712a2 Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Fri, 19 Aug 2022 10:33:24 +0000 Subject: [PATCH 1/2] feat[python] support units for estimated_size ('mb','gb', etc) --- py-polars/polars/internals/dataframe/frame.py | 30 +++++++++++++++++-- py-polars/polars/internals/series/series.py | 22 ++++++++++++-- py-polars/polars/internals/type_aliases.py | 12 ++++++++ py-polars/polars/utils.py | 16 +++++++++- py-polars/tests/test_utils.py | 12 ++++++-- 5 files changed, 83 insertions(+), 9 deletions(-) diff --git a/py-polars/polars/internals/dataframe/frame.py b/py-polars/polars/internals/dataframe/frame.py index eafd5d3137d4..c549a6c30936 100644 --- a/py-polars/polars/internals/dataframe/frame.py +++ b/py-polars/polars/internals/dataframe/frame.py @@ -53,6 +53,7 @@ is_int_sequence, is_str_sequence, range_to_slice, + scale_bytes, ) try: @@ -113,6 +114,7 @@ ParallelStrategy, ParquetCompression, PivotAgg, + SizeUnit, UniqueKeepStrategy, ) @@ -300,10 +302,10 @@ def __init__( else: raise ValueError("DataFrame constructor not called properly.") - def estimated_size(self) -> int: + def estimated_size(self, unit: SizeUnit = "b") -> int | float: """ Return an estimation of the total (heap) allocated size of the `DataFrame` in - bytes. + bytes (pass `unit` to return estimated size in kilobytes, megabytes, etc).. This estimation is the sum of the size of its buffers, validity, including nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the @@ -315,8 +317,30 @@ def estimated_size(self) -> int: this function returns the visible size of the buffer, not its total capacity. FFI buffers are included in this estimation. + + Parameters + ---------- + unit : str, optional + Scale returned size to the given unit ("b", "kb", "mb", "gb", or "tb"). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... columns=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + """ - return self._df.estimated_size() + sz = self._df.estimated_size() + return scale_bytes(sz, to=unit) @classmethod def _from_pydf(cls: type[DF], py_df: PyDataFrame) -> DF: diff --git a/py-polars/polars/internals/series/series.py b/py-polars/polars/internals/series/series.py index 23fe87970cb5..14aaba5a723d 100644 --- a/py-polars/polars/internals/series/series.py +++ b/py-polars/polars/internals/series/series.py @@ -54,6 +54,7 @@ is_bool_sequence, is_int_sequence, range_to_slice, + scale_bytes, ) try: @@ -96,6 +97,7 @@ InterpolationMethod, NullBehavior, RankMethod, + SizeUnit, TimeUnit, ) @@ -651,10 +653,10 @@ def flags(self) -> dict[str, bool]: "SORTED_DESC": self._s.is_sorted_reverse_flag(), } - def estimated_size(self) -> int: + def estimated_size(self, unit: SizeUnit = "b") -> int | float: """ Return an estimation of the total (heap) allocated size of the `Series` in - bytes. + bytes (pass `unit` to return estimated size in kilobytes, megabytes, etc). This estimation is the sum of the size of its buffers, validity, including nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the @@ -667,8 +669,22 @@ def estimated_size(self) -> int: FFI buffers are included in this estimation. + Parameters + ---------- + unit : str, optional + Scale returned size to the given unit ("b", "kb", "mb", "gb", or "tb"). + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + """ - return self._s.estimated_size() + sz = self._s.estimated_size() + return scale_bytes(sz, to=unit) def sqrt(self) -> Series: """ diff --git a/py-polars/polars/internals/type_aliases.py b/py-polars/polars/internals/type_aliases.py index a1e1a1bb6f41..48fedc9e58e2 100644 --- a/py-polars/polars/internals/type_aliases.py +++ b/py-polars/polars/internals/type_aliases.py @@ -39,6 +39,18 @@ RankMethod: TypeAlias = Literal["average", "min", "max", "dense", "ordinal", "random"] TimeUnit: TypeAlias = Literal["ns", "us", "ms"] UniqueKeepStrategy: TypeAlias = Literal["first", "last"] +SizeUnit: TypeAlias = Literal[ + "b", + "kb", + "mb", + "gb", + "tb", + "bytes", + "kilobytes", + "megabytes", + "gigabytes", + "terabytes", +] # The following have a Rust enum equivalent with a different name AsofJoinStrategy: TypeAlias = Literal["backward", "forward"] # AsofStrategy diff --git a/py-polars/polars/utils.py b/py-polars/polars/utils.py index 94d7b370d52d..82c7e6474a4e 100644 --- a/py-polars/polars/utils.py +++ b/py-polars/polars/utils.py @@ -34,7 +34,7 @@ from typing_extensions import ParamSpec, TypeGuard if TYPE_CHECKING: - from polars.internals.type_aliases import TimeUnit + from polars.internals.type_aliases import SizeUnit, TimeUnit def _process_null_values( @@ -344,3 +344,17 @@ def _rename_kwargs( stacklevel=3, ) kwargs[new] = kwargs.pop(alias) + + +def scale_bytes(sz: int, to: SizeUnit) -> int | float: + """Scale size in bytes to other size units (eg: "kb", "mb", "gb", "tb").""" + scaling_factor = { + "b": 1, + "k": 1024, + "m": 1024**2, + "g": 1024**3, + "t": 1024**4, + }[to[0]] + if scaling_factor > 1: + return sz / scaling_factor + return sz diff --git a/py-polars/tests/test_utils.py b/py-polars/tests/test_utils.py index 155c9e8e0539..b4c757384429 100644 --- a/py-polars/tests/test_utils.py +++ b/py-polars/tests/test_utils.py @@ -53,5 +53,13 @@ def test_timedelta_to_pl_timedelta() -> None: def test_estimated_size() -> None: - a = pl.Series([1, 2, 3]) - assert a.estimated_size() == a.to_frame().estimated_size() + s = pl.Series("n", list(range(10_000))) + df = s.to_frame() + + for sz in (s.estimated_size(), s.estimated_size("b"), s.estimated_size("bytes")): + assert sz == df.estimated_size() + + assert s.estimated_size("kb") == (df.estimated_size("b") / 1024) + assert s.estimated_size("mb") == (df.estimated_size("kb") / 1024) + assert s.estimated_size("gb") == (df.estimated_size("mb") / 1024) + assert s.estimated_size("tb") == (df.estimated_size("gb") / 1024) From 7d793dfc2df1bd49cdca28e95bc869a0fc911f2b Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Fri, 19 Aug 2022 12:04:31 +0000 Subject: [PATCH 2/2] improve units docstring, minimise test impact --- py-polars/polars/internals/dataframe/frame.py | 4 ++-- py-polars/polars/internals/series/series.py | 4 ++-- py-polars/tests/test_utils.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/py-polars/polars/internals/dataframe/frame.py b/py-polars/polars/internals/dataframe/frame.py index c549a6c30936..b91d77ada892 100644 --- a/py-polars/polars/internals/dataframe/frame.py +++ b/py-polars/polars/internals/dataframe/frame.py @@ -320,8 +320,8 @@ def estimated_size(self, unit: SizeUnit = "b") -> int | float: Parameters ---------- - unit : str, optional - Scale returned size to the given unit ("b", "kb", "mb", "gb", or "tb"). + unit : {'b', 'kb', 'mb', 'gb', 'tb'} + Scale the returned size to the given unit. Examples -------- diff --git a/py-polars/polars/internals/series/series.py b/py-polars/polars/internals/series/series.py index 14aaba5a723d..9cba16f18952 100644 --- a/py-polars/polars/internals/series/series.py +++ b/py-polars/polars/internals/series/series.py @@ -671,8 +671,8 @@ def estimated_size(self, unit: SizeUnit = "b") -> int | float: Parameters ---------- - unit : str, optional - Scale returned size to the given unit ("b", "kb", "mb", "gb", or "tb"). + unit : {'b', 'kb', 'mb', 'gb', 'tb'} + Scale the returned size to the given unit. Examples -------- diff --git a/py-polars/tests/test_utils.py b/py-polars/tests/test_utils.py index b4c757384429..fcaa7d509462 100644 --- a/py-polars/tests/test_utils.py +++ b/py-polars/tests/test_utils.py @@ -53,7 +53,7 @@ def test_timedelta_to_pl_timedelta() -> None: def test_estimated_size() -> None: - s = pl.Series("n", list(range(10_000))) + s = pl.Series("n", list(range(100))) df = s.to_frame() for sz in (s.estimated_size(), s.estimated_size("b"), s.estimated_size("bytes")):