diff --git a/CHANGELOG.md b/CHANGELOG.md index 56e6ff7..7f36f86 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +## [0.8.6] - 2023-08-26 + +- Made ``zstandard`` a required (as opposed to optional) dependency, because ``Biglist.DEFAULT_STORAGE_FORMAT`` defaults to ``pickle-zstd``. +- Decreased default value of ``Biglist._n_write_threads`` from 8 to 4. + + ## [0.8.5] - 2023-08-25 - ``BiglistBase`` now has custom support for pickling. diff --git a/pyproject.toml b/pyproject.toml index cc2ad8b..94b9d30 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,7 @@ dependencies = [ "deprecation", "pyarrow >= 12.0.1", "typing-extensions", + "zstandard", ] requires-python = ">=3.8" readme = "README.rst" @@ -30,7 +31,7 @@ Source = "https://github.com/zpz/biglist" [project.optional-dependencies] -zstandard = ["zstandard"] +zstandard = [] lz4 = ["lz4"] doc = [ "sphinx < 7.2.0", @@ -49,6 +50,9 @@ test = [ ] +# `zstandard` has become required dependency. +# The `zstandard` extra will be removed later. + # See https://beta.ruff.rs/docs/rules/ [tool.ruff] target-version = "py38" diff --git a/src/biglist/__init__.py b/src/biglist/__init__.py index a789fa5..3fccb15 100644 --- a/src/biglist/__init__.py +++ b/src/biglist/__init__.py @@ -51,4 +51,4 @@ Slicer, ) -__version__ = "0.8.5" +__version__ = "0.8.6" diff --git a/src/biglist/_base.py b/src/biglist/_base.py index 1f040f4..3d4cc15 100644 --- a/src/biglist/_base.py +++ b/src/biglist/_base.py @@ -69,6 +69,7 @@ def load(self) -> None: FileReaderType = TypeVar("FileReaderType", bound=FileReader) +'''This type variable indicates the class :class:`FileReader` or a subclass thereof.''' class FileSeq(Seq[FileReaderType]): diff --git a/src/biglist/_biglist.py b/src/biglist/_biglist.py index 2dc31ab..ac6d9bd 100644 --- a/src/biglist/_biglist.py +++ b/src/biglist/_biglist.py @@ -226,6 +226,7 @@ def new( ``serialize_kwargs`` and ``deserialize_kwargs`` are rarely needed. One use case is ``schema`` when storage format is "parquet". + See :class:`~biglist._biglist.ParquetSerializer`. ``serialize_kwargs`` and ``deserialize_kwargs``, if not ``None``, will be saved in the "info.json" file, hence they must be JSON @@ -298,7 +299,15 @@ def __init__( self._append_buffer: list = [] self._append_files_buffer: list = [] self._file_dumper = None - self._n_write_threads = 8 + + self._n_write_threads = 4 + '''This value affects memory demand during quick "appending" (and flushing/dumping in the background). + If the memory consumption of each batch is large, you could manually set this to a lower value, like:: + + lst = Biglist(path) + lst._n_write_threads = 4 + ''' + self._serialize_kwargs = self.info.get("serialize_kwargs", {}) self._deserialize_kwargs = self.info.get("deserialize_kwargs", {}) if self.storage_format == "parquet" and "schema_spec" in self._serialize_kwargs: @@ -496,11 +505,13 @@ def extend(self, x: Iterable[Element]) -> None: def make_file_name(self, buffer_len: int, extra: str = '') -> str: ''' - If you need to customize the data file name for any reason, you should do that via ``extra`` + This method constructs the file name of a data file. + If you need to customize this method for any reason, you should do it via ``extra`` and keep the other patterns unchanged. + The string ``extra`` will appear between other fixed patterns in the file name. One possible usecase is this: in distributed writing, you want files written by different workers - to be distinguishable by the file names. Do something like this: + to be distinguishable by the file names. Do something like this:: def worker(datapath: str, worker_id: str, ...): out = Biglist(datapath) @@ -909,10 +920,11 @@ class Multiplexer: The usage consists of two main parts: 1. In "controller" code, call :meth:`start` to start a new "session". - Different sessions (at the same time or otherwise) are independent consumers of the data. + Different sessions (at the same time or otherwise) are independent consumers of the data. + 2. In "worker" code, use the session ID that was returned by :meth:`start` to instantiate - a Multiplexer and iterate over it. In so doing, multiple workers will obtain the data elements - collectively, i.e., each element is obtained by exactly one worker. + a Multiplexer and iterate over it. In so doing, multiple workers will obtain the data elements + collectively, i.e., each element is obtained by exactly one worker. """ @classmethod diff --git a/src/biglist/_parquet.py b/src/biglist/_parquet.py index 82e4468..a092699 100644 --- a/src/biglist/_parquet.py +++ b/src/biglist/_parquet.py @@ -71,8 +71,9 @@ def get_gcsfs(cls, *, good_for_seconds=600) -> GcsFileSystem: @classmethod def load_file(cls, path: Upath) -> ParquetFile: ''' - Depending on the implementation, this may read *meta* info only, or - load in the entire file eagerly. + This reads *meta* info and constructs a ``pyarrow.parquet.ParquetFile`` object. + This does not load the entire file. + See :meth:`load` for eager loading. Parameters ---------- @@ -156,12 +157,12 @@ def __len__(self) -> int: return self.num_rows def load(self) -> None: - """Eagerly read in the whole file as a table.""" + """Eagerly read the whole file into memory as a table.""" if self._data is None: self._data = ParquetBatchData( self.file.read(columns=self._column_names, use_threads=True), ) - self._data.scalar_as_py = (self.scalar_as_py,) + self._data.scalar_as_py = self.scalar_as_py if self.num_row_groups == 1: assert self._row_groups is None self._row_groups = [self._data] diff --git a/src/biglist/_util.py b/src/biglist/_util.py index 4ac1b4c..23b09d0 100644 --- a/src/biglist/_util.py +++ b/src/biglist/_util.py @@ -145,6 +145,7 @@ def __iter__(self) -> Iterator[Element]: SeqType = TypeVar("SeqType", bound=Seq) +'''This type variable indicates the class :class:`Seq` or a subclass thereof.''' class Slicer(Generic[SeqType]):