From a6849c41bd74858f4beae536f71bb13437ae5336 Mon Sep 17 00:00:00 2001 From: Richard Harris Date: Mon, 6 Nov 2023 10:55:15 +0000 Subject: [PATCH] groupby to group_by following deprecation in polars 0.19 --- CHANGELOG.md | 1 + docs/docs/tools.rst | 2 +- python/pod5/README.md | 2 +- python/pod5/src/pod5/tools/pod5_filter.py | 6 +++--- python/pod5/src/pod5/tools/pod5_subset.py | 8 +++----- python/pod5/src/pod5/tools/pod5_view.py | 2 +- 6 files changed, 10 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8511cea..1f64f35 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Transfers dataframes used in subsetting / filter use categorical fields to reduce memory consumption - Polars version increased to `~=0.19` +- Renamed deprecated `polars.groupby` to `polars.group_by` ### Fixed diff --git a/docs/docs/tools.rst b/docs/docs/tools.rst index 4c78639..50c5be7 100644 --- a/docs/docs/tools.rst +++ b/docs/docs/tools.rst @@ -276,7 +276,7 @@ based on a text file containing a table (csv or tsv) parsible by ``polars``. This table file could be the output from ``pod5 view`` or from a sequencing summary. The table must contain a header row and a series of columns on which to group unique collections of values. Internally this process uses the -`polars.Dataframe.groupby `_ +`polars.Dataframe.group_by `_ function where the ``by`` parameter is the sequence of column names specified with the ``--columns`` argument. diff --git a/python/pod5/README.md b/python/pod5/README.md index ef5526b..7f221a7 100644 --- a/python/pod5/README.md +++ b/python/pod5/README.md @@ -364,7 +364,7 @@ based on a text file containing a table (csv or tsv) parsible by ``polars``. This table file could be the output from ``pod5 view`` or from a sequencing summary. The table must contain a header row and a series of columns on which to group unique collections of values. Internally this process uses the -`polars.Dataframe.groupby `_ +`polars.Dataframe.group_by `_ function where the ``by`` parameter is the sequence of column names specified with the ``--columns`` argument. diff --git a/python/pod5/src/pod5/tools/pod5_filter.py b/python/pod5/src/pod5/tools/pod5_filter.py index ca85108..0d4e8c2 100644 --- a/python/pod5/src/pod5/tools/pod5_filter.py +++ b/python/pod5/src/pod5/tools/pod5_filter.py @@ -75,7 +75,7 @@ def filter_reads(dest: Path, sources: pl.DataFrame, duplicate_ok: bool) -> None: # Count the total number of reads expected total_reads = 0 - for source, reads in sources.groupby(PL_SRC_FNAME): + for source, reads in sources.group_by(PL_SRC_FNAME): total_reads += len(reads.get_column(PL_READ_ID)) pbar = tqdm( @@ -89,7 +89,7 @@ def filter_reads(dest: Path, sources: pl.DataFrame, duplicate_ok: bool) -> None: active_limit = 5 # Copy selected reads from one file at a time - for source, reads in sources.groupby(PL_SRC_FNAME): + for source, reads in sources.group_by(PL_SRC_FNAME): src = Path(source) read_ids = reads.get_column(PL_READ_ID).unique().to_list() logger.debug(f"Filtering: {src} - n_reads: {len(read_ids)}") @@ -160,7 +160,7 @@ def filter_pod5( print(f"Calculated {len(transfers.collect())} transfers") # There will only one output from this - groupby_dest = transfers.collect().groupby(PL_DEST_FNAME) + groupby_dest = transfers.collect().group_by(PL_DEST_FNAME) for dest, sources in groupby_dest: filter_reads(dest=dest, sources=sources, duplicate_ok=duplicate_ok) diff --git a/python/pod5/src/pod5/tools/pod5_subset.py b/python/pod5/src/pod5/tools/pod5_subset.py index c760941..d19222c 100644 --- a/python/pod5/src/pod5/tools/pod5_subset.py +++ b/python/pod5/src/pod5/tools/pod5_subset.py @@ -378,11 +378,9 @@ def __init__( context: SpawnContext, transfers: pl.LazyFrame, ) -> None: - pass - self.work: mp.JoinableQueue = context.JoinableQueue() self.size = 0 - groupby_dest = transfers.collect().groupby(PL_DEST_FNAME) + groupby_dest = transfers.collect().group_by(PL_DEST_FNAME) for dest, sources in groupby_dest: self.work.put((Path(dest), sources)) self.size += 1 @@ -518,7 +516,7 @@ def subset_reads( """Copy the reads in `sources` into a new pod5 file at `dest`""" # Count the total number of reads expected total_reads = 0 - for source, reads in sources.groupby(PL_SRC_FNAME): + for source, reads in sources.group_by(PL_SRC_FNAME): total_reads += len(reads.get_column(PL_READ_ID)) pbar = tqdm( @@ -537,7 +535,7 @@ def subset_reads( active_limit = 5 # Copy selected reads from one file at a time - for source, reads in sources.groupby(PL_SRC_FNAME): + for source, reads in sources.group_by(PL_SRC_FNAME): while repacker.currently_open_file_reader_count >= active_limit: pbar.update(repacker.reads_completed - pbar.n) sleep(0.05) diff --git a/python/pod5/src/pod5/tools/pod5_view.py b/python/pod5/src/pod5/tools/pod5_view.py index 902ed7b..1f7c153 100644 --- a/python/pod5/src/pod5/tools/pod5_view.py +++ b/python/pod5/src/pod5/tools/pod5_view.py @@ -265,7 +265,7 @@ def assert_unique_acquisition_id(run_info: pl.LazyFrame, path: Path) -> None: """ Perform a check that the acquisition ids are unique raising AssertionError otherwise """ - groups = run_info.collect().groupby(pl.col("acquisition_id")) + groups = run_info.collect().group_by(pl.col("acquisition_id")) common_acq_ids = [acq_id for acq_id, frame in groups if frame.n_unique() != 1] if common_acq_ids: raise AssertionError(