Merge branch 'rh/polars_groupby' into 'master'

groupby to group_by following deprecation in polars 0.19 See merge request minknow/pod5-file-format!309
nanoporetech · Nov 7, 2023 · c68c43a · c68c43a
2 parents ec25354 + a6849c4
commit c68c43a
Show file tree

Hide file tree

Showing 6 changed files with 10 additions and 11 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Transfers dataframes used in subsetting / filter use categorical fields to reduce memory consumption
 - Polars version increased to `~=0.19`
+- Renamed deprecated `polars.groupby` to `polars.group_by`
 
 ### Fixed
 

diff --git a/docs/docs/tools.rst b/docs/docs/tools.rst
@@ -276,7 +276,7 @@ based on a text file containing a table (csv or tsv) parsible by ``polars``.
 This table file could be the output from ``pod5 view`` or from a sequencing summary.
 The table must contain a header row and a series of columns on which to group unique
 collections of values. Internally this process uses the
-`polars.Dataframe.groupby <https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.groupby.html>`_
+`polars.Dataframe.group_by <https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.group_by.html>`_
 function where the ``by`` parameter is the sequence of column names specified with
 the ``--columns`` argument.
 

diff --git a/python/pod5/README.md b/python/pod5/README.md
@@ -364,7 +364,7 @@ based on a text file containing a table (csv or tsv) parsible by ``polars``.
 This table file could be the output from ``pod5 view`` or from a sequencing summary.
 The table must contain a header row and a series of columns on which to group unique
 collections of values. Internally this process uses the
-`polars.Dataframe.groupby <https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.groupby.html>`_
+`polars.Dataframe.group_by <https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.group_by.html>`_
 function where the ``by`` parameter is the sequence of column names specified with
 the ``--columns`` argument.
 

diff --git a/python/pod5/src/pod5/tools/pod5_filter.py b/python/pod5/src/pod5/tools/pod5_filter.py
@@ -75,7 +75,7 @@ def filter_reads(dest: Path, sources: pl.DataFrame, duplicate_ok: bool) -> None:
 
         # Count the total number of reads expected
         total_reads = 0
-        for source, reads in sources.groupby(PL_SRC_FNAME):
+        for source, reads in sources.group_by(PL_SRC_FNAME):
             total_reads += len(reads.get_column(PL_READ_ID))
 
         pbar = tqdm(
@@ -89,7 +89,7 @@ def filter_reads(dest: Path, sources: pl.DataFrame, duplicate_ok: bool) -> None:
         active_limit = 5
 
         # Copy selected reads from one file at a time
-        for source, reads in sources.groupby(PL_SRC_FNAME):
+        for source, reads in sources.group_by(PL_SRC_FNAME):
             src = Path(source)
             read_ids = reads.get_column(PL_READ_ID).unique().to_list()
             logger.debug(f"Filtering: {src} - n_reads: {len(read_ids)}")
@@ -160,7 +160,7 @@ def filter_pod5(
     print(f"Calculated {len(transfers.collect())} transfers")
 
     # There will only one output from this
-    groupby_dest = transfers.collect().groupby(PL_DEST_FNAME)
+    groupby_dest = transfers.collect().group_by(PL_DEST_FNAME)
     for dest, sources in groupby_dest:
         filter_reads(dest=dest, sources=sources, duplicate_ok=duplicate_ok)
 

diff --git a/python/pod5/src/pod5/tools/pod5_subset.py b/python/pod5/src/pod5/tools/pod5_subset.py
@@ -378,11 +378,9 @@ def __init__(
         context: SpawnContext,
         transfers: pl.LazyFrame,
     ) -> None:
-        pass
-
         self.work: mp.JoinableQueue = context.JoinableQueue()
         self.size = 0
-        groupby_dest = transfers.collect().groupby(PL_DEST_FNAME)
+        groupby_dest = transfers.collect().group_by(PL_DEST_FNAME)
         for dest, sources in groupby_dest:
             self.work.put((Path(dest), sources))
             self.size += 1
@@ -518,7 +516,7 @@ def subset_reads(
     """Copy the reads in `sources` into a new pod5 file at `dest`"""
     # Count the total number of reads expected
     total_reads = 0
-    for source, reads in sources.groupby(PL_SRC_FNAME):
+    for source, reads in sources.group_by(PL_SRC_FNAME):
         total_reads += len(reads.get_column(PL_READ_ID))
 
     pbar = tqdm(
@@ -537,7 +535,7 @@ def subset_reads(
 
         active_limit = 5
         # Copy selected reads from one file at a time
-        for source, reads in sources.groupby(PL_SRC_FNAME):
+        for source, reads in sources.group_by(PL_SRC_FNAME):
             while repacker.currently_open_file_reader_count >= active_limit:
                 pbar.update(repacker.reads_completed - pbar.n)
                 sleep(0.05)

diff --git a/python/pod5/src/pod5/tools/pod5_view.py b/python/pod5/src/pod5/tools/pod5_view.py
@@ -265,7 +265,7 @@ def assert_unique_acquisition_id(run_info: pl.LazyFrame, path: Path) -> None:
     """
     Perform a check that the acquisition ids are unique raising AssertionError otherwise
     """
-    groups = run_info.collect().groupby(pl.col("acquisition_id"))
+    groups = run_info.collect().group_by(pl.col("acquisition_id"))
     common_acq_ids = [acq_id for acq_id, frame in groups if frame.n_unique() != 1]
     if common_acq_ids:
         raise AssertionError(