Skip to content

Commit

Permalink
Merge branch 'rh/polars_groupby' into 'master'
Browse files Browse the repository at this point in the history
groupby to group_by following deprecation in polars 0.19

See merge request minknow/pod5-file-format!309
  • Loading branch information
0x55555555 committed Nov 7, 2023
2 parents ec25354 + a6849c4 commit c68c43a
Show file tree
Hide file tree
Showing 6 changed files with 10 additions and 11 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- Transfers dataframes used in subsetting / filter use categorical fields to reduce memory consumption
- Polars version increased to `~=0.19`
- Renamed deprecated `polars.groupby` to `polars.group_by`

### Fixed

Expand Down
2 changes: 1 addition & 1 deletion docs/docs/tools.rst
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ based on a text file containing a table (csv or tsv) parsible by ``polars``.
This table file could be the output from ``pod5 view`` or from a sequencing summary.
The table must contain a header row and a series of columns on which to group unique
collections of values. Internally this process uses the
`polars.Dataframe.groupby <https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.groupby.html>`_
`polars.Dataframe.group_by <https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.group_by.html>`_
function where the ``by`` parameter is the sequence of column names specified with
the ``--columns`` argument.

Expand Down
2 changes: 1 addition & 1 deletion python/pod5/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,7 @@ based on a text file containing a table (csv or tsv) parsible by ``polars``.
This table file could be the output from ``pod5 view`` or from a sequencing summary.
The table must contain a header row and a series of columns on which to group unique
collections of values. Internally this process uses the
`polars.Dataframe.groupby <https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.groupby.html>`_
`polars.Dataframe.group_by <https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.group_by.html>`_
function where the ``by`` parameter is the sequence of column names specified with
the ``--columns`` argument.
Expand Down
6 changes: 3 additions & 3 deletions python/pod5/src/pod5/tools/pod5_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def filter_reads(dest: Path, sources: pl.DataFrame, duplicate_ok: bool) -> None:

# Count the total number of reads expected
total_reads = 0
for source, reads in sources.groupby(PL_SRC_FNAME):
for source, reads in sources.group_by(PL_SRC_FNAME):
total_reads += len(reads.get_column(PL_READ_ID))

pbar = tqdm(
Expand All @@ -89,7 +89,7 @@ def filter_reads(dest: Path, sources: pl.DataFrame, duplicate_ok: bool) -> None:
active_limit = 5

# Copy selected reads from one file at a time
for source, reads in sources.groupby(PL_SRC_FNAME):
for source, reads in sources.group_by(PL_SRC_FNAME):
src = Path(source)
read_ids = reads.get_column(PL_READ_ID).unique().to_list()
logger.debug(f"Filtering: {src} - n_reads: {len(read_ids)}")
Expand Down Expand Up @@ -160,7 +160,7 @@ def filter_pod5(
print(f"Calculated {len(transfers.collect())} transfers")

# There will only one output from this
groupby_dest = transfers.collect().groupby(PL_DEST_FNAME)
groupby_dest = transfers.collect().group_by(PL_DEST_FNAME)
for dest, sources in groupby_dest:
filter_reads(dest=dest, sources=sources, duplicate_ok=duplicate_ok)

Expand Down
8 changes: 3 additions & 5 deletions python/pod5/src/pod5/tools/pod5_subset.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,11 +378,9 @@ def __init__(
context: SpawnContext,
transfers: pl.LazyFrame,
) -> None:
pass

self.work: mp.JoinableQueue = context.JoinableQueue()
self.size = 0
groupby_dest = transfers.collect().groupby(PL_DEST_FNAME)
groupby_dest = transfers.collect().group_by(PL_DEST_FNAME)
for dest, sources in groupby_dest:
self.work.put((Path(dest), sources))
self.size += 1
Expand Down Expand Up @@ -518,7 +516,7 @@ def subset_reads(
"""Copy the reads in `sources` into a new pod5 file at `dest`"""
# Count the total number of reads expected
total_reads = 0
for source, reads in sources.groupby(PL_SRC_FNAME):
for source, reads in sources.group_by(PL_SRC_FNAME):
total_reads += len(reads.get_column(PL_READ_ID))

pbar = tqdm(
Expand All @@ -537,7 +535,7 @@ def subset_reads(

active_limit = 5
# Copy selected reads from one file at a time
for source, reads in sources.groupby(PL_SRC_FNAME):
for source, reads in sources.group_by(PL_SRC_FNAME):
while repacker.currently_open_file_reader_count >= active_limit:
pbar.update(repacker.reads_completed - pbar.n)
sleep(0.05)
Expand Down
2 changes: 1 addition & 1 deletion python/pod5/src/pod5/tools/pod5_view.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ def assert_unique_acquisition_id(run_info: pl.LazyFrame, path: Path) -> None:
"""
Perform a check that the acquisition ids are unique raising AssertionError otherwise
"""
groups = run_info.collect().groupby(pl.col("acquisition_id"))
groups = run_info.collect().group_by(pl.col("acquisition_id"))
common_acq_ids = [acq_id for acq_id, frame in groups if frame.n_unique() != 1]
if common_acq_ids:
raise AssertionError(
Expand Down

0 comments on commit c68c43a

Please sign in to comment.