From 92e70f1a7e9685b3d1f52b50f39e6fb04d27b11b Mon Sep 17 00:00:00 2001 From: George Pimm Date: Mon, 8 Jul 2024 08:57:57 +0000 Subject: [PATCH] Fix uncompressed signal python bindings --- CHANGELOG.md | 7 ++++++ c++/pod5_format_pybind/bindings.cpp | 8 ++++--- ci/setup_python_osx.sh | 4 ++-- docs/requirements.txt | 2 +- python/lib_pod5/src/lib_pod5/__init__.py | 2 ++ python/pod5/pyproject.toml | 8 ++++--- python/pod5/src/pod5/__init__.py | 3 ++- python/pod5/src/pod5/reader.py | 13 +++++++++--- python/pod5/src/pod5/writer.py | 27 ++++++++++++++++++++++-- python/pod5/src/tests/test_api.py | 25 +++++++++++++++++----- 10 files changed, 79 insertions(+), 20 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cd9b874..eeae940 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,13 @@ All notable changes, updates, and fixes to pod5 will be documented here The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.3.12] + +## Fixed + +- Fixed issues reading signal from uncompressed pod5 files. + + ## [0.3.11] ## Added diff --git a/c++/pod5_format_pybind/bindings.cpp b/c++/pod5_format_pybind/bindings.cpp index bdb7582..803c7ea 100644 --- a/c++/pod5_format_pybind/bindings.cpp +++ b/c++/pod5_format_pybind/bindings.cpp @@ -10,12 +10,14 @@ PYBIND11_MODULE(pod5_format_pybind, m) m.doc() = "POD5 Format Raw Bindings"; - auto thread_pool = pod5::make_thread_pool(std::thread::hardware_concurrency()); + py::enum_(m, "SignalType", py::arithmetic(), "SignalType enum") + .value("UncompressedSignal", SignalType::UncompressedSignal, "Signal is not compressed") + .value("VbzSignal", SignalType::VbzSignal, "Signal is compressed using vbz") + .export_values(); py::class_(m, "FileWriterOptions") - .def(py::init([thread_pool]() { + .def(py::init([]() { FileWriterOptions options; - options.set_thread_pool(thread_pool); return options; })) .def_property( diff --git a/ci/setup_python_osx.sh b/ci/setup_python_osx.sh index dc9b35c..e3126e7 100755 --- a/ci/setup_python_osx.sh +++ b/ci/setup_python_osx.sh @@ -22,11 +22,11 @@ fi # relocatable-python doesn't like this dir existing, it exits with error: tmp_python_dir="/Users/cirunner/Library/Python_" function cleanup { - mv $tmp_python_dir /Users/cirunner/Library/Python + mv $tmp_python_dir /Users/cirunner/Library/Python || true } trap cleanup EXIT -mv /Users/cirunner/Library/Python $tmp_python_dir +mv /Users/cirunner/Library/Python $tmp_python_dir || true relocatable-python/make_relocatable_python_framework.py --python-version "${version}" --destination "${destination}" --upgrade-pip --os-version "${os_version}" diff --git a/docs/requirements.txt b/docs/requirements.txt index 155a39a..faec2dc 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -2,4 +2,4 @@ sphinx-rtd-theme sphinx==v5.3.0 myst-parser # Paths are relative to project root for ReadTheDocs and docs/Makefile -pod5==0.3.11 +pod5==0.3.12 diff --git a/python/lib_pod5/src/lib_pod5/__init__.py b/python/lib_pod5/src/lib_pod5/__init__.py index 990a7f7..ee99494 100644 --- a/python/lib_pod5/src/lib_pod5/__init__.py +++ b/python/lib_pod5/src/lib_pod5/__init__.py @@ -10,6 +10,7 @@ Pod5RepackerOutput, Pod5SignalCacheBatch, Repacker, + SignalType, compress_signal, create_file, recover_file, @@ -33,6 +34,7 @@ "Pod5RepackerOutput", "Pod5SignalCacheBatch", "Repacker", + "SignalType", "compress_signal", "create_file", "recover_file", diff --git a/python/pod5/pyproject.toml b/python/pod5/pyproject.toml index 552c4af..96723cd 100644 --- a/python/pod5/pyproject.toml +++ b/python/pod5/pyproject.toml @@ -22,13 +22,15 @@ classifiers=[ ] dependencies = [ - "lib_pod5 == 0.3.11", + "lib_pod5 == 0.3.12", "iso8601", 'importlib-metadata; python_version<"3.8"', "more_itertools", "numpy >= 1.21.0", - 'typing-extensions; python_version<"3.8"', - "pyarrow ~= 16.1.0", + 'typing-extensions; python_version<"3.10"', + # Avoid issues with pyarrow 16.1.0 on x64 Macos: https://github.com/apache/arrow/issues/41696 + 'pyarrow ~= 16.1.0; platform_system!="Darwin" or platform_machine!="x86_64" or python_version<"3.12"', + 'pyarrow ~= 16.0.0; platform_system=="Darwin" and platform_machine=="x86_64" and python_version>="3.12"', "pytz", "packaging", "polars~=0.19", diff --git a/python/pod5/src/pod5/__init__.py b/python/pod5/src/pod5/__init__.py index 7dc67aa..14d58e6 100644 --- a/python/pod5/src/pod5/__init__.py +++ b/python/pod5/src/pod5/__init__.py @@ -37,7 +37,7 @@ vbz_decompress_signal_chunked, vbz_decompress_signal_into, ) -from .writer import Writer +from .writer import SignalType, Writer __all__ = ( "__version__", @@ -56,6 +56,7 @@ "Reader", "ReadRecord", "ReadRecordBatch", + "SignalType", "vbz_compress_signal", "vbz_decompress_signal", "vbz_decompress_signal_chunked", diff --git a/python/pod5/src/pod5/reader.py b/python/pod5/src/pod5/reader.py index 8039bf1..78a856a 100644 --- a/python/pod5/src/pod5/reader.py +++ b/python/pod5/src/pod5/reader.py @@ -303,7 +303,7 @@ def signal(self) -> npt.NDArray[np.int16]: memoryview(signal[batch_row_index].as_buffer()), output_slice ) else: - output_slice[:] = signal.to_numpy() + output_slice[:] = signal[batch_row_index].values current_sample_index += current_row_count return output @@ -347,11 +347,16 @@ def map_signal_row(sig_row) -> SignalRowInfo: sig_row = sig_row.as_py() batch, batch_index, batch_row_index = self._find_signal_row_index(sig_row) + batch_length = 0 + if isinstance(batch.signal, pa.lib.LargeListArray): + batch_length = len(batch.signal[batch_row_index]) + else: + batch_length = len(batch.signal[batch_row_index].as_buffer()) return SignalRowInfo( batch_index, batch_row_index, batch.samples[batch_row_index].as_py(), - len(batch.signal[batch_row_index].as_buffer()), + batch_length, ) return [map_signal_row(r) for r in self._batch.columns.signal[self._row]] @@ -402,7 +407,9 @@ def _get_signal_for_row(self, signal_row: int) -> npt.NDArray[np.int16]: memoryview(signal[batch_row_index].as_buffer()), sample_count ) - return signal.to_numpy() + return signal.to_numpy() + else: + return np.array(signal[batch_row_index].values, dtype="int16") def to_read(self) -> Read: """ diff --git a/python/pod5/src/pod5/writer.py b/python/pod5/src/pod5/writer.py index 477864b..b3f9298 100644 --- a/python/pod5/src/pod5/writer.py +++ b/python/pod5/src/pod5/writer.py @@ -17,12 +17,18 @@ TypeVar, Union, ) +import sys import lib_pod5 as p5b import numpy as np from pod5.reader import ReadRecord import pytz +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias + from pod5.api_utils import Pod5ApiException, safe_close from pod5.pod5_types import ( BaseRead, @@ -35,6 +41,7 @@ DEFAULT_SOFTWARE_NAME = "Python API" +SignalType: TypeAlias = p5b.SignalType PoreType = str T = TypeVar("T", bound=Union[EndReason, PoreType, RunInfo]) @@ -68,7 +75,12 @@ def timestamp_to_int(time_stamp: Union[datetime.datetime, int]) -> int: class Writer: """Pod5 File Writer""" - def __init__(self, path: PathOrStr, software_name: str = DEFAULT_SOFTWARE_NAME): + def __init__( + self, + path: PathOrStr, + software_name: str = DEFAULT_SOFTWARE_NAME, + signal_compression_type: SignalType = SignalType.VbzSignal, + ): """ Open a pod5 file for Writing. @@ -78,17 +90,23 @@ def __init__(self, path: PathOrStr, software_name: str = DEFAULT_SOFTWARE_NAME): The path to the pod5 file to create software_name : str The name of the application used to create this pod5 file + signal_compression_type : SignalType + The type of compression to use in the file. Defaults to Vbz. """ self._path = Path(path).absolute() self._software_name = software_name + self._signal_compression_type = signal_compression_type if self._path.is_file(): raise FileExistsError( f"Input path already exists. Refusing to overwrite: {self._path}" ) + options = p5b.FileWriterOptions() + options.signal_compression_type = signal_compression_type + self._writer: Optional[p5b.FileWriter] = p5b.create_file( - str(self._path), software_name, None + str(self._path), software_name, options ) if not self._writer: raise Pod5ApiException( @@ -134,6 +152,11 @@ def software_name(self) -> str: """Return the software name used to open this file""" return self._software_name + @property + def signal_compression_type(self) -> SignalType: + """Return the signal compression type used by this file""" + return self._signal_compression_type + def add(self, obj: Union[EndReason, PoreType, RunInfo]) -> int: """ Add a :py:class:`EndReason`, :py:class:`PoreType`, or diff --git a/python/pod5/src/tests/test_api.py b/python/pod5/src/tests/test_api.py index 173239f..f56d84e 100644 --- a/python/pod5/src/tests/test_api.py +++ b/python/pod5/src/tests/test_api.py @@ -85,11 +85,13 @@ def get_random_str(prefix: str) -> str: def run_writer_test(f: Writer): + writer_supports_compressed = f.signal_compression_type == p5.SignalType.VbzSignal + test_read = gen_test_read(0, compressed=False) print("read", test_read.read_id, test_read.run_info.adc_max) f.add_read(test_read) - test_read = gen_test_read(1, compressed=True) + test_read = gen_test_read(1, compressed=writer_supports_compressed) print("read", test_read.read_id, test_read.run_info.adc_max) f.add_read(test_read) @@ -103,10 +105,10 @@ def run_writer_test(f: Writer): f.add_reads(test_reads) test_reads = [ - gen_test_read(6, compressed=True), - gen_test_read(7, compressed=True), - gen_test_read(8, compressed=True), - gen_test_read(9, compressed=True), + gen_test_read(6, compressed=writer_supports_compressed), + gen_test_read(7, compressed=writer_supports_compressed), + gen_test_read(8, compressed=writer_supports_compressed), + gen_test_read(9, compressed=writer_supports_compressed), ] f.add_reads(test_reads) assert test_reads[0].sample_count > 0 @@ -247,6 +249,19 @@ def test_pyarrow_from_str(): run_reader_test(_fh) +@pytest.mark.filterwarnings("ignore: pod5.") +def test_pyarrow_from_pathlib_uncompressed(): + with tempfile.TemporaryDirectory() as temp: + path = Path(temp) / "example.pod5" + with p5.Writer( + path, signal_compression_type=p5.SignalType.UncompressedSignal + ) as _fh: + run_writer_test(_fh) + + with p5.Reader(path) as _fh: + run_reader_test(_fh) + + def test_read_id_packing(): """ Assert pack_read_ids repacks and format_read_ids correctly unpacks collections