Skip to content

Commit

Permalink
Merge branch 'merge_perf' into 'master'
Browse files Browse the repository at this point in the history
Merge perf

See merge request minknow/pod5-file-format!285
  • Loading branch information
HalfPhoton committed Nov 6, 2023
2 parents 1ed94d8 + 9375e53 commit 744e4a1
Show file tree
Hide file tree
Showing 37 changed files with 1,768 additions and 1,281 deletions.
21 changes: 9 additions & 12 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,7 @@ pre-commit checks:
tags:
- linux
script:
# Need v2.21.0 until we drop python3.7 support
- pip install pre-commit==v2.21.0
- pip install pre-commit
# if any of the hooks wanted to modify files, this will have a non-zero exit
- if ! pre-commit run --all-files --verbose --color always ; then
- rm -rf .cache
Expand Down Expand Up @@ -244,7 +243,7 @@ linux-x64-gcc9-release-build:
PYTHON_ENABLED: "ON"
parallel:
matrix:
- PYTHON_VERSION: ["3.7", "3.8", "3.9", "3.10", "3.11"]
- PYTHON_VERSION: ["3.8", "3.9", "3.10", "3.11"]

linux-aarch64-gcc9-release-build:
image: quay.io/pypa/manylinux2014_aarch64
Expand All @@ -259,7 +258,7 @@ linux-aarch64-gcc9-release-build:
PYTHON_ENABLED: "ON"
parallel:
matrix:
- PYTHON_VERSION: ["3.7", "3.8", "3.9", "3.10", "3.11"]
- PYTHON_VERSION: ["3.8", "3.9", "3.10", "3.11"]


# ======================================
Expand Down Expand Up @@ -316,7 +315,7 @@ osx-x64-clang12-release-build:
BUILD_PYTHON_WHEEL: "ON"
parallel:
matrix:
- PYTHON_VERSION: ["3.7.4", "3.8.9", "3.9.13"]
- PYTHON_VERSION: ["3.8.9", "3.9.13"]
MACOSX_DEPLOYMENT_TARGET: "10.9"
OUTPUT_SKU: "osx-10.9-x64"
- PYTHON_VERSION: ["3.10.10", "3.11.2"]
Expand All @@ -336,7 +335,7 @@ osx-arm64-clang13-release-build:
BUILD_PYTHON_WHEEL: "ON"
parallel:
matrix:
- PYTHON_VERSION: ["3.7.4", "3.8.9", "3.9.13"]
- PYTHON_VERSION: ["3.8.9", "3.9.13"]
MACOSX_DEPLOYMENT_TARGET: "10.9"
OUTPUT_SKU: "osx-10.9-arm64"
FORCE_PYTHON_PLATFORM: macosx_11_0_arm64
Expand Down Expand Up @@ -397,7 +396,7 @@ win-x64-msvc2017-release-build:
OUTPUT_SKU: "win-x64"
parallel:
matrix:
- PYTHON_VERSION: ["3.7.5", "3.8.0", "3.9.13", "3.10.10", "3.11.2"]
- PYTHON_VERSION: ["3.8.0", "3.9.13", "3.10.10", "3.11.2"]
artifacts:
name: "${CI_JOB_NAME}-artifacts"
paths:
Expand Down Expand Up @@ -455,7 +454,7 @@ build-python-api:
.parallel-py-minor-ver: &parallel-py-minor-ver
parallel:
matrix:
- PY_MINOR_VER: ["7", "8", "9", "10", "11"]
- PY_MINOR_VER: ["8", "9", "10", "11"]

tools-linux-x64:
stage: test
Expand Down Expand Up @@ -524,8 +523,6 @@ pytest-linux-aarch64:
.versions-matrix-pyenv-and-venv: &versions-matrix-pyenv-and-venv
parallel:
matrix:
- PY_MINOR_VER: "7"
PYTHON_VERSION: "3.7.4"
- PY_MINOR_VER: "8"
PYTHON_VERSION: "3.8.9"
- PY_MINOR_VER: "9"
Expand Down Expand Up @@ -945,7 +942,7 @@ mlhub:
parallel:
matrix:
- PYTHON_VERSION:
- "3.7"
- "3.8"
- "3.11"


Expand All @@ -958,7 +955,7 @@ mlhub:

pages:
stage: deploy
image: ${CI_REGISTRY}/traque/ont-docker-base/ont-base-python:3.7
image: ${CI_REGISTRY}/traque/ont-docker-base/ont-base-python:3.10
needs: []
tags:
- linux
Expand Down
12 changes: 11 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## Unreleased

### Added
### Changed

- Transfers dataframes used in subsetting / filter use categorical fields to reduce memory consumption
- Polars version increased to `~=0.19`

### Fixed

- Remove exposed artifactory URL env var from gitlab ci config.
- `convert to_fast5` writes byte encoded read_ids to match Minkow (was `str`)

### Removed

- Removed python3.7 support

## [0.2.10] 2023-11-03

### Fixed
Expand Down
2 changes: 2 additions & 0 deletions c++/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ add_library(pod5_format ${pod5_library_type}
pod5_format/signal_table_writer.cpp
pod5_format/signal_table_writer.h
pod5_format/signal_table_utils.h
pod5_format/signal_builder.h

pod5_format/c_api.cpp
pod5_format/c_api.h
Expand Down Expand Up @@ -145,6 +146,7 @@ list(APPEND public_headers
pod5_format/signal_table_schema.h
pod5_format/signal_table_writer.h
pod5_format/signal_table_utils.h
pod5_format/signal_builder.h

pod5_format/c_api.h

Expand Down
32 changes: 31 additions & 1 deletion c++/pod5_format/file_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ class FileWriterImpl {
return signal_rows;
}

pod5::Result<std::uint64_t> add_pre_compressed_signal(
pod5::Result<SignalTableRowIndex> add_pre_compressed_signal(
boost::uuids::uuid const & read_id,
gsl::span<std::uint8_t const> const & signal_bytes,
std::uint32_t sample_count)
Expand All @@ -223,8 +223,25 @@ class FileWriterImpl {
read_id, signal_bytes, sample_count);
}

pod5::Result<std::pair<SignalTableRowIndex, SignalTableRowIndex>> add_signal_batch(
std::size_t row_count,
std::vector<std::shared_ptr<arrow::Array>> && columns,
bool final_batch)
{
if (!m_signal_table_writer || !m_read_table_writer) {
return arrow::Status::Invalid("File writer closed, cannot write further data");
}

return m_signal_table_writer->add_signal_batch(row_count, std::move(columns), final_batch);
}

SignalType signal_type() const { return m_signal_table_writer->signal_type(); }

std::size_t signal_table_batch_size() const
{
return m_signal_table_writer->table_batch_size();
}

pod5::Status close_run_info_table_writer()
{
if (m_run_info_table_writer) {
Expand Down Expand Up @@ -438,6 +455,14 @@ pod5::Result<SignalTableRowIndex> FileWriter::add_pre_compressed_signal(
return m_impl->add_pre_compressed_signal(read_id, signal_bytes, sample_count);
}

pod5::Result<std::pair<SignalTableRowIndex, SignalTableRowIndex>> FileWriter::add_signal_batch(
std::size_t row_count,
std::vector<std::shared_ptr<arrow::Array>> && columns,
bool final_batch)
{
return m_impl->add_signal_batch(row_count, std::move(columns), final_batch);
}

pod5::Result<EndReasonDictionaryIndex> FileWriter::lookup_end_reason(ReadEndReason end_reason) const
{
return m_impl->lookup_end_reason(end_reason);
Expand All @@ -455,6 +480,11 @@ pod5::Result<RunInfoDictionaryIndex> FileWriter::add_run_info(RunInfoData const

SignalType FileWriter::signal_type() const { return m_impl->signal_type(); }

std::size_t FileWriter::signal_table_batch_size() const
{
return m_impl->signal_table_batch_size();
}

pod5::Result<FileWriterImpl::DictionaryWriters> make_dictionary_writers(arrow::MemoryPool * pool)
{
FileWriterImpl::DictionaryWriters writers;
Expand Down
9 changes: 8 additions & 1 deletion c++/pod5_format/file_writer.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@
#include <memory>

namespace arrow {
class Array;
class MemoryPool;
}
} // namespace arrow

namespace pod5 {

Expand Down Expand Up @@ -111,12 +112,18 @@ class POD5_FORMAT_EXPORT FileWriter {
gsl::span<std::uint8_t const> const & signal_bytes,
std::uint32_t sample_count);

pod5::Result<std::pair<SignalTableRowIndex, SignalTableRowIndex>> add_signal_batch(
std::size_t row_count,
std::vector<std::shared_ptr<arrow::Array>> && columns,
bool final_batch);

// Find or create an end reason index representing this read end reason.
pod5::Result<EndReasonDictionaryIndex> lookup_end_reason(ReadEndReason end_reason) const;
pod5::Result<PoreDictionaryIndex> add_pore_type(std::string const & pore_type_data);
pod5::Result<RunInfoDictionaryIndex> add_run_info(RunInfoData const & run_info_data);

SignalType signal_type() const;
std::size_t signal_table_batch_size() const;

FileWriterImpl * impl() const { return m_impl.get(); };

Expand Down
5 changes: 5 additions & 0 deletions c++/pod5_format/internal/async_output_stream.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#pragma once

#include "pod5_format/internal/tracing/tracing.h"
#include "pod5_format/thread_pool.h"

#include <arrow/buffer.h>
Expand Down Expand Up @@ -82,6 +83,7 @@ class AsyncOutputStream : public arrow::io::OutputStream {

arrow::Status Write(void const * data, int64_t nbytes) override
{
POD5_TRACE_FUNCTION();
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Buffer> buffer, arrow::AllocateBuffer(nbytes));
auto const char_data = static_cast<std::uint8_t const *>(data);
std::copy(char_data, char_data + nbytes, buffer->mutable_data());
Expand All @@ -90,6 +92,7 @@ class AsyncOutputStream : public arrow::io::OutputStream {

arrow::Status Write(std::shared_ptr<arrow::Buffer> const & data) override
{
POD5_TRACE_FUNCTION();
if (m_has_error) {
return *m_error;
}
Expand All @@ -104,6 +107,7 @@ class AsyncOutputStream : public arrow::io::OutputStream {

m_submitted_writes += 1;
m_strand->post([&, data] {
POD5_TRACE_FUNCTION();
if (m_has_error) {
return;
}
Expand All @@ -126,6 +130,7 @@ class AsyncOutputStream : public arrow::io::OutputStream {

arrow::Status Flush() override
{
POD5_TRACE_FUNCTION();
// Wait for our completed writes to match our submitted writes,
// this guarantees our async operations are finished.
auto wait_for_write_count = m_submitted_writes.load();
Expand Down
2 changes: 0 additions & 2 deletions c++/pod5_format/read_table_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@

namespace pod5 {

using SignalTableRowIndex = std::uint64_t;

using PoreDictionaryIndex = std::int16_t;
using EndReasonDictionaryIndex = std::int16_t;
using RunInfoDictionaryIndex = std::int16_t;
Expand Down
1 change: 1 addition & 0 deletions c++/pod5_format/read_table_writer.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "pod5_format/read_table_writer_utils.h"
#include "pod5_format/result.h"
#include "pod5_format/schema_field_builder.h"
#include "pod5_format/signal_table_utils.h"

#include <arrow/array/builder_dict.h>
#include <arrow/io/type_fwd.h>
Expand Down
Loading

0 comments on commit 744e4a1

Please sign in to comment.