Skip to content

Commit

Permalink
Merge branch 'feature/improve_ci' into 'master'
Browse files Browse the repository at this point in the history
Python CI

See merge request minknow/mkr-file-format!3
  • Loading branch information
0x55555555 committed May 4, 2022
2 parents d65421a + 347509a commit a92312d
Show file tree
Hide file tree
Showing 52 changed files with 3,702 additions and 435 deletions.
76 changes: 72 additions & 4 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,25 @@ stages:
- build
- build-python
- test-python
- deploy

variables:
CONAN_CONFIG_URL: https://artifactory.oxfordnanolabs.local/artifactory/ONT-Conan/configs/config-v24.zip

tag_version_check:
stage: pre-flight
only:
- tags
image: ${PYTHON_IMAGE}
script:
- mkr_version="$(cmake -P ci/get_tag_version.cmake 2>&1)"
- tag_version="${CI_COMMIT_TAG/#v/}"
- if [[ "${mkr_version}" != "${tag_version}" ]]; then
echo "Tag is for release ${tag_version}, but MKR version is $mkr_version";
exit 1;
fi


pre-commit checks:
image: python:3.9
stage: pre-flight
Expand All @@ -22,18 +37,35 @@ pre-commit checks:
paths:
- ${PRE_COMMIT_HOME}

build-standalone-ubu20:
stage: build
image: ubuntu:20.04
script:
- export DEBIAN_FRONTEND=noninteractive
- apt update
- apt install -y -V ca-certificates lsb-release wget
- wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
- apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
- apt update
- apt install -y cmake build-essential libzstd-dev libzstd-dev libboost-dev libboost-filesystem-dev libflatbuffers-dev libarrow-dev
- mkdir build
- cd build
- cmake ..
- make -j

.build:
stage: build
script:
- mkr_version="$(cmake -P ci/get_tag_version.cmake 2>&1)"
- mkdir build
- cd build
- conan install --profile ${CONAN_PROFILE} ..
- cmake -DCMAKE_BUILD_TYPE=Release ..
- cmake ${CMAKE_ARGS} -DCMAKE_BUILD_TYPE=Release -DUSE_CONAN=ON ..
- cmake --build . --config Release
- ctest -C Release -VV
- cmake -DCMAKE_INSTALL_PREFIX="archive" -DBUILD_TYPE="Release" -P "cmake_install.cmake"
- cd ./archive
- tar -czf ../mkr-file-format-${OUTPUT_SKU}.tar.gz .
- tar -czf ../mkr-file-format-${mkr_version}-${OUTPUT_SKU}.tar.gz .
artifacts:
paths:
- build/mkr-file-format*.tar.gz
Expand Down Expand Up @@ -75,6 +107,17 @@ win-x64-msvc2017-release-build:
- cmake
- VS2017
- conan
script:
- mkr_version="$(cmake -P ci/get_tag_version.cmake 2>&1)"
- mkdir build
- cd build
- conan install --profile ${CONAN_PROFILE} ..
- cmake -A x64 -G "Visual Studio 15 2017" -DCMAKE_BUILD_TYPE=Release -DUSE_CONAN=ON ..
- cmake --build . --config Release
- ctest -C Release -VV
- cmake -DCMAKE_INSTALL_PREFIX="archive" -DBUILD_TYPE="Release" -P "cmake_install.cmake"
- cd ./archive
- tar -czf ../mkr-file-format-${mkr_version}-${OUTPUT_SKU}.tar.gz .
variables:
CONAN_PROFILE: "windows-x86_64-vs2017-release"
OUTPUT_SKU: "win-x64"
Expand All @@ -97,7 +140,7 @@ wheel-build:
- conan install --profile linux-aarch64-gcc9-release ..
- apt update
- apt install -y cmake
- cmake ..
- cmake -DUSE_CONAN=ON ..
- cd ..
- ci/unpack_libs_for_python.sh ./build ./python/mkr_format/libs
- mkdir wheels
Expand All @@ -107,6 +150,19 @@ wheel-build:
paths:
- wheels/*.whl

pytest:
stage: test-python
image: git.oxfordnanolabs.local:4567/traque/ont-docker-base/ont-base-python:3.9
tags:
- linux
needs:
- wheel-build
script:
- pip install ./wheels/*
- cd python
- pip install pytest pytest-cov
- pytest

wheel-test:
stage: test-python
image: git.oxfordnanolabs.local:4567/traque/ont-docker-base/ont-base-python:3.9
Expand All @@ -116,4 +172,16 @@ wheel-test:
- wheel-build
script:
- pip install ./wheels/*
- mkr-convert-fast5 ./test_files/ ./output_files --output-one-to-one
- mkr-convert-fast5 ./test_data/ ./output_files --output-one-to-one
- python3 python/tests/tools/check_mkr_files_equal.py ./output_files/multi_fast5_zip.mkr ./test_data/multi_fast5_zip.mkr

package_upload:
stage: deploy
image: ${UPLOAD_PYTHON_IMAGE}
needs:
- wheel-build
script:
- ls -lh wheels
- pip install twine
- twine upload wheels/*
only: ["tags"]
5 changes: 3 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@ project(MKR
VERSION ${MKR_NUMERIC_VERSION}
)

option(FORCE_CI_COMPATIBILITY "Change compiler flags to ensure compatibility with CI builds" ON)
if (FORCE_CI_COMPATIBILITY AND CMAKE_COMPILER_IS_GNUCXX AND
option(USE_CONAN "Use conan for dependency management" OFF)

if (USE_CONAN AND CMAKE_COMPILER_IS_GNUCXX AND
CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "9.0" AND
CMAKE_CXX_COMPILER_VERSION VERSION_LESS "10.0")
# We build MKR on CentOS 7 in CI, where we have GCC 9 but only the pre-C++11 ABI
Expand Down
102 changes: 102 additions & 0 deletions DESIGN.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
MKR File Format Design Details
==============================

The MKR file format has been specifically designed to be suitable for Nanopore read data, we had some specific design goals:

Design Goals
------------

The primary purpose of this file format is store reads produced by Oxford Nanopore sequencing, and in particular the signal data from those reads (which can then be basecalled or processed in other ways).

This file format has the following design goals (roughly in priority order):

- Good write performance for MinKNOW
- Recoverable if the writing process crashes
- Good read performance for downstream tools, including basecall model generation
- Efficient use of space
- Straightforward to implement and maintain
- Extensibilty

Note that trade-offs have been made between these goals, but we have mostly aimed to make those run-time decisions.

We have also chosen not to optimise for editing existing files.


### Write performance

The aspects of this format that are designed to maximise write performance are:

- Data can be written sequentially
- The sequential access pattern makes it easy to use efficient operating system APIs (such as io_uring on Linux)
- The sequential access pattern helps the operating system's I/O scheduler maximise throughput
- Signal data from different reads can be interleaved, and data streams can be safely abandoned (at the cost of using more space than necessary)
- This allows MinKNOW to write out data as it arrives, potentially avoiding the need have an intermediate caching format (this file format can be used for the cache and the final output)
- Support for space- and CPU-efficient compression routines (VBZ)
- This reduces the amount of data that needs to be written, which reduces I/O load

### Recovery

The aspects of this format that are designed to allow for recovery if the writing process crashes are:

- A way to indicate that a file is actually complete as intended (complete files end with a recognisable footer)
- The Apache Feather format can be assembled by reading it sequentially, without using the footer
- The data file format is append-only, which means that once data is recorded it cannot be corrupted by later updates

### Read performance

The aspects of this format that are designed to maximise read performance are:

- The Apache Feather format can be memory mapped and used directly
- Apache Arrow has significant existing engineering work geared around efficient access to data, from the layout of the data itself to the library tooling
- Storing direct information about signal data locations with the row table
- This allows quick access to a read's data without scanning the data file
- It is possible to only decode part of a long read, due to read data being stored in chunks
- This is useful for model training
- Read access does not require locking or otherwise modifying the file
- This allows multi-threaded and multi-process access to a file for reading

### Efficient use of space

The aspects of this format that are designed to maximise use of space are:

- Support for efficient compression routines (VBZ)
- Apache Arrow's support for dictionary encoding
- Apache Arrow's support for compressing buffers with standard compression routines

### Ease of implementation

The aspects of this format that are designed to make the format easy to implement are:

- Relying on an existing, widely-used format (Apache Arrow)

### Extensibility

The aspects of this format that are designed to make the format extensible are:

- Apache Arrow uses a self-describing schema with named columns, so it is straightfoward to write code that is resilient in the face of things like additional columns being added.


Format Specification
--------------------

### Overview

The file format is, at its core, a collection of Apache Arrow tables, stored in the Apache Feather 2 (also know as Apache Arrow IPC File) format. These can be stored separately, linked by having a common filename component, or bundled into a single file for ease of file management.

In its unbundled format, there are two required files:

```
<prefix>_signal.arrow
<prefix>_reads.arrow
```

Optionally, index files can also be provided:

```
<preifx>_index_read_id.arrow (index by read_id)
<preifx>_index_*.arrow (optional, extension point for other indexes)
```

Each of these is an Apache Feather 2 file, and can be opened directly using the Apache Arrow library's IPC routines. The schema of the tables is described below. The naming scheme above is recommended (and should be the default when creating these files), but tooling should provide a way for users to explicitly every filename when opening files (in case the user has renamed them to a different scheme).

These can be stored in a bundled file, named <prefix>.mkr and described below.
48 changes: 44 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,41 @@ What does this project contain
This project contains a core library for reading and writing MKR data, and a toolkit for
accessing this data in other languages.

Getting Started
---------------

Usage
-----

The MKR is bundled as a python module for easy use in scripts, a user can install using:

```bash
> pip install mkr_format
```

The python module comes with several tools to assist users with mkr files, and a python library to write custom scripts against.

Please see [examples](./python/mkr_format/examples) for documentation on using the library.

Tools
-----

### mkr-convert-fast5

Generate an mkr file from a set of input fast5 files:

```bash
> mkr-convert-fast5 input_fast5_1.fast5 input_fast5_2.fast5 output_mkr_file.mkr
```

### mkr-inspect

Inspect an mkr file to extract details about the contents:

```bash
> mkr-inspect mkr_file.mkr
```

Development
-----------

### Developing

Expand All @@ -28,9 +61,16 @@ Building the project requires several tools and libraries are available:
- Flatbuffers

```bash
> pip install -r ./requirements.txt
# Docs on installing arrow from here: https://arrow.apache.org/install/
> sudo apt install -y -V ca-certificates lsb-release wget
> wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
> sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
> sudo apt update
# Now install the rest of the dependencies:
> sudo apt install cmake libzstd-dev libzstd-dev libboost-dev libboost-filesystem-dev libflatbuffers-dev
# Finally start build of MKR:
> mkdir build
> cd build
> conan install .. # Optional step, but the source requires the libraries are available on the system in a way cmake can find them.
> cmake ..
> make -j
```
2 changes: 2 additions & 0 deletions benchmarks/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*/outputs/
image/*.whl
72 changes: 72 additions & 0 deletions benchmarks/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
MKR Benchmarks
==============

Building the benchmark environment
----------------------------------

To run benchmarks you first have to build the docker environment to run them:

```bash
> ./build.sh
```


Running a benchmark
-------------------

To run a benchmark, use the helper script to start the docker image:

```bash
> ./run_benchmark.sh convert ./path-to-source-files/



Benchmarking Results
--------------------

On dataset a PCR Zymo dataset PAM50264, on 10.4.1 e8.2 data (`pcr_zymo/20220419_1706_2E_PAM50264_3c6f33f1`):

```
Note preliminary results

More work needed on splitting one blow5 file into batches for threading, and thoughts needed on if an index could be added to mkr files... or the benchmark could be better at least.
```
### Convert
| | Fast5 | MKR | blow5 |
|------|-------|----------|----------|
| Time | N/A | 227 secs | 781 secs |
| Size | 52 GB | 37GB | 38GB |
### Find all read ids
| | Fast5 | MKR | blow5 |
|------|----------|----------|----------|
| Time | 6.1 secs | 4.9 secs | 275 secs |
### Find all samples
| | Fast5 | MKR | blow5 |
|------|----------|---------|----------|
| Time | 524 secs | 31 secs | 317 secs |
### Find selected read ids + extract read number
| | Fast5 | MKR | blow5 |
|------|----------|---------|--------|
| Time | 412 secs | 10 secs | 8 secs |
### Find selected read ids + extract sample count
| | Fast5 | MKR | blow5 |
|------|----------|---------|--------|
| Time | 414 secs | 14 secs | 9 secs |
### Find selected read ids + samples
| | Fast5 | MKR | blow5 |
|------|----------|---------|---------|
| Time | 476 secs | 16 secs | 10 secs |
7 changes: 7 additions & 0 deletions benchmarks/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/bash

script_dir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd ${script_dir}

cd image/
docker build -t mkr-benchmark-base -f Dockerfile.base .
Loading

0 comments on commit a92312d

Please sign in to comment.