Merge branch 'feature/improve_ci' into 'master'

Python CI See merge request minknow/mkr-file-format!3
nanoporetech · May 4, 2022 · a92312d · a92312d
2 parents d65421a + 347509a
commit a92312d
Show file tree

Hide file tree

Showing 52 changed files with 3,702 additions and 435 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -3,10 +3,25 @@ stages:
   - build
   - build-python
   - test-python
+  - deploy
 
 variables:
   CONAN_CONFIG_URL: https://artifactory.oxfordnanolabs.local/artifactory/ONT-Conan/configs/config-v24.zip
 
+tag_version_check:
+  stage: pre-flight
+  only:
+    - tags
+  image: ${PYTHON_IMAGE}
+  script:
+    - mkr_version="$(cmake -P ci/get_tag_version.cmake 2>&1)"
+    - tag_version="${CI_COMMIT_TAG/#v/}"
+    - if [[ "${mkr_version}" != "${tag_version}" ]]; then
+        echo "Tag is for release ${tag_version}, but MKR version is $mkr_version";
+        exit 1;
+      fi
+
+
 pre-commit checks:
     image: python:3.9
     stage: pre-flight
@@ -22,18 +37,35 @@ pre-commit checks:
         paths:
             - ${PRE_COMMIT_HOME}
 
+build-standalone-ubu20:
+  stage: build
+  image: ubuntu:20.04
+  script:
+    - export DEBIAN_FRONTEND=noninteractive
+    - apt update
+    - apt install -y -V ca-certificates lsb-release wget
+    - wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
+    - apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
+    - apt update
+    - apt install -y cmake build-essential libzstd-dev libzstd-dev libboost-dev libboost-filesystem-dev libflatbuffers-dev libarrow-dev
+    - mkdir build
+    - cd build
+    - cmake ..
+    - make -j
+
 .build:
   stage: build
   script:
+    - mkr_version="$(cmake -P ci/get_tag_version.cmake 2>&1)"
     - mkdir build
     - cd build
     - conan install --profile ${CONAN_PROFILE} ..
-    - cmake -DCMAKE_BUILD_TYPE=Release ..
+    - cmake ${CMAKE_ARGS} -DCMAKE_BUILD_TYPE=Release -DUSE_CONAN=ON ..
     - cmake --build . --config Release
     - ctest -C Release -VV
     - cmake -DCMAKE_INSTALL_PREFIX="archive" -DBUILD_TYPE="Release" -P "cmake_install.cmake"
     - cd ./archive
-    - tar -czf ../mkr-file-format-${OUTPUT_SKU}.tar.gz .
+    - tar -czf ../mkr-file-format-${mkr_version}-${OUTPUT_SKU}.tar.gz .
   artifacts:
     paths:
       - build/mkr-file-format*.tar.gz
@@ -75,6 +107,17 @@ win-x64-msvc2017-release-build:
     - cmake
     - VS2017
     - conan
+  script:
+    - mkr_version="$(cmake -P ci/get_tag_version.cmake 2>&1)"
+    - mkdir build
+    - cd build
+    - conan install --profile ${CONAN_PROFILE} ..
+    - cmake -A x64 -G "Visual Studio 15 2017" -DCMAKE_BUILD_TYPE=Release -DUSE_CONAN=ON ..
+    - cmake --build . --config Release
+    - ctest -C Release -VV
+    - cmake -DCMAKE_INSTALL_PREFIX="archive" -DBUILD_TYPE="Release" -P "cmake_install.cmake"
+    - cd ./archive
+    - tar -czf ../mkr-file-format-${mkr_version}-${OUTPUT_SKU}.tar.gz .
   variables:
     CONAN_PROFILE: "windows-x86_64-vs2017-release"
     OUTPUT_SKU: "win-x64"
@@ -97,7 +140,7 @@ wheel-build:
     - conan install --profile linux-aarch64-gcc9-release ..
     - apt update
     - apt install -y cmake
-    - cmake ..
+    - cmake -DUSE_CONAN=ON ..
     - cd ..
     - ci/unpack_libs_for_python.sh ./build ./python/mkr_format/libs
     - mkdir wheels
@@ -107,6 +150,19 @@ wheel-build:
     paths:
       - wheels/*.whl
 
+pytest:
+  stage: test-python
+  image: git.oxfordnanolabs.local:4567/traque/ont-docker-base/ont-base-python:3.9
+  tags: 
+    - linux
+  needs:
+    - wheel-build
+  script:
+    - pip install ./wheels/*
+    - cd python
+    - pip install pytest pytest-cov
+    - pytest
+
 wheel-test:
   stage: test-python
   image: git.oxfordnanolabs.local:4567/traque/ont-docker-base/ont-base-python:3.9
@@ -116,4 +172,16 @@ wheel-test:
     - wheel-build
   script:
     - pip install ./wheels/*
-    - mkr-convert-fast5 ./test_files/ ./output_files --output-one-to-one
+    - mkr-convert-fast5 ./test_data/ ./output_files --output-one-to-one
+    - python3 python/tests/tools/check_mkr_files_equal.py ./output_files/multi_fast5_zip.mkr ./test_data/multi_fast5_zip.mkr
+
+package_upload:
+  stage: deploy
+  image: ${UPLOAD_PYTHON_IMAGE}
+  needs:
+    - wheel-build
+  script:
+    - ls -lh wheels
+    - pip install twine
+    - twine upload wheels/*
+  only: ["tags"]
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -8,8 +8,9 @@ project(MKR
     VERSION ${MKR_NUMERIC_VERSION}
 )
 
-option(FORCE_CI_COMPATIBILITY "Change compiler flags to ensure compatibility with CI builds" ON)
-if (FORCE_CI_COMPATIBILITY AND CMAKE_COMPILER_IS_GNUCXX AND
+option(USE_CONAN "Use conan for dependency management" OFF)
+
+if (USE_CONAN AND CMAKE_COMPILER_IS_GNUCXX AND
         CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "9.0" AND
         CMAKE_CXX_COMPILER_VERSION VERSION_LESS "10.0")
     # We build MKR on CentOS 7 in CI, where we have GCC 9 but only the pre-C++11 ABI

diff --git a/DESIGN.md b/DESIGN.md
@@ -0,0 +1,102 @@
+MKR File Format Design Details
+==============================
+
+The MKR file format has been specifically designed to be suitable for Nanopore read data, we had some specific design goals:
+
+Design Goals
+------------
+
+The primary purpose of this file format is store reads produced by Oxford Nanopore sequencing, and in particular the signal data from those reads (which can then be basecalled or processed in other ways).
+
+This file format has the following design goals (roughly in priority order):
+
+- Good write performance for MinKNOW
+- Recoverable if the writing process crashes
+- Good read performance for downstream tools, including basecall model generation
+- Efficient use of space
+- Straightforward to implement and maintain
+- Extensibilty
+
+Note that trade-offs have been made between these goals, but we have mostly aimed to make those run-time decisions.
+
+We have also chosen not to optimise for editing existing files.
+
+
+### Write performance
+
+The aspects of this format that are designed to maximise write performance are:
+
+- Data can be written sequentially
+  - The sequential access pattern makes it easy to use efficient operating system APIs (such as io_uring on Linux)
+  - The sequential access pattern helps the operating system's I/O scheduler maximise throughput
+- Signal data from different reads can be interleaved, and data streams can be safely abandoned (at the cost of using more space than necessary)
+  - This allows MinKNOW to write out data as it arrives, potentially avoiding the need have an intermediate caching format (this file format can be used for the cache and the final output)
+- Support for space- and CPU-efficient compression routines (VBZ)
+  - This reduces the amount of data that needs to be written, which reduces I/O load
+
+### Recovery
+
+The aspects of this format that are designed to allow for recovery if the writing process crashes are:
+
+- A way to indicate that a file is actually complete as intended (complete files end with a recognisable footer)
+- The Apache Feather format can be assembled by reading it sequentially, without using the footer
+- The data file format is append-only, which means that once data is recorded it cannot be corrupted by later updates
+
+### Read performance
+
+The aspects of this format that are designed to maximise read performance are:
+
+- The Apache Feather format can be memory mapped and used directly
+- Apache Arrow has significant existing engineering work geared around efficient access to data, from the layout of the data itself to the library tooling
+- Storing direct information about signal data locations with the row table
+  - This allows quick access to a read's data without scanning the data file
+- It is possible to only decode part of a long read, due to read data being stored in chunks
+  - This is useful for model training
+- Read access does not require locking or otherwise modifying the file
+  - This allows multi-threaded and multi-process access to a file for reading
+
+### Efficient use of space
+
+The aspects of this format that are designed to maximise use of space are:
+
+- Support for efficient compression routines (VBZ)
+- Apache Arrow's support for dictionary encoding
+- Apache Arrow's support for compressing buffers with standard compression routines
+
+### Ease of implementation
+
+The aspects of this format that are designed to make the format easy to implement are:
+
+- Relying on an existing, widely-used format (Apache Arrow)
+
+### Extensibility
+
+The aspects of this format that are designed to make the format extensible are:
+
+- Apache Arrow uses a self-describing schema with named columns, so it is straightfoward to write code that is resilient in the face of things like additional columns being added.
+
+
+Format Specification
+--------------------
+
+### Overview
+
+The file format is, at its core, a collection of Apache Arrow tables, stored in the Apache Feather 2 (also know as Apache Arrow IPC File) format. These can be stored separately, linked by having a common filename component, or bundled into a single file for ease of file management.
+
+In its unbundled format, there are two required files:
+
+```
+    <prefix>_signal.arrow
+    <prefix>_reads.arrow
+```
+
+Optionally, index files can also be provided:
+
+```
+    <preifx>_index_read_id.arrow (index by read_id)
+    <preifx>_index_*.arrow (optional, extension point for other indexes)
+```
+
+Each of these is an Apache Feather 2 file, and can be opened directly using the Apache Arrow library's IPC routines. The schema of the tables is described below. The naming scheme above is recommended (and should be the default when creating these files), but tooling should provide a way for users to explicitly every filename when opening files (in case the user has renamed them to a different scheme).
+
+These can be stored in a bundled file, named <prefix>.mkr and described below.
diff --git a/README.md b/README.md
@@ -14,8 +14,41 @@ What does this project contain
 This project contains a core library for reading and writing MKR data, and a toolkit for
 accessing this data in other languages.
 
-Getting Started
----------------
+
+Usage
+-----
+
+The MKR is bundled as a python module for easy use in scripts, a user can install using:
+
+```bash
+> pip install mkr_format
+```
+
+The python module comes with several tools to assist users with mkr files, and a python library to write custom scripts against.
+
+Please see [examples](./python/mkr_format/examples) for documentation on using the library.
+
+Tools
+-----
+
+### mkr-convert-fast5
+
+Generate an mkr file from a set of input fast5 files:
+
+```bash
+> mkr-convert-fast5 input_fast5_1.fast5 input_fast5_2.fast5 output_mkr_file.mkr
+```
+
+### mkr-inspect
+
+Inspect an mkr file to extract details about the contents:
+
+```bash
+> mkr-inspect mkr_file.mkr
+```
+
+Development
+-----------
 
 ### Developing
 
@@ -28,9 +61,16 @@ Building the project requires several tools and libraries are available:
 - Flatbuffers
 
 ```bash
-> pip install -r ./requirements.txt
+# Docs on installing arrow from here: https://arrow.apache.org/install/
+> sudo apt install -y -V ca-certificates lsb-release wget
+> wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
+> sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
+> sudo apt update
+# Now install the rest of the dependencies:
+> sudo apt install cmake libzstd-dev libzstd-dev libboost-dev libboost-filesystem-dev libflatbuffers-dev
+# Finally start build of MKR:
 > mkdir build
 > cd build
-> conan install .. # Optional step, but the source requires the libraries are available on the system in a way cmake can find them.
 > cmake ..
+> make -j
 ```
diff --git a/benchmarks/.gitignore b/benchmarks/.gitignore
@@ -0,0 +1,2 @@
+*/outputs/
+image/*.whl
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -0,0 +1,72 @@
+MKR Benchmarks
+==============
+
+Building the benchmark environment
+----------------------------------
+
+To run benchmarks you first have to build the docker environment to run them:
+
+```bash
+> ./build.sh
+```
+
+
+Running a benchmark
+-------------------
+
+To run a benchmark, use the helper script to start the docker image:
+
+```bash
+> ./run_benchmark.sh convert ./path-to-source-files/
+
+
+
+Benchmarking Results
+--------------------
+
+On dataset a PCR Zymo dataset PAM50264, on 10.4.1 e8.2 data (`pcr_zymo/20220419_1706_2E_PAM50264_3c6f33f1`):
+
+```
+Note preliminary results
+
+More work needed on splitting one blow5 file into batches for threading, and thoughts needed on if an index could be added to mkr files... or the benchmark could be better at least.
+```
+
+### Convert
+
+|      | Fast5 | MKR      | blow5    |
+|------|-------|----------|----------|
+| Time | N/A   | 227 secs | 781 secs |
+| Size | 52 GB | 37GB     | 38GB     |
+
+
+### Find all read ids
+
+|      | Fast5    | MKR      | blow5    |
+|------|----------|----------|----------|
+| Time | 6.1 secs | 4.9 secs | 275 secs |
+
+### Find all samples
+
+|      | Fast5    | MKR     | blow5    |
+|------|----------|---------|----------|
+| Time | 524 secs | 31 secs | 317 secs |
+
+
+### Find selected read ids + extract read number
+
+|      | Fast5    | MKR     | blow5  |
+|------|----------|---------|--------|
+| Time | 412 secs | 10 secs | 8 secs |
+
+### Find selected read ids + extract sample count
+
+|      | Fast5    | MKR     | blow5  |
+|------|----------|---------|--------|
+| Time | 414 secs | 14 secs | 9 secs |
+
+### Find selected read ids + samples
+
+|      | Fast5    | MKR     | blow5   |
+|------|----------|---------|---------|
+| Time | 476 secs | 16 secs | 10 secs |
diff --git a/benchmarks/build.sh b/benchmarks/build.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+script_dir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+cd ${script_dir}
+
+cd image/
+docker build -t mkr-benchmark-base -f Dockerfile.base .