Merge branch 'blawrence/build-speed' into 'master'

Reduce compilation times See merge request machine-learning/dorado!668
nanoporetech · Nov 1, 2023 · 92b5a67 · 92b5a67
2 parents b630567 + f0d43ee
commit 92b5a67
Show file tree

Hide file tree

Showing 65 changed files with 487 additions and 450 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -100,6 +100,14 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
     endif()
 endif()
 
+# Bring in spdlog
+add_subdirectory(${DORADO_3RD_PARTY_SOURCE}/spdlog)
+# Avoid namespace clashes with static torch.
+target_compile_definitions(spdlog PUBLIC
+    "FMT_BEGIN_NAMESPACE=namespace fmt { inline namespace ont {"
+    "FMT_END_NAMESPACE=}}"
+)
+
 # ELZIP_DECOMPRESS_ONLY stops minizip from adding OpenSSL as a target, preventing use of three dylibs on osx.
 set(ELZIP_DECOMPRESS_ONLY ON)
 add_subdirectory(${DORADO_3RD_PARTY_SOURCE}/elzip)
@@ -261,7 +269,6 @@ target_include_directories(dorado_lib
         ${DORADO_3RD_PARTY_SOURCE}/hdf_plugins/vbz_plugin
         ${DORADO_3RD_PARTY_SOURCE}/cxxpool/src
         ${DORADO_3RD_PARTY_SOURCE}/NVTX/c/include
-        ${DORADO_3RD_PARTY_SOURCE}/spdlog/include
         ${DORADO_3RD_PARTY_SOURCE}/indicators/include
 )
 

diff --git a/cmake/Torch.cmake b/cmake/Torch.cmake
@@ -350,7 +350,10 @@ if (USING_STATIC_TORCH_LIB)
             )
             target_link_libraries(dorado_torch_lib PRIVATE
                 ${TORCH_LIBRARIES}
-                ${ont_cuda_internal_linkage_libs}
+                # Some of the CUDA libs have inter-dependencies, so group them together
+                -Wl,--start-group
+                    ${ont_cuda_internal_linkage_libs}
+                -Wl,--end-group
             )
             target_include_directories(dorado_torch_lib PUBLIC
                 ${TORCH_INCLUDE_DIRS}

diff --git a/dorado/cli/benchmark.cpp b/dorado/cli/benchmark.cpp
@@ -1,8 +1,8 @@
 #include "../utils/tensor_utils.h"
 #include "Version.h"
 
+#include <ATen/ATen.h>
 #include <argparse.hpp>
-#include <torch/torch.h>
 
 #include <chrono>
 #include <iostream>
@@ -26,12 +26,12 @@ int benchmark(int argc, char* argv[]) {
         std::cerr << "samples : " << n << std::endl;
 
         // generate some input
-        auto x = torch::randint(0, 2047, n);
-        auto q = torch::tensor({0.2, 0.9}, {torch::kFloat32});
+        auto x = at::randint(0, 2047, n);
+        auto q = at::tensor({0.2, 0.9}, {at::ScalarType::Float});
 
         // torch::quantile
         auto start = std::chrono::system_clock::now();
-        auto res = torch::quantile(x, q);
+        auto res = at::quantile(x, q);
         auto end = std::chrono::system_clock::now();
 
         auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
@@ -51,7 +51,7 @@ int benchmark(int argc, char* argv[]) {
                   << " q20=" << res[0].item<int>() << " q90=" << res[1].item<int>() << " "
                   << duration << "us" << std::endl;
 
-        x = x.to(torch::kInt16);
+        x = x.to(at::ScalarType::Short);
 
         // counting
         start = std::chrono::system_clock::now();

diff --git a/dorado/cli/cli_utils.h b/dorado/cli/cli_utils.h
@@ -6,7 +6,6 @@
 
 #include <argparse.hpp>
 #include <htslib/sam.h>
-#include <torch/torch.h>
 
 #include <algorithm>
 #include <cctype>

diff --git a/dorado/data_loader/DataLoader.cpp b/dorado/data_loader/DataLoader.cpp
@@ -6,6 +6,7 @@
 #include "utils/types.h"
 #include "vbz_plugin_user_utils.h"
 
+#include <ATen/ATen.h>
 #include <cxxpool.h>
 #include <highfive/H5Easy.hpp>
 #include <highfive/H5File.hpp>
@@ -132,8 +133,8 @@ SimplexReadPtr process_pod5_read(
     pod5_error_t err = pod5_format_read_id(read_data.read_id, read_id_tmp);
     std::string read_id_str(read_id_tmp);
 
-    auto options = torch::TensorOptions().dtype(torch::kInt16);
-    auto samples = torch::empty(read_data.num_samples, options);
+    auto options = at::TensorOptions().dtype(at::kShort);
+    auto samples = at::empty(read_data.num_samples, options);
 
     if (pod5_get_read_complete_signal(file, batch, row, read_data.num_samples,
                                       samples.data_ptr<int16_t>()) != POD5_OK) {
@@ -795,8 +796,8 @@ void DataLoader::load_fast5_reads_from_file(const std::string& path) {
             throw std::runtime_error("Invalid FAST5 Signal data type of " +
                                      ds.getDataType().string());
 
-        auto options = torch::TensorOptions().dtype(torch::kInt16);
-        auto samples = torch::empty(ds.getElementCount(), options);
+        auto options = at::TensorOptions().dtype(at::kShort);
+        auto samples = at::empty(ds.getElementCount(), options);
         ds.read(samples.data_ptr<int16_t>());
 
         HighFive::Attribute mux_attr = raw.getAttribute("start_mux");

diff --git a/dorado/decode/CPUDecoder.cpp b/dorado/decode/CPUDecoder.cpp
@@ -2,38 +2,38 @@
 
 #include "beam_search.h"
 
+#include <ATen/ATen.h>
 #include <math.h>
 #include <spdlog/spdlog.h>
-#include <torch/torch.h>
 
 #include <vector>
 
 namespace {
 
-at::Tensor scan(const torch::Tensor& Ms,
+at::Tensor scan(const at::Tensor& Ms,
                 const float fixed_stay_score,
-                const torch::Tensor& idx,
-                const torch::Tensor& v0) {
+                const at::Tensor& idx,
+                const at::Tensor& v0) {
     const int T = Ms.size(0);
     const int N = Ms.size(1);
     const int C = Ms.size(2);
 
-    torch::Tensor alpha = Ms.new_full({T + 1, N, C}, -1E38);
+    at::Tensor alpha = Ms.new_full({T + 1, N, C}, -1E38);
     alpha[0] = v0;
 
     for (int t = 0; t < T; t++) {
-        auto scored_steps = torch::add(alpha.index({t, torch::indexing::Slice(), idx}), Ms[t]);
-        auto scored_stay = torch::add(alpha.index({t, torch::indexing::Slice()}), fixed_stay_score)
-                                   .unsqueeze(-1);
-        auto scored_transitions = torch::cat({scored_stay, scored_steps}, -1);
+        auto scored_steps = at::add(alpha.index({t, at::indexing::Slice(), idx}), Ms[t]);
+        auto scored_stay =
+                at::add(alpha.index({t, at::indexing::Slice()}), fixed_stay_score).unsqueeze(-1);
+        auto scored_transitions = at::cat({scored_stay, scored_steps}, -1);
 
-        alpha[t + 1] = torch::logsumexp(scored_transitions, -1);
+        alpha[t + 1] = at::logsumexp(scored_transitions, -1);
     }
 
     return alpha;
 }
 
-torch::Tensor forward_scores(const torch::Tensor& scores, const float fixed_stay_score) {
+at::Tensor forward_scores(const at::Tensor& scores, const float fixed_stay_score) {
     const int T = scores.size(0);  // Signal len
     const int N = scores.size(1);  // Num batches
     const int C = scores.size(2);  // 4^state_len * 4 = 4^(state_len + 1)
@@ -43,7 +43,7 @@ torch::Tensor forward_scores(const torch::Tensor& scores, const float fixed_stay
 
     // Transition scores reshaped so that the 4 scores for each predecessor state are arranged along the
     // innermost dimension.
-    const torch::Tensor Ms = scores.reshape({T, N, -1, n_base});
+    const at::Tensor Ms = scores.reshape({T, N, -1, n_base});
 
     // Number of states per timestep.
     const int num_states = pow(n_base, state_len);
@@ -52,16 +52,13 @@ torch::Tensor forward_scores(const torch::Tensor& scores, const float fixed_stay
     const auto v0 = Ms.new_full({{N, num_states}}, 0.0f);
 
     // For each state, the indices of the 4 states that could precede it via a step transition.
-    const auto idx = torch::arange(num_states)
-                             .repeat_interleave(n_base)
-                             .reshape({n_base, -1})
-                             .t()
-                             .contiguous();
+    const auto idx =
+            at::arange(num_states).repeat_interleave(n_base).reshape({n_base, -1}).t().contiguous();
 
     return scan(Ms, fixed_stay_score, idx, v0);
 }
 
-torch::Tensor backward_scores(const torch::Tensor& scores, const float fixed_stay_score) {
+at::Tensor backward_scores(const at::Tensor& scores, const float fixed_stay_score) {
     const int N = scores.size(1);  // Num batches
     const int C = scores.size(2);  // 4^state_len * 4 = 4^(state_len + 1)
 
@@ -73,31 +70,28 @@ torch::Tensor backward_scores(const torch::Tensor& scores, const float fixed_sta
     const int num_states = pow(n_base, state_len);
 
     // Guide values at last timestep.
-    const torch::Tensor vT = scores.new_full({N, num_states}, 0.0f);
+    const at::Tensor vT = scores.new_full({N, num_states}, 0.0f);
 
-    const auto idx = torch::arange(num_states)
-                             .repeat_interleave(n_base)
-                             .reshape({n_base, -1})
-                             .t()
-                             .contiguous();
+    const auto idx =
+            at::arange(num_states).repeat_interleave(n_base).reshape({n_base, -1}).t().contiguous();
     auto idx_T = idx.flatten().argsort().reshape(idx.sizes());
 
-    const auto Ms_T = scores.index({torch::indexing::Slice(), torch::indexing::Slice(), idx_T});
+    const auto Ms_T = scores.index({at::indexing::Slice(), at::indexing::Slice(), idx_T});
 
     // For each state, the indices of the 4 states that could succeed it via a step transition.
-    idx_T = torch::bitwise_right_shift(idx_T, 2);
+    idx_T = at::bitwise_right_shift(idx_T, 2);
 
-    return scan(Ms_T.flip(0), fixed_stay_score, idx_T.to(torch::kInt64), vT).flip(0);
+    return scan(Ms_T.flip(0), fixed_stay_score, idx_T.to(at::kLong), vT).flip(0);
 }
 
 }  // namespace
 
 namespace dorado {
 
-std::vector<DecodedChunk> CPUDecoder::beam_search(const torch::Tensor& scores,
+std::vector<DecodedChunk> CPUDecoder::beam_search(const at::Tensor& scores,
                                                   const int num_chunks,
                                                   const DecoderOptions& options) {
-    const auto scores_cpu = scores.to(torch::kCPU);
+    const auto scores_cpu = scores.to(at::kCPU);
     int num_threads = std::min(num_chunks, 4);
     int chunks_per_thread = num_chunks / num_threads;
     int num_threads_with_one_more_chunk = num_chunks % num_threads;
@@ -109,20 +103,20 @@ std::vector<DecodedChunk> CPUDecoder::beam_search(const torch::Tensor& scores,
     for (int i = 0; i < num_threads; ++i) {
         threads.emplace_back(new std::thread(
                 [&](int i) {
-                    torch::InferenceMode inference_mode_guard;
+                    at::InferenceMode inference_mode_guard;
 
                     int t_first_chunk =
                             i * chunks_per_thread + std::min(i, num_threads_with_one_more_chunk);
                     int t_num_chunks = chunks_per_thread + int(i < num_threads_with_one_more_chunk);
 
-                    using Slice = torch::indexing::Slice;
+                    using Slice = at::indexing::Slice;
                     auto t_scores = scores_cpu.index(
                             {Slice(), Slice(t_first_chunk, t_first_chunk + t_num_chunks)});
 
-                    torch::Tensor fwd = forward_scores(t_scores, options.blank_score);
-                    torch::Tensor bwd = backward_scores(t_scores, options.blank_score);
+                    at::Tensor fwd = forward_scores(t_scores, options.blank_score);
+                    at::Tensor bwd = backward_scores(t_scores, options.blank_score);
 
-                    torch::Tensor posts = torch::softmax(fwd + bwd, -1);
+                    at::Tensor posts = at::softmax(fwd + bwd, -1);
 
                     t_scores = t_scores.transpose(0, 1);
                     bwd = bwd.transpose(0, 1).contiguous();

diff --git a/dorado/decode/CPUDecoder.h b/dorado/decode/CPUDecoder.h
@@ -2,16 +2,16 @@
 
 #include "Decoder.h"
 
-#include <torch/torch.h>
+#include <ATen/core/TensorBody.h>
 
 namespace dorado {
 
 class CPUDecoder final : Decoder {
 public:
-    std::vector<DecodedChunk> beam_search(const torch::Tensor& scores,
+    std::vector<DecodedChunk> beam_search(const at::Tensor& scores,
                                           int num_chunks,
                                           const DecoderOptions& options) final;
-    constexpr static torch::ScalarType dtype = torch::kF32;
+    constexpr static at::ScalarType dtype = at::ScalarType::Float;
 };
 
 }  // namespace dorado
diff --git a/dorado/decode/Decoder.h b/dorado/decode/Decoder.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/torch.h>
+#include <ATen/core/TensorBody.h>
 
 #include <string>
 #include <vector>
@@ -25,7 +25,7 @@ struct DecoderOptions {
 
 class Decoder {
 public:
-    virtual std::vector<DecodedChunk> beam_search(const torch::Tensor& scores,
+    virtual std::vector<DecodedChunk> beam_search(const at::Tensor& scores,
                                                   int num_chunks,
                                                   const DecoderOptions& options) = 0;
 };

diff --git a/dorado/decode/GPUDecoder.cpp b/dorado/decode/GPUDecoder.cpp
@@ -6,45 +6,42 @@
 
 #include <c10/cuda/CUDAGuard.h>
 #include <nvtx3/nvtx3.hpp>
-#include <torch/torch.h>
 
 extern "C" {
 #include "koi.h"
 }
 
 namespace dorado {
 
-torch::Tensor GPUDecoder::gpu_part(torch::Tensor scores, int num_chunks, DecoderOptions options) {
+at::Tensor GPUDecoder::gpu_part(at::Tensor scores, int num_chunks, DecoderOptions options) {
     c10::cuda::CUDAGuard device_guard(scores.device());
     utils::ScopedProfileRange loop{"gpu_decode", 1};
     long int N = scores.sizes()[0];
     long int T = scores.sizes()[1];
     long int C = scores.sizes()[2];
 
-    auto tensor_options_int32 = torch::TensorOptions()
-                                        .dtype(torch::kInt32)
-                                        .device(scores.device())
-                                        .requires_grad(false);
+    auto tensor_options_int32 =
+            at::TensorOptions().dtype(at::kInt).device(scores.device()).requires_grad(false);
 
     auto tensor_options_int8 =
-            torch::TensorOptions().dtype(torch::kInt8).device(scores.device()).requires_grad(false);
+            at::TensorOptions().dtype(at::kChar).device(scores.device()).requires_grad(false);
 
-    auto chunks = torch::empty({N, 4}, tensor_options_int32);
-    chunks.index({torch::indexing::Slice(), 0}) = torch::arange(0, int(T * N), int(T));
-    chunks.index({torch::indexing::Slice(), 2}) = torch::arange(0, int(T * N), int(T));
-    chunks.index({torch::indexing::Slice(), 1}) = int(T);
-    chunks.index({torch::indexing::Slice(), 3}) = 0;
+    auto chunks = at::empty({N, 4}, tensor_options_int32);
+    chunks.index({at::indexing::Slice(), 0}) = at::arange(0, int(T * N), int(T));
+    chunks.index({at::indexing::Slice(), 2}) = at::arange(0, int(T * N), int(T));
+    chunks.index({at::indexing::Slice(), 1}) = int(T);
+    chunks.index({at::indexing::Slice(), 3}) = 0;
 
-    auto chunk_results = torch::empty({N, 8}, tensor_options_int32);
+    auto chunk_results = at::empty({N, 8}, tensor_options_int32);
 
     chunk_results = chunk_results.contiguous();
 
-    auto aux = torch::empty(N * (T + 1) * (C + 4 * options.beam_width), tensor_options_int8);
-    auto path = torch::zeros(N * (T + 1), tensor_options_int32);
+    auto aux = at::empty(N * (T + 1) * (C + 4 * options.beam_width), tensor_options_int8);
+    auto path = at::zeros(N * (T + 1), tensor_options_int32);
 
-    auto moves_sequence_qstring = torch::zeros({3, N * T}, tensor_options_int8);
+    auto moves_sequence_qstring = at::zeros({3, N * T}, tensor_options_int8);
 
-    moves_sequence_qstring.index({torch::indexing::Slice()}) = 0.0;
+    moves_sequence_qstring.index({at::indexing::Slice()}) = 0.0;
     auto moves = moves_sequence_qstring[0];
     auto sequence = moves_sequence_qstring[1];
     auto qstring = moves_sequence_qstring[2];
@@ -84,9 +81,9 @@ torch::Tensor GPUDecoder::gpu_part(torch::Tensor scores, int num_chunks, Decoder
     return moves_sequence_qstring.reshape({3, N, -1});
 }
 
-std::vector<DecodedChunk> GPUDecoder::cpu_part(torch::Tensor moves_sequence_qstring_cpu) {
+std::vector<DecodedChunk> GPUDecoder::cpu_part(at::Tensor moves_sequence_qstring_cpu) {
     nvtx3::scoped_range loop{"cpu_decode"};
-    assert(moves_sequence_qstring_cpu.device() == torch::kCPU);
+    assert(moves_sequence_qstring_cpu.device() == at::kCPU);
     auto moves_cpu = moves_sequence_qstring_cpu[0];
     auto sequence_cpu = moves_sequence_qstring_cpu[1];
     auto qstring_cpu = moves_sequence_qstring_cpu[2];
@@ -110,7 +107,7 @@ std::vector<DecodedChunk> GPUDecoder::cpu_part(torch::Tensor moves_sequence_qstr
     return called_chunks;
 }
 
-std::vector<DecodedChunk> GPUDecoder::beam_search(const torch::Tensor &scores,
+std::vector<DecodedChunk> GPUDecoder::beam_search(const at::Tensor &scores,
                                                   int num_chunks,
                                                   const DecoderOptions &options) {
     return cpu_part(gpu_part(scores, num_chunks, options));