Skip to content

Commit

Permalink
Merge branch 'blawrence/build-speed' into 'master'
Browse files Browse the repository at this point in the history
Reduce compilation times

See merge request machine-learning/dorado!668
  • Loading branch information
blawrence-ont committed Nov 1, 2023
2 parents b630567 + f0d43ee commit 92b5a67
Show file tree
Hide file tree
Showing 65 changed files with 487 additions and 450 deletions.
9 changes: 8 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,14 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
endif()
endif()

# Bring in spdlog
add_subdirectory(${DORADO_3RD_PARTY_SOURCE}/spdlog)
# Avoid namespace clashes with static torch.
target_compile_definitions(spdlog PUBLIC
"FMT_BEGIN_NAMESPACE=namespace fmt { inline namespace ont {"
"FMT_END_NAMESPACE=}}"
)

# ELZIP_DECOMPRESS_ONLY stops minizip from adding OpenSSL as a target, preventing use of three dylibs on osx.
set(ELZIP_DECOMPRESS_ONLY ON)
add_subdirectory(${DORADO_3RD_PARTY_SOURCE}/elzip)
Expand Down Expand Up @@ -261,7 +269,6 @@ target_include_directories(dorado_lib
${DORADO_3RD_PARTY_SOURCE}/hdf_plugins/vbz_plugin
${DORADO_3RD_PARTY_SOURCE}/cxxpool/src
${DORADO_3RD_PARTY_SOURCE}/NVTX/c/include
${DORADO_3RD_PARTY_SOURCE}/spdlog/include
${DORADO_3RD_PARTY_SOURCE}/indicators/include
)

Expand Down
5 changes: 4 additions & 1 deletion cmake/Torch.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,10 @@ if (USING_STATIC_TORCH_LIB)
)
target_link_libraries(dorado_torch_lib PRIVATE
${TORCH_LIBRARIES}
${ont_cuda_internal_linkage_libs}
# Some of the CUDA libs have inter-dependencies, so group them together
-Wl,--start-group
${ont_cuda_internal_linkage_libs}
-Wl,--end-group
)
target_include_directories(dorado_torch_lib PUBLIC
${TORCH_INCLUDE_DIRS}
Expand Down
10 changes: 5 additions & 5 deletions dorado/cli/benchmark.cpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#include "../utils/tensor_utils.h"
#include "Version.h"

#include <ATen/ATen.h>
#include <argparse.hpp>
#include <torch/torch.h>

#include <chrono>
#include <iostream>
Expand All @@ -26,12 +26,12 @@ int benchmark(int argc, char* argv[]) {
std::cerr << "samples : " << n << std::endl;

// generate some input
auto x = torch::randint(0, 2047, n);
auto q = torch::tensor({0.2, 0.9}, {torch::kFloat32});
auto x = at::randint(0, 2047, n);
auto q = at::tensor({0.2, 0.9}, {at::ScalarType::Float});

// torch::quantile
auto start = std::chrono::system_clock::now();
auto res = torch::quantile(x, q);
auto res = at::quantile(x, q);
auto end = std::chrono::system_clock::now();

auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
Expand All @@ -51,7 +51,7 @@ int benchmark(int argc, char* argv[]) {
<< " q20=" << res[0].item<int>() << " q90=" << res[1].item<int>() << " "
<< duration << "us" << std::endl;

x = x.to(torch::kInt16);
x = x.to(at::ScalarType::Short);

// counting
start = std::chrono::system_clock::now();
Expand Down
1 change: 0 additions & 1 deletion dorado/cli/cli_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

#include <argparse.hpp>
#include <htslib/sam.h>
#include <torch/torch.h>

#include <algorithm>
#include <cctype>
Expand Down
9 changes: 5 additions & 4 deletions dorado/data_loader/DataLoader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "utils/types.h"
#include "vbz_plugin_user_utils.h"

#include <ATen/ATen.h>
#include <cxxpool.h>
#include <highfive/H5Easy.hpp>
#include <highfive/H5File.hpp>
Expand Down Expand Up @@ -132,8 +133,8 @@ SimplexReadPtr process_pod5_read(
pod5_error_t err = pod5_format_read_id(read_data.read_id, read_id_tmp);
std::string read_id_str(read_id_tmp);

auto options = torch::TensorOptions().dtype(torch::kInt16);
auto samples = torch::empty(read_data.num_samples, options);
auto options = at::TensorOptions().dtype(at::kShort);
auto samples = at::empty(read_data.num_samples, options);

if (pod5_get_read_complete_signal(file, batch, row, read_data.num_samples,
samples.data_ptr<int16_t>()) != POD5_OK) {
Expand Down Expand Up @@ -795,8 +796,8 @@ void DataLoader::load_fast5_reads_from_file(const std::string& path) {
throw std::runtime_error("Invalid FAST5 Signal data type of " +
ds.getDataType().string());

auto options = torch::TensorOptions().dtype(torch::kInt16);
auto samples = torch::empty(ds.getElementCount(), options);
auto options = at::TensorOptions().dtype(at::kShort);
auto samples = at::empty(ds.getElementCount(), options);
ds.read(samples.data_ptr<int16_t>());

HighFive::Attribute mux_attr = raw.getAttribute("start_mux");
Expand Down
62 changes: 28 additions & 34 deletions dorado/decode/CPUDecoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,38 +2,38 @@

#include "beam_search.h"

#include <ATen/ATen.h>
#include <math.h>
#include <spdlog/spdlog.h>
#include <torch/torch.h>

#include <vector>

namespace {

at::Tensor scan(const torch::Tensor& Ms,
at::Tensor scan(const at::Tensor& Ms,
const float fixed_stay_score,
const torch::Tensor& idx,
const torch::Tensor& v0) {
const at::Tensor& idx,
const at::Tensor& v0) {
const int T = Ms.size(0);
const int N = Ms.size(1);
const int C = Ms.size(2);

torch::Tensor alpha = Ms.new_full({T + 1, N, C}, -1E38);
at::Tensor alpha = Ms.new_full({T + 1, N, C}, -1E38);
alpha[0] = v0;

for (int t = 0; t < T; t++) {
auto scored_steps = torch::add(alpha.index({t, torch::indexing::Slice(), idx}), Ms[t]);
auto scored_stay = torch::add(alpha.index({t, torch::indexing::Slice()}), fixed_stay_score)
.unsqueeze(-1);
auto scored_transitions = torch::cat({scored_stay, scored_steps}, -1);
auto scored_steps = at::add(alpha.index({t, at::indexing::Slice(), idx}), Ms[t]);
auto scored_stay =
at::add(alpha.index({t, at::indexing::Slice()}), fixed_stay_score).unsqueeze(-1);
auto scored_transitions = at::cat({scored_stay, scored_steps}, -1);

alpha[t + 1] = torch::logsumexp(scored_transitions, -1);
alpha[t + 1] = at::logsumexp(scored_transitions, -1);
}

return alpha;
}

torch::Tensor forward_scores(const torch::Tensor& scores, const float fixed_stay_score) {
at::Tensor forward_scores(const at::Tensor& scores, const float fixed_stay_score) {
const int T = scores.size(0); // Signal len
const int N = scores.size(1); // Num batches
const int C = scores.size(2); // 4^state_len * 4 = 4^(state_len + 1)
Expand All @@ -43,7 +43,7 @@ torch::Tensor forward_scores(const torch::Tensor& scores, const float fixed_stay

// Transition scores reshaped so that the 4 scores for each predecessor state are arranged along the
// innermost dimension.
const torch::Tensor Ms = scores.reshape({T, N, -1, n_base});
const at::Tensor Ms = scores.reshape({T, N, -1, n_base});

// Number of states per timestep.
const int num_states = pow(n_base, state_len);
Expand All @@ -52,16 +52,13 @@ torch::Tensor forward_scores(const torch::Tensor& scores, const float fixed_stay
const auto v0 = Ms.new_full({{N, num_states}}, 0.0f);

// For each state, the indices of the 4 states that could precede it via a step transition.
const auto idx = torch::arange(num_states)
.repeat_interleave(n_base)
.reshape({n_base, -1})
.t()
.contiguous();
const auto idx =
at::arange(num_states).repeat_interleave(n_base).reshape({n_base, -1}).t().contiguous();

return scan(Ms, fixed_stay_score, idx, v0);
}

torch::Tensor backward_scores(const torch::Tensor& scores, const float fixed_stay_score) {
at::Tensor backward_scores(const at::Tensor& scores, const float fixed_stay_score) {
const int N = scores.size(1); // Num batches
const int C = scores.size(2); // 4^state_len * 4 = 4^(state_len + 1)

Expand All @@ -73,31 +70,28 @@ torch::Tensor backward_scores(const torch::Tensor& scores, const float fixed_sta
const int num_states = pow(n_base, state_len);

// Guide values at last timestep.
const torch::Tensor vT = scores.new_full({N, num_states}, 0.0f);
const at::Tensor vT = scores.new_full({N, num_states}, 0.0f);

const auto idx = torch::arange(num_states)
.repeat_interleave(n_base)
.reshape({n_base, -1})
.t()
.contiguous();
const auto idx =
at::arange(num_states).repeat_interleave(n_base).reshape({n_base, -1}).t().contiguous();
auto idx_T = idx.flatten().argsort().reshape(idx.sizes());

const auto Ms_T = scores.index({torch::indexing::Slice(), torch::indexing::Slice(), idx_T});
const auto Ms_T = scores.index({at::indexing::Slice(), at::indexing::Slice(), idx_T});

// For each state, the indices of the 4 states that could succeed it via a step transition.
idx_T = torch::bitwise_right_shift(idx_T, 2);
idx_T = at::bitwise_right_shift(idx_T, 2);

return scan(Ms_T.flip(0), fixed_stay_score, idx_T.to(torch::kInt64), vT).flip(0);
return scan(Ms_T.flip(0), fixed_stay_score, idx_T.to(at::kLong), vT).flip(0);
}

} // namespace

namespace dorado {

std::vector<DecodedChunk> CPUDecoder::beam_search(const torch::Tensor& scores,
std::vector<DecodedChunk> CPUDecoder::beam_search(const at::Tensor& scores,
const int num_chunks,
const DecoderOptions& options) {
const auto scores_cpu = scores.to(torch::kCPU);
const auto scores_cpu = scores.to(at::kCPU);
int num_threads = std::min(num_chunks, 4);
int chunks_per_thread = num_chunks / num_threads;
int num_threads_with_one_more_chunk = num_chunks % num_threads;
Expand All @@ -109,20 +103,20 @@ std::vector<DecodedChunk> CPUDecoder::beam_search(const torch::Tensor& scores,
for (int i = 0; i < num_threads; ++i) {
threads.emplace_back(new std::thread(
[&](int i) {
torch::InferenceMode inference_mode_guard;
at::InferenceMode inference_mode_guard;

int t_first_chunk =
i * chunks_per_thread + std::min(i, num_threads_with_one_more_chunk);
int t_num_chunks = chunks_per_thread + int(i < num_threads_with_one_more_chunk);

using Slice = torch::indexing::Slice;
using Slice = at::indexing::Slice;
auto t_scores = scores_cpu.index(
{Slice(), Slice(t_first_chunk, t_first_chunk + t_num_chunks)});

torch::Tensor fwd = forward_scores(t_scores, options.blank_score);
torch::Tensor bwd = backward_scores(t_scores, options.blank_score);
at::Tensor fwd = forward_scores(t_scores, options.blank_score);
at::Tensor bwd = backward_scores(t_scores, options.blank_score);

torch::Tensor posts = torch::softmax(fwd + bwd, -1);
at::Tensor posts = at::softmax(fwd + bwd, -1);

t_scores = t_scores.transpose(0, 1);
bwd = bwd.transpose(0, 1).contiguous();
Expand Down
6 changes: 3 additions & 3 deletions dorado/decode/CPUDecoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,16 @@

#include "Decoder.h"

#include <torch/torch.h>
#include <ATen/core/TensorBody.h>

namespace dorado {

class CPUDecoder final : Decoder {
public:
std::vector<DecodedChunk> beam_search(const torch::Tensor& scores,
std::vector<DecodedChunk> beam_search(const at::Tensor& scores,
int num_chunks,
const DecoderOptions& options) final;
constexpr static torch::ScalarType dtype = torch::kF32;
constexpr static at::ScalarType dtype = at::ScalarType::Float;
};

} // namespace dorado
4 changes: 2 additions & 2 deletions dorado/decode/Decoder.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#pragma once

#include <torch/torch.h>
#include <ATen/core/TensorBody.h>

#include <string>
#include <vector>
Expand All @@ -25,7 +25,7 @@ struct DecoderOptions {

class Decoder {
public:
virtual std::vector<DecodedChunk> beam_search(const torch::Tensor& scores,
virtual std::vector<DecodedChunk> beam_search(const at::Tensor& scores,
int num_chunks,
const DecoderOptions& options) = 0;
};
Expand Down
37 changes: 17 additions & 20 deletions dorado/decode/GPUDecoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,45 +6,42 @@

#include <c10/cuda/CUDAGuard.h>
#include <nvtx3/nvtx3.hpp>
#include <torch/torch.h>

extern "C" {
#include "koi.h"
}

namespace dorado {

torch::Tensor GPUDecoder::gpu_part(torch::Tensor scores, int num_chunks, DecoderOptions options) {
at::Tensor GPUDecoder::gpu_part(at::Tensor scores, int num_chunks, DecoderOptions options) {
c10::cuda::CUDAGuard device_guard(scores.device());
utils::ScopedProfileRange loop{"gpu_decode", 1};
long int N = scores.sizes()[0];
long int T = scores.sizes()[1];
long int C = scores.sizes()[2];

auto tensor_options_int32 = torch::TensorOptions()
.dtype(torch::kInt32)
.device(scores.device())
.requires_grad(false);
auto tensor_options_int32 =
at::TensorOptions().dtype(at::kInt).device(scores.device()).requires_grad(false);

auto tensor_options_int8 =
torch::TensorOptions().dtype(torch::kInt8).device(scores.device()).requires_grad(false);
at::TensorOptions().dtype(at::kChar).device(scores.device()).requires_grad(false);

auto chunks = torch::empty({N, 4}, tensor_options_int32);
chunks.index({torch::indexing::Slice(), 0}) = torch::arange(0, int(T * N), int(T));
chunks.index({torch::indexing::Slice(), 2}) = torch::arange(0, int(T * N), int(T));
chunks.index({torch::indexing::Slice(), 1}) = int(T);
chunks.index({torch::indexing::Slice(), 3}) = 0;
auto chunks = at::empty({N, 4}, tensor_options_int32);
chunks.index({at::indexing::Slice(), 0}) = at::arange(0, int(T * N), int(T));
chunks.index({at::indexing::Slice(), 2}) = at::arange(0, int(T * N), int(T));
chunks.index({at::indexing::Slice(), 1}) = int(T);
chunks.index({at::indexing::Slice(), 3}) = 0;

auto chunk_results = torch::empty({N, 8}, tensor_options_int32);
auto chunk_results = at::empty({N, 8}, tensor_options_int32);

chunk_results = chunk_results.contiguous();

auto aux = torch::empty(N * (T + 1) * (C + 4 * options.beam_width), tensor_options_int8);
auto path = torch::zeros(N * (T + 1), tensor_options_int32);
auto aux = at::empty(N * (T + 1) * (C + 4 * options.beam_width), tensor_options_int8);
auto path = at::zeros(N * (T + 1), tensor_options_int32);

auto moves_sequence_qstring = torch::zeros({3, N * T}, tensor_options_int8);
auto moves_sequence_qstring = at::zeros({3, N * T}, tensor_options_int8);

moves_sequence_qstring.index({torch::indexing::Slice()}) = 0.0;
moves_sequence_qstring.index({at::indexing::Slice()}) = 0.0;
auto moves = moves_sequence_qstring[0];
auto sequence = moves_sequence_qstring[1];
auto qstring = moves_sequence_qstring[2];
Expand Down Expand Up @@ -84,9 +81,9 @@ torch::Tensor GPUDecoder::gpu_part(torch::Tensor scores, int num_chunks, Decoder
return moves_sequence_qstring.reshape({3, N, -1});
}

std::vector<DecodedChunk> GPUDecoder::cpu_part(torch::Tensor moves_sequence_qstring_cpu) {
std::vector<DecodedChunk> GPUDecoder::cpu_part(at::Tensor moves_sequence_qstring_cpu) {
nvtx3::scoped_range loop{"cpu_decode"};
assert(moves_sequence_qstring_cpu.device() == torch::kCPU);
assert(moves_sequence_qstring_cpu.device() == at::kCPU);
auto moves_cpu = moves_sequence_qstring_cpu[0];
auto sequence_cpu = moves_sequence_qstring_cpu[1];
auto qstring_cpu = moves_sequence_qstring_cpu[2];
Expand All @@ -110,7 +107,7 @@ std::vector<DecodedChunk> GPUDecoder::cpu_part(torch::Tensor moves_sequence_qstr
return called_chunks;
}

std::vector<DecodedChunk> GPUDecoder::beam_search(const torch::Tensor &scores,
std::vector<DecodedChunk> GPUDecoder::beam_search(const at::Tensor &scores,
int num_chunks,
const DecoderOptions &options) {
return cpu_part(gpu_part(scores, num_chunks, options));
Expand Down
Loading

0 comments on commit 92b5a67

Please sign in to comment.