forked from voutcn/megahit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
lv2_gpu_functions.cu
98 lines (83 loc) · 4.56 KB
/
lv2_gpu_functions.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
/*
* MEGAHIT
* Copyright (C) 2014 The University of Hong Kong
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "lv2_gpu_functions.h"
#include <stdio.h>
#include <assert.h>
#include "cub/util_allocator.cuh"
#include "cub/device/device_radix_sort.cuh"
#include "helper_functions-inl.h"
using namespace cub;
static CachingDeviceAllocator g_allocator(true);
const int kGPUThreadPerBlock = 256;
void cuda_init() {
cudaFree(0);
}
void get_cuda_memory(size_t &free_mem, size_t &total_mem) {
assert(cudaMemGetInfo(&free_mem, &total_mem) == cudaSuccess);
}
// device function for permuting an array
__global__ void permutation_kernel(edge_word_t *index, edge_word_t *val, edge_word_t *new_val, uint32_t num_items) {
int tid = blockIdx.x * kGPUThreadPerBlock + threadIdx.x;
if (tid < num_items)
new_val[tid] = val[index[tid]];
}
// device function for reset permutation
__global__ void reset_permutation_kernel(uint32_t *permutation, uint32_t num_items) {
int tid = blockIdx.x * kGPUThreadPerBlock + threadIdx.x;
if (tid < num_items)
permutation[tid] = tid;
}
// single thread
void lv2_gpu_sort(edge_word_t *lv2_substrings, uint32_t *permutation, int words_per_substring, int64_t lv2_num_items) {
DoubleBuffer<edge_word_t> d_keys;
DoubleBuffer<uint32_t> d_values;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[0], sizeof(edge_word_t) * lv2_num_items));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[1], sizeof(edge_word_t) * lv2_num_items));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[0], sizeof(uint32_t) * lv2_num_items));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(uint32_t) * lv2_num_items));
// Allocate temporary storage
size_t temp_storage_bytes = 0;
void *d_temp_storage = NULL;
CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, lv2_num_items));
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
// Initialize permutation array
int num_gpu_blocks = DivCeiling(lv2_num_items, kGPUThreadPerBlock);
reset_permutation_kernel<<<num_gpu_blocks, kGPUThreadPerBlock>>>(d_values.d_buffers[d_values.selector], lv2_num_items);
for (int64_t iteration = words_per_substring - 1; iteration >= 0; --iteration) {
if (iteration == words_per_substring - 1) { // first iteration
CubDebugExit(cudaMemcpy(d_keys.d_buffers[d_keys.selector], lv2_substrings + (iteration * lv2_num_items),
sizeof(edge_word_t) * lv2_num_items, cudaMemcpyHostToDevice));
} else {
CubDebugExit(cudaMemcpy(d_keys.d_buffers[1 - d_keys.selector], lv2_substrings + (iteration * lv2_num_items),
sizeof(edge_word_t) * lv2_num_items, cudaMemcpyHostToDevice));
permutation_kernel<<<num_gpu_blocks, kGPUThreadPerBlock>>>(d_values.d_buffers[d_values.selector],
d_keys.d_buffers[1 - d_keys.selector], d_keys.d_buffers[d_keys.selector],
lv2_num_items);
}
// Run
CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, lv2_num_items));
}
// copy answer back to host
CubDebugExit(cudaMemcpy(permutation, d_values.d_buffers[d_values.selector], sizeof(uint32_t) * lv2_num_items, cudaMemcpyDeviceToHost));
// free device memory
if (d_keys.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[0]));
if (d_keys.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[1]));
if (d_values.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[0]));
if (d_values.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[1]));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
}