-
Notifications
You must be signed in to change notification settings - Fork 0
/
kernel.cu
190 lines (167 loc) · 9.82 KB
/
kernel.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <cmath>
typedef unsigned char U8;
typedef unsigned short U16;
typedef unsigned int U32;
typedef unsigned long long int U64;
// TODO: lower threads count could perform better overall? Test later
const int DEFAULT_THREADS_COUNT = 256; // 1024 <-- limit (can be changed, but that's probably a bad idea)
const int DEFAULT_BLOCKS_COUNT = 1024; // 4096 * 256 <-- stable. AMOUNT OF MEMORY IS THE ONLY LIMIT
// Bruteforcer params
constexpr auto MAX_SKIP = 26;
constexpr auto L2A_SAFE_CHARS_COUNT = 14; // 14 for DATA5 using key of length 27 with maximum skips
constexpr auto ITERATIONS_PER_INVOCATION = 32;
constexpr auto KEY_LEN = 7;
constexpr auto PASSING_SCORE = 1; // used with >
// DATA params
constexpr auto ACTUAL_DATA_SIZE = 511;
constexpr auto ALIGNED_DATA_LEN = 512;
const __constant__ U8 DATA4[ALIGNED_DATA_LEN] = { 73, 118, 84, 117, 105, 116, 110, 32, 116, 97, 116, 32, 32, 101, 72, 101, 110, 111, 69, 105, 32, 112, 103, 118, 105, 32, 73, 65, 84, 102, 32, 110, 65, 79, 105, 77, 32, 112, 76, 108, 32, 115, 115, 121, 71, 110, 101, 108, 110, 117, 32, 108, 40, 69, 101, 111, 32, 115, 101, 73, 116, 104, 72, 78, 99, 59, 89, 65, 82, 115, 76, 32, 111, 101, 101, 111, 69, 105, 108, 110, 66, 115, 116, 40, 32, 101, 108, 116, 104, 32, 100, 121, 108, 97, 114, 120, 116, 108, 83, 65, 80, 97, 32, 89, 98, 83, 112, 78, 82, 39, 85, 114, 115, 108, 67, 80, 116, 104, 59, 66, 32, 76, 105, 121, 82, 79, 115, 111, 77, 112, 115, 70, 32, 83, 114, 97, 115, 67, 73, 115, 115, 110, 32, 75, 78, 69, 65, 76, 104, 65, 84, 105, 104, 32, 111, 32, 32, 97, 116, 111, 119, 114, 115, 116, 122, 67, 110, 105, 115, 117, 104, 99, 65, 72, 75, 65, 109, 32, 32, 101, 99, 73, 82, 69, 102, 116, 78, 115, 85, 77, 105, 78, 110, 97, 110, 32, 72, 101, 111, 115, 105, 114, 104, 72, 83, 115, 59, 65, 100, 116, 117, 117, 78, 112, 111, 100, 112, 69, 111, 59, 112, 101, 121, 32, 114, 54, 84, 32, 41, 114, 115, 40, 105, 115, 103, 32, 116, 121, 65, 105, 59, 115, 115, 101, 59, 116, 70, 101, 67, 116, 117, 116, 69, 106, 32, 83, 111, 101, 119, 117, 111, 101, 32, 116, 77, 78, 82, 32, 66, 32, 83, 101, 100, 99, 108, 32, 67, 32, 75, 120, 101, 101, 67, 79, 120, 97, 105, 59, 32, 116, 110, 105, 116, 76, 108, 76, 121, 32, 116, 101, 116, 110, 99, 85, 32, 79, 79, 32, 105, 106, 119, 69, 105, 67, 58, 98, 55, 79, 59, 101, 85, 105, 116, 104, 120, 32, 86, 79, 111, 100, 65, 114, 32, 41, 84, 117, 32, 120, 98, 115, 99, 58, 111, 110, 68, 32, 78, 110, 32, 116, 112, 115, 111, 116, 32, 75, 83, 101, 111, 59, 97, 101, 41, 68, 77, 69, 119, 50, 122, 32, 116, 102, 32, 69, 121, 102, 111, 116, 100, 97, 110, 114, 65, 101, 101, 116, 119, 117, 55, 100, 73, 101, 66, 32, 59, 109, 110, 104, 58, 98, 65, 32, 115, 108, 117, 32, 116, 32, 87, 76, 32, 107, 101, 116, 65, 116, 108, 68, 110, 107, 59, 114, 78, 108, 85, 32, 73, 76, 87, 101, 87, 55, 32, 105, 105, 68, 119, 85, 108, 110, 114, 84, 101, 69, 105, 111, 108, 116, 116, 100, 89, 84, 32, 79, 116, 119, 69, 101, 68, 98, 85, 101, 32, 105, 32, 104, 32, 72, 78, 102, 114, 65, 102, 99, 32, 100, 86, 81, 100, 107, 32, 65, 114, 122, 84, 79, 76, 97, 114, 65, 32, 105, 102, 32, 107, 121, 101, 59, 32, 116, 78, 32, 115, 84, 76, 121, 115, 32, 111, 84, 100, 0};
const __constant__ U8 INITIAL_KEY[KEY_LEN] = { 24, 4, 25, 15, 25, 15, 25 };
// TODO: generate later at compile time by using KEY_LEN param
const __constant__ U8 KEY_MODULO_LOOKUP_TABLE[L2A_SAFE_CHARS_COUNT] = {
0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6
};
// TODO: generate later at compile time by using ITERATIONS_PER_INVOCATION param
const __constant__ U32 BITS_SHIFTING_LOOKUP_TABLE[ITERATIONS_PER_INVOCATION] = {
1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7, 1 << 8, 1 << 9, 1 << 10, 1 << 11,
1 << 12, 1 << 13, 1 << 14, 1 << 15, 1 << 16, 1 << 17, 1 << 18, 1 << 19, 1 << 20, 1 << 21, 1 << 22,
1 << 23, 1 << 24, 1 << 25, 1 << 26, 1 << 27, 1 << 28, 1 << 29, 1 << 30, 1 << 31
};
__global__ void l2aKernel(U32 *outputBuffer) {
int invocation_id = blockIdx.x * blockDim.x + threadIdx.x;
U32 result = 0;
U8 current_score;
int data_index;
int previous_printable_data_index;
#pragma unroll (ITERATIONS_PER_INVOCATION)
for (int j = 0; j < ITERATIONS_PER_INVOCATION; j++) {
current_score = 0;
data_index = INITIAL_KEY[0];
previous_printable_data_index = data_index;
/*
data_removed_count = [current iterations count]
printable_data_index = data_index + data_removed_count
key_index = [current iteration count] % [key length]
*/
#pragma unroll
for (int i = ACTUAL_DATA_SIZE - 1; i > ACTUAL_DATA_SIZE - L2A_SAFE_CHARS_COUNT; i--) {
data_index = (data_index + INITIAL_KEY[KEY_MODULO_LOOKUP_TABLE[(ACTUAL_DATA_SIZE - i)]]) % i;
current_score += (DATA4[previous_printable_data_index] > 96) & (DATA4[data_index + ACTUAL_DATA_SIZE - i] == 32);
previous_printable_data_index = data_index + ACTUAL_DATA_SIZE - i;
}
result |= (current_score > PASSING_SCORE) * BITS_SHIFTING_LOOKUP_TABLE[j];
}
outputBuffer[invocation_id] = result;
}
// Handles error in Debug mode, but ignores them in Release
inline cudaError_t cudaUnwrap(cudaError_t result) {
#if defined(DEBUG) || defined(_DEBUG)
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %sn\n", cudaGetErrorString(result));
exit(1);
}
#endif
return result;
}
// Helper function for calling kernel
void l2aWithCuda(U32 *outputBuffer, unsigned long threads, unsigned long blocks) {
// Allocate initial key array
// cudaUnwrap(cudaMalloc((void**)&initialKeyDevicePtr, KEY_LEN * sizeof(U8)));
// cudaUnwrap(cudaMemcpy(initialKeyDevicePtr, initialKey, KEY_LEN * sizeof(U8), cudaMemcpyHostToDevice));
// Launch kernel
l2aKernel<<<blocks, threads>>>(outputBuffer);
// Check for any errors after launch
cudaError_t cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "l2aKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
// free(initialKeyDevicePtr);
exit(1);
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaUnwrap(cudaDeviceSynchronize());
}
//
//
//
//
// NOTE: THIS GUY IS GENIUS; USE HIS TIPS: http://impact.crhc.illinois.edu/shared/report/phd-thesis-shane-ryoo.pdf
// TODO: open intal-D music in the backround at the beggining xD
//
//
//
int main(int argc, char **argv) {
unsigned long threads_count = DEFAULT_THREADS_COUNT;
unsigned long blocks_count = DEFAULT_BLOCKS_COUNT;
if (argc == 3) {
// argv[0] is executable path (not sure if it's always the case)
printf("Detected 2 args; `%s` threads and `%s` blocks\n", argv[1], argv[2]);
// There is no such thing as safety, only hope...
threads_count = strtoul(argv[1], &argv[1], 10);
blocks_count = strtoul(argv[2], &argv[2], 10);
} else {
printf("No args for threads count and blocks count were detected; Using default values...\n");
}
U32 output_buffer_len = threads_count * blocks_count;
// Select GPU
cudaUnwrap(cudaSetDevice(0));
// Allocating output buffer
U32* outputBufferHost;
// TODO: benchmark different variants
cudaUnwrap(cudaMallocManaged((void**)&outputBufferHost, threads_count * blocks_count * sizeof(U32)));
// TODO: async prefetching + double buffering
#pragma unroll
for (U32 i = 0; i < output_buffer_len; i++) {
outputBufferHost[i] = i;
}
// FIXME: THIS NUMBER IS WRONG; smth wrong with the types
long double cudaCallsCount = ceil(powl(MAX_SKIP, KEY_LEN) / (output_buffer_len * ITERATIONS_PER_INVOCATION));
printf("Starting bruteforcer, pray to all gods that you know...\n");
// printf("(NOT WORKING )Expected amount of calls to cuda kernel = %Lf\n", cudaCallsCount);
// Messuaring execution time using cuda events
// (See: https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/#using-cuda-gpu-timers)
float exectuionTimeMs = 0.0;
cudaEvent_t executionStart, executionEnd;
cudaUnwrap(cudaEventCreate(&executionStart));
cudaUnwrap(cudaEventCreate(&executionEnd));
cudaUnwrap(cudaEventRecord(executionStart, 0));
// TODO: manual unrolling? (this tag doesn't seem to work here)
#pragma unroll
for (int cudaCallIndex = 0; cudaCallIndex < 1; cudaCallIndex++) {
l2aWithCuda(outputBufferHost, threads_count, blocks_count);
// Step 2. Check keys that produce something that doesn't classified as giberish
#pragma unroll
for (U32 i = 0; i < output_buffer_len; i++) {
if (outputBufferHost[i] != 0) {
// TODO: find a way to turn that into a switch statement (since there are only 32 bits)
for (int bitPosition = 0; bitPosition < 32; bitPosition++) {
if ((outputBufferHost[i] >> bitPosition) & 1) {
// printf("Found a key: TODO:");
}
}
// printf("\n");
break;
}
}
}
cudaUnwrap(cudaEventRecord(executionEnd, 0));
// Execution time
cudaUnwrap(cudaEventSynchronize(executionEnd));
cudaUnwrap(cudaEventElapsedTime(&exectuionTimeMs, executionStart, executionEnd));
printf("Elapsed: %fms\n", exectuionTimeMs);
cudaUnwrap(cudaEventDestroy(executionStart));
cudaUnwrap(cudaEventDestroy(executionEnd));
// LEVEL 4 OUTPUT: ifo llszu uyjx just kiddi
for (U32 i = output_buffer_len - 1; i < output_buffer_len; i++) {
printf("%u ", outputBufferHost[i]);
}
cudaUnwrap(cudaFree(outputBufferHost));
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaUnwrap(cudaDeviceReset());
return 0;
}