-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsw_cuda_windowed.cu
158 lines (131 loc) · 6.12 KB
/
sw_cuda_windowed.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#include <iostream> // std::cout
#include <string>
#include <vector>
#include <utility>
#include "utils.h"
#include <stdio.h>
using window_t = int16_t[ WINDOW_SIZE ][ WINDOW_SIZE ];
__global__
void align(int16_t *scores, window_t *windows, char *sequences) {
// Thread index
const int16_t t = threadIdx.x + blockIdx.x * blockDim.x;
if (t > QUANTITY)
return;
// Instantiate scores
auto const gap = -2;
auto const mismatch = -2;
auto const match = 3;
// Calculate params
const int16_t batches_no = SIZE / WINDOW_SIZE;
int16_t top_leftover[WINDOW_SIZE] = { };
int16_t side_leftover[WINDOW_SIZE] = { };
// Instantiate placeholders
int16_t max_element(0),
diagonal_value,
top_value,
left_value;
char s1_piece[WINDOW_SIZE];
char s2_piece[WINDOW_SIZE];
for (int16_t bi = 0; bi < batches_no; ++bi) {
int16_t offset_i = bi * WINDOW_SIZE;
for (int16_t i = 0; i != WINDOW_SIZE; ++i)
s1_piece[ i ] = sequences[ t * SIZE * 2 + offset_i + i ];
for (int16_t bj = 0; bj < batches_no; ++bj) {
int16_t offset_j = bj * WINDOW_SIZE;
for (int16_t i = 0; i != WINDOW_SIZE; ++i)
s2_piece[ i ] = sequences[ t * SIZE * 2 + SIZE + offset_j + i ];
for (int16_t i = 0; i < WINDOW_SIZE; ++i) {
windows[ t ][ 0 ][ i ] = top_leftover[ i ];
windows[ t ][ i ][ 0 ] = side_leftover[ i ];
}
for (int16_t i = 1; i < WINDOW_SIZE; ++i) {
for (int16_t j = 1; j < WINDOW_SIZE; ++j) {
diagonal_value = windows[ t ][ i - 1 ][ j - 1 ] + (s1_piece[i - 1] == s2_piece[j - 1] ? match : mismatch);
top_value = windows[ t ][ i - 1 ][ j ] + gap;
left_value = windows[ t ][ i ][ j - 1 ] + gap;
// target_value = std::max(diagonal_value, std::max(top_value, left_value));
int16_t temp =
top_value - ((top_value - left_value) & ((top_value - left_value) >> (sizeof(int16_t) * 8 - 1)));
int16_t target_value =
diagonal_value - ((diagonal_value - temp) & ((diagonal_value - temp) >> (sizeof(int16_t) * 8 - 1)));
// windows[ t ][ i ][ j ] = (target_value > 0) ? target_value : 0;
windows[ t ][ i ][ j ] =
target_value - (target_value & (target_value >> (sizeof(int16_t) * 8 - 1)));
if (target_value > max_element)
max_element = target_value;
}
}
for (int16_t i = 0; i < WINDOW_SIZE; ++i) {
top_leftover[ i ] = windows[ t ][ WINDOW_SIZE - 1 ][ i ];
side_leftover[ i ] = windows[ t ][ i ][ WINDOW_SIZE - 1 ];
}
}
}
scores[ t ] = max_element;
// // Traceback
// traceback(matrix, max_element_i, max_element_j);
}
void sw_cuda_windowed(std::vector<std::pair<std::string, std::string>> const sequences){
// Quanitities
int const num_blocks = QUANTITY / CUDA_XBLOCK_SIZE;
// Instantiate host variables
std::vector<int16_t> scores(QUANTITY);
// Instantiate device variables
window_t *dev_matrices;
char *dev_input;
int16_t *dev_output;
int64_t matrices_size = QUANTITY * sizeof(window_t);
int64_t output_size = QUANTITY * sizeof(int16_t);
int64_t input_size = QUANTITY * SIZE * 2;
// Allocate memory on device
auto const start_time_1 = std::chrono::steady_clock::now();
cudaMalloc( (void**) &dev_output, output_size );
cudaMalloc( (void**) &dev_matrices, matrices_size );
cudaMalloc( (void**) &dev_input, input_size );
auto const end_time_1 = std::chrono::steady_clock::now();
std::cout << "Device malloc time: (μs) "
<< std::chrono::duration_cast<std::chrono::microseconds>( end_time_1 - start_time_1 ).count()
<< std::endl;
cudaCheckErrors("Failed to allocate device buffer");
// Preprocessing
auto const start_time_0 = std::chrono::steady_clock::now();
const char *sequences_bytes = to_byte_arr(sequences);
auto const end_time_0 = std::chrono::steady_clock::now();
std::cout << "Preprocessing time: (μs) "
<< std::chrono::duration_cast<std::chrono::microseconds>( end_time_0 - start_time_0 ).count()
<< std::endl;
// Send the data to device
auto const start_time_2 = std::chrono::steady_clock::now();
cudaMemcpy( dev_input, sequences_bytes, input_size, cudaMemcpyHostToDevice );
auto const end_time_2 = std::chrono::steady_clock::now();
std::cout << "To device transfer time: (μs) "
<< std::chrono::duration_cast<std::chrono::microseconds>( end_time_2 - start_time_2 ).count()
<< std::endl;
cudaCheckErrors("CUDA memcpy failure");
// Kernel
auto const start_time_3 = std::chrono::steady_clock::now();
align<<< num_blocks, CUDA_XBLOCK_SIZE>>>( dev_output, dev_matrices, dev_input );
auto const end_time_3 = std::chrono::steady_clock::now();
std::cout << "Exec time: (μs) "
<< std::chrono::duration_cast<std::chrono::microseconds>( end_time_3 - start_time_3 ).count()
<< std::endl;
cudaCheckErrors("Kernel launch failure");
// Retrieve results from device
auto const start_time_4 = std::chrono::steady_clock::now();
cudaMemcpy( scores.data(), dev_output, output_size, cudaMemcpyDeviceToHost );
auto const end_time_4 = std::chrono::steady_clock::now();
std::cout << "From device transfer time: (μs) "
<< std::chrono::duration_cast<std::chrono::microseconds>( end_time_4 - start_time_4 ).count()
<< std::endl;
cudaCheckErrors("CUDA memcpy failure");
// Free the memory on device
cudaFree( dev_input );
cudaFree( dev_matrices );
cudaFree( dev_output );
cudaCheckErrors("cudaFree fail");
int total_scores;
for (auto e : scores) {
total_scores += e;
}
std::cout << "Average score: " << total_scores / QUANTITY << std::endl;
}