forked from marian-nmt/marian-dev
-
Notifications
You must be signed in to change notification settings - Fork 7
/
wasm_intgemm_interface.h
267 lines (257 loc) · 14 KB
/
wasm_intgemm_interface.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
#pragma once
/** Main interface for integer matrix multiplication followed by addition of bias for wasm.
*
* C = A * B + Bias
*
* Input matrix A:
* - is a 2-D matrix that typically represents activations as floating point values
* - no. of rows should be a multiple of 1 (i.e. no restriction)
* - no. of columns should be a multiple of 64
* - is represented as array (contiguous memory locations) in row-major format
*
* Input matrix B:
* - is a 2-D matrix that typically represents fixed model parameters as floating point values
* - no. of rows should be:
* -- equal to no. of columns of Input matrix A
* -- a multiple of 64
* - no. of columns should be a multiple of 8
* - is represented as array (contiguous memory locations) in row-major format
*
* Please note that it is also possible to pass Input matrix B in 2 more forms:
* - One that is already a quantized and transposed version of Input matrix B
* - Other that is already a transposed version of Input matrix B
*
* Input Bias:
* - is an array (contiguous memory locations) that represents bias
* - size of the array should be equal to the no. of columns of Input matrix B
*
* Output matrix C:
* - is a 2-D matrix that represents the result (= A * B + Bias)
* - no. of rows will be equal to no. of rows of Input matrix A
* - no. of columns will be equal to no. of columns of Input matrix B (in untransposed form)
* - is represented as array (contiguous memory locations) in row-major format
*
* Please note that most of the functions in this interface might have architecture specific
* implementations.
*
* Conventions followed throughout this file:
* - Unless explicitly mentioned, Input matrix B always means an unquantized (i.e. float values)
* and non-transposed version
* - no. of rows of Input matrix A = `rows_A`
* - no. of columns of Input matrix A = no. of rows of Input matrix B = `width`
* - no. of columns of Input matrix B = `cols_B`
*/
#include <cstdint>
using Index = uint32_t;
/**
* Prepare B for the Matrix Multiply function from Input matrix B.
*
* Quantization is performed on the input.
* The final prepared B is in CPU-dependent format and can be used as an input to matrix multiply
* function (`int8MultiplyAndAddBias`).
*
* Please note that this interface might have architecture specific implementation.
*
* @param[in] input_B An array representing the Input matrix B in row-major format.
* Size of the array = `width` * `cols_B`.
* Shape of the matrix: (`width`, `cols_B`)
* @param[in] scale The scaling factor (for quantization)
* @param[in] zero_point The zero point (for quantization)
* @param[in] width No. of rows of Input matrix B. It should be a multiple of 64.
* @param[in] cols_B No. of columns of Input matrix B. It should be a multiple of 8.
* @param[out] output An array representing the prepared B matrix.
* Size of the array = `width` * `cols_B`.
*/
extern "C" void __attribute__((import_module("wasm_gemm"), import_name("int8_prepare_b")))
int8PrepareB(const float* input_B,
float scale,
float zero_point,
Index width,
Index cols_B,
int8_t* output);
/**
* Prepare B for the Matrix Multiply function from transposed version of Input matrix B.
*
* Quantization is performed on floating values of input.
* The final prepared B is in CPU-dependent format and can be used as an input to matrix multiply
* function (`int8MultiplyAndAddBias`).
*
* Please note that this interface might have architecture specific implementation.
*
* @param[in] input_B_transposed An array representing transposed version of Input matrix B.
* It is in column-major format.
* Size of the array = `width` * `cols_B`.
* Shape of the matrix: (`cols_B`, `width`)
* @param[in] scale The scaling factor (for quantization)
* @param[in] zero_point The zero point (for quantization)
* @param[in] width No. of rows of Input matrix B. It should be a multiple of 64.
* @param[in] cols_B No. of columns of Input matrix B. Should be a multiple of 8.
* @param[out] output An array representing the prepared B matrix.
* Size of the array = `width` * `cols_B`.
*/
extern "C" void
__attribute__((import_module("wasm_gemm"), import_name("int8_prepare_b_from_transposed")))
int8PrepareBFromTransposed(const float* input_B_transposed,
float scale,
float zero_point,
Index width,
Index cols_B,
int8_t* output);
/**
* Prepare B for the Matrix Multiply function from a quantized and transposed version of Input
* matrix B which is also in a CPU-independent format.
*
* The final prepared B is in CPU-dependent format and can be used as an input to matrix multiply
* function (`int8MultiplyAndAddBias`).
*
* This function is useful while using the quantized models that are stored in a CPU-independent
* format on the disk.
*
* @param[in] input_B_quant_transposed An array representing the quantized and transposed
* version of Input matrix B. It is in column-major format.
* Size of the array = `width` * `cols_B`.
* Shape of the matrix: (`cols_B`, `width`)
* @param[in] width No. of rows of Input matrix B. Should be multiple of 64
* @param[in] cols_B No. of columns of Input matrix B. Should be multiple of 8
* @param[out] output An array representing the prepared B matrix.
* Size of the array = `width` * `cols_B`.
*/
extern "C" void __attribute__((import_module("wasm_gemm"),
import_name("int8_prepare_b_from_quantized_transposed")))
int8PrepareBFromQuantizedTransposed(const int8_t* input_B_quant_transposed,
Index width,
Index cols_B,
int8_t* output);
/**
* Prepare A for the Matrix Multiply function from Input matrix A.
*
* It performs quantization on floating values of input.
* The final prepared A might be architecture dependent. e.g. On some architectures like x86, it
* might be unsigned (achieved by adding 127 to quantized values) while on others like Arm, it might
* be signed.
* The final prepared A can be used as an input to matrix multiply function
* (`int8MultiplyAndAddBias`).
*
* Please note that this interface might have architecture specific implementation.
*
* @param[in] input_A An array representing the Input matrix A in row-major format.
* Size of the array = `rows_A` * `width`.
* Shape of the matrix: (`rows_A`, `width`)
* @param[in] scale The scaling factor (for quantization)
* @param[in] zero_point The zero point (for quantization)
* @param[in] rows_A No. of rows of Input matrix A. No restriction on its size.
* @param[in] width No. of columns of Input matrix A. It should be a multiple of 64.
* @param[out] output An array representing the prepared A matrix.
* Size of the array = `rows_A` * `width`.
*/
extern "C" void __attribute__((import_module("wasm_gemm"), import_name("int8_prepare_a")))
int8PrepareA(const float* input_A,
float scale,
float zero_point,
Index rows_A,
Index width,
int8_t* output);
/**
* Prepares bias for the Matrix Multiply function.
*
* It uses the prepared B (which must be obtained by using any of the int8PrepareB* functions) and
* a bias input to prepare the final bias.
*
* The final bias can be used as an input to matrix multiply function (`int8MultiplyAndAddBias`).
*
* @param[in] input_B_prepared An array representing the prepared B matrix.
* Size of the array = `width` * `cols_B`.
* @param[in] scale_A The scaling factor (for quantization) of A
* @param[in] zero_point_A The zero point (for quantization) of A
* @param[in] scale_B The scaling factor (for quantization) of B
* @param[in] zero_point_B The zero point (for quantization) of B
* factor that is prepared from `scale_A` and `scale_B`.
* @param[in] width No. of rows of Input matrix B (unquantized & non-transposed).
* It should be a multiple of 64.
* @param[in] cols_B No. of columns of Input matrix B (unquantized & non-transposed)
* It should be a multiple of 8.
* @param[in] input_bias An array representing the input bias. Size of array = `cols_B`
* @param[out] output An array representing the final prepared bias.
* Size of the array = `cols_B`
*/
extern "C" void __attribute__((import_module("wasm_gemm"), import_name("int8_prepare_bias")))
int8PrepareBias(const int8_t* input_B_prepared,
float scale_A,
float zero_point_A,
float scale_B,
float zero_point_B,
Index width,
Index cols_B,
const float* input_bias,
float* output);
/**
* Perform multiplication of 2 matrices followed by adding a bias.
*
* i.e Output = A_prepared * B_prepared + Bias_prepared
*
* The inputs A_prepared, B_prepared and Bias_prepared of this function must be
* obtained by using `int8PrepareA`, one of the `int8PrepareB*` and `int8PrepareBias`
* functions respectively.
*
* Please note that this interface might have architecture specific implementation.
*
* @param[in] input_A_prepared An array representing the prepared A matrix.
* This must be obtained by using `int8PrepareA` function.
* Size of the array = `rows_A` * `width`.
* @param[in] scale_A The scaling factor (for quantization) of A
* @param[in] zero_point_A The zero point (for quantization) of A
* @param[in] input_B_prepared An array representing the prepared B matrix.
* This must be obtained by using one of `int8PrepareB*`
* functions. Size of the array = `width` * `cols_B`.
* @param[in] scale_B The scaling factor (for quantization) of B
* @param[in] zero_point_B The zero point (for quantization) of B
* @param[in] input_bias_prepared An array representing the prepared bias.
* This must be obtained by using `int8PrepareBias` function.
* Size of the array = `cols_B`
* @param[in] unquant_multiplier A value that will be multiplied to the final unquantization
* factor that is prepared from `scale_A` and `scale_B`.
* @param[in] rows_A No. of rows of Input matrix A. No restriction on its size.
* @param[in] width No. of columns of Input matrix A (same as no. of columns of
* Input matrix B). It should be a multiple of 64.
* @param[in] cols_B No. of columns of Input matrix B. Should be a multiple of 8.
* @param[out] output An array representing the result matrix in row-major format.
* Size of the array = `rows_A` * `cols_B`.
*/
extern "C" void
__attribute__((import_module("wasm_gemm"), import_name("int8_multiply_and_add_bias")))
int8MultiplyAndAddBias(const int8_t* input_A_prepared,
float scale_A,
float zero_point_A,
const int8_t* input_B_prepared,
float scale_B,
float zero_point_B,
const float* input_bias_prepared,
float unquant_multiplier,
Index rows_A,
Index width,
Index cols_B,
float* output);
/**
* Select a subset of columns of prepared B.
*
* Indices of the columns to be selected are specified by an array.
*
* @param[in] input_B_prepared An array representing the prepared B matrix.
* This must be obtained by using one of the `int8PrepareB*`
* functions Size of the array = `width` * `cols_B`.
* @param[in] width No. of rows of Input matrix B. It should be a multiple of 64.
* @param[in] cols_B No. of columns of Input matrix B. It should be a multiple of 8.
* @param[in] cols An array of column indices to be selected from prepared B.
* All indices of the array should be valid. i.e.
* 0 <= cols[N] < cols_B where N = 0, 1, 2 .... (`num_cols`-1)
* @param[in] num_cols Size of the `cols` array. It should be a multiple of 8.
* @param[out] output An array representing the selected columns of prepared B.
* Size of the array = `width` * `num_cols`.
*/
extern "C" void __attribute__((import_module("wasm_gemm"), import_name("int8_select_columns_of_b")))
int8SelectColumnsOfB(const int8_t* input_B_prepared,
Index width,
Index cols_B,
const Index* cols,
const Index num_cols,
int8_t* output);