-
Notifications
You must be signed in to change notification settings - Fork 1
/
params.hpp
301 lines (267 loc) · 10.3 KB
/
params.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
#ifndef PARAMS_DEFINED
#define PARAMS_DEFINED
//#define HOST_DEBUG
//#define SPW_SYSTEM
#define DENSE_SYSTEM
#define OA_PING_PONG
//#define HW_SYNC
#define NOOP
#if defined(SPW_TEST)
#undef DENSE_SYSTEM
#define SPW_SYSTEM
#endif
/**
* Global memory settings
*/
#define MAX_DRAM_BYTE_INPUT_ACTIVATION (1 << 29)
#define MAX_DRAM_BYTE_INPUT_ACTIVATION_SB_COUNT (1 << 22)
#define MAX_DRAM_BYTE_INPUT_WEIGHT (1 << 28)
#define MAX_DRAM_BYTE_INPUT_WEIGHT_SB_COUNT 0x40000
#define MAX_DRAM_BYTE_INPUT_BIAS 0x20000
//#define MAX_DRAM_BYTE_OUTPUT_ACTIVATION 0x400000
//#define MAX_DRAM_BYTE_OUTPUT_ACTIVATION_SB_COUNT 0x40000
#define MAX_DRAM_BYTE_INPUT_MOVER_INSTRUCTION (1 << 23)
#define MAX_DRAM_BYTE_INPUT_TILE_CONTROLLER_INSTRUCTION (1 << 23)
#define MAX_DRAM_BYTE_OUTPUT_MOVER_INSTRUCTION (1 << 23)
#define MAX_DRAM_BYTE_WEIGHT_MOVER_INSTRUCTION (1 << 25)
#define MAX_DRAM_BYTE_OUTPUT_TILE_CONTROLLER_INSTRUCTION (1 << 23)
#define MAX_DRAM_BYTE_MISC_CONTROLLER_INSTRUCTION (1 << 23)
// #define PACKET_SIZE 1
//Assume on Arria 10 Dev Kit, the memory bandwidth is on
#define DDR_BANDWIDTH_GBS_INT 17
#define FMAX_MHZ 221
#define DDR_BYTES_PER_CYCLE ((DDR_BANDWIDTH_GBS_INT * 1000 - 1) / FMAX_MHZ + 1)
#if defined(FULL_SYSTEM)
// #define PE_COLS 2
// #define PE_ROWS_PER_GROUP 4
// #define PE_ROW_GROUPS 2
// #define MISC_COLS 1
#define PE_COLS 7
#define PE_ROWS_PER_GROUP 8
#define PE_ROW_GROUPS 3
#define MISC_COLS 1
// #define PE_COLS 1
// #define PE_ROWS_PER_GROUP 1
// #define PE_ROW_GROUPS 1
// #define MISC_COLS 1
// #define PE_COLS 7
// #define PE_ROWS_PER_GROUP 8
// #define PE_ROW_GROUPS 6
// #define MISC_COLS 1
#else
#define PE_COLS 1
#define PE_ROWS_PER_GROUP 16
#define PE_ROW_GROUPS 1
#define MISC_COLS 1
#endif
#define PE_ROWS (PE_ROWS_PER_GROUP * PE_ROW_GROUPS)
//Derived parameter
#if (PE_ROWS_PER_GROUP == 1)
#define DIVIDE_BY_PE_ROWS_PER_GROUP_SHIFT 0
#define DIVIDE_BY_PE_ROWS_PER_GROUP_REMAINDER_MASK 0x0
#elif (PE_ROWS_PER_GROUP == 2)
#define DIVIDE_BY_PE_ROWS_PER_GROUP_SHIFT 1
#define DIVIDE_BY_PE_ROWS_PER_GROUP_REMAINDER_MASK 0x1
#elif (PE_ROWS_PER_GROUP == 4)
#define DIVIDE_BY_PE_ROWS_PER_GROUP_SHIFT 2
#define DIVIDE_BY_PE_ROWS_PER_GROUP_REMAINDER_MASK 0x3
#elif (PE_ROWS_PER_GROUP == 8)
#define DIVIDE_BY_PE_ROWS_PER_GROUP_SHIFT 3
#define DIVIDE_BY_PE_ROWS_PER_GROUP_REMAINDER_MASK 0x7
#elif (PE_ROWS_PER_GROUP == 16)
#define DIVIDE_BY_PE_ROWS_PER_GROUP_SHIFT 4
#define DIVIDE_BY_PE_ROWS_PER_GROUP_REMAINDER_MASK 0xF
#else
#error DIVIDE_BY_PE_ROWS_PER_GROUP_SHIFT should be chosen from {1, 2, 4, 8, 16}
#endif
#if (MISC_COLS > PE_COLS)
#error Configuration MISC_COLS should be less or equal to PE_COLS
#endif
#define CHANNEL_DEPTH 1
#define OA_DRAIN_CHANNEL_DEPTH 1
//Activation memory region offsets
//in terms of DRAM blocks
//TB count memory region offsets
//In terms of TB counts (shorts)
#define NUM_ACTIVATION_REGIONS 3
#define MEM_ACTIVATION_REGION_SIZE_PER_SLICE (1 << 20)
#define MEM_ACTIVATION_TB_REGION_SIZE_PER_SLICE (1 << 18)
#define CLUSTER_SIZE 2 //cluster size in terms of values
#if (CLUSTER_SIZE == 1)
#define VALUE_TO_CLUSTER_SHIFT 0x00
#elif (CLUSTER_SIZE == 2)
#define VALUE_TO_CLUSTER_SHIFT 0x01
#elif (CLUSTER_SIZE == 4)
#define VALUE_TO_CLUSTER_SHIFT 0x02
#elif (CLUSTER_SIZE == 8)
#define VALUE_TO_CLUSTER_SHIFT 0x03
#elif (CLUSTER_SIZE == 16)
#define VALUE_TO_CLUSTER_SHIFT 0x04
#else
#error CLUSTER_SIZE should be chosen from {1, 2, 4, 8, 16}
#endif
#define VALUE_DIVIDED_BY_CLUSTER_SIZE_REMAINDER_MASK ((1 << VALUE_TO_CLUSTER_SHIFT) - 1)
#define VALUE_DIVIDED_BY_SIMD_SIZE_REMAINDER_MASK ((1 << (VALUE_TO_CLUSTER_SHIFT + CLUSTER_TO_TRANSFER_SIZE_SHIFT)) - 1)
#define CLUSTER_TO_TRANSFER_BLOCK_SHIFT CLUSTER_TO_TRANSFER_SIZE_SHIFT //amount of right shift required to convert a cluster count into transfer block count
/**
* Parameters relevant for balanced-block sparsity
*/
//Prune range of balanced-sparsity
#define PRUNE_RANGE_IN_CLUSTER 4
//Number of prune range processed in parallel.
//Equal to SIMD size (in terms of cluster) in a SpW PE
#define PE_SIMD_SIZE 8
#if (PE_SIMD_SIZE == 1)
#define PE_SIMD_SIZE_CLUSTER_OFFSET 0x0
#elif (PE_SIMD_SIZE == 2)
#define PE_SIMD_SIZE_CLUSTER_OFFSET 0x01
#elif (PE_SIMD_SIZE == 4)
#define PE_SIMD_SIZE_CLUSTER_OFFSET 0x02
#elif (PE_SIMD_SIZE == 8)
#define PE_SIMD_SIZE_CLUSTER_OFFSET 0x03
#elif (PE_SIMD_SIZE == 16)
#define PE_SIMD_SIZE_CLUSTER_OFFSET 0x04
#else
#error PE_SIMD_SIZE should be chosen from {1, 2, 4, 8, 16}
#endif
#define PE_SIMD_SIZE_CLUSTER_MASK ((1 << (PE_SIMD_SIZE_CLUSTER_OFFSET)) - 1)
#if (PRUNE_RANGE_IN_CLUSTER == 1)
#define PRUNE_RNAGE_IN_CLUSTER_OFFSET 0x0
#elif (PRUNE_RANGE_IN_CLUSTER == 2)
#define PRUNE_RNAGE_IN_CLUSTER_OFFSET 0x01
#elif (PRUNE_RANGE_IN_CLUSTER == 4)
#define PRUNE_RNAGE_IN_CLUSTER_OFFSET 0x02
#elif (PRUNE_RANGE_IN_CLUSTER == 8)
#define PRUNE_RNAGE_IN_CLUSTER_OFFSET 0x03
#elif (PRUNE_RANGE_IN_CLUSTER == 16)
#define PRUNE_RNAGE_IN_CLUSTER_OFFSET 0x04
#else
#error PRUNE_RANGE_IN_CLUSTER should be chosen from {1, 2, 4, 8, 16}
#endif
#define PRUNE_RANGE_IN_CLUSTER_MASK ((1 << (PRUNE_RNAGE_IN_CLUSTER_OFFSET)) - 1)
//Size of the char array in the host weight blocks
//used to contain the indices of the NZ clusters
//Each char is split into two 4-bit halfs.
//Each half corresponds to an index
#if defined(PE_SIMD_SIZE) && defined(SPW_SYSTEM)
#if (PE_SIMD_SIZE <= 2)
#define INDEX_CHAR_ARRAY_SIZE 1
#define INDEX_CHAR_ARRAY_SIZE_OFFSET 0
#elif (PE_SIMD_SIZE <= 4)
#define INDEX_CHAR_ARRAY_SIZE 2
#define INDEX_CHAR_ARRAY_SIZE_OFFSET 1
#elif (PE_SIMD_SIZE <= 8)
#define INDEX_CHAR_ARRAY_SIZE 4
#define INDEX_CHAR_ARRAY_SIZE_OFFSET 2
#elif (PE_SIMD_SIZE <= 16)
#define INDEX_CHAR_ARRAY_SIZE 8
#define INDEX_CHAR_ARRAY_SIZE_OFFSET 3
#else
#error PE_SIMD_SIZE should be chosen from {1, 2, 4, 8, 16}
#endif
#endif
#if defined(SPW_SYSTEM)
#define PE_ACTIVATION_BLOCK_SIZE_IN_WORD (CLUSTER_SIZE * PE_SIMD_SIZE * PRUNE_RANGE_IN_CLUSTER)
#define PE_ACTIVATION_BLOCK_SIZE_IN_WORD_OFFSET (VALUE_TO_CLUSTER_SHIFT + PE_SIMD_SIZE_CLUSTER_OFFSET + PRUNE_RNAGE_IN_CLUSTER_OFFSET)
#else
#define PE_ACTIVATION_BLOCK_SIZE_IN_WORD (CLUSTER_SIZE * PE_SIMD_SIZE)
#define PE_ACTIVATION_BLOCK_SIZE_IN_WORD_OFFSET (VALUE_TO_CLUSTER_SHIFT + PE_SIMD_SIZE_CLUSTER_OFFSET)
#endif
#define PE_ACTIVATION_BLOCK_SIZE_IN_WORD_MASK ((1 << PE_ACTIVATION_BLOCK_SIZE_IN_WORD_OFFSET) - 1)
#define SURVIVING_COUNT_CLUSTER_INDEX 0X1
#define SURVIVING_COUNT_TRANSFER_BLOCK_INDEX 0x1
#define ACTIVATION_WIDE_SIZE 4
#define ACTIVATION_WIDE_SIZE_OFFSET 0x2 //Numnber of bits to shift the transfer block index to the right in order to recover the wide offset
#define ACTIVATION_WIDE_SIZE_REMAINDER_MASK 0x3
#define ACTIVATION_BURST_SIZE_BYTE (PE_ACTIVATION_BLOCK_SIZE_IN_WORD * ACTIVATION_WIDE_SIZE)
#define ACTIVATION_BURST_SIZE_BYTE_OFFSET (ACTIVATION_WIDE_SIZE_OFFSET + PE_ACTIVATION_BLOCK_SIZE_IN_WORD_OFFSET)
#define ACTIVATION_WIDE_SIZE_BYTE_MASK ((1 << ACTIVATION_BURST_SIZE_BYTE_OFFSET) - 1)
#define ACTIVATION_BURST_SIZE_IN_PE_ROW_GROUP (ACTIVATION_BURST_SIZE_BYTE / PE_ROWS_PER_GROUP)
#define ACTIVATION_BURST_SIZE_IN_PE_ROW_GROUP_OFFSET (ACTIVATION_BURST_SIZE_BYTE_OFFSET - DIVIDE_BY_PE_ROWS_PER_GROUP_SHIFT)
#if (ACTIVATION_BURST_SIZE_IN_PE_ROW_GROUP_OFFSET < 0)
#error The number of rows in a PE group must be smaller or equal to the activation burst size
#endif
#define ACTIVATION_WIDE_SIZE_IN_PE_ROW_GROUP_MASK ((1 << ACTIVATION_BURST_SIZE_IN_PE_ROW_GROUP_OFFSET) - 1)
#define WEIGHT_WIDE_SIZE 4
#define WEIGHT_WIDE_SIZE_OFFSET 0x2 //Numnber of bits to shift the transfer block index to the right in order to recover the wide offset
#define WEIGHT_WIDE_SIZE_REMAINDER_MASK 0x3
#define WEIGHT_BURST_SIZE_VALUE_BYTE (WEIGHT_WIDE_SIZE * PE_SIMD_SIZE * CLUSTER_SIZE)
#if defined(SPW_SYSTEM)
#define WEIGHT_BURST_SIZE_INDEX_BYTE (WEIGHT_WIDE_SIZE*INDEX_CHAR_ARRAY_SIZE)
#endif
#define WMOVER_FILTER_DRAM_BLOCK_ACCESS_UNROLL_FACTOR 4
#define KERNEL_CACHE_SIZE_VALUE_BYTE 32768
#define KERNEL_CACHE_DEPTH (KERNEL_CACHE_SIZE_VALUE_BYTE / WEIGHT_BURST_SIZE_VALUE_BYTE)
#define MAX_OUTPUT_TILE_WIDTH_PER_COL 8
#define MAX_OUTPUT_TILE_HEIGHT 31
#define MAX_INPUT_TILE_WIDTH_PER_COL 8
#define MAX_INPUT_TILE_HEIGHT 31
// #define IA_CACHE_SIZE_BYTE 16384
#define IA_CACHE_SIZE_BYTE 32768
#define IA_CACHE_DEPTH (IA_CACHE_SIZE_BYTE/ACTIVATION_BURST_SIZE_BYTE)
#define WEIGHT_MOVER_BIAS_CACHE_SIZE 2048
//TODO: Change this back to the commented line
//#define OA_CACHE_SIZE_BYTE 8192
#define OA_CACHE_SIZE_BYTE 32768
#define OA_CACHE_DEPTH (OA_CACHE_SIZE_BYTE/ACTIVATION_BURST_SIZE_BYTE)
//Accumulator width
#define ACCUMULATOR_WIDTH 28
#if defined(EMULATOR)
#pragma message("WARNING: IN EMULATOR MODE, ACCUMULATOR_WIDTH IS FIXED TO 32")
#define ACCUM_MASK 0x0FFFFFFFF
#define MULT_MASK 0x0FFFFFFFF
#define ACCUM_MIN 0x80000000
#elif (ACCUMULATOR_WIDTH == 32)
#define ACCUM_MASK 0x0FFFFFFFF
#define MULT_MASK 0x0FFFFFFFF
#define ACCUM_MIN 0x80000000
#elif (ACCUMULATOR_WIDTH == 28)
#define ACCUM_MASK 0x00FFFFFFF
#define MULT_MASK 0x00FFFFFFF
#define ACCUM_MIN 0x08000000
#elif (ACCUMULATOR_WIDTH == 24)
#define ACCUM_MASK 0x00FFFFFF
#define MULT_MASK 0x00FFFFFF
#define ACCUM_MIN 0x00800000
#elif (ACCUMULATOR_WIDTH == 20)
#define ACCUM_MASK 0x000FFFFF
#define MULT_MASK 0x000FFFFF
#define ACCUM_MIN 0x00080000
#elif (ACCUMULATOR_WIDTH == 16)
#define ACCUM_MASK 0x0FFFF
#define MULT_MASK 0x0FFFF
#define ACCUM_MIN 0x00008000
#else
#error Accumulator width should be from 32-bit, 28-bit, 24-bit, 20-bit, and 16-bit
#endif
#define MISC_ACCUMULATOR_WIDTH 16
#if defined(EMULATOR)
#pragma message("WARNING: IN EMULATOR MODE, MISC_ACCUMULATOR_WIDTH IS FIXED TO 32")
#define MISC_ACCUM_MASK 0x0FFFFFFFF
#define MISC_MULT_MASK 0x0FFFFFFFF
#define MISC_ACCUM_MIN 0x80000000
#elif (MISC_ACCUMULATOR_WIDTH == 32)
#define MISC_ACCUM_MASK 0x0FFFFFFFF
#define MISC_MULT_MASK 0x0FFFFFFFF
#define MISC_ACCUM_MIN 0x80000000
#elif (MISC_ACCUMULATOR_WIDTH == 28)
#define MISC_ACCUM_MASK 0x00FFFFFFF
#define MISC_MULT_MASK 0x00FFFFFFF
#define MISC_ACCUM_MIN 0x08000000
#elif (MISC_ACCUMULATOR_WIDTH == 24)
#define MISC_ACCUM_MASK 0x00FFFFFF
#define MISC_MULT_MASK 0x00FFFFFF
#define MISC_ACCUM_MIN 0x00800000
#elif (MISC_ACCUMULATOR_WIDTH == 20)
#define MISC_ACCUM_MASK 0x000FFFFF
#define MISC_MULT_MASK 0x000FFFFF
#define MISC_ACCUM_MIN 0x00080000
#elif (MISC_ACCUMULATOR_WIDTH == 16)
#define MISC_ACCUM_MASK 0x0FFFF
#define MISC_MULT_MASK 0x0FFFF
#define MISC_ACCUM_MIN 0x00008000
#else
#error Misc accumulator width should be from 32-bit, 28-bit, 24-bit, 20-bit, and 16-bit
#endif
#define TIMEOUT 0X1FFFFFF
#endif