diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h index f89711328..23109343d 100755 --- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h +++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h @@ -130,7 +130,17 @@ void sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) enum class softmax_implementation {latency=0, legacy=1, stable=2}; template -inline unsigned softmax_idx_from_real_val(const data_T x){ +inline unsigned softmax_stable_idx_from_real_val(const data_T x){ + // Number of address bits for table + static constexpr int N = ceillog2(CONFIG_T::table_size); + + // Slice the top N bits of the input + hls_register ac_int y = x.template slc(x.width-N-1); + return y.to_uint(); +} + +template +inline unsigned softmax_latency_idx_from_real_val(const data_T x){ // Number of address bits for table static constexpr int N = ceillog2(CONFIG_T::table_size); @@ -148,19 +158,12 @@ void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){ // Find maximum Op_max op_max; hls_register data_T x_max = reduce>(data, op_max); - - // Calculate differences from the maximum, forcing rounding and saturation for better accuracy - hls_register ac_fixed d_xi_xmax[CONFIG_T::n_in]; - #pragma unroll - for(unsigned i = 0; i < CONFIG_T::n_in; i++) { - d_xi_xmax[i] = data[i] - x_max; - } // Calculate all the e^x's hls_register typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in]; #pragma unroll for(unsigned i = 0; i < CONFIG_T::n_in; i++) { - exp_res[i] = exp_table[softmax_idx_from_real_val(d_xi_xmax[i])]; + exp_res[i] = exp_table[softmax_stable_idx_from_real_val(data[i] - x_max)]; } // Explicitly sum previously calculated exponentials with an adder tree @@ -168,7 +171,7 @@ void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){ hls_register typename CONFIG_T::exp_table_t exp_sum = reduce>(exp_res, op_add); // Multiply previously calculated exponetials with the reciprocal of the sum - hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[softmax_idx_from_real_val(exp_sum)]; + hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[softmax_stable_idx_from_real_val(exp_sum)]; #pragma unroll for(unsigned i = 0; i < CONFIG_T::n_in; i++) { res[i] = exp_res[i] * inv_exp_sum; @@ -178,15 +181,6 @@ void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){ // TODO - Improve accuracy template void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){ - /* - * Note: The latency tables are equivalent to stable tables - * However, the compiler cannot include the same table twice - * Therefore, an out-of-scope exception is thrown in one of the functions - * Temporary solution - Create the same table twice in quartus_writer.py - * Long-term solution - Only create tables needed by the network; - * Currently, quartus-writer.py generates LUTs for all activations, - * Regardless if they are present in the network or not - */ #include "activation_tables/exp_table_latency.tb" #include "activation_tables/invert_table_latency.tb" @@ -194,7 +188,7 @@ void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){ hls_register typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in]; #pragma unroll for(unsigned i = 0; i < CONFIG_T::n_in; i++) { - exp_res[i] = exp_table_latency[softmax_idx_from_real_val(data[i])]; + exp_res[i] = exp_table_latency[softmax_latency_idx_from_real_val(data[i])]; } // Explicitly sum the results with an adder tree. @@ -202,7 +196,7 @@ void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){ hls_register typename CONFIG_T::exp_table_t exp_sum = reduce>(exp_res, op_add); // Multiply previously calculated exponetials with the reciprocal of the sum - hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table_latency[softmax_idx_from_real_val(exp_sum)]; + hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table_latency[softmax_latency_idx_from_real_val(exp_sum)]; #pragma unroll for(unsigned i = 0; i < CONFIG_T::n_in; i++){ res[i] = exp_res[i] * inv_exp_sum; diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation_stream.h index 03e0dc1e5..c5d040000 100644 --- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation_stream.h +++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation_stream.h @@ -283,7 +283,7 @@ void softmax_stable(stream &data, stream &res) { hls_register typename CONFIG_T::exp_table_t exp_res[data_T::size]; #pragma unroll for(unsigned j = 0; j < data_T::size; j++) { - exp_res[j] = exp_table[softmax_idx_from_real_val(d_xi_xmax[j])]; + exp_res[j] = exp_table[softmax_stable_idx_from_real_val(d_xi_xmax[j])]; } // Explicitly sum the results with an adder tree. @@ -291,7 +291,7 @@ void softmax_stable(stream &data, stream &res) { Op_add op_add; hls_register typename CONFIG_T::exp_table_t exp_sum = reduce>(exp_res, op_add); - hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[softmax_idx_from_real_val(exp_sum)]; + hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[softmax_stable_idx_from_real_val(exp_sum)]; res_T out_pack; SoftmaxInvPackLoop: @@ -327,7 +327,7 @@ void softmax_latency(stream &data, stream &res){ SoftmaxExpPackLoop: #pragma unroll for(unsigned j = 0; j < data_T::size; j++) { - exp_res[j] = exp_table_latency[softmax_idx_from_real_val(in_pack[j])]; + exp_res[j] = exp_table_latency[softmax_latency_idx_from_real_val(in_pack[j])]; } // Explicitly sum the results with an adder tree. @@ -336,7 +336,7 @@ void softmax_latency(stream &data, stream &res){ hls_register typename CONFIG_T::exp_table_t exp_sum = reduce>(exp_res, op_add); // Multiply previously calculated exponetials with the reciprocal of the sum - hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table_latency[softmax_idx_from_real_val(exp_sum)]; + hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table_latency[softmax_latency_idx_from_real_val(exp_sum)]; res_T out_pack; SoftmaxInvPackLoop: diff --git a/hls4ml/writer/quartus_writer.py b/hls4ml/writer/quartus_writer.py index 67cbee6f9..e0debd6d6 100644 --- a/hls4ml/writer/quartus_writer.py +++ b/hls4ml/writer/quartus_writer.py @@ -918,12 +918,19 @@ def __write_exp_table(self, model, path): except: # FixedPrecisionType wasn't correctly stored in layer attributes, use default values pass + if fp_signed is False: + raise Exception('Softmax types need to be signed') sep = '' N = ceil_log2(table_size) for i in range(table_size): f = FixedPointEmulator(fp_bits, fp_integer, signed=fp_signed) - f.set_msb_bits(uint_to_binary(i, N)) + b = uint_to_binary(i, N) + if i == 0: + b.insert(0, 0) + else: + b.insert(0, 1) + f.set_msb_bits(b) real_val = f.exp_float() h_file.write(sep + str(real_val)) sep = ", " @@ -957,19 +964,23 @@ def __write_invert_table(self, model, path): except: # FixedPrecisionType wasn't correctly stored in layer attributes, use default values pass + if fp_signed is False: + raise Exception('Softmax types need to be signed') sep = '' N = ceil_log2(table_size) for i in range(table_size): f = FixedPointEmulator(fp_bits, fp_integer, signed=fp_signed) - f.set_msb_bits(uint_to_binary(i, N)) + b = uint_to_binary(i, N) + b.insert(0, 0) + f.set_msb_bits(b) real_val = f.inv_float() h_file.write(sep + str(real_val)) sep = ", " h_file.write('};\n') h_file.close() - + def __write_exp_table_latency(self, model, path): table_name = 'exp_table_latency' table_size = self.__get_table_size(model, 'softmax')