Skip to content

Commit

Permalink
Add missing MVE support for int16 convolution (ARM-software#124)
Browse files Browse the repository at this point in the history
- Removes arm_convolve_s16
- Updates arm_nn_mat_mul_kernel_s16.c to be used by scalar and DSP
- Adds arm_nn_mat_mult_nt_t_s16.c with MVE support
- Refactors and renames arm_convolve_fast_s16 to arm_convolve_s16 so
that all variants use the previously named fast s16 version.
- Updates unit tests to increase test coverage for convolution s16
- Keras based unit test uses null bias as opposed to array with zeroes
- Correct PDSC file

BUGS=ARM-software#101
ARM-software#119
  • Loading branch information
mansnils authored Apr 2, 2024
1 parent d9cfc00 commit 1af62f8
Show file tree
Hide file tree
Showing 52 changed files with 3,109 additions and 1,015 deletions.
7 changes: 4 additions & 3 deletions ARM.CMSIS-NN.pdsc
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
<file category="source" name="Source/ConvolutionFunctions/arm_convolve_1_x_n_s8.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s4_s16.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s16.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s8.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_convolve_1x1_s4_fast.c"/>
Expand All @@ -53,7 +54,6 @@
<file category="source" name="Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s4.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_depthwise_conv_s4_opt.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_depthwise_conv_s4.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_convolve_fast_s16.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_convolve_s4.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_convolve_s8.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_convolve_s16.c"/>
Expand All @@ -69,13 +69,13 @@
<file category="source" name="Source/ConvolutionFunctions/arm_nn_depthwise_conv_s8_core.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_transpose_conv_get_buffer_sizes_s8.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_transpose_conv_s8.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_convolve_get_buffer_sizes_s8.c"/>
<file category="source" name="Source/ConcatenationFunctions/arm_concatenation_s8_x.c"/>
<file category="source" name="Source/ConcatenationFunctions/arm_concatenation_s8_w.c"/>
<file category="source" name="Source/ConcatenationFunctions/arm_concatenation_s8_y.c"/>
<file category="source" name="Source/ConcatenationFunctions/arm_concatenation_s8_z.c"/>
<file category="source" name="Source/SVDFunctions/arm_svdf_s8.c"/>
<file category="source" name="Source/SVDFunctions/arm_svdf_state_s16_s8.c"/>
<file category="source" name="Source/SVDFunctions/arm_svdf_get_buffer_sizes_s8.c"/>
<file category="source" name="Source/PoolingFunctions/arm_max_pool_s8.c"/>
<file category="source" name="Source/PoolingFunctions/arm_max_pool_s16.c"/>
<file category="source" name="Source/PoolingFunctions/arm_avgpool_s8.c"/>
Expand All @@ -101,9 +101,9 @@
<file category="source" name="Source/NNSupportFunctions/arm_s8_to_s16_unordered_with_offset.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s4.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s8.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s16.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s8_s32.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s16.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_nn_mat_mul_kernel_s16.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_padded_s8.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_nn_mat_mul_core_4x_s8.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_nntables.c"/>
Expand All @@ -112,6 +112,7 @@
<file category="source" name="Source/NNSupportFunctions/arm_nn_lstm_step_s8.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_nn_lstm_calculate_gate_s8_s16.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_nn_vec_mat_mul_result_acc_s8_s16.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_row_offset_s8_s16.c"/>
<file category="source" name="Source/FullyConnectedFunctions/arm_fully_connected_s4.c"/>
<file category="source" name="Source/FullyConnectedFunctions/arm_fully_connected_s8.c"/>
<file category="source" name="Source/FullyConnectedFunctions/arm_fully_connected_s16.c"/>
Expand Down
68 changes: 10 additions & 58 deletions Include/arm_nnfunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
* Title: arm_nnfunctions.h
* Description: Public header file for CMSIS NN Library
*
* $Date: 20 February 2024
* $Revision: V.14.0.0
* $Date: 11 March 2024
* $Revision: V.15.0.0
*
* Target : Arm(R) M-Profile Architecture
Expand Down Expand Up @@ -375,8 +375,10 @@ arm_cmsis_nn_status arm_convolve_s4(const cmsis_nn_context *ctx,
* @param[in] bias_data Optional bias data pointer. Data type: int32
* @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
* @param[out] output_data Output data pointer. Data type: int8
* @return The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
*
* @return The function returns <code>ARM_CMSIS_NN_SUCCESS</code> if successful or
* <code>ARM_CMSIS_NN_ARG_ERROR</code> if incorrect arguments or
* <code>ARM_CMSIS_NN_NO_IMPL_ERROR</code>
*
* @details
* 1. Supported framework: TensorFlow Lite micro
Expand Down Expand Up @@ -522,8 +524,10 @@ int32_t arm_transpose_conv_s8_get_buffer_size_mve(const cmsis_nn_dims *input_dim
* @param[in] bias_data Optional bias data pointer. Data type: int64
* @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
* @param[out] output_data Output data pointer. Data type: int16
* @return The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
*
* @return The function returns <code>ARM_CMSIS_NN_SUCCESS</code> if successful or
* <code>ARM_CMSIS_NN_ARG_ERROR</code> if incorrect arguments or
* <code>ARM_CMSIS_NN_NO_IMPL_ERROR</code>
*
* @details
* 1. Supported framework: TensorFlow Lite micro
Expand All @@ -541,47 +545,6 @@ arm_cmsis_nn_status arm_convolve_s16(const cmsis_nn_context *ctx,
const int64_t *bias_data,
const cmsis_nn_dims *output_dims,
int16_t *output_data);
/**
* @brief Optimized s16 convolution function
* @param[in, out] ctx Function context that contains the additional buffer if required by the function.
* arm_convolve_fast_s16_get_buffer_size will return the buffer_size if required.
* The caller is expected to clear the buffer, if applicable, for security reasons.
* @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...).
* conv_params->input_offset : Not used
* conv_params->output_offset : Not used
* @param[in] quant_params Per-channel quantization info.
* It contains the multiplier and shift values to be applied to each output channel
* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
* @param[in] input_data Input (activation) data pointer. Data type: int16
* @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
* spatial filter dimensions. (filter_dims->w * filter_dims->h * input_dims->c) must not
exceed 512
* @param[in] filter_data Filter data pointer. Data type: int8
* @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
* @param[in] bias_data Optional bias data pointer. Data type: int64
* @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
* @param[out] output_data Output data pointer. Data type: int16
* @return The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
*
* @details
* 1. Supported framework: TensorFlow Lite micro
* 2. Additional memory is required for optimization. Refer to argument 'ctx' for details.
* 3. Implementation supports kernel volumes (filter width * filter height * input channels) < 512.
*
*/

arm_cmsis_nn_status arm_convolve_fast_s16(const cmsis_nn_context *ctx,
const cmsis_nn_conv_params *conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const int16_t *input_data,
const cmsis_nn_dims *filter_dims,
const int8_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int64_t *bias_data,
const cmsis_nn_dims *output_dims,
int16_t *output_data);

/**
* @brief Get the required buffer size for s16 convolution function
Expand All @@ -594,17 +557,6 @@ arm_cmsis_nn_status arm_convolve_fast_s16(const cmsis_nn_context *ctx,
*/
int32_t arm_convolve_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);

/**
* @brief Get the required buffer size for fast s16 convolution function
*
* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
* @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK
* are the spatial filter dimensions
* @return The function returns required buffer size(bytes)
*
*/
int32_t arm_convolve_fast_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);

/**
* @brief Fast s4 version for 1x1 convolution (non-square shape)
*
Expand Down
54 changes: 50 additions & 4 deletions Include/arm_nnsupportfunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
* Title: arm_nnsupportfunctions.h
* Description: Public header file of support functions for CMSIS NN Library
*
* $Date: 14 February 2024
* $Revision: V.19.0.0
* $Date: 22 March 2024
* $Revision: V.20.0.0
*
* Target : Arm(R) M-Profile Architecture
* -------------------------------------------------------------------- */
Expand Down Expand Up @@ -63,6 +63,10 @@ extern "C" {
// will result in lower scratch buffer usage.
#define CH_IN_BLOCK_MVE (124)

// For input of int16 when number of columns are above this limit int64 accumulation is needed
// to not loose precision.
#define MAX_COL_COUNT (512)

/**
* @brief definition to pack four 8 bit values.
*/
Expand Down Expand Up @@ -281,8 +285,8 @@ int16_t *arm_nn_mat_mult_kernel_s16(const int8_t *input_a,
const int32_t output_ch,
const int32_t *out_shift,
const int32_t *out_mult,
const int16_t activation_min,
const int16_t activation_max,
const int32_t activation_min,
const int32_t activation_max,
const int32_t num_col_a,
const int64_t *const output_bias,
int16_t *out_0);
Expand Down Expand Up @@ -445,6 +449,48 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const int8_t *lhs,
const int32_t row_address_offset,
const int32_t lhs_cols_offset);

/**
* @brief General Matrix-multiplication function with per-channel requantization and int16 input (LHS) and output.
* This function assumes:
* - LHS input matrix NOT transposed (nt)
* - RHS input matrix transposed (t)
*
* @note This operation also performs the broadcast bias addition before the requantization
*
* @param[in] lhs Pointer to the LHS input matrix
* @param[in] rhs Pointer to the RHS input matrix
* @param[in] bias Pointer to the bias vector. The length of this vector is equal to the number of
* output columns (or RHS input rows)
* @param[out] dst Pointer to the output matrix with "m" rows and "n" columns
* @param[in] dst_multipliers Pointer to the multipliers vector needed for the per-channel requantization.
* The length of this vector is equal to the number of output columns (or RHS input
* rows)
* @param[in] dst_shifts Pointer to the shifts vector needed for the per-channel requantization. The length
* of this vector is equal to the number of output columns (or RHS input rows)
* @param[in] lhs_rows Number of LHS input rows
* @param[in] rhs_rows Number of RHS input rows
* @param[in] rhs_cols Number of LHS/RHS input columns
* @param[in] activation_min Minimum value to clamp down the output. Range : int16
* @param[in] activation_max Maximum value to clamp up the output. Range : int16
*
* @details MVE implementation only.
*
* @return The function returns <code>ARM_CMSIS_NN_SUCCESS</code> or
* <code>ARM_CMSIS_NN_NO_IMPL_ERROR</code> if not for MVE
*
*/
arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s16(const int16_t *lhs,
const int8_t *rhs,
const int64_t *bias,
int16_t *dst,
const int32_t *dst_multipliers,
const int32_t *dst_shifts,
const int32_t lhs_rows,
const int32_t rhs_rows,
const int32_t rhs_cols,
const int32_t activation_min,
const int32_t activation_max);

/**
* @brief General Matrix-multiplication function with int8 input and int32 output.
* This function assumes:
Expand Down
Loading

0 comments on commit 1af62f8

Please sign in to comment.