CMSIS-NN  Version 1.2.0
CMSIS NN Software Library
 All Data Structures Namespaces Files Functions Variables Enumerations Enumerator Macros Groups Pages
Neural Network Convolution Functions

Macros

#define DIM_KER_X
 
#define DIM_KER_Y
 

Functions

arm_status arm_convolve_1x1_HWC_q7_fast_nonsquare (const q7_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, const uint16_t stride_x, const uint16_t stride_y, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out_x, const uint16_t dim_im_out_y, q15_t *bufferA, q7_t *bufferB)
 Fast Q7 version of 1x1 convolution (non-sqaure shape) More...
 
arm_status arm_convolve_1x1_s8_fast (const q7_t *input, const uint16_t input_x, const uint16_t input_y, const uint16_t input_ch, const uint16_t input_batches, const q7_t *kernel, const uint16_t output_ch, const uint16_t pad_x, const uint16_t pad_y, const uint16_t stride_x, const uint16_t stride_y, const int32_t *bias, q7_t *output, const int32_t *output_shift, const int32_t *output_mult, const int32_t out_offset, const int32_t input_offset, const int32_t out_activation_min, const int32_t out_activation_max, const uint16_t output_x, const uint16_t output_y, q15_t *buffer_a)
 Fast s8 version for 1x1 convolution (non-square shape) More...
 
int32_t arm_convolve_1x1_s8_fast_get_buffer_size (const uint16_t input_ch)
 Get the required buffer size for the fast 1x1 convolution (non-square shape) s8 convolution function. More...
 
arm_status arm_convolve_HWC_q15_basic (const q15_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q15_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q15_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q15_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA, q7_t *bufferB)
 Basic Q15 convolution function. More...
 
arm_status arm_convolve_HWC_q15_fast (const q15_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q15_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q15_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q15_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA, q7_t *bufferB)
 Fast Q15 convolution function. More...
 
arm_status arm_convolve_HWC_q15_fast_nonsquare (const q15_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, const uint16_t ch_im_in, const q15_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, const uint16_t stride_x, const uint16_t stride_y, const q15_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q15_t *Im_out, const uint16_t dim_im_out_x, const uint16_t dim_im_out_y, q15_t *bufferA, q7_t *bufferB)
 Fast Q15 convolution function (non-sqaure shape) More...
 
arm_status arm_convolve_HWC_q7_basic (const q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA, q7_t *bufferB)
 Basic Q7 convolution function. More...
 
arm_status arm_convolve_HWC_q7_basic_nonsquare (const q7_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, const uint16_t stride_x, const uint16_t stride_y, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out_x, const uint16_t dim_im_out_y, q15_t *bufferA, q7_t *bufferB)
 Basic Q7 convolution function (non-sqaure shape) More...
 
arm_status arm_convolve_HWC_q7_fast (const q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA, q7_t *bufferB)
 Fast Q7 convolution function. More...
 
arm_status arm_convolve_HWC_q7_fast_nonsquare (const q7_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, const uint16_t stride_x, const uint16_t stride_y, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out_x, const uint16_t dim_im_out_y, q15_t *bufferA, q7_t *bufferB)
 Fast Q7 convolution function (non-sqaure shape) More...
 
arm_status arm_convolve_HWC_q7_RGB (const q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA, q7_t *bufferB)
 Q7 convolution function for RGB image. More...
 
arm_status arm_convolve_s8 (const q7_t *input, const uint16_t input_x, const uint16_t input_y, const uint16_t input_ch, const uint16_t input_batches, const q7_t *kernel, const uint16_t output_ch, const uint16_t kernel_x, const uint16_t kernel_y, const uint16_t pad_x, const uint16_t pad_y, const uint16_t stride_x, const uint16_t stride_y, const int32_t *bias, q7_t *output, const int32_t *output_shift, const int32_t *output_mult, const int32_t out_offset, const int32_t input_offset, const int32_t out_activation_min, const int32_t out_activation_max, const uint16_t output_x, const uint16_t output_y, q15_t *buffer_a)
 Basic s8 convolution function. More...
 
int32_t arm_convolve_s8_get_buffer_size (const uint16_t input_ch, const uint16_t kernel_x, const uint16_t kernel_y)
 Get the required buffer size for s8 convolution function. More...
 
arm_status arm_depthwise_conv_s8 (const q7_t *input, const uint16_t input_x, const uint16_t input_y, const uint16_t input_ch, const q7_t *kernel, const uint16_t output_ch, const uint16_t ch_mult, const uint16_t kernel_x, const uint16_t kernel_y, const uint16_t pad_x, const uint16_t pad_y, const uint16_t stride_x, const uint16_t stride_y, const int32_t *bias, q7_t *output, const int32_t *output_shift, const int32_t *output_mult, const uint16_t output_x, const uint16_t output_y, const int32_t output_offset, const int32_t input_offset, const int32_t output_activation_min, const int32_t output_activation_max, const uint16_t dilation_x, const uint16_t dilation_y, q15_t *buffer_a)
 Basic s8 depthwise convolution function. More...
 
arm_status arm_depthwise_conv_s8_opt (const q7_t *input, const uint16_t input_x, const uint16_t input_y, const uint16_t input_ch, const q7_t *kernel, const uint16_t output_ch, const uint16_t kernel_x, const uint16_t kernel_y, const uint16_t pad_x, const uint16_t pad_y, const uint16_t stride_x, const uint16_t stride_y, const int32_t *bias, q7_t *output, const int32_t *output_shift, const int32_t *output_mult, const uint16_t output_x, const uint16_t output_y, const int32_t output_offset, const int32_t input_offset, const int32_t output_activation_min, const int32_t output_activation_max, const uint16_t dilation_x, const uint16_t dilation_y, q15_t *buffer_a)
 Optimized s8 depthwise convolution function with constraint that in_channel equals out_channel. More...
 
int32_t arm_depthwise_conv_s8_opt_get_buffer_size (const uint16_t input_ch, const uint16_t kernel_x, const uint16_t kernel_y)
 Get the required buffer size for optimized s8 depthwise convolution function with constraint that in_channel equals out_channel. More...
 
arm_status arm_depthwise_conv_u8_basic_ver1 (const uint8_t *input, const uint16_t input_x, const uint16_t input_y, const uint16_t input_ch, const uint8_t *kernel, const uint16_t kernel_x, const uint16_t kernel_y, const int16_t ch_mult, const int16_t pad_x, const int16_t pad_y, const int16_t stride_x, const int16_t stride_y, const int16_t dilation_x, const int16_t dilation_y, const int32_t *bias, const int32_t input_offset, const int32_t filter_offset, const int32_t output_offset, uint8_t *output, const uint16_t output_x, const uint16_t output_y, const int32_t output_activation_min, const int32_t output_activation_max, const int32_t out_shift, const int32_t out_mult)
 uint8 depthwise convolution function with asymmetric quantization for even number of channel multiplier and input channels. Unless specified otherwise, arguments are mandatory. Both square and non-square inputs are accepted. More...
 
arm_status arm_depthwise_separable_conv_HWC_q7 (const q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA, q7_t *bufferB)
 Q7 depthwise separable convolution function. More...
 
arm_status arm_depthwise_separable_conv_HWC_q7_nonsquare (const q7_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, const uint16_t stride_x, const uint16_t stride_y, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out_x, const uint16_t dim_im_out_y, q15_t *bufferA, q7_t *bufferB)
 Q7 depthwise separable convolution function (non-square shape) More...
 

Description

Perform convolution layer

The convolution is implemented in 2 steps: im2col and GEMM

im2col is a process of converting each patch of image data into a column. After im2col, the convolution is computed as matrix-matrix multiplication.

To reduce the memory footprint, the im2col is performed partially. Each iteration, only a few column (i.e., patches) are generated and computed with GEMM kernels similar to CMSIS-DSP arm_mat_mult functions.

Macro Definition Documentation

#define DIM_KER_X
#define DIM_KER_Y

Function Documentation

arm_status arm_convolve_1x1_HWC_q7_fast_nonsquare ( const q7_t *  Im_in,
const uint16_t  dim_im_in_x,
const uint16_t  dim_im_in_y,
const uint16_t  ch_im_in,
const q7_t *  wt,
const uint16_t  ch_im_out,
const uint16_t  dim_kernel_x,
const uint16_t  dim_kernel_y,
const uint16_t  padding_x,
const uint16_t  padding_y,
const uint16_t  stride_x,
const uint16_t  stride_y,
const q7_t *  bias,
const uint16_t  bias_shift,
const uint16_t  out_shift,
q7_t *  Im_out,
const uint16_t  dim_im_out_x,
const uint16_t  dim_im_out_y,
q15_t *  bufferA,
q7_t *  bufferB 
)
Parameters
[in]Im_inpointer to input tensor
[in]dim_im_in_xinput tensor dimention x
[in]dim_im_in_yinput tensor dimention y
[in]ch_im_innumber of input tensor channels
[in]wtpointer to kernel weights
[in]ch_im_outnumber of filters, i.e., output tensor channels
[in]dim_kernel_xfilter kernel size x
[in]dim_kernel_yfilter kernel size y
[in]padding_xpadding size x
[in]padding_ypadding size y
[in]stride_xconvolution stride x
[in]stride_yconvolution stride y
[in]biaspointer to bias
[in]bias_shiftamount of left-shift for bias
[in]out_shiftamount of right-shift for output
[in,out]Im_outpointer to output tensor
[in]dim_im_out_xoutput tensor dimension x
[in]dim_im_out_youtput tensor dimension y
[in,out]bufferApointer to buffer space for input
[in,out]bufferBpointer to buffer space for output
Returns
The function returns either ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.

This function is optimized for convolution with 1x1 kernel size (i.e., dim_kernel_x=1 and dim_kernel_y=1). It can be used for the second half of MobileNets [1] after depthwise separable convolution.

This function is the version with full list of optimization tricks, but with some contraints: ch_im_in is multiple of 4 ch_im_out is multiple of 2

[1] MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications https://arxiv.org/abs/1704.04861

References arm_nn_mat_mult_kernel_q7_q15_reordered(), arm_q7_to_q15_reordered_no_shift(), and NN_ROUND.

arm_status arm_convolve_1x1_s8_fast ( const q7_t *  input,
const uint16_t  input_x,
const uint16_t  input_y,
const uint16_t  input_ch,
const uint16_t  input_batches,
const q7_t *  kernel,
const uint16_t  output_ch,
const uint16_t  pad_x,
const uint16_t  pad_y,
const uint16_t  stride_x,
const uint16_t  stride_y,
const int32_t *  bias,
q7_t *  output,
const int32_t *  output_shift,
const int32_t *  output_mult,
const int32_t  out_offset,
const int32_t  input_offset,
const int32_t  out_activation_min,
const int32_t  out_activation_max,
const uint16_t  output_x,
const uint16_t  output_y,
q15_t *  buffer_a 
)
Parameters
[in]inputpointer to input tensor. Format: [N, H, W, in_ch]
[in]input_xinput tensor dimension x
[in]input_yinput tensor dimension y
[in]input_chnumber of input tensor channels
[in]input_batchesnumber of input batches
[in]kernelpointer to kernel weights. Format: [out_ch, H, W, in_ch]
[in]output_chnumber of filters, i.e., output tensor channels
[in]pad_xpadding size x
[in]pad_ypadding size y
[in]stride_xconvolution stride x
[in]stride_yconvolution stride y
[in]biaspointer to per channel bias. Range : int32
[in,out]outputpointer to output tensor. Format: [H, W, out_ch]
[in]output_shiftpointer to per output channel requantization shift parameter.
[in]output_multpointer to per output channel requantization multiplier parameter.
[in]out_offsetoutput tensor offset. Range: int8
[in]input_offsetinput tensor offset. Range: int8
[in]out_activation_minMinimum value to clamp the output to. Range: int8
[in]out_activation_maxMinimum value to clamp the output to. Range: int8
[in]output_xoutput tensor width
[in]output_youtput tensor height
[in]buffer_apointer to buffer space used for input optimization(partial im2col) and is necessary when ARM_MATH_LOOPUNROLL and ARM_MATH_DSP is defined. Required space: 2 * input_ch * sizeof(q15_t) bytes Use arm_convolve_1x1_s8_fast_get_buffer_size() to get the size
Returns
The function returns either ARM_MATH_SIZE_MISMATCH if argument constraints fail. or, ARM_MATH_SUCCESS on successful completion.
  • Supported framework : TensorFlow Lite Micro
  • The following constrains on the arguments apply
    1. input_ch is a multiple of 4
    2. output_ch is a multiple of 2
    3. padding equals 0
    4. Stride equals 1
    5. kernel dimension is 1x1 (Not provided in the argument list)

References arm_convolve_s8(), arm_nn_mat_mult_kernel_s8_s16_reordered(), arm_nn_read_q15x2_ia(), arm_nn_requantize(), arm_q7_to_q15_reordered_with_offset(), DIM_KER_X, DIM_KER_Y, MAX, and MIN.

int32_t arm_convolve_1x1_s8_fast_get_buffer_size ( const uint16_t  input_ch)
Parameters
[in]input_chnumber of input tensor channels
Returns
The function returns required buffer size
arm_status arm_convolve_HWC_q15_basic ( const q15_t *  Im_in,
const uint16_t  dim_im_in,
const uint16_t  ch_im_in,
const q15_t *  wt,
const uint16_t  ch_im_out,
const uint16_t  dim_kernel,
const uint16_t  padding,
const uint16_t  stride,
const q15_t *  bias,
const uint16_t  bias_shift,
const uint16_t  out_shift,
q15_t *  Im_out,
const uint16_t  dim_im_out,
q15_t *  bufferA,
q7_t *  bufferB 
)
Parameters
[in]Im_inpointer to input tensor
[in]dim_im_ininput tensor dimention
[in]ch_im_innumber of input tensor channels
[in]wtpointer to kernel weights
[in]ch_im_outnumber of filters, i.e., output tensor channels
[in]dim_kernelfilter kernel size
[in]paddingpadding sizes
[in]strideconvolution stride
[in]biaspointer to bias
[in]bias_shiftamount of left-shift for bias
[in]out_shiftamount of right-shift for output
[in,out]Im_outpointer to output tensor
[in]dim_im_outoutput tensor dimension
[in,out]bufferApointer to buffer space for input
[in,out]bufferBpointer to buffer space for output
Returns
The function returns ARM_MATH_SUCCESS

Buffer size:

bufferA size: ch_im_in*dim_kernel*dim_kernel

bufferB size: 0

This basic version is designed to work for any input tensor and weight dimension.

References NN_ROUND.

arm_status arm_convolve_HWC_q15_fast ( const q15_t *  Im_in,
const uint16_t  dim_im_in,
const uint16_t  ch_im_in,
const q15_t *  wt,
const uint16_t  ch_im_out,
const uint16_t  dim_kernel,
const uint16_t  padding,
const uint16_t  stride,
const q15_t *  bias,
const uint16_t  bias_shift,
const uint16_t  out_shift,
q15_t *  Im_out,
const uint16_t  dim_im_out,
q15_t *  bufferA,
q7_t *  bufferB 
)
Parameters
[in]Im_inpointer to input tensor
[in]dim_im_ininput tensor dimention
[in]ch_im_innumber of input tensor channels
[in]wtpointer to kernel weights
[in]ch_im_outnumber of filters, i.e., output tensor channels
[in]dim_kernelfilter kernel size
[in]paddingpadding sizes
[in]strideconvolution stride
[in]biaspointer to bias
[in]bias_shiftamount of left-shift for bias
[in]out_shiftamount of right-shift for output
[in,out]Im_outpointer to output tensor
[in]dim_im_outoutput tensor dimension
[in,out]bufferApointer to buffer space for input
[in,out]bufferBpointer to buffer space for output
Returns
The function returns either ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.

Buffer size:

bufferA size: 2*ch_im_in*dim_kernel*dim_kernel

bufferB size: 0

Input dimension constraints:

ch_im_in is multiple of 2

ch_im_out is multipe of 2

References NN_ROUND.

arm_status arm_convolve_HWC_q15_fast_nonsquare ( const q15_t *  Im_in,
const uint16_t  dim_im_in_x,
const uint16_t  dim_im_in_y,
const uint16_t  ch_im_in,
const q15_t *  wt,
const uint16_t  ch_im_out,
const uint16_t  dim_kernel_x,
const uint16_t  dim_kernel_y,
const uint16_t  padding_x,
const uint16_t  padding_y,
const uint16_t  stride_x,
const uint16_t  stride_y,
const q15_t *  bias,
const uint16_t  bias_shift,
const uint16_t  out_shift,
q15_t *  Im_out,
const uint16_t  dim_im_out_x,
const uint16_t  dim_im_out_y,
q15_t *  bufferA,
q7_t *  bufferB 
)
Parameters
[in]Im_inpointer to input tensor
[in]dim_im_in_xinput tensor dimention x
[in]dim_im_in_yinput tensor dimention y
[in]ch_im_innumber of input tensor channels
[in]wtpointer to kernel weights
[in]ch_im_outnumber of filters, i.e., output tensor channels
[in]dim_kernel_xfilter kernel size x
[in]dim_kernel_yfilter kernel size y
[in]padding_xpadding size x
[in]padding_ypadding size y
[in]stride_xconvolution stride x
[in]stride_yconvolution stride y
[in]biaspointer to bias
[in]bias_shiftamount of left-shift for bias
[in]out_shiftamount of right-shift for output
[in,out]Im_outpointer to output tensor
[in]dim_im_out_xoutput tensor dimension x
[in]dim_im_out_youtput tensor dimension y
[in,out]bufferApointer to buffer space for input
[in,out]bufferBpointer to buffer space for output
Returns
The function returns either ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.

Buffer size:

bufferA size: 2*ch_im_in*dim_kernel*dim_kernel

bufferB size: 0

Input dimension constraints:

ch_im_in is multiple of 2

ch_im_out is multipe of 2

References NN_ROUND.

arm_status arm_convolve_HWC_q7_basic ( const q7_t *  Im_in,
const uint16_t  dim_im_in,
const uint16_t  ch_im_in,
const q7_t *  wt,
const uint16_t  ch_im_out,
const uint16_t  dim_kernel,
const uint16_t  padding,
const uint16_t  stride,
const q7_t *  bias,
const uint16_t  bias_shift,
const uint16_t  out_shift,
q7_t *  Im_out,
const uint16_t  dim_im_out,
q15_t *  bufferA,
q7_t *  bufferB 
)
Parameters
[in]Im_inpointer to input tensor
[in]dim_im_ininput tensor dimention
[in]ch_im_innumber of input tensor channels
[in]wtpointer to kernel weights
[in]ch_im_outnumber of filters, i.e., output tensor channels
[in]dim_kernelfilter kernel size
[in]paddingpadding sizes
[in]strideconvolution stride
[in]biaspointer to bias
[in]bias_shiftamount of left-shift for bias
[in]out_shiftamount of right-shift for output
[in,out]Im_outpointer to output tensor
[in]dim_im_outoutput tensor dimension
[in,out]bufferApointer to buffer space for input
[in,out]bufferBpointer to buffer space for output
Returns
The function returns ARM_MATH_SUCCESS

Buffer size:

bufferA size: 2*ch_im_in*dim_kernel*dim_kernel

bufferB size: 0

This basic version is designed to work for any input tensor and weight dimension.

References arm_nn_mat_mult_kernel_q7_q15(), arm_q7_to_q15_no_shift(), and NN_ROUND.

arm_status arm_convolve_HWC_q7_basic_nonsquare ( const q7_t *  Im_in,
const uint16_t  dim_im_in_x,
const uint16_t  dim_im_in_y,
const uint16_t  ch_im_in,
const q7_t *  wt,
const uint16_t  ch_im_out,
const uint16_t  dim_kernel_x,
const uint16_t  dim_kernel_y,
const uint16_t  padding_x,
const uint16_t  padding_y,
const uint16_t  stride_x,
const uint16_t  stride_y,
const q7_t *  bias,
const uint16_t  bias_shift,
const uint16_t  out_shift,
q7_t *  Im_out,
const uint16_t  dim_im_out_x,
const uint16_t  dim_im_out_y,
q15_t *  bufferA,
q7_t *  bufferB 
)

Basic Q7 convolution function (non-square shape)

Parameters
[in]Im_inpointer to input tensor
[in]dim_im_in_xinput tensor dimention x
[in]dim_im_in_yinput tensor dimention y
[in]ch_im_innumber of input tensor channels
[in]wtpointer to kernel weights
[in]ch_im_outnumber of filters, i.e., output tensor channels
[in]dim_kernel_xfilter kernel size x
[in]dim_kernel_yfilter kernel size y
[in]padding_xpadding size x
[in]padding_ypadding size y
[in]stride_xconvolution stride x
[in]stride_yconvolution stride y
[in]biaspointer to bias
[in]bias_shiftamount of left-shift for bias
[in]out_shiftamount of right-shift for output
[in,out]Im_outpointer to output tensor
[in]dim_im_out_xoutput tensor dimension x
[in]dim_im_out_youtput tensor dimension y
[in,out]bufferApointer to buffer space for input
[in,out]bufferBpointer to buffer space for output
Returns
The function returns ARM_MATH_SUCCESS

References arm_nn_mat_mult_kernel_q7_q15(), arm_q7_to_q15_no_shift(), and NN_ROUND.

arm_status arm_convolve_HWC_q7_fast ( const q7_t *  Im_in,
const uint16_t  dim_im_in,
const uint16_t  ch_im_in,
const q7_t *  wt,
const uint16_t  ch_im_out,
const uint16_t  dim_kernel,
const uint16_t  padding,
const uint16_t  stride,
const q7_t *  bias,
const uint16_t  bias_shift,
const uint16_t  out_shift,
q7_t *  Im_out,
const uint16_t  dim_im_out,
q15_t *  bufferA,
q7_t *  bufferB 
)
Parameters
[in]Im_inpointer to input tensor
[in]dim_im_ininput tensor dimention
[in]ch_im_innumber of input tensor channels
[in]wtpointer to kernel weights
[in]ch_im_outnumber of filters, i.e., output tensor channels
[in]dim_kernelfilter kernel size
[in]paddingpadding sizes
[in]strideconvolution stride
[in]biaspointer to bias
[in]bias_shiftamount of left-shift for bias
[in]out_shiftamount of right-shift for output
[in,out]Im_outpointer to output tensor
[in]dim_im_outoutput tensor dimension
[in,out]bufferApointer to buffer space for input
[in,out]bufferBpointer to buffer space for output
Returns
The function returns either ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.

Buffer size:

bufferA size: 2*ch_im_in*dim_kernel*dim_kernel

bufferB size: 0

Input dimension constraints:

ch_im_in is multiple of 4 ( because of the SIMD32 read and swap )

ch_im_out is multipe of 2 ( bacause 2x2 mat_mult kernel )

The im2col converts the Q7 tensor input into Q15 column, which is stored in bufferA. There is reordering happenning during this im2col process with arm_q7_to_q15_reordered_no_shift. For every four elements, the second and third elements are swapped.

The computation kernel arm_nn_mat_mult_kernel_q7_q15_reordered does the GEMM computation with the reordered columns.

To speed-up the determination of the padding condition, we split the computation into 3x3 parts, i.e., {top, mid, bottom} X {left, mid, right}. This reduces the total number of boundary condition checks and improves the data copying performance.

References arm_nn_mat_mult_kernel_q7_q15_reordered(), arm_q7_to_q15_reordered_no_shift(), and NN_ROUND.

Referenced by main().

arm_status arm_convolve_HWC_q7_fast_nonsquare ( const q7_t *  Im_in,
const uint16_t  dim_im_in_x,
const uint16_t  dim_im_in_y,
const uint16_t  ch_im_in,
const q7_t *  wt,
const uint16_t  ch_im_out,
const uint16_t  dim_kernel_x,
const uint16_t  dim_kernel_y,
const uint16_t  padding_x,
const uint16_t  padding_y,
const uint16_t  stride_x,
const uint16_t  stride_y,
const q7_t *  bias,
const uint16_t  bias_shift,
const uint16_t  out_shift,
q7_t *  Im_out,
const uint16_t  dim_im_out_x,
const uint16_t  dim_im_out_y,
q15_t *  bufferA,
q7_t *  bufferB 
)
Parameters
[in]Im_inpointer to input tensor
[in]dim_im_in_xinput tensor dimention x
[in]dim_im_in_yinput tensor dimention y
[in]ch_im_innumber of input tensor channels
[in]wtpointer to kernel weights
[in]ch_im_outnumber of filters, i.e., output tensor channels
[in]dim_kernel_xfilter kernel size x
[in]dim_kernel_yfilter kernel size y
[in]padding_xpadding size x
[in]padding_ypadding size y
[in]stride_xconvolution stride x
[in]stride_yconvolution stride y
[in]biaspointer to bias
[in]bias_shiftamount of left-shift for bias
[in]out_shiftamount of right-shift for output
[in,out]Im_outpointer to output tensor
[in]dim_im_out_xoutput tensor dimension x
[in]dim_im_out_youtput tensor dimension y
[in,out]bufferApointer to buffer space for input
[in,out]bufferBpointer to buffer space for output
Returns
The function returns either ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.

This function is the version with full list of optimization tricks, but with some contraints: ch_im_in is multiple of 4 ch_im_out is multiple of 2

References arm_nn_mat_mult_kernel_q7_q15_reordered(), arm_q7_to_q15_reordered_no_shift(), and NN_ROUND.

arm_status arm_convolve_HWC_q7_RGB ( const q7_t *  Im_in,
const uint16_t  dim_im_in,
const uint16_t  ch_im_in,
const q7_t *  wt,
const uint16_t  ch_im_out,
const uint16_t  dim_kernel,
const uint16_t  padding,
const uint16_t  stride,
const q7_t *  bias,
const uint16_t  bias_shift,
const uint16_t  out_shift,
q7_t *  Im_out,
const uint16_t  dim_im_out,
q15_t *  bufferA,
q7_t *  bufferB 
)

Q7 version of convolution for RGB image.

Parameters
[in]Im_inpointer to input tensor
[in]dim_im_ininput tensor dimention
[in]ch_im_innumber of input tensor channels
[in]wtpointer to kernel weights
[in]ch_im_outnumber of filters, i.e., output tensor channels
[in]dim_kernelfilter kernel size
[in]paddingpadding sizes
[in]strideconvolution stride
[in]biaspointer to bias
[in]bias_shiftamount of left-shift for bias
[in]out_shiftamount of right-shift for output
[in,out]Im_outpointer to output tensor
[in]dim_im_outoutput tensor dimension
[in,out]bufferApointer to buffer space for input
[in,out]bufferBpointer to buffer space for output
Returns
The function returns either ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.

Buffer size:

bufferA size: 2*ch_im_in*dim_kernel*dim_kernel

bufferB size: 0

Input dimension constraints:

ch_im_in equals 3

This kernel is written exclusively for convolution with ch_im_in equals 3. This applies on the first layer of CNNs which has input image with RGB format.

References arm_nn_mat_mult_kernel_q7_q15(), arm_nnword::half_words, NN_ROUND, and arm_nnword::word.

Referenced by main().

arm_status arm_convolve_s8 ( const q7_t *  input,
const uint16_t  input_x,
const uint16_t  input_y,
const uint16_t  input_ch,
const uint16_t  input_batches,
const q7_t *  kernel,
const uint16_t  output_ch,
const uint16_t  kernel_x,
const uint16_t  kernel_y,
const uint16_t  pad_x,
const uint16_t  pad_y,
const uint16_t  stride_x,
const uint16_t  stride_y,
const int32_t *  bias,
q7_t *  output,
const int32_t *  output_shift,
const int32_t *  output_mult,
const int32_t  out_offset,
const int32_t  input_offset,
const int32_t  output_activation_min,
const int32_t  output_activation_max,
const uint16_t  output_x,
const uint16_t  output_y,
q15_t *  buffer_a 
)
Parameters
[in]inputpointer to input tensor. Range: int8, format: [N,H,W,in_ch]
[in]input_xinput tensor width
[in]input_yinput tensor height
[in]input_chnumber of input tensor channels
[in]input_batchesnumber of input batches
[in]kernelpointer to kernel weights. Range: int8, format: [out_ch, H, W, in_ch]
[in]output_chnumber of filters, i.e., output tensor channels
[in]kernel_xfilter/kernel width
[in]kernel_yfilter/kernel height
[in]pad_xpadding along width
[in]pad_ypadding along height
[in]stride_xconvolution stride x
[in]stride_yconvolution stride y
[in]biaspointer to per output channel bias. Range: int32
[in,out]outputpointer to output tensor. format: [H, W, out_ch]
[in]output_shiftpointer to per output channel requantization shift parameter.
[in]output_multpointer to per output channel requantization multiplier parameter.
[in]out_offsetoutput tensor offset. Range: int8
[in]input_offsetinput tensor offset. Range: int8
[in]output_activation_minMinimum value to clamp the output to. Range: int8
[in]output_activation_maxMinimum value to clamp the output to. Range: int8
[in]output_xoutput tensor width
[in]output_youtput tensor height
[in]buffer_apointer to buffer space used for input optimization(partial im2col) and is necessary when both ARM_MATH_LOOPUNROLL and ARM_MATH_DSP are defined. Required space: (2 * input_ch * kernel_x * kernel_y) * sizeof(q15_t) bytes Use arm_convolve_s8_get_buffer_size() to get the size.
Returns
The function returns ARM_MATH_SUCCESS
  1. Supported framework: TensorFlow Lite micro
  2. q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
  3. Additional memory is required for optimization. Refer to argument 'buffer_a' for details.

References arm_nn_mat_mult_kernel_s8_s16(), arm_nn_read_q15x2_ia(), arm_nn_requantize(), arm_q7_to_q15_with_offset(), MAX, and MIN.

Referenced by arm_convolve_1x1_s8_fast().

int32_t arm_convolve_s8_get_buffer_size ( const uint16_t  input_ch,
const uint16_t  kernel_x,
const uint16_t  kernel_y 
)
Parameters
[in]input_chnumber of input tensor channels
[in]kernel_xfilter/kernel width
[in]kernel_yfilter/kernel height
Returns
The function returns required buffer size
arm_status arm_depthwise_conv_s8 ( const q7_t *  input,
const uint16_t  input_x,
const uint16_t  input_y,
const uint16_t  input_ch,
const q7_t *  kernel,
const uint16_t  output_ch,
const uint16_t  ch_mult,
const uint16_t  kernel_x,
const uint16_t  kernel_y,
const uint16_t  pad_x,
const uint16_t  pad_y,
const uint16_t  stride_x,
const uint16_t  stride_y,
const int32_t *  bias,
q7_t *  output,
const int32_t *  output_shift,
const int32_t *  output_mult,
const uint16_t  output_x,
const uint16_t  output_y,
const int32_t  output_offset,
const int32_t  input_offset,
const int32_t  output_activation_min,
const int32_t  output_activation_max,
const uint16_t  dilation_x,
const uint16_t  dilation_y,
q15_t *  buffer_a 
)
Parameters
[in]inputpointer to input tensor. Range: int8, format: [H,W,in_ch]
[in]input_xinput tensor width
[in]input_yinput tensor height
[in]input_chnumber of input tensor channels
[in]kernelpointer to kernel weights. Range: int8, format: [in_ch, H, W, out_ch]
[in]output_chNumber of output channels. output_ch = ch_mult * input_ch
[in]ch_multchannel multiplier.
[in]kernel_xfilter/kernel width
[in]kernel_yfilter/kernel height
[in]pad_xpadding along width
[in]pad_ypadding along height
[in]stride_xconvolution stride along width
[in]stride_yconvolution stride along height
[in]biaspointer to per output channel bias. Range: int32
[in,out]outputpointer to output tensor. Format: [H, W, out_ch]
[in]output_shiftpointer to per output channel requantization shift parameter.
[in]output_multpointer to per output channel requantization multiplier parameter.
[in]output_xoutput tensor width
[in]output_youtput tensor height
[in]output_offsetoffset to elements of output tensor
[in]input_offsetoffset to elements of input tensor
[in]output_activation_minMinimum value to clamp the output to. Range: int8
[in]output_activation_maxMinimum value to clamp the output to. Range: int8
[in]dilation_xdilation along x. Not used. Dilation factor of 1 is used.
[in]dilation_ydilation along y. Not used. Dilation factor of 1 is used.
[in]buffer_aNot used.
Returns
The function returns ARM_MATH_SUCCESS
  1. Supported framework: TensorFlow Lite
  2. q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
  3. Optimization using DSP extension is not available for the generic case where channel multiplier is > 1.

References arm_nn_requantize(), MAX, and MIN.

Referenced by arm_depthwise_conv_s8_opt().

arm_status arm_depthwise_conv_s8_opt ( const q7_t *  input,
const uint16_t  input_x,
const uint16_t  input_y,
const uint16_t  input_ch,
const q7_t *  kernel,
const uint16_t  output_ch,
const uint16_t  kernel_x,
const uint16_t  kernel_y,
const uint16_t  pad_x,
const uint16_t  pad_y,
const uint16_t  stride_x,
const uint16_t  stride_y,
const int32_t *  bias,
q7_t *  output,
const int32_t *  output_shift,
const int32_t *  output_mult,
const uint16_t  output_x,
const uint16_t  output_y,
const int32_t  output_offset,
const int32_t  input_offset,
const int32_t  output_activation_min,
const int32_t  output_activation_max,
const uint16_t  dilation_x,
const uint16_t  dilation_y,
q15_t *  buffer_a 
)
Parameters
[in]inputpointer to input tensor. Range: int8, format: [H,W,in_ch]
[in]input_xinput tensor width
[in]input_yinput tensor height
[in]input_chnumber of input tensor channels
[in]kernelpointer to kernel weights. Range: int8, Format: [in_ch, H, W, out_ch]
[in]output_chNumber of output channels.
[in]kernel_xfilter/kernel width
[in]kernel_yfilter/kernel height
[in]pad_xpadding along width
[in]pad_ypadding along height
[in]stride_xconvolution stride along width
[in]stride_yconvolution stride along height
[in]biaspointer to per output channel bias. Range: int8
[in,out]outputpointer to output tensor. Format: [H, W, out_ch]
[in]output_shiftpointer to per output channel requantization shift parameter.
[in]output_multpointer to per output channel requantization multiplier parameter.
[in]output_xoutput tensor width
[in]output_youtput tensor height
[in]output_offsetoffset to elements of output tensor
[in]input_offsetoffset to elements of input tensor
[in]output_activation_minMinimum value to clamp the output to. Range: int8
[in]output_activation_maxMinimum value to clamp the output to. Range: int8
[in]dilation_xdilation along x. Not used. Dilation factor of 1 is used.
[in]dilation_ydilation along y. Not used. Dilation factor of 1 is used.
[in]buffer_aBuffer for partial im2col optimization. This is mandatory when ARM_MATH_LOOPUNROLL and ARM_MATH_DSP are defined. Required space: (2 * input_ch * kernel_x * kernel_y) * sizeof(q15_t) bytes Use arm_depthwise_conv_s8_opt_get_buffer_size() to get the size.
Returns
The function returns one of the following ARM_MATH_SIZE_MISMATCH - Unsupported dimension of tensors ARM_MATH_SUCCESS - Successful operation
  1. Supported framework: TensorFlow Lite
  2. q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
  3. Reccomended when number of channels is 4 or greater.

References arm_depthwise_conv_s8(), arm_nn_read_q15x2(), arm_nn_read_q7x4(), arm_nn_requantize(), arm_q7_to_q15_with_offset(), col_buffer, MAX, and MIN.

int32_t arm_depthwise_conv_s8_opt_get_buffer_size ( const uint16_t  input_ch,
const uint16_t  kernel_x,
const uint16_t  kernel_y 
)
Parameters
[in]input_chnumber of input tensor channels
[in]kernel_xfilter/kernel width
[in]kernel_yfilter/kernel height
Returns
The function returns required buffer size
arm_status arm_depthwise_conv_u8_basic_ver1 ( const uint8_t *  input,
const uint16_t  input_x,
const uint16_t  input_y,
const uint16_t  input_ch,
const uint8_t *  kernel,
const uint16_t  kernel_x,
const uint16_t  kernel_y,
const int16_t  ch_mult,
const int16_t  pad_x,
const int16_t  pad_y,
const int16_t  stride_x,
const int16_t  stride_y,
const int16_t  dilation_x,
const int16_t  dilation_y,
const int32_t *  bias,
const int32_t  input_offset,
const int32_t  filter_offset,
const int32_t  output_offset,
uint8_t *  output,
const uint16_t  output_x,
const uint16_t  output_y,
const int32_t  output_activation_min,
const int32_t  output_activation_max,
const int32_t  out_shift,
const int32_t  out_mult 
)

uint8 depthwise convolution function with asymmetric quantization for even number of channel multiplier and input channels. Unless specified otherwise, arguments are mandatory.

Parameters
[in]inputPointer to input tensor
[in]input_xWidth of input tensor
[in]input_yHeight of input tensor
[in]input_chChannels in input tensor
[in]kernelPointer to kernel weights
[in]kernel_xWidth of kernel
[in]kernel_yHeight of kernel
[in]ch_multNumber of channel multiplier
[in]pad_xPadding sizes x
[in]pad_yPadding sizes y
[in]stride_xConvolution stride along the width
[in]stride_yConvolution stride along the height
[in]dilation_xDilation along width. Not used and intended for future enhancement.
[in]dilation_yDilation along height. Not used and intended for future enhancement.
[in]biasPointer to optional bias values. If no bias is availble, NULL is expected
[in]input_offsetInput tensor zero offset
[in]filter_offsetKernel tensor zero offset
[in]output_offsetOutput tensor zero offset
[in,out]outputPointer to output tensor
[in]output_xWidth of output tensor
[in]output_yHeight of output tensor
[in]output_activation_minMinimum value to clamp the output to. Range : {0, 255}
[in]output_activation_maxMinimum value to clamp the output to. Range : {0, 255}
[in]out_shiftAmount of right-shift for output
[in]out_multOutput multiplier for requantization
Returns
The function returns one of the following ARM_MATH_SIZE_MISMATCH - Not supported dimension of tensors ARM_MATH_SUCCESS - Successful operation ARM_MATH_ARGUMENT_ERROR - Implementation not available

Input constraints ch_mult is multiple of 2 kernel_x is multiple of 2

References arm_nn_divide_by_power_of_two(), arm_nn_sat_doubling_high_mult(), DILATION_X, DILATION_Y, LEFT_SHIFT, and RIGHT_SHIFT.

arm_status arm_depthwise_separable_conv_HWC_q7 ( const q7_t *  Im_in,
const uint16_t  dim_im_in,
const uint16_t  ch_im_in,
const q7_t *  wt,
const uint16_t  ch_im_out,
const uint16_t  dim_kernel,
const uint16_t  padding,
const uint16_t  stride,
const q7_t *  bias,
const uint16_t  bias_shift,
const uint16_t  out_shift,
q7_t *  Im_out,
const uint16_t  dim_im_out,
q15_t *  bufferA,
q7_t *  bufferB 
)
Parameters
[in]Im_inpointer to input tensor
[in]dim_im_ininput tensor dimention
[in]ch_im_innumber of input tensor channels
[in]wtpointer to kernel weights
[in]ch_im_outnumber of filters, i.e., output tensor channels
[in]dim_kernelfilter kernel size
[in]paddingpadding sizes
[in]strideconvolution stride
[in]biaspointer to bias
[in]bias_shiftamount of left-shift for bias
[in]out_shiftamount of right-shift for output
[in,out]Im_outpointer to output tensor
[in]dim_im_outoutput tensor dimension
[in,out]bufferApointer to buffer space for input
[in,out]bufferBpointer to buffer space for output
Returns
The function returns either ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.

Buffer size:

bufferA size: 2*ch_im_in*dim_kernel*dim_kernel

bufferB size: 0

Input dimension constraints:

ch_im_in equals ch_im_out

Implementation: There are 3 nested loop here: Inner loop: calculate each output value with MAC instruction over an accumulator Mid loop: loop over different output channel Outer loop: loop over different output (x, y)

References arm_nnword::bytes, NN_ROUND, and arm_nnword::word.

arm_status arm_depthwise_separable_conv_HWC_q7_nonsquare ( const q7_t *  Im_in,
const uint16_t  dim_im_in_x,
const uint16_t  dim_im_in_y,
const uint16_t  ch_im_in,
const q7_t *  wt,
const uint16_t  ch_im_out,
const uint16_t  dim_kernel_x,
const uint16_t  dim_kernel_y,
const uint16_t  padding_x,
const uint16_t  padding_y,
const uint16_t  stride_x,
const uint16_t  stride_y,
const q7_t *  bias,
const uint16_t  bias_shift,
const uint16_t  out_shift,
q7_t *  Im_out,
const uint16_t  dim_im_out_x,
const uint16_t  dim_im_out_y,
q15_t *  bufferA,
q7_t *  bufferB 
)
Parameters
[in]Im_inpointer to input tensor
[in]dim_im_in_xinput tensor dimention x
[in]dim_im_in_yinput tensor dimention y
[in]ch_im_innumber of input tensor channels
[in]wtpointer to kernel weights
[in]ch_im_outnumber of filters, i.e., output tensor channels
[in]dim_kernel_xfilter kernel size x
[in]dim_kernel_yfilter kernel size y
[in]padding_xpadding sizes x
[in]padding_ypadding sizes y
[in]stride_xconvolution stride x
[in]stride_yconvolution stride y
[in]biaspointer to bias
[in]bias_shiftamount of left-shift for bias
[in]out_shiftamount of right-shift for output
[in,out]Im_outpointer to output tensor
[in]dim_im_out_xoutput tensor dimension x
[in]dim_im_out_youtput tensor dimension y
[in,out]bufferApointer to buffer space for input
[in,out]bufferBpointer to buffer space for output
Returns
The function returns either ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.

This function is the version with full list of optimization tricks, but with some contraints: ch_im_in is equal to ch_im_out

References arm_nnword::bytes, NN_ROUND, and arm_nnword::word.