/* * Copyright (c) 2022 Arm Limited. * * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "addressing.hpp" #include "src/core/NEON/kernels/arm_gemm/utils.hpp" #include namespace arm_conv { namespace addressing { void fill_pointer_array( size_t element_size, void **dest_raw, const unsigned int array_rows, const unsigned int array_cols, void *base_ptr_raw, size_t ld_row, size_t ld_col, void *pad_buffer_raw, const unsigned int pad_top, const unsigned int valid_rows, const unsigned int pad_left, const unsigned int valid_cols ) { auto dest = reinterpret_cast(dest_raw); auto base_ptr = reinterpret_cast(base_ptr_raw); auto pad_buffer = reinterpret_cast(pad_buffer_raw); ld_row *= element_size; ld_col *= element_size; const auto last_valid_row = std::min(pad_top + valid_rows, array_rows); const auto last_valid_col = std::min(pad_left + valid_cols, array_cols); unsigned int i = 0; for (; i < pad_top; i++) { for (unsigned int j = 0; j < array_cols; j++) { *(dest++) = pad_buffer; } } for (; i < last_valid_row; i++) { unsigned int j = 0; auto colptr = base_ptr; base_ptr += ld_row; for (; j < pad_left; j++) { *(dest++) = pad_buffer; } for (; j < last_valid_col; j++) { *(dest++) = colptr; colptr += ld_col; } for (; j < array_cols; j++) { *(dest++) = pad_buffer; } } for (; i < array_rows; i++) { for (unsigned int j = 0; j < array_cols; j++) { *(dest++) = pad_buffer; } } } void fill_pointer_array_generic_kernel( const size_t element_size, void **dest_raw, const unsigned int output_rows, const unsigned int output_cols, const unsigned int kernel_rows, const unsigned int kernel_cols, const unsigned int stride_rows, const unsigned int stride_cols, void *base_ptr_raw, size_t ld_row, size_t ld_col, void *pad_buffer_raw, const unsigned int pad_top, const unsigned int valid_rows, const unsigned int pad_left, const unsigned int valid_cols ) { auto dest = reinterpret_cast(dest_raw); auto base_ptr = reinterpret_cast(base_ptr_raw); auto pad_buffer = reinterpret_cast(pad_buffer_raw); ld_row *= element_size; ld_col *= element_size; const auto last_valid_row = pad_top + valid_rows; const auto last_valid_col = pad_left + valid_cols; const auto point_stride = output_rows * output_cols; // Iterate over the output points, after every point increment the pointer // into the address array. for (unsigned int oi = 0; oi < output_rows; oi++) { for (unsigned int oj = 0; oj < output_cols; oj++) { auto point_dest = dest; dest++; // Iterate over kernel points and fill in the pointer array. unsigned int ki = 0, ii = oi*stride_rows; for (; ii < pad_top && ki < kernel_rows; ii++, ki++) { // Fill with padding for (unsigned int j = 0; j < kernel_cols; j++) { *point_dest = pad_buffer; point_dest += point_stride; } } for (; ii < last_valid_row && ki < kernel_rows; ii++, ki++) { unsigned int kj = 0, ij = oj*stride_cols; for (; ij < pad_left && kj < kernel_cols; ij++, kj++) { // Padding *point_dest = pad_buffer; point_dest += point_stride; } for (; ij < last_valid_col && kj < kernel_cols; ij++, kj++) { *point_dest = base_ptr + (ii - pad_top)*ld_row + (ij - pad_left)*ld_col; point_dest += point_stride; } for (; kj < kernel_cols; kj++) { // Padding *point_dest = pad_buffer; point_dest += point_stride; } } for (; ki < kernel_rows; ki++) { // Fill with padding for (unsigned int j = 0; j < kernel_cols; j++) { *point_dest = pad_buffer; point_dest += point_stride; } } } } } /* Patch array constructor * * Some depthwise kernels require an NCHW-ordered patch of input. Here we * construct such a patch, and fill in an array of pointers to the rows of the * patch. */ void fill_nchw_patch_array( size_t element_size, const void **dest_row_pointers_raw, // Array of pointers to each row of the patch void *dest_patch_raw, // Pointer to space which can be used to construct the patch const unsigned int patch_rows, unsigned int patch_cols, // Patch size const void *src_ptr_raw, size_t ld_row, size_t ld_col, // Source tensor const void *pad_row, // Pointer to a row of padding values const unsigned int pad_top, const unsigned int valid_rows, const unsigned int pad_left, const unsigned int valid_cols ) { // Convert into more useful types auto row_pointers = reinterpret_cast(dest_row_pointers_raw); auto dest_patch = reinterpret_cast(dest_patch_raw); auto src = reinterpret_cast(src_ptr_raw); ld_row *= element_size; ld_col *= element_size; // Round up the patch columns to be a full quad patch_cols = arm_gemm::roundup(patch_cols, 16 / element_size); const auto last_valid_row = std::min(pad_top + valid_rows, patch_rows); const auto last_valid_col = std::min(pad_left + valid_cols, patch_cols); // Construct the patch and row pointer array together unsigned int i = 0; for (; i < pad_top; i++) { // Insert pointers into the padding row *(row_pointers++) = reinterpret_cast(pad_row); } for (; i < last_valid_row; i++) { // Get a copy of the pointer for this row auto colptr = src; src += ld_row; // If the input is already in NCHW format (ld_col == element_size) AND // there is no padding, then we just use a pointer to the source tensor; // otherwise we need to construct a patch and provide a pointer to it. if (ld_col == element_size && pad_left == 0 && last_valid_col == patch_cols) { *(row_pointers++) = colptr; } else { auto patch_col = dest_patch; *(row_pointers++) = dest_patch; dest_patch += element_size * patch_cols; // Move the patch pointer on // Construct the patch; fill the entirety with padding and then copy in // the valid elements. memcpy(patch_col, pad_row, element_size * patch_cols); patch_col += pad_left * element_size; // Move over the left padding if (ld_col == element_size) { // If the input is NCHW then copy across as many columns as we can. memcpy(patch_col, colptr, (last_valid_col - pad_left) * element_size); } else { // If the input is NHWC then copy columns across in turn. for (auto j = pad_left; j < last_valid_col; j++) { memcpy(patch_col, colptr, element_size); // Copy the valid element patch_col += element_size; // Progress the patch destination colptr += ld_col; // Progress the patch source } } } } for (; i < patch_rows; i++) { // Insert pointers into the padding row *(row_pointers++) = reinterpret_cast(pad_row); } } /* Patch array constructor (generic kernels) * * Construct an array of pointers; one pointer for each output row for each * kernel point. Pointers should point at a whole number of QUADS containing an * input point for each output point. If the kernel column stride is 1 and the * data is NCHW then the input tensor might be addressed directly, otherwise a * new patch sample might need to be constructed. */ void fill_patch_array_generic_kernel( size_t element_size, const void **dest_pointers_raw, // Pointers: one per output row per kernel point void *patch_raw, // Pointer to space which can be used to construct the patch const unsigned int output_rows, const unsigned int output_cols, const unsigned int kernel_rows, const unsigned int kernel_cols, const unsigned int stride_rows, const unsigned int stride_cols, const void *src_ptr_raw, size_t ld_row, size_t ld_col, // Source tensor const void *pad_row, // Pointer to a row of padding values const unsigned int pad_top, const unsigned int valid_rows, const unsigned int pad_left, const unsigned int valid_cols ) { auto dest = reinterpret_cast(dest_pointers_raw); auto patch = reinterpret_cast(patch_raw); auto src_ptr = reinterpret_cast(src_ptr_raw); ld_row *= element_size; ld_col *= element_size; // Round up the patch columns to a multiple of quad-length const auto patch_cols = arm_gemm::roundup(output_cols, 16 / element_size); const auto input_rows = kernel_rows + (output_rows - 1) * stride_rows; const auto last_valid_row = std::min(pad_top + valid_rows, input_rows); const auto input_cols = kernel_cols + (output_cols - 1) * stride_cols; const auto last_valid_col = std::min(pad_left + valid_cols, input_cols); for (auto ki = 0u; ki < kernel_rows; ki++) { for (auto kj = 0u; kj < kernel_cols; kj++) { auto oi = 0u, ii = ki; for (; oi < output_rows && ii < pad_top; oi++, ii += stride_rows) { // Insert a pointer to the padding row *(dest++) = reinterpret_cast(pad_row); } for (; oi < output_rows && ii < last_valid_row; oi++, ii += stride_rows) { auto rowptr = src_ptr + (ii - pad_top) * ld_row; // Construct a sample of the input here auto patch_pos = patch; *(dest++) = patch; patch += patch_cols * element_size; // Fill with padding memcpy(patch_pos, pad_row, patch_cols * element_size); // Fill in the valid elements auto oj = 0u, ij = kj; for (; oj < patch_cols && ij < pad_left; oj++, ij += stride_cols) { // Do nothing for padding patch_pos += element_size; } for (; oj < patch_cols && ij < last_valid_col; oj++, ij += stride_cols) { // Copy from the source tensor memcpy(patch_pos, rowptr + (ij - pad_left)*ld_col, element_size); patch_pos += element_size; } // No action required for right-hand padding } for (; oi < output_rows; oi++) { *(dest++) = reinterpret_cast(pad_row); } } } } } // namespace addressing } // namespace arm_conv