// Copyright 2022 Google LLC // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. #include .syntax unified // void xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2( // size_t rows, r0 // const uint32_t* input, r1 // const uint8_t* weight_widths, r2 // const uint16_t* weights, r3 // uint64_t* output) sp -> r12 // d8-d15, r12-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved. // Register usage // input r1 d2 // weights r3 d3 d4 d5 // output r12 d0 d1 // weight_widths r2 r4 BEGIN_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2 .arm #ifndef __APPLE__ .arch armv7-a .fpu neon #endif LDR r12, [sp] // output PUSH {r4,lr} // push 8 bytes VMOV.U8 d0, #0 // weight_accumulator // Compute unweight as initial weight LDRB r4, [r2], #1 // weight_widths VMOV.U8 d1, #0 // unweight_accumulator 0: VLD1.32 {d3[]}, [r3]! // weight+unweight VLD1.32 {d2[]}, [r1]! // input SUBS r4, r4, #1 VMOVL.U16 q2, d3 VMLAL.U32 q0, d2, d4[1] // unweight BHI 0b 1: LDRB r4, [r2], #1 // weight_widths SUBS r4, r4, #1 VMOV.U8 d1, #0 // unweight_accumulator BLS 3f // less than 2 weights? 2: VLD1.16 {d3}, [r3]! // weights VLD1.32 {d2}, [r1]! // input SUBS r4, r4, #2 VMOVL.U16 q2, d3 VMLAL.U32 q0, d4, d2[0] VMLAL.U32 q0, d5, d2[1] BHI 2b BLO 4f // is there a remainder? 3: VLD1.32 {d3[]}, [r3]! // weights VLD1.32 {d2[]}, [r1]! // input VMOVL.U16 q2, d3 VMLAL.U32 q0, d4, d2 4: VST1.64 {d0}, [r12]! SUBS r0, r0, #1 VMOV d0, d1 BNE 1b POP {r4,pc} END_FUNCTION xnn_u32_filterbank_accumulate_ukernel__aarch32_neon_x2 #ifdef __ELF__ .section ".note.GNU-stack","",%progbits #endif