// Auto-generated file. Do not edit! // Template: src/qs8-gemm/1x16c4-aarch64-neondot-ld32.S.in // Generator: tools/xngen // // Copyright 2020 Google LLC // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. #include # void xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32( # size_t mr, x0 # size_t nc, x1 # size_t kc, x2 / x0 # const int8_t* restrict a, x3 # size_t a_stride, (x4) # const void* restrict w, x5 # int8_t* restrict c, x6 # size_t cm_stride, (x7) # size_t cn_stride, [sp] -> x12 # const union xnn_qs8_minmax_params params) [sp + 8] -> x11 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. # Register usage # A0 x3 v0 # B x5 v16 v17 v18 v19 # C0 x6 v28 v29 v30 v31 # unused v4 v5 v6 v7 v8 v9 v10 v11 v12 v13 v14 v15 BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32 0: # Load initial bias from w into accumulators ADD x2, x2, 3 // kc = (kc + 3) & ~3 LDP q28, q29, [x5], 32 BIC x2, x2, 3 LDP q30, q31, [x5], 32 MOV x0, x2 // k = kc. assumes kc > 0 LDR x11, [sp, 8] // params # Main loop - 4 bytes of A .p2align 3 1: LDR s0, [x3], 4 LDR q16, [x5], 16 LDR q17, [x5], 16 LDR q18, [x5], 16 LDR q19, [x5], 16 SDOT v28.4s, v16.16b, v0.4b[0] SDOT v29.4s, v17.16b, v0.4b[0] SUBS x0, x0, 4 SDOT v30.4s, v18.16b, v0.4b[0] SDOT v31.4s, v19.16b, v0.4b[0] B.HI 1b # Load per channel scale values from weights SCVTF v28.4s, v28.4s LDR q4, [x5], 16 SCVTF v29.4s, v29.4s LDR q5, [x5], 16 SCVTF v30.4s, v30.4s LDR q6, [x5], 16 SCVTF v31.4s, v31.4s FMUL v28.4s, v28.4s, v4.4s LDR q4, [x5], 16 FMUL v29.4s, v29.4s, v5.4s FMUL v30.4s, v30.4s, v6.4s FMUL v31.4s, v31.4s, v4.4s FCVTNS v28.4s, v28.4s FCVTNS v29.4s, v29.4s FCVTNS v30.4s, v30.4s FCVTNS v31.4s, v31.4s LD1R {v6.8h}, [x11], 2 // add bias SQXTN v0.4h, v28.4s SQXTN v2.4h, v30.4s SQXTN2 v0.8h, v29.4s SQXTN2 v2.8h, v31.4s LD2R {v4.16b, v5.16b}, [x11] // clamp to min/max SQADD v0.8h, v0.8h, v6.8h SQADD v2.8h, v2.8h, v6.8h LDR x12, [sp] // cn_stride SQXTN v0.8b, v0.8h SQXTN2 v0.16b, v2.8h SUBS x1, x1, 16 SMAX v0.16b, v0.16b, v4.16b SMIN v0.16b, v0.16b, v5.16b B.LO 2f # Store full 1 x 16 ST1 {v0.16b}, [x6], x12 SUB x3, x3, x2 // a0 -= kc B.NE 0b RET # Store odd width .p2align 3 2: TBZ x1, 3, 3f STR d0, [x6], 8 DUP d0, v0.d[1] 3: TBZ x1, 2, 4f STR s0, [x6], 4 DUP s0, v0.s[1] 4: TBZ x1, 1, 5f STR h0, [x6], 2 DUP h0, v0.h[1] 5: TBZ x1, 0, 6f STR b0, [x6] 6: RET END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32 #ifdef __ELF__ .section ".note.GNU-stack","",%progbits #endif