/* * Copyright © 2024, VideoLAN and dav1d authors * Copyright © 2024, Janne Grunau * Copyright © 2024, Martin Storsjo * Copyright © 2024, Arm Limited * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" #if HAVE_DOTPROD ENABLE_DOTPROD // No spaces in these expressions, due to gas-preprocessor. It is translated by // -1 to save the negative offset at getting the address of `mc_subpel_filters`. #define REGULAR1 (((0*15-1)<<7)|(3*15-1)) #define SMOOTH1 (((1*15-1)<<7)|(4*15-1)) #define SHARP1 (((2*15-1)<<7)|(3*15-1)) #define FUNC_ALIGN 2 #define JUMP_ALIGN 2 #define LOOP_ALIGN 2 const h_tbl_neon_dotprod, align=4 // Shuffle indices to permute horizontal samples in preparation for // input to SDOT instructions. The 8-tap horizontal convolution uses // sample indices in the interval of [-3, 4] relative to the current // sample position. .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 .byte 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 // Shuffle indices to permute horizontal samples in preparation for // input to USMMLA instructions. #define OFFSET_USMMLA 48 .byte 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 .byte 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 // Lookup table used to help conversion of shifted 32-bit values to 8-bit. #define OFFSET_CVT_32_8 80 .byte 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 endconst const v_tbl_neon_dotprod, align=4 // Vertical convolutions are also using SDOT instructions, where a // 128-bit register contains a transposed 4x4 matrix of values. // Subsequent iterations of the vertical convolution can reuse the // 3x4 sub-matrix from the previous loop iteration. These shuffle // indices shift and merge this 4x4 matrix with the values of a new // line. .byte 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28 .byte 1, 2, 3, 16, 5, 6, 7, 17, 9, 10, 11, 18, 13, 14, 15, 19 .byte 1, 2, 3, 20, 5, 6, 7, 21, 9, 10, 11, 22, 13, 14, 15, 23 .byte 1, 2, 3, 24, 5, 6, 7, 25, 9, 10, 11, 26, 13, 14, 15, 27 .byte 1, 2, 3, 28, 5, 6, 7, 29, 9, 10, 11, 30, 13, 14, 15, 31 endconst .macro make_8tap_fn op, type, type_h, type_v, isa, jump=1 function \op\()_8tap_\type\()_8bpc_\isa, export=1, align=FUNC_ALIGN mov x9, \type_h mov x10, \type_v .if \jump b \op\()_8tap_\isa .endif endfunc .endm .macro filter_8tap_fn type, dot, isa, dst, d_strd, src, s_strd, w, h, mx, my, xmx, xmy, ldst, lsrc, wd_strd make_8tap_fn \type, sharp, SHARP1, SHARP1, \isa make_8tap_fn \type, sharp_smooth, SHARP1, SMOOTH1, \isa make_8tap_fn \type, sharp_regular, SHARP1, REGULAR1, \isa make_8tap_fn \type, smooth_sharp, SMOOTH1, SHARP1, \isa make_8tap_fn \type, smooth, SMOOTH1, SMOOTH1, \isa make_8tap_fn \type, smooth_regular, SMOOTH1, REGULAR1, \isa make_8tap_fn \type, regular_sharp, REGULAR1, SHARP1, \isa make_8tap_fn \type, regular_smooth, REGULAR1, SMOOTH1, \isa make_8tap_fn \type, regular, REGULAR1, REGULAR1, \isa, jump=0 function \type\()_8tap_\isa, align=FUNC_ALIGN clz w8, \w mov w11, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) sub w8, w8, #24 // for jump tables movrel x12, X(mc_subpel_filters) cbnz \mx, L(\type\()_8tap_h_hv_\isa) cbnz \my, L(\type\()_8tap_v_\isa) .ifc \type, prep add \wd_strd, \w, \w // prep_neon needs w * 2 as stride .endif b X(\type\()_neon) .align JUMP_ALIGN L(\type\()_8tap_v_\isa): madd \my, \my, w11, w10 movrel x13, v_tbl_neon_dotprod sub \src, \src, \s_strd .ifc \isa, neon_dotprod .ifc \type, prep mov w8, #0x2002 // FILTER_WEIGHT * 128 + rounding dup v4.4s, w8 .else movi v4.4s, #32, lsl #8 // FILTER_WEIGHT * 128, bias for SDOT .endif .endif ubfx w11, \my, #7, #7 and \my, \my, #0x7F ldp q6, q28, [x13] cmp \h, #4 csel \my, \my, w11, le sub \src, \src, \s_strd, lsl #1 // src - s_strd * 3 add \xmy, x12, \xmy, lsl #3 // subpel V filter address ldr q29, [x13, #32] .ifc \isa, neon_dotprod movi v5.16b, #128 .endif ldr d7, [\xmy] cmp \w, #8 b.eq 80f b.lt 40f // .align JUMP_ALIGN // fallthrough 160: // V - 16xN+ ldp q30, q31, [x13, #48] .ifc \type, prep add \wd_strd, \w, \w .endif .align LOOP_ALIGN 161: mov \lsrc, \src mov \ldst, \dst sub w8, \h, #1 ldr q16, [\lsrc] ldr q17, [\lsrc, \s_strd] add \lsrc, \lsrc, \s_strd, lsl #1 ldr q18, [\lsrc] ldr q19, [\lsrc, \s_strd] add \lsrc, \lsrc, \s_strd, lsl #1 zip1 v0.16b, v16.16b, v17.16b zip2 v1.16b, v16.16b, v17.16b zip1 v2.16b, v18.16b, v19.16b zip2 v3.16b, v18.16b, v19.16b ldr q20, [\lsrc] ldr q21, [\lsrc, \s_strd] add \lsrc, \lsrc, \s_strd, lsl #1 ldr q22, [\lsrc] ldr q23, [\lsrc, \s_strd] add \lsrc, \lsrc, \s_strd, lsl #1 zip1 v18.16b, v20.16b, v21.16b zip2 v21.16b, v20.16b, v21.16b zip1 v24.16b, v22.16b, v23.16b zip2 v27.16b, v22.16b, v23.16b zip1 v16.8h, v0.8h, v2.8h zip2 v19.8h, v0.8h, v2.8h zip1 v22.8h, v1.8h, v3.8h zip2 v25.8h, v1.8h, v3.8h zip1 v17.8h, v18.8h, v24.8h zip2 v20.8h, v18.8h, v24.8h zip1 v23.8h, v21.8h, v27.8h zip2 v26.8h, v21.8h, v27.8h .ifc \isa, neon_dotprod sub v16.16b, v16.16b, v5.16b sub v19.16b, v19.16b, v5.16b sub v22.16b, v22.16b, v5.16b sub v25.16b, v25.16b, v5.16b sub v17.16b, v17.16b, v5.16b sub v20.16b, v20.16b, v5.16b sub v23.16b, v23.16b, v5.16b sub v26.16b, v26.16b, v5.16b .endif .align LOOP_ALIGN 16: .ifc \isa, neon_i8mm ld1 {v18.16b}, [\lsrc], \s_strd movi v0.4s, #0 movi v1.4s, #0 movi v2.4s, #0 movi v3.4s, #0 mov v21.16b, v18.16b mov v24.16b, v18.16b mov v27.16b, v18.16b .else // neon_dotprod ld1 {v27.16b}, [\lsrc], \s_strd mov v0.16b, v4.16b mov v1.16b, v4.16b mov v2.16b, v4.16b mov v3.16b, v4.16b sub v18.16b, v27.16b, v5.16b sub v21.16b, v27.16b, v5.16b sub v24.16b, v27.16b, v5.16b sub v27.16b, v27.16b, v5.16b .endif \dot v0.4s, v16.16b, v7.4b[0] \dot v1.4s, v19.16b, v7.4b[0] \dot v2.4s, v22.16b, v7.4b[0] \dot v3.4s, v25.16b, v7.4b[0] tbl v16.16b, {v16.16b, v17.16b}, v6.16b tbl v19.16b, {v19.16b, v20.16b}, v6.16b tbl v22.16b, {v22.16b, v23.16b}, v6.16b tbl v25.16b, {v25.16b, v26.16b}, v6.16b \dot v0.4s, v17.16b, v7.4b[1] \dot v1.4s, v20.16b, v7.4b[1] \dot v2.4s, v23.16b, v7.4b[1] \dot v3.4s, v26.16b, v7.4b[1] tbl v17.16b, {v17.16b, v18.16b}, v28.16b tbl v20.16b, {v20.16b, v21.16b}, v29.16b tbl v23.16b, {v23.16b, v24.16b}, v30.16b tbl v26.16b, {v26.16b, v27.16b}, v31.16b subs w8, w8, #1 uzp1 v0.8h, v0.8h, v1.8h uzp1 v2.8h, v2.8h, v3.8h .ifc \type, prep .ifc \isa, neon_i8mm srshr v0.8h, v0.8h, #2 srshr v1.8h, v2.8h, #2 .else sshr v0.8h, v0.8h, #2 sshr v1.8h, v2.8h, #2 .endif st1 {v0.8h, v1.8h}, [\ldst], \d_strd .else // put sqrshrun v0.8b, v0.8h, #6 sqrshrun2 v0.16b, v2.8h, #6 st1 {v0.16b}, [\ldst], \d_strd .endif b.gt 16b .ifc \isa, neon_i8mm movi v0.4s, #0 movi v1.4s, #0 movi v2.4s, #0 movi v3.4s, #0 .else // neon_dotprod mov v0.16b, v4.16b mov v1.16b, v4.16b mov v2.16b, v4.16b mov v3.16b, v4.16b .endif \dot v0.4s, v16.16b, v7.4b[0] \dot v1.4s, v19.16b, v7.4b[0] \dot v2.4s, v22.16b, v7.4b[0] \dot v3.4s, v25.16b, v7.4b[0] \dot v0.4s, v17.16b, v7.4b[1] \dot v1.4s, v20.16b, v7.4b[1] \dot v2.4s, v23.16b, v7.4b[1] \dot v3.4s, v26.16b, v7.4b[1] subs \w, \w, #16 uzp1 v0.8h, v0.8h, v1.8h uzp1 v2.8h, v2.8h, v3.8h .ifc \type, prep .ifc \isa, neon_i8mm srshr v0.8h, v0.8h, #2 srshr v1.8h, v2.8h, #2 .else sshr v0.8h, v0.8h, #2 sshr v1.8h, v2.8h, #2 .endif stp q0, q1, [\ldst] add \dst, \dst, #32 .else // put sqrshrun v0.8b, v0.8h, #6 sqrshrun2 v0.16b, v2.8h, #6 str q0, [\ldst] add \dst, \dst, #16 .endif add \src, \src, #16 b.gt 161b ret .align JUMP_ALIGN 80: // V - 8xN ldr d16, [\src] ldr d17, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 ldr d18, [\src] ldr d19, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 ldr d20, [\src] ldr d21, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 ldr d22, [\src] ldr d23, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 subs \h, \h, #2 // for prep: sub is enough zip1 v0.16b, v16.16b, v17.16b zip1 v2.16b, v18.16b, v19.16b zip1 v18.16b, v20.16b, v21.16b zip1 v24.16b, v22.16b, v23.16b zip1 v16.8h, v0.8h, v2.8h zip2 v19.8h, v0.8h, v2.8h zip1 v17.8h, v18.8h, v24.8h zip2 v20.8h, v18.8h, v24.8h .ifc \isa, neon_dotprod sub v16.16b, v16.16b, v5.16b sub v19.16b, v19.16b, v5.16b sub v17.16b, v17.16b, v5.16b sub v20.16b, v20.16b, v5.16b .endif .ifc \type, put b.eq 82f .endif .align LOOP_ALIGN 8: .ifc \isa, neon_i8mm ldr d18, [\src] movi v0.4s, #0 movi v1.4s, #0 ldr d24, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 movi v2.4s, #0 movi v3.4s, #0 mov v21.8b, v18.8b mov v27.8b, v24.8b .else // neon_dotprod ldr d21, [\src] ldr d27, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 mov v0.16b, v4.16b mov v1.16b, v4.16b mov v2.16b, v4.16b mov v3.16b, v4.16b sub v18.16b, v21.16b, v5.16b sub v21.16b, v21.16b, v5.16b sub v24.16b, v27.16b, v5.16b sub v27.16b, v27.16b, v5.16b .endif tbl v22.16b, {v16.16b, v17.16b}, v6.16b tbl v25.16b, {v19.16b, v20.16b}, v6.16b tbl v23.16b, {v17.16b, v18.16b}, v28.16b tbl v26.16b, {v20.16b, v21.16b}, v29.16b \dot v0.4s, v16.16b, v7.4b[0] \dot v0.4s, v17.16b, v7.4b[1] \dot v1.4s, v19.16b, v7.4b[0] \dot v1.4s, v20.16b, v7.4b[1] tbl v16.16b, {v22.16b, v23.16b}, v6.16b tbl v19.16b, {v25.16b, v26.16b}, v6.16b tbl v17.16b, {v23.16b, v24.16b}, v28.16b tbl v20.16b, {v26.16b, v27.16b}, v29.16b \dot v2.4s, v22.16b, v7.4b[0] \dot v2.4s, v23.16b, v7.4b[1] \dot v3.4s, v25.16b, v7.4b[0] \dot v3.4s, v26.16b, v7.4b[1] subs \h, \h, #2 uzp1 v0.8h, v0.8h, v1.8h uzp1 v2.8h, v2.8h, v3.8h .ifc \type, prep .ifc \isa, neon_i8mm srshr v0.8h, v0.8h, #2 srshr v1.8h, v2.8h, #2 .else sshr v0.8h, v0.8h, #2 sshr v1.8h, v2.8h, #2 .endif stp q0, q1, [\dst], #32 .else // put sqrshrun v0.8b, v0.8h, #6 sqrshrun v1.8b, v2.8h, #6 str d0, [\dst] str d1, [\dst, \d_strd] add \dst, \dst, \d_strd, lsl #1 .endif b.gt 8b .ifc \type, put .align JUMP_ALIGN 82: .endif .ifc \isa, neon_i8mm ldr d18, [\src] movi v0.4s, #0 movi v1.4s, #0 movi v2.4s, #0 movi v3.4s, #0 mov v21.8b, v18.8b .else // neon_dotprod ldr d21, [\src] mov v0.16b, v4.16b mov v1.16b, v4.16b mov v2.16b, v4.16b mov v3.16b, v4.16b sub v18.16b, v21.16b, v5.16b sub v21.16b, v21.16b, v5.16b .endif tbl v22.16b, {v16.16b, v17.16b}, v6.16b tbl v25.16b, {v19.16b, v20.16b}, v6.16b tbl v23.16b, {v17.16b, v18.16b}, v28.16b tbl v26.16b, {v20.16b, v21.16b}, v29.16b \dot v0.4s, v16.16b, v7.4b[0] \dot v0.4s, v17.16b, v7.4b[1] \dot v1.4s, v19.16b, v7.4b[0] \dot v1.4s, v20.16b, v7.4b[1] \dot v2.4s, v22.16b, v7.4b[0] \dot v2.4s, v23.16b, v7.4b[1] \dot v3.4s, v25.16b, v7.4b[0] \dot v3.4s, v26.16b, v7.4b[1] uzp1 v0.8h, v0.8h, v1.8h uzp1 v2.8h, v2.8h, v3.8h .ifc \type, prep .ifc \isa, neon_i8mm srshr v0.8h, v0.8h, #2 srshr v1.8h, v2.8h, #2 .else sshr v0.8h, v0.8h, #2 sshr v1.8h, v2.8h, #2 .endif stp q0, q1, [\dst] .else // put sqrshrun v0.8b, v0.8h, #6 sqrshrun v1.8b, v2.8h, #6 str d0, [\dst] str d1, [\dst, \d_strd] .endif ret .align JUMP_ALIGN 40: // V - 4xN or 2xN (put only) .ifc \type, put cmp \w, #2 b.eq 20f .endif ldr s16, [\src] ldr s17, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 ldr s18, [\src] ldr s19, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 ldr s20, [\src] ldr s21, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 ldr s22, [\src] ldr s23, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 subs \h, \h, #2 // for prep: sub is enough zip1 v0.8b, v16.8b, v17.8b zip1 v2.8b, v18.8b, v19.8b zip1 v18.8b, v20.8b, v21.8b zip1 v24.8b, v22.8b, v23.8b zip1 v16.8h, v0.8h, v2.8h zip1 v17.8h, v18.8h, v24.8h .ifc \isa, neon_dotprod sub v16.16b, v16.16b, v5.16b sub v17.16b, v17.16b, v5.16b .endif .ifc \type, put b.eq 42f .endif .align LOOP_ALIGN 4: ldr s18, [\src] ldr s21, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 .ifc \isa, neon_i8mm movi v0.4s, #0 movi v1.4s, #0 .else // neon_dotprod mov v0.16b, v4.16b mov v1.16b, v4.16b sub v18.16b, v18.16b, v5.16b sub v21.16b, v21.16b, v5.16b .endif tbl v19.16b, {v16.16b, v17.16b}, v6.16b tbl v20.16b, {v17.16b, v18.16b}, v28.16b \dot v0.4s, v16.16b, v7.4b[0] \dot v0.4s, v17.16b, v7.4b[1] tbl v16.16b, {v19.16b, v20.16b}, v6.16b tbl v17.16b, {v20.16b, v21.16b}, v28.16b \dot v1.4s, v19.16b, v7.4b[0] \dot v1.4s, v20.16b, v7.4b[1] .ifc \type, prep subs \h, \h, #2 .ifc \isa, neon_i8mm rshrn v0.4h, v0.4s, #2 rshrn2 v0.8h, v1.4s, #2 .else shrn v0.4h, v0.4s, #2 shrn2 v0.8h, v1.4s, #2 .endif str q0, [\dst], #16 .else uzp1 v0.8h, v0.8h, v1.8h sqrshrun v0.8b, v0.8h, #6 subs \h, \h, #2 fmov x8, d0 lsr x9, x8, #32 str w8, [\dst] str w9, [\dst, \d_strd] add \dst, \dst, \d_strd, lsl #1 .endif b.gt 4b .ifc \type, put .align JUMP_ALIGN 42: .endif ldr s18, [\src] .ifc \isa, neon_i8mm movi v0.4s, #0 movi v1.4s, #0 .else // neon_dotprod mov v0.16b, v4.16b mov v1.16b, v4.16b sub v18.16b, v18.16b, v5.16b .endif tbl v19.16b, {v16.16b, v17.16b}, v6.16b tbl v20.16b, {v17.16b, v18.16b}, v28.16b \dot v0.4s, v16.16b, v7.4b[0] \dot v0.4s, v17.16b, v7.4b[1] \dot v1.4s, v19.16b, v7.4b[0] \dot v1.4s, v20.16b, v7.4b[1] .ifc \type, prep .ifc \isa, neon_i8mm rshrn v0.4h, v0.4s, #2 rshrn2 v0.8h, v1.4s, #2 .else shrn v0.4h, v0.4s, #2 shrn2 v0.8h, v1.4s, #2 .endif str q0, [\dst] .else uzp1 v0.8h, v0.8h, v1.8h sqrshrun v0.8b, v0.8h, #6 fmov x8, d0 lsr x9, x8, #32 str w8, [\dst] str w9, [\dst, \d_strd] .endif ret .ifc \type, put .align JUMP_ALIGN 20: // V - 2xN ldr h16, [\src] ldr h17, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 ldr h18, [\src] ldr h19, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 ldr h20, [\src] ldr h21, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 ldr h22, [\src] ldr h23, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 subs \h, \h, #2 zip1 v0.8b, v16.8b, v17.8b zip1 v2.8b, v18.8b, v19.8b zip1 v18.8b, v20.8b, v21.8b zip1 v24.8b, v22.8b, v23.8b zip1 v16.4h, v0.4h, v2.4h zip1 v17.4h, v18.4h, v24.4h .ifc \isa, neon_dotprod sub v16.8b, v16.8b, v5.8b sub v17.8b, v17.8b, v5.8b .endif b.eq 22f .align LOOP_ALIGN 2: ldr h18, [\src] ldr h21, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 .ifc \isa, neon_i8mm movi v0.4s, #0 movi v1.4s, #0 .else // put mov v0.16b, v4.16b mov v1.16b, v4.16b sub v18.8b, v18.8b, v5.8b sub v21.8b, v21.8b, v5.8b .endif tbl v19.16b, {v16.16b, v17.16b}, v6.16b tbl v20.16b, {v17.16b, v18.16b}, v28.16b \dot v0.4s, v16.16b, v7.4b[0] \dot v0.4s, v17.16b, v7.4b[1] tbl v16.16b, {v19.16b, v20.16b}, v6.16b tbl v17.16b, {v20.16b, v21.16b}, v28.16b \dot v1.4s, v19.16b, v7.4b[0] \dot v1.4s, v20.16b, v7.4b[1] uzp1 v0.8h, v0.8h, v1.8h sqrshrun v0.8b, v0.8h, #6 subs \h, \h, #2 fmov x8, d0 lsr x9, x8, #32 strh w8, [\dst] strh w9, [\dst, \d_strd] add \dst, \dst, \d_strd, lsl #1 b.gt 2b .align JUMP_ALIGN 22: ldr h18, [\src] .ifc \isa, neon_i8mm movi v0.4s, #0 movi v1.4s, #0 .else // put mov v0.16b, v4.16b mov v1.16b, v4.16b sub v18.8b, v18.8b, v5.8b .endif tbl v19.16b, {v16.16b, v17.16b}, v6.16b tbl v20.16b, {v17.16b, v18.16b}, v28.16b \dot v0.4s, v16.16b, v7.4b[0] \dot v0.4s, v17.16b, v7.4b[1] \dot v1.4s, v19.16b, v7.4b[0] \dot v1.4s, v20.16b, v7.4b[1] uzp1 v0.8h, v0.8h, v1.8h sqrshrun v0.8b, v0.8h, #6 fmov x8, d0 lsr x9, x8, #32 strh w8, [\dst] strh w9, [\dst, \d_strd] ret .endif .align JUMP_ALIGN L(\type\()_8tap_h_hv_\isa): madd \mx, \mx, w11, w9 madd w14, \my, w11, w10 // for HV .ifc \isa, neon_dotprod mov w13, #0x2002 // FILTER_WEIGHT * 128 + rounding dup v27.4s, w13 // put H overrides this .endif movrel x13, h_tbl_neon_dotprod sub \src, \src, #3 // src - 3 ldr q28, [x13] // for 4-tap & 8-tap H filters ubfx w15, \mx, #7, #7 and \mx, \mx, #0x7F ubfx w11, w14, #7, #7 // for HV and w14, w14, #0x7F // for HV cmp \w, #4 csel \mx, \mx, w15, le add \xmx, x12, \xmx, lsl #3 // subpel H filter address .ifc \isa, neon_dotprod movi v24.16b, #128 .endif cbz \my, L(\type\()_8tap_h_\isa) // HV cases cmp \h, #4 csel w14, w14, w11, le sub \src, \src, \s_strd, lsl #1 // src - s_strd * 2 - 3 add \xmy, x12, x14, lsl #3 // subpel V filter address mov x15, x30 ldr d7, [\xmy] .ifc \type, put ldr q25, [x13, #(OFFSET_CVT_32_8)] // LUT to help conversion .endif // of 32b values to 8b sxtl v7.8h, v7.8b cmp w10, #SHARP1 b.ne L(\type\()_6tap_hv_\isa) // vertical != SHARP1 // HV 8-tap cases sub \src, \src, \s_strd // src - s_strd * 3 - 3 cmp \w, #4 b.eq 40f .ifc \type, put b.lt 20f .endif // .align JUMP_ALIGN // fallthrough 80: // HV8 - 8xN+ ldp q29, q30, [x13, #16] ldr d26, [\xmx] .ifc \type, prep add \wd_strd, \w, \w .endif .align LOOP_ALIGN 81: mov \lsrc, \src mov \ldst, \dst mov w8, \h .ifc \isa, neon_i8mm bl L(\type\()_hv_filter8_\isa) srshr v16.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) srshr v17.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) srshr v18.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) srshr v19.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) srshr v20.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) srshr v21.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) srshr v22.8h, v22.8h, #2 .else bl L(\type\()_hv_filter8_\isa) sshr v16.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) sshr v17.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) sshr v18.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) sshr v19.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) sshr v20.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) sshr v21.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) sshr v22.8h, v22.8h, #2 .endif .align LOOP_ALIGN 8: ldr q23, [\lsrc] add \lsrc, \lsrc, \s_strd smull v0.4s, v16.4h, v7.h[0] smull2 v1.4s, v16.8h, v7.h[0] mov v16.16b, v17.16b .ifc \isa, neon_i8mm movi v5.4s, #0 movi v6.4s, #0 tbl v2.16b, {v23.16b}, v28.16b tbl v3.16b, {v23.16b}, v29.16b .else // neon_dotprod sub v23.16b, v23.16b, v24.16b mov v5.16b, v27.16b mov v6.16b, v27.16b .endif smlal v0.4s, v17.4h, v7.h[1] smlal2 v1.4s, v17.8h, v7.h[1] .ifc \isa, neon_i8mm tbl v4.16b, {v23.16b}, v30.16b mov v17.16b, v18.16b .else // neon_dotprod mov v17.16b, v18.16b tbl v2.16b, {v23.16b}, v28.16b tbl v3.16b, {v23.16b}, v29.16b tbl v4.16b, {v23.16b}, v30.16b .endif smlal v0.4s, v18.4h, v7.h[2] smlal2 v1.4s, v18.8h, v7.h[2] mov v18.16b, v19.16b \dot v5.4s, v2.16b, v26.4b[0] \dot v6.4s, v3.16b, v26.4b[0] smlal v0.4s, v19.4h, v7.h[3] smlal2 v1.4s, v19.8h, v7.h[3] mov v19.16b, v20.16b \dot v5.4s, v3.16b, v26.4b[1] \dot v6.4s, v4.16b, v26.4b[1] smlal v0.4s, v20.4h, v7.h[4] smlal2 v1.4s, v20.8h, v7.h[4] mov v20.16b, v21.16b smlal v0.4s, v21.4h, v7.h[5] smlal2 v1.4s, v21.8h, v7.h[5] .ifc \type, prep uzp1 v23.8h, v5.8h, v6.8h .endif mov v21.16b, v22.16b smlal v0.4s, v22.4h, v7.h[6] smlal2 v1.4s, v22.8h, v7.h[6] .ifc \isa, neon_i8mm subs w8, w8, #1 .endif .ifc \type, prep .ifc \isa, neon_i8mm srshr v22.8h, v23.8h, #2 .else sshr v22.8h, v23.8h, #2 .endif smlal v0.4s, v22.4h, v7.h[7] smlal2 v1.4s, v22.8h, v7.h[7] rshrn v0.4h, v0.4s, #6 rshrn2 v0.8h, v1.4s, #6 .else // put .ifc \isa, neon_i8mm rshrn v22.4h, v5.4s, #2 rshrn2 v22.8h, v6.4s, #2 .else shrn v22.4h, v5.4s, #2 shrn2 v22.8h, v6.4s, #2 .endif smlal v0.4s, v22.4h, v7.h[7] smlal2 v1.4s, v22.8h, v7.h[7] tbl v0.16b, {v0.16b, v1.16b}, v25.16b sqrshrun v0.8b, v0.8h, #2 .endif .ifc \isa, neon_dotprod subs w8, w8, #1 .endif .ifc \type, prep st1 {v0.8h}, [\ldst], \d_strd b.gt 8b add \dst, \dst, #16 .else st1 {v0.8b}, [\ldst], \d_strd b.gt 8b add \dst, \dst, #8 .endif add \src, \src, #8 subs \w, \w, #8 b.gt 81b ret x15 .align JUMP_ALIGN 40: // HV8 - 4xN ldur s26, [\xmx, #2] add \src, \src, #2 bl L(\type\()_hv_filter4_\isa) shrn v16.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v17.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v18.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v19.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v20.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v21.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v22.4h, v22.4s, #2 .align LOOP_ALIGN 4: ld1 {v4.8b}, [\src], \s_strd smull v0.4s, v16.4h, v7.h[0] smlal v0.4s, v17.4h, v7.h[1] mov v16.16b, v17.16b mov v17.16b, v18.16b .ifc \isa, neon_dotprod sub v4.16b, v4.16b, v24.16b .endif smlal v0.4s, v18.4h, v7.h[2] smlal v0.4s, v19.4h, v7.h[3] tbl v2.16b, {v4.16b}, v28.16b .ifc \isa, neon_i8mm movi v5.4s, #0 .else mov v5.16b, v27.16b .endif mov v18.16b, v19.16b mov v19.16b, v20.16b smlal v0.4s, v20.4h, v7.h[4] smlal v0.4s, v21.4h, v7.h[5] \dot v5.4s, v2.16b, v26.4b[0] mov v20.16b, v21.16b mov v21.16b, v22.16b smlal v0.4s, v22.4h, v7.h[6] .ifc \isa, neon_i8mm rshrn v22.4h, v5.4s, #2 .else shrn v22.4h, v5.4s, #2 .endif smlal v0.4s, v22.4h, v7.h[7] .ifc \type, prep rshrn v0.4h, v0.4s, #6 str d0, [\dst], #8 subs \h, \h, #1 .else subs \h, \h, #1 tbl v0.8b, {v0.16b}, v25.8b sqrshrun v0.8b, v0.8h, #2 str s0, [\dst] add \dst, \dst, \d_strd .endif b.gt 4b ret x15 .ifc \type, put .align JUMP_ALIGN 20: // HV8 - 2xN ldur s26, [\xmx, #2] add \src, \src, #2 bl L(\type\()_hv_filter4_\isa) shrn v16.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v17.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v18.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v19.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v20.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v21.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v22.4h, v22.4s, #2 .align LOOP_ALIGN 2: ld1 {v4.8b}, [\src], \s_strd smull v0.4s, v16.4h, v7.h[0] smlal v0.4s, v17.4h, v7.h[1] mov v16.16b, v17.16b mov v17.16b, v18.16b .ifc \isa, neon_dotprod sub v4.16b, v4.16b, v24.16b .endif smlal v0.4s, v18.4h, v7.h[2] smlal v0.4s, v19.4h, v7.h[3] tbl v2.16b, {v4.16b}, v28.16b .ifc \isa, neon_i8mm movi v5.4s, #0 .else mov v5.16b, v27.16b .endif mov v18.16b, v19.16b mov v19.16b, v20.16b smlal v0.4s, v20.4h, v7.h[4] smlal v0.4s, v21.4h, v7.h[5] \dot v5.4s, v2.16b, v26.4b[0] mov v20.16b, v21.16b mov v21.16b, v22.16b smlal v0.4s, v22.4h, v7.h[6] .ifc \isa, neon_i8mm rshrn v22.4h, v5.4s, #2 .else shrn v22.4h, v5.4s, #2 .endif smlal v0.4s, v22.4h, v7.h[7] subs \h, \h, #1 tbl v0.8b, {v0.16b}, v25.8b sqrshrun v0.8b, v0.8h, #2 str h0, [\dst] add \dst, \dst, \d_strd b.gt 2b ret x15 .endif .align JUMP_ALIGN L(\type\()_6tap_hv_\isa): cmp \w, #4 b.eq 40f .ifc \type, put b.lt 20f .endif // .align JUMP_ALIGN // fallthrough 80: // HV6 - 8xN+ ldr d26, [\xmx] .ifc \type, prep add \wd_strd, \w, \w .endif .ifc \isa, neon_i8mm cmp w9, #SHARP1 b.eq 88f // horizontal == SHARP1 ldp q29, q30, [x13, #(OFFSET_USMMLA)] ext v0.8b, v26.8b, v26.8b, #7 ins v26.d[1], v0.d[0] .align LOOP_ALIGN 81: mov \lsrc, \src mov \ldst, \dst mov w8, \h bl L(\type\()_hv_filter6_neon_i8mm) srshr v16.8h, v22.8h, #2 bl L(\type\()_hv_filter6_neon_i8mm) srshr v17.8h, v22.8h, #2 bl L(\type\()_hv_filter6_neon_i8mm) srshr v18.8h, v22.8h, #2 bl L(\type\()_hv_filter6_neon_i8mm) srshr v19.8h, v22.8h, #2 bl L(\type\()_hv_filter6_neon_i8mm) srshr v20.8h, v22.8h, #2 .align LOOP_ALIGN 8: ld1 {v23.16b}, [\lsrc], \s_strd smull v0.4s, v16.4h, v7.h[1] smull2 v1.4s, v16.8h, v7.h[1] mov v16.16b, v17.16b movi v5.4s, #0 movi v6.4s, #0 tbl v2.16b, {v23.16b}, v29.16b tbl v3.16b, {v23.16b}, v30.16b smlal v0.4s, v17.4h, v7.h[2] smlal2 v1.4s, v17.8h, v7.h[2] mov v17.16b, v18.16b usmmla v5.4s, v2.16b, v26.16b usmmla v6.4s, v3.16b, v26.16b smlal v0.4s, v18.4h, v7.h[3] smlal2 v1.4s, v18.8h, v7.h[3] mov v18.16b, v19.16b subs w8, w8, #1 smlal v0.4s, v19.4h, v7.h[4] smlal2 v1.4s, v19.8h, v7.h[4] uzp1 v23.8h, v5.8h, v6.8h mov v19.16b, v20.16b smlal v0.4s, v20.4h, v7.h[5] smlal2 v1.4s, v20.8h, v7.h[5] srshr v20.8h, v23.8h, #2 smlal v0.4s, v20.4h, v7.h[6] smlal2 v1.4s, v20.8h, v7.h[6] .ifc \type, prep rshrn v0.4h, v0.4s, #6 rshrn2 v0.8h, v1.4s, #6 st1 {v0.8h}, [\ldst], \d_strd b.gt 8b add \dst, \dst, #16 .else tbl v0.16b, {v0.16b, v1.16b}, v25.16b sqrshrun v0.8b, v0.8h, #2 st1 {v0.8b}, [\ldst], \d_strd b.gt 8b add \dst, \dst, #8 .endif add \src, \src, #8 subs \w, \w, #8 b.gt 81b ret x15 .align JUMP_ALIGN 88: .endif // neon_i8mm ldp q29, q30, [x13, #16] .align LOOP_ALIGN 81: mov \lsrc, \src mov \ldst, \dst mov w8, \h .ifc \isa, neon_i8mm bl L(\type\()_hv_filter8_\isa) srshr v16.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) srshr v17.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) srshr v18.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) srshr v19.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) srshr v20.8h, v22.8h, #2 .else bl L(\type\()_hv_filter8_\isa) sshr v16.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) sshr v17.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) sshr v18.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) sshr v19.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) sshr v20.8h, v22.8h, #2 .endif .align LOOP_ALIGN 8: ldr q23, [\lsrc] add \lsrc, \lsrc, \s_strd smull v0.4s, v16.4h, v7.h[1] smull2 v1.4s, v16.8h, v7.h[1] .ifc \isa, neon_dotprod sub v23.16b, v23.16b, v24.16b .endif mov v16.16b, v17.16b .ifc \isa, neon_i8mm movi v5.4s, #0 movi v6.4s, #0 .else mov v5.16b, v27.16b mov v6.16b, v27.16b .endif tbl v2.16b, {v23.16b}, v28.16b tbl v3.16b, {v23.16b}, v29.16b smlal v0.4s, v17.4h, v7.h[2] smlal2 v1.4s, v17.8h, v7.h[2] tbl v4.16b, {v23.16b}, v30.16b mov v17.16b, v18.16b \dot v5.4s, v2.16b, v26.4b[0] \dot v6.4s, v3.16b, v26.4b[0] smlal v0.4s, v18.4h, v7.h[3] smlal2 v1.4s, v18.8h, v7.h[3] mov v18.16b, v19.16b \dot v5.4s, v3.16b, v26.4b[1] \dot v6.4s, v4.16b, v26.4b[1] smlal v0.4s, v19.4h, v7.h[4] smlal2 v1.4s, v19.8h, v7.h[4] mov v19.16b, v20.16b uzp1 v23.8h, v5.8h, v6.8h smlal v0.4s, v20.4h, v7.h[5] smlal2 v1.4s, v20.8h, v7.h[5] .ifc \isa, neon_i8mm srshr v20.8h, v23.8h, #2 .else sshr v20.8h, v23.8h, #2 .endif subs w8, w8, #1 smlal v0.4s, v20.4h, v7.h[6] smlal2 v1.4s, v20.8h, v7.h[6] .ifc \type, prep rshrn v0.4h, v0.4s, #6 rshrn2 v0.8h, v1.4s, #6 st1 {v0.8h}, [\ldst], \d_strd b.gt 8b add \dst, \dst, #16 .else tbl v0.16b, {v0.16b, v1.16b}, v25.16b sqrshrun v0.8b, v0.8h, #2 st1 {v0.8b}, [\ldst], \d_strd b.gt 8b add \dst, \dst, #8 .endif add \src, \src, #8 subs \w, \w, #8 b.gt 81b ret x15 .align FUNC_ALIGN L(\type\()_hv_filter8_\isa): ld1 {v4.16b}, [\lsrc], \s_strd .ifc \isa, neon_i8mm movi v22.4s, #0 movi v23.4s, #0 .else // neon_dotprod sub v4.16b, v4.16b, v24.16b mov v22.16b, v27.16b mov v23.16b, v27.16b .endif tbl v2.16b, {v4.16b}, v28.16b tbl v3.16b, {v4.16b}, v29.16b tbl v4.16b, {v4.16b}, v30.16b \dot v22.4s, v2.16b, v26.4b[0] \dot v23.4s, v3.16b, v26.4b[0] \dot v22.4s, v3.16b, v26.4b[1] \dot v23.4s, v4.16b, v26.4b[1] uzp1 v22.8h, v22.8h, v23.8h ret .ifc \isa, neon_i8mm .align FUNC_ALIGN L(\type\()_hv_filter6_neon_i8mm): ld1 {v4.16b}, [\lsrc], \s_strd movi v22.4s, #0 movi v23.4s, #0 tbl v2.16b, {v4.16b}, v29.16b tbl v3.16b, {v4.16b}, v30.16b usmmla v22.4s, v2.16b, v26.16b usmmla v23.4s, v3.16b, v26.16b uzp1 v22.8h, v22.8h, v23.8h ret .endif .align FUNC_ALIGN L(\type\()_hv_filter4_\isa): ld1 {v4.8b}, [\src], \s_strd .ifc \isa, neon_i8mm movi v22.4s, #2 .else mov v22.16b, v27.16b sub v4.16b, v4.16b, v24.16b .endif tbl v2.16b, {v4.16b}, v28.16b \dot v22.4s, v2.16b, v26.4b[0] ret .align JUMP_ALIGN 40: // HV6 - 4xN ldur s26, [\xmx, #2] add \src, \src, #2 bl L(\type\()_hv_filter4_\isa) shrn v16.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v17.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v18.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v19.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v20.4h, v22.4s, #2 .align LOOP_ALIGN 4: ld1 {v4.8b}, [\src], \s_strd smull v0.4s, v16.4h, v7.h[1] smlal v0.4s, v17.4h, v7.h[2] .ifc \isa, neon_dotprod sub v4.16b, v4.16b, v24.16b .endif mov v16.16b, v17.16b mov v17.16b, v18.16b smlal v0.4s, v18.4h, v7.h[3] smlal v0.4s, v19.4h, v7.h[4] tbl v2.16b, {v4.16b}, v28.16b .ifc \isa, neon_i8mm movi v5.4s, #0 .else mov v5.16b, v27.16b .endif mov v18.16b, v19.16b mov v19.16b, v20.16b \dot v5.4s, v2.16b, v26.4b[0] smlal v0.4s, v20.4h, v7.h[5] .ifc \isa, neon_i8mm rshrn v20.4h, v5.4s, #2 .else shrn v20.4h, v5.4s, #2 .endif subs \h, \h, #1 smlal v0.4s, v20.4h, v7.h[6] .ifc \type, prep rshrn v0.4h, v0.4s, #6 str d0, [\dst], #8 .else tbl v0.8b, {v0.16b}, v25.8b sqrshrun v0.8b, v0.8h, #2 str s0, [\dst] add \dst, \dst, \d_strd .endif b.gt 4b ret x15 .ifc \type, put .align JUMP_ALIGN 20: // HV6 - 2xN ldur s26, [\xmx, #2] add \src, \src, #2 bl L(\type\()_hv_filter4_\isa) shrn v16.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v17.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v18.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v19.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v20.4h, v22.4s, #2 .align LOOP_ALIGN 2: ld1 {v4.8b}, [\src], \s_strd smull v0.4s, v16.4h, v7.h[1] smlal v0.4s, v17.4h, v7.h[2] .ifc \isa, neon_dotprod sub v4.16b, v4.16b, v24.16b .endif mov v16.16b, v17.16b mov v17.16b, v18.16b smlal v0.4s, v18.4h, v7.h[3] smlal v0.4s, v19.4h, v7.h[4] tbl v2.16b, {v4.16b}, v28.16b .ifc \isa, neon_i8mm movi v5.4s, #0 .else mov v5.16b, v27.16b .endif mov v18.16b, v19.16b mov v19.16b, v20.16b \dot v5.4s, v2.16b, v26.4b[0] smlal v0.4s, v20.4h, v7.h[5] .ifc \isa, neon_i8mm rshrn v20.4h, v5.4s, #2 .else shrn v20.4h, v5.4s, #2 .endif subs \h, \h, #1 smlal v0.4s, v20.4h, v7.h[6] tbl v0.8b, {v0.16b}, v25.8b sqrshrun v0.8b, v0.8h, #2 str h0, [\dst] add \dst, \dst, \d_strd b.gt 2b ret x15 .endif .align JUMP_ALIGN L(\type\()_8tap_h_\isa): movrel x11, \type\()_8tap_h_\isa\()_tbl ldrsw x8, [x11, x8, lsl #2] .ifc \type, put .ifc \isa, neon_i8mm movi v27.4s, #34 // special rounding .else mov w10, #0x2022 // 64 * 128 + 34, bias and rounding for SDOT dup v27.4s, w10 .endif .endif add x11, x11, x8 br x11 .ifc \type, put .align JUMP_ALIGN 20: // H - 2xN AARCH64_VALID_JUMP_TARGET add \src, \src, #2 ldur s26, [\xmx, #2] .align LOOP_ALIGN 2: ldr d0, [\src] ldr d1, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 .ifc \isa, neon_dotprod sub v0.8b, v0.8b, v24.8b sub v1.8b, v1.8b, v24.8b .endif mov v4.16b, v27.16b mov v5.16b, v27.16b tbl v2.16b, {v0.16b}, v28.16b tbl v3.16b, {v1.16b}, v28.16b \dot v4.4s, v2.16b, v26.4b[0] \dot v5.4s, v3.16b, v26.4b[0] uzp1 v4.8h, v4.8h, v5.8h sqshrun v4.8b, v4.8h, #6 subs \h, \h, #2 fmov x8, d4 lsr x9, x8, #32 strh w8, [\dst] strh w9, [\dst, \d_strd] add \dst, \dst, \d_strd, lsl #1 b.gt 2b ret .endif .align JUMP_ALIGN 40: // H - 4xN AARCH64_VALID_JUMP_TARGET add \src, \src, #2 ldur s26, [\xmx, #2] .align LOOP_ALIGN 4: ldr d0, [\src] ldr d1, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 .ifc \type\()_\isa, prep_neon_i8mm movi v4.4s, #0 movi v5.4s, #0 .else .ifc \isa, neon_dotprod sub v0.8b, v0.8b, v24.8b sub v1.8b, v1.8b, v24.8b .endif mov v4.16b, v27.16b mov v5.16b, v27.16b .endif tbl v2.16b, {v0.16b}, v28.16b tbl v3.16b, {v1.16b}, v28.16b \dot v4.4s, v2.16b, v26.4b[0] \dot v5.4s, v3.16b, v26.4b[0] .ifc \type, prep subs \h, \h, #2 .ifc \isa, neon_i8mm uzp1 v4.8h, v4.8h, v5.8h srshr v4.8h, v4.8h, #2 .else shrn v4.4h, v4.4s, #2 shrn2 v4.8h, v5.4s, #2 .endif str q4, [\dst], #16 .else // put uzp1 v4.8h, v4.8h, v5.8h sqshrun v4.8b, v4.8h, #6 subs \h, \h, #2 fmov x8, d4 lsr x9, x8, #32 str w8, [\dst] str w9, [\dst, \d_strd] add \dst, \dst, \d_strd, lsl #1 .endif b.gt 4b ret .align JUMP_ALIGN 80: // H - 8xN AARCH64_VALID_JUMP_TARGET ldr d26, [\xmx] .ifc \isa, neon_i8mm cmp w9, #SHARP1 b.eq 88f // horizontal == SHARP1 ldp q29, q30, [x13, #(OFFSET_USMMLA)] ext v0.8b, v26.8b, v26.8b, #7 ins v26.d[1], v0.d[0] .align LOOP_ALIGN 8: ldr q0, [\src] ldr q16, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 .ifc \type, prep movi v4.4s, #0 movi v5.4s, #0 movi v20.4s, #0 movi v21.4s, #0 .else mov v4.16b, v27.16b mov v5.16b, v27.16b mov v20.16b, v27.16b mov v21.16b, v27.16b .endif tbl v1.16b, {v0.16b}, v29.16b tbl v2.16b, {v0.16b}, v30.16b tbl v17.16b, {v16.16b}, v29.16b tbl v18.16b, {v16.16b}, v30.16b usmmla v4.4s, v1.16b, v26.16b usmmla v5.4s, v2.16b, v26.16b usmmla v20.4s, v17.16b, v26.16b usmmla v21.4s, v18.16b, v26.16b uzp1 v4.8h, v4.8h, v5.8h uzp1 v20.8h, v20.8h, v21.8h .ifc \type, prep srshr v4.8h, v4.8h, #2 srshr v20.8h, v20.8h, #2 subs \h, \h, #2 stp q4, q20, [\dst], #32 .else // put sqshrun v4.8b, v4.8h, #6 sqshrun v20.8b, v20.8h, #6 subs \h, \h, #2 str d4, [\dst] str d20, [\dst, \d_strd] add \dst, \dst, \d_strd, lsl #1 .endif b.gt 8b ret .align JUMP_ALIGN 88: .endif // neon_i8mm ldp q29, q30, [x13, #16] .align LOOP_ALIGN 8: ldr q0, [\src] ldr q16, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 .ifc \type\()_\isa, prep_neon_i8mm movi v4.4s, #0 movi v5.4s, #0 movi v20.4s, #0 movi v21.4s, #0 .else .ifc \isa, neon_dotprod sub v0.16b, v0.16b, v24.16b sub v16.16b, v16.16b, v24.16b .endif mov v4.16b, v27.16b mov v5.16b, v27.16b mov v20.16b, v27.16b mov v21.16b, v27.16b .endif tbl v1.16b, {v0.16b}, v28.16b tbl v2.16b, {v0.16b}, v29.16b tbl v3.16b, {v0.16b}, v30.16b tbl v17.16b, {v16.16b}, v28.16b tbl v18.16b, {v16.16b}, v29.16b tbl v19.16b, {v16.16b}, v30.16b \dot v4.4s, v1.16b, v26.4b[0] \dot v5.4s, v2.16b, v26.4b[0] \dot v20.4s, v17.16b, v26.4b[0] \dot v21.4s, v18.16b, v26.4b[0] \dot v4.4s, v2.16b, v26.4b[1] \dot v5.4s, v3.16b, v26.4b[1] \dot v20.4s, v18.16b, v26.4b[1] \dot v21.4s, v19.16b, v26.4b[1] uzp1 v4.8h, v4.8h, v5.8h uzp1 v20.8h, v20.8h, v21.8h .ifc \type, prep .ifc \isa, neon_i8mm srshr v4.8h, v4.8h, #2 srshr v20.8h, v20.8h, #2 .else sshr v4.8h, v4.8h, #2 sshr v20.8h, v20.8h, #2 .endif subs \h, \h, #2 stp q4, q20, [\dst], #32 .else // put sqshrun v4.8b, v4.8h, #6 sqshrun v20.8b, v20.8h, #6 subs \h, \h, #2 str d4, [\dst] str d20, [\dst, \d_strd] add \dst, \dst, \d_strd, lsl #1 .endif b.gt 8b ret .align JUMP_ALIGN 160: // H - 16xN AARCH64_VALID_JUMP_TARGET ldr d26, [\xmx] .ifc \isa, neon_i8mm cmp w9, #SHARP1 b.eq 168f // horizontal == SHARP1 ldp q29, q30, [x13, #(OFFSET_USMMLA)] ext v0.8b, v26.8b, v26.8b, #7 ins v26.d[1], v0.d[0] .align LOOP_ALIGN 16: ldr q16, [\src] ldur q17, [\src, #8] // avoid 2 register TBL for small cores add \src, \src, \s_strd .ifc \type, prep movi v6.4s, #0 movi v7.4s, #0 movi v22.4s, #0 movi v23.4s, #0 .else mov v6.16b, v27.16b mov v7.16b, v27.16b mov v22.16b, v27.16b mov v23.16b, v27.16b .endif tbl v0.16b, {v16.16b}, v29.16b tbl v1.16b, {v16.16b}, v30.16b tbl v2.16b, {v17.16b}, v29.16b tbl v3.16b, {v17.16b}, v30.16b usmmla v6.4s, v0.16b, v26.16b usmmla v7.4s, v1.16b, v26.16b usmmla v22.4s, v2.16b, v26.16b usmmla v23.4s, v3.16b, v26.16b uzp1 v6.8h, v6.8h, v7.8h uzp1 v22.8h, v22.8h, v23.8h .ifc \type, prep srshr v6.8h, v6.8h, #2 srshr v22.8h, v22.8h, #2 subs \h, \h, #1 stp q6, q22, [\dst], #32 .else // put sqshrun v6.8b, v6.8h, #6 sqshrun2 v6.16b, v22.8h, #6 subs \h, \h, #1 st1 {v6.16b}, [\dst], \d_strd .endif b.gt 16b ret .align JUMP_ALIGN 168: .endif // neon_i8mm ldp q29, q30, [x13, #16] .align LOOP_ALIGN 16: ldr q16, [\src] ldur q17, [\src, #12] // avoid 2 register TBL for small cores add \src, \src, \s_strd .ifc \type\()_\isa, prep_neon_i8mm movi v6.4s, #0 movi v7.4s, #0 movi v22.4s, #0 movi v23.4s, #0 .else .ifc \isa, neon_dotprod sub v16.16b, v16.16b, v24.16b sub v17.16b, v17.16b, v24.16b .endif mov v6.16b, v27.16b mov v7.16b, v27.16b mov v22.16b, v27.16b mov v23.16b, v27.16b .endif tbl v0.16b, {v16.16b}, v28.16b tbl v1.16b, {v16.16b}, v29.16b tbl v2.16b, {v16.16b}, v30.16b tbl v3.16b, {v17.16b}, v28.16b tbl v4.16b, {v17.16b}, v29.16b \dot v6.4s, v0.16b, v26.4b[0] \dot v7.4s, v1.16b, v26.4b[0] \dot v22.4s, v2.16b, v26.4b[0] \dot v23.4s, v3.16b, v26.4b[0] \dot v6.4s, v1.16b, v26.4b[1] \dot v7.4s, v2.16b, v26.4b[1] \dot v22.4s, v3.16b, v26.4b[1] \dot v23.4s, v4.16b, v26.4b[1] uzp1 v6.8h, v6.8h, v7.8h uzp1 v22.8h, v22.8h, v23.8h .ifc \type, prep .ifc \isa, neon_i8mm srshr v6.8h, v6.8h, #2 srshr v22.8h, v22.8h, #2 .else sshr v6.8h, v6.8h, #2 sshr v22.8h, v22.8h, #2 .endif subs \h, \h, #1 stp q6, q22, [\dst], #32 .else // put sqshrun v6.8b, v6.8h, #6 sqshrun2 v6.16b, v22.8h, #6 subs \h, \h, #1 st1 {v6.16b}, [\dst], \d_strd .endif b.gt 16b ret .align JUMP_ALIGN 320: // H - 32xN+ 640: 1280: AARCH64_VALID_JUMP_TARGET ldr d26, [\xmx] .ifc \type, put sub \d_strd, \d_strd, \w, uxtw .endif sub \s_strd, \s_strd, \w, uxtw mov w8, \w .ifc \isa, neon_i8mm cmp w9, #SHARP1 b.eq 328f // horizontal == SHARP1 ldp q29, q30, [x13, #(OFFSET_USMMLA)] ext v0.8b, v26.8b, v26.8b, #7 ins v26.d[1], v0.d[0] .align LOOP_ALIGN 32: ldr q16, [\src] ldur q17, [\src, #8] // avoid 2 register TBL for small cores add \src, \src, #16 .ifc \type, prep movi v6.4s, #0 movi v7.4s, #0 movi v22.4s, #0 movi v23.4s, #0 .else mov v6.16b, v27.16b mov v7.16b, v27.16b mov v22.16b, v27.16b mov v23.16b, v27.16b .endif tbl v0.16b, {v16.16b}, v29.16b tbl v1.16b, {v16.16b}, v30.16b tbl v2.16b, {v17.16b}, v29.16b tbl v3.16b, {v17.16b}, v30.16b usmmla v6.4s, v0.16b, v26.16b usmmla v7.4s, v1.16b, v26.16b usmmla v22.4s, v2.16b, v26.16b usmmla v23.4s, v3.16b, v26.16b uzp1 v6.8h, v6.8h, v7.8h uzp1 v22.8h, v22.8h, v23.8h .ifc \type, prep srshr v6.8h, v6.8h, #2 srshr v22.8h, v22.8h, #2 subs w8, w8, #16 stp q6, q22, [\dst], #32 .else // put sqshrun v6.8b, v6.8h, #6 sqshrun2 v6.16b, v22.8h, #6 subs w8, w8, #16 str q6, [\dst], #16 .endif b.gt 32b add \src, \src, \s_strd .ifc \type, put add \dst, \dst, \d_strd .endif mov w8, \w subs \h, \h, #1 b.gt 32b ret .align JUMP_ALIGN 328: .endif // neon_i8mm ldp q29, q30, [x13, #16] .align LOOP_ALIGN 32: ldr q16, [\src] ldur q17, [\src, #12] // avoid 2 register TBL for small cores add \src, \src, #16 .ifc \type\()_\isa, prep_neon_i8mm movi v6.4s, #0 movi v7.4s, #0 movi v22.4s, #0 movi v23.4s, #0 .else .ifc \isa, neon_dotprod sub v16.16b, v16.16b, v24.16b sub v17.16b, v17.16b, v24.16b .endif mov v6.16b, v27.16b mov v7.16b, v27.16b mov v22.16b, v27.16b mov v23.16b, v27.16b .endif tbl v0.16b, {v16.16b}, v28.16b tbl v1.16b, {v16.16b}, v29.16b tbl v2.16b, {v16.16b}, v30.16b tbl v3.16b, {v17.16b}, v28.16b tbl v4.16b, {v17.16b}, v29.16b \dot v6.4s, v0.16b, v26.4b[0] \dot v7.4s, v1.16b, v26.4b[0] \dot v22.4s, v2.16b, v26.4b[0] \dot v23.4s, v3.16b, v26.4b[0] \dot v6.4s, v1.16b, v26.4b[1] \dot v7.4s, v2.16b, v26.4b[1] \dot v22.4s, v3.16b, v26.4b[1] \dot v23.4s, v4.16b, v26.4b[1] uzp1 v6.8h, v6.8h, v7.8h uzp1 v22.8h, v22.8h, v23.8h .ifc \type, prep .ifc \isa, neon_i8mm srshr v6.8h, v6.8h, #2 srshr v22.8h, v22.8h, #2 .else sshr v6.8h, v6.8h, #2 sshr v22.8h, v22.8h, #2 .endif subs w8, w8, #16 stp q6, q22, [\dst], #32 .else // put sqshrun v6.8b, v6.8h, #6 sqshrun2 v6.16b, v22.8h, #6 subs w8, w8, #16 str q6, [\dst], #16 .endif b.gt 32b add \src, \src, \s_strd .ifc \type, put add \dst, \dst, \d_strd .endif mov w8, \w subs \h, \h, #1 b.gt 32b ret endfunc jumptable \type\()_8tap_h_\isa\()_tbl .word 1280b - \type\()_8tap_h_\isa\()_tbl .word 640b - \type\()_8tap_h_\isa\()_tbl .word 320b - \type\()_8tap_h_\isa\()_tbl .word 160b - \type\()_8tap_h_\isa\()_tbl .word 80b - \type\()_8tap_h_\isa\()_tbl .word 40b - \type\()_8tap_h_\isa\()_tbl .ifc \type, put .word 20b - \type\()_8tap_h_\isa\()_tbl .endif endjumptable .endm // dst(x0), d_strd(x7), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6) // xmx(x5), xmy(x6), ldst(x5), lsrc(x6), wd_strd(w7) filter_8tap_fn prep, sdot, neon_dotprod, x0, x7, x1, x2, w3, w4, w5, w6, x5, x6, x5, x6, w7 // dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7) // xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1) filter_8tap_fn put, sdot, neon_dotprod, x0, x1, x2, x3, w4, w5, w6, w7, x6, x7, x6, x7, w1 #if HAVE_I8MM ENABLE_I8MM // dst(x0), d_strd(x7), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6) // xmx(x5), xmy(x6), ldst(x5), lsrc(x6), wd_strd(w7) filter_8tap_fn prep, usdot, neon_i8mm, x0, x7, x1, x2, w3, w4, w5, w6, x5, x6, x5, x6, w7 // dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7) // xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1) filter_8tap_fn put, usdot, neon_i8mm, x0, x1, x2, x3, w4, w5, w6, w7, x6, x7, x6, x7, w1 DISABLE_I8MM #endif // HAVE_I8MM DISABLE_DOTPROD #endif // HAVE_DOTPROD