; Copyright © 2020, VideoLAN and dav1d authors ; Copyright © 2020, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 64 spel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 db 32, 33, 34, 35, 34, 35, 36, 37, 36, 37, 38, 39, 38, 39, 40, 41 spel_h_shufC: db 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15, 16, 17 db 40, 41, 42, 43, 42, 43, 44, 45, 44, 45, 46, 47, 46, 47, 48, 49 db 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23, 24, 25 db 48, 49, 50, 51, 50, 51, 52, 53, 52, 53, 54, 55, 54, 55, 56, 57 spel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 db 36, 37, 38, 39, 38, 39, 40, 41, 40, 41, 42, 43, 42, 43, 44, 45 spel_h_shufD: db 12, 13, 14, 15, 14, 15, 16, 17, 16, 17, 18, 19, 18, 19, 20, 21 db 44, 45, 46, 47, 46, 47, 48, 49, 48, 49, 50, 51, 50, 51, 52, 53 db 20, 21, 22, 23, 22, 23, 24, 25, 24, 25, 26, 27, 26, 27, 28, 29 db 52, 53, 54, 55, 54, 55, 56, 57, 56, 57, 58, 59, 58, 59, 60, 61 spel_v_shuf8: db 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23 db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39 db 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31 db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47 spel_v_shuf16: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39 db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47 db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55 db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63 prep_endA: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62 db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94 db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126 prep_endB: db 1, 2, 5, 6, 9, 10, 13, 14, 33, 34, 37, 38, 41, 42, 45, 46 db 17, 18, 21, 22, 25, 26, 29, 30, 49, 50, 53, 54, 57, 58, 61, 62 db 65, 66, 69, 70, 73, 74, 77, 78, 97, 98,101,102,105,106,109,110 db 81, 82, 85, 86, 89, 90, 93, 94,113,114,117,118,121,122,125,126 prep_endC: db 1, 2, 5, 6, 9, 10, 13, 14, 65, 66, 69, 70, 73, 74, 77, 78 db 17, 18, 21, 22, 25, 26, 29, 30, 81, 82, 85, 86, 89, 90, 93, 94 db 33, 34, 37, 38, 41, 42, 45, 46, 97, 98,101,102,105,106,109,110 db 49, 50, 53, 54, 57, 58, 61, 62,113,114,117,118,121,122,125,126 spel_shuf4a: db 1, 2, 17, 18, 5, 6, 21, 22, 9, 10, 25, 26, 13, 14, 29, 30 db 17, 18, 33, 34, 21, 22, 37, 38, 25, 26, 41, 42, 29, 30, 45, 46 db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62 db 49, 50, 65, 66, 53, 54, 69, 70, 57, 58, 73, 74, 61, 62, 77, 78 spel_shuf4b: db 50, 51, 65, 66, 54, 55, 69, 70, 58, 59, 73, 74, 62, 63, 77, 78 db 65, 66, 81, 82, 69, 70, 85, 86, 73, 74, 89, 90, 77, 78, 93, 94 db 81, 82, 97, 98, 85, 86,101,102, 89, 90,105,106, 93, 94,109,110 db 97, 98,113,114,101,102,117,118,105,106,121,122,109,110,125,126 spel_shuf8a: db 1, 2, 17, 18, 5, 6, 21, 22, 9, 10, 25, 26, 13, 14, 29, 30 db 17, 18, 65, 66, 21, 22, 69, 70, 25, 26, 73, 74, 29, 30, 77, 78 db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62 db 49, 50, 97, 98, 53, 54,101,102, 57, 58,105,106, 61, 62,109,110 spel_shuf8b: db 18, 19, 65, 66, 22, 23, 69, 70, 26, 27, 73, 74, 30, 31, 77, 78 db 65, 66, 81, 82, 69, 70, 85, 86, 73, 74, 89, 90, 77, 78, 93, 94 db 50, 51, 97, 98, 54, 55,101,102, 58, 59,105,106, 62, 63,109,110 db 97, 98,113,114,101,102,117,118,105,106,121,122,109,110,125,126 spel_shuf16: db 1, 2, 33, 34, 5, 6, 37, 38, 9, 10, 41, 42, 13, 14, 45, 46 db 17, 18, 49, 50, 21, 22, 53, 54, 25, 26, 57, 58, 29, 30, 61, 62 db 65, 66, 97, 98, 69, 70,101,102, 73, 74,105,106, 77, 78,109,110 db 81, 82,113,114, 85, 86,117,118, 89, 90,121,122, 93, 94,125,126 spel_shuf32: db 1, 2, 65, 66, 5, 6, 69, 70, 9, 10, 73, 74, 13, 14, 77, 78 db 17, 18, 81, 82, 21, 22, 85, 86, 25, 26, 89, 90, 29, 30, 93, 94 db 33, 34, 97, 98, 37, 38,101,102, 41, 42,105,106, 45, 46,109,110 db 49, 50,113,114, 53, 54,117,118, 57, 58,121,122, 61, 62,125,126 spel_h_shuf2b: db 1, 2, 17, 18, 5, 6, 21, 22, 17, 18, 33, 34, 21, 22, 37, 38 db 33, 34, 49, 50, 37, 38, 53, 54, 49, 50, 9, 10, 53, 54, 13, 14 db 9, 10, 25, 26, 13, 14, 29, 30, 25, 26, 41, 42, 29, 30, 45, 46 spel_shuf2: db 10, 11, 17, 18, 14, 15, 21, 22, 17, 18, 25, 26, 21, 22, 29, 30 spel_h_shuf2a: db 0, 1, 2, 3, 2, 3, 4, 5, 16, 17, 18, 19, 18, 19, 20, 21 db 4, 5, 6, 7, 6, 7, 8, 9, 20, 21, 22, 23, 22, 23, 24, 25 w_mask_end42x: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125 w_mask_end444: db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62 db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94 db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126 w_mask_shuf4: db 0, 2, 8, 10, 4, 6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30 db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62 db 64, 66, 72, 74, 68, 70, 76, 78, 80, 82, 88, 90, 84, 86, 92, 94 db 96, 98,104,106,100,102,108,110,112,114,120,122,116,118,124,126 w_mask_shuf8: db 0, 2, 16, 18, 4, 6, 20, 22, 8, 10, 24, 26, 12, 14, 28, 30 db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62 db 64, 66, 80, 82, 68, 70, 84, 86, 72, 74, 88, 90, 76, 78, 92, 94 db 96, 98,112,114,100,102,116,118,104,106,120,122,108,110,124,126 w_mask_shuf16: db 0, 2, 32, 34, 4, 6, 36, 38, 8, 10, 40, 42, 12, 14, 44, 46 db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62 db 64, 66, 96, 98, 68, 70,100,102, 72, 74,104,106, 76, 78,108,110 db 80, 82,112,114, 84, 86,116,118, 88, 90,120,122, 92, 94,124,126 warp8x8_permA: db 0, 1, 2, 3, 32, 33, 34, 35, 2, 3, 4, 5, 34, 35, 36, 37 db 4, 5, 6, 7, 36, 37, 38, 39, 6, 7, 8, 9, 38, 39, 40, 41 db 8, 9, 10, 11, 40, 41, 42, 43, 10, 11, 12, 13, 42, 43, 44, 45 db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49 warp8x8_permB: db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49 db 16, 17, 18, 19, 48, 49, 50, 51, 18, 19, 20, 21, 50, 51, 52, 53 db 20, 21, 22, 23, 52, 53, 54, 55, 22, 23, 24, 25, 54, 55, 56, 57 db 24, 25, 26, 27, 56, 57, 58, 59, 26, 27, 28, 29, 58, 59, 60, 61 warp8x8_end: db 0, 1, 4, 5, 16, 17, 20, 21, 32, 33, 36, 37, 48, 49, 52, 53 db 2, 3, 6, 7, 18, 19, 22, 23, 34, 35, 38, 39, 50, 51, 54, 55 db 8, 9, 12, 13, 24, 25, 28, 29, 40, 41, 44, 45, 56, 57, 60, 61 db 10, 11, 14, 15, 26, 27, 30, 31, 42, 43, 46, 47, 58, 59, 62, 63 deint_q_shuf: ;dq 0, 2, 4, 6, 1, 3, 5, 7 pd_0to7: dd 0, 1, 2, 3, 4, 5, 6, 7 dd 1 pw_2048: times 2 dw 2048 dd 3 pw_8192: times 2 dw 8192 avg_shift: dw 5, 5, 3, 3 pw_27615: times 2 dw 27615 pw_32766: times 2 dw 32766 warp8x8_permC: db -1, 0, -1, 1, -1, 8, -1, 9, -1, 4, -1, 5, -1, 12, -1, 13 warp8x8_permD: db -1, 2, -1, 3, -1, 10, -1, 11, -1, 6, -1, 7, -1, 14, -1, 15 warp_shift_h: db 11, 19, 11, 19, 43, 51, 43, 51, 13, 21, 13, 21, 45, 53, 45, 53 blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 resize_permA: dd 0, 4, 8, 12, 1, 5, 9, 13, 16, 20, 24, 28, 17, 21, 25, 29 resize_permB: dd 2, 6, 10, 14, 3, 7, 11, 15, 18, 22, 26, 30, 19, 23, 27, 31 resize_permC: dq 0, 1, 4, 5, 8, 9, 12, 13 resize_permD: dq 2, 3, 6, 7, 10, 11, 14, 15 resize_permE: dq 0, 2, 4, 6 resize_shufA: db -1, 0, -1, 1, -1, 4, -1, 5, -1, 8, -1, 9, -1, 12, -1, 13 resize_shufB: db -1, 2, -1, 3, -1, 6, -1, 7, -1, 10, -1, 11, -1, 14, -1, 15 rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15 prep_hv_shift: dq 6, 4 put_bilin_h_rnd: dw 8, 8, 10, 10 prep_mul: dw 16, 16, 4, 4 put_8tap_h_rnd: dd 34, 40 prep_8tap_rnd: dd 128 - (8192 << 8) warp_8x8_rnd_h: dd 512, 2048 warp_8x8_rnd_v: dd 262144, 65536 warp_8x8t_rnd_v: dd 16384 - (8192 << 15) avg_round: dw -16400, -16400, -16388, -16388 w_avg_round: dd 128 + (8192 << 4), 32 + (8192 << 4) mask_round: dd 512 + (8192 << 6), 128 + (8192 << 6) w_mask_round: dd 128, 64 bidir_shift: dw 6, 6, 4, 4 pb_64: times 4 db 64 pw_m512: times 2 dw -512 pw_2: times 2 dw 2 pw_64: times 2 dw 64 pd_32: dd 32 pd_63: dd 63 pd_128: dd 128 pd_640: dd 640 pd_2176: dd 2176 pd_16384: dd 16384 pd_0_4: dd 0, 4 %define pw_16 prep_mul %define pd_512 warp_8x8_rnd_h %macro BASE_JMP_TABLE 3-* %xdefine %1_%2_table (%%table - %3) %xdefine %%base %1_%2 %%table: %rep %0 - 2 dw %%base %+ _w%3 - %%base %rotate 1 %endrep %endmacro %macro HV_JMP_TABLE 5-* %xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3) %xdefine %%base %1_%3 %assign %%types %4 %if %%types & 1 %xdefine %1_%2_h_%3_table (%%h - %5) %%h: %rep %0 - 4 dw %%prefix %+ .h_w%5 - %%base %rotate 1 %endrep %rotate 4 %endif %if %%types & 2 %xdefine %1_%2_v_%3_table (%%v - %5) %%v: %rep %0 - 4 dw %%prefix %+ .v_w%5 - %%base %rotate 1 %endrep %rotate 4 %endif %if %%types & 4 %xdefine %1_%2_hv_%3_table (%%hv - %5) %%hv: %rep %0 - 4 dw %%prefix %+ .hv_w%5 - %%base %rotate 1 %endrep %endif %endmacro %macro BIDIR_JMP_TABLE 2-* %xdefine %1_%2_table (%%table - 2*%3) %xdefine %%base %1_%2_table %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2) %%table: %rep %0 - 2 dd %%prefix %+ .w%3 - %%base %rotate 1 %endrep %endmacro %xdefine put_avx512icl mangle(private_prefix %+ _put_bilin_16bpc_avx512icl.put) %xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_16bpc_avx512icl.prep) BIDIR_JMP_TABLE avg, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_avg, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE mask, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_420, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_422, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_444, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE blend, avx512icl, 4, 8, 16, 32 BIDIR_JMP_TABLE blend_v, avx512icl, 2, 4, 8, 16, 32 BIDIR_JMP_TABLE blend_h, avx512icl, 2, 4, 8, 16, 32, 64, 128 BASE_JMP_TABLE put, avx512icl, 2, 4, 8, 16, 32, 64, 128 BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, 6tap, avx512icl, 2, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, 8tap, avx512icl, 2, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, 6tap, avx512icl, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, 8tap, avx512icl, 2, 4, 8, 16, 32, 64, 128 %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX cextern mc_subpel_filters %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) cextern mc_warp_filter cextern obmc_masks_avx2 cextern resize_filter SECTION .text %if WIN64 DECLARE_REG_TMP 4 %else DECLARE_REG_TMP 8 %endif INIT_ZMM avx512icl cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w, h, mxy mov mxyd, r6m ; mx lea r7, [put_avx512icl] tzcnt t0d, wm movifnidn hd, hm test mxyd, mxyd jnz .h mov mxyd, r7m ; my test mxyd, mxyd jnz .v .put: movzx t0d, word [r7+t0*2+table_offset(put,)] add t0, r7 jmp t0 .put_w2: mov r6d, [srcq+ssq*0] mov r7d, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r6d mov [dstq+dsq*1], r7d lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w2 RET .put_w4: mov r6, [srcq+ssq*0] mov r7, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r6 mov [dstq+dsq*1], r7 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w4 RET .put_w8: movu xmm0, [srcq+ssq*0] movu xmm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], xmm0 mova [dstq+dsq*1], xmm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w8 RET .put_w16: movu ym0, [srcq+ssq*0] movu ym1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], ym0 mova [dstq+dsq*1], ym1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w16 RET .put_w32: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w32 RET .put_w64: movu m0, [srcq+ssq*0+64*0] movu m1, [srcq+ssq*0+64*1] movu m2, [srcq+ssq*1+64*0] movu m3, [srcq+ssq*1+64*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0+64*0], m0 mova [dstq+dsq*0+64*1], m1 mova [dstq+dsq*1+64*0], m2 mova [dstq+dsq*1+64*1], m3 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w64 RET .put_w128: movu m0, [srcq+64*0] movu m1, [srcq+64*1] movu m2, [srcq+64*2] movu m3, [srcq+64*3] add srcq, ssq mova [dstq+64*0], m0 mova [dstq+64*1], m1 mova [dstq+64*2], m2 mova [dstq+64*3], m3 add dstq, dsq dec hd jg .put_w128 RET .h: vpbroadcastw m5, mxyd mov mxyd, r7m ; my vpbroadcastd m4, [pw_16] psubw m4, m5 test mxyd, mxyd jnz .hv ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v movzx t0d, word [r7+t0*2+table_offset(put, _bilin_h)] mov r6d, r8m ; bitdepth_max add t0, r7 shr r6d, 11 vpbroadcastd m6, [r7-put_avx512icl+put_bilin_h_rnd+r6*4] jmp t0 .h_w2: movq xmm1, [srcq+ssq*0] movhps xmm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmullw xmm0, xmm1, xm4 psrlq xmm1, 16 pmullw xmm1, xm5 paddw xmm0, xm6 paddw xmm0, xmm1 psrlw xmm0, 4 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2 RET .h_w4: movq xmm0, [srcq+ssq*0+0] movhps xmm0, [srcq+ssq*1+0] movq xmm1, [srcq+ssq*0+2] movhps xmm1, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] pmullw xmm0, xm4 pmullw xmm1, xm5 paddw xmm0, xm6 paddw xmm0, xmm1 psrlw xmm0, 4 movq [dstq+dsq*0], xmm0 movhps [dstq+dsq*1], xmm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4 RET .h_w8: movu xm0, [srcq+ssq*0+0] vinserti32x4 ym0, [srcq+ssq*1+0], 1 movu xm1, [srcq+ssq*0+2] vinserti32x4 ym1, [srcq+ssq*1+2], 1 lea srcq, [srcq+ssq*2] pmullw ym0, ym4 pmullw ym1, ym5 paddw ym0, ym6 paddw ym0, ym1 psrlw ym0, 4 mova [dstq+dsq*0], xm0 vextracti32x4 [dstq+dsq*1], ym0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 RET .h_w16: movu ym0, [srcq+ssq*0+0] vinserti32x8 m0, [srcq+ssq*1+0], 1 movu ym1, [srcq+ssq*0+2] vinserti32x8 m1, [srcq+ssq*1+2], 1 lea srcq, [srcq+ssq*2] pmullw m0, m4 pmullw m1, m5 paddw m0, m6 paddw m0, m1 psrlw m0, 4 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w16 RET .h_w32: pmullw m0, m4, [srcq+ssq*0+0] pmullw m2, m5, [srcq+ssq*0+2] pmullw m1, m4, [srcq+ssq*1+0] pmullw m3, m5, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] paddw m0, m6 paddw m1, m6 paddw m0, m2 paddw m1, m3 psrlw m0, 4 psrlw m1, 4 mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w32 RET .h_w64: pmullw m0, m4, [srcq+64*0+0] pmullw m2, m5, [srcq+64*0+2] pmullw m1, m4, [srcq+64*1+0] pmullw m3, m5, [srcq+64*1+2] add srcq, ssq paddw m0, m6 paddw m1, m6 paddw m0, m2 paddw m1, m3 psrlw m0, 4 psrlw m1, 4 mova [dstq+64*0], m0 mova [dstq+64*1], m1 add dstq, dsq dec hd jg .h_w64 RET .h_w128: pmullw m0, m4, [srcq+64*0+0] pmullw m7, m5, [srcq+64*0+2] pmullw m1, m4, [srcq+64*1+0] pmullw m8, m5, [srcq+64*1+2] pmullw m2, m4, [srcq+64*2+0] pmullw m9, m5, [srcq+64*2+2] pmullw m3, m4, [srcq+64*3+0] pmullw m10, m5, [srcq+64*3+2] add srcq, ssq REPX {paddw x, m6}, m0, m1, m2, m3 paddw m0, m7 paddw m1, m8 paddw m2, m9 paddw m3, m10 REPX {psrlw x, 4}, m0, m1, m2, m3 mova [dstq+64*0], m0 mova [dstq+64*1], m1 mova [dstq+64*2], m2 mova [dstq+64*3], m3 add dstq, dsq dec hd jg .h_w128 RET .v: movzx t0d, word [r7+t0*2+table_offset(put, _bilin_v)] shl mxyd, 11 vpbroadcastw m8, mxyd add t0, r7 jmp t0 .v_w2: movd xmm0, [srcq+ssq*0] .v_w2_loop: movd xmm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpckldq xmm2, xmm0, xmm1 movd xmm0, [srcq+ssq*0] punpckldq xmm1, xmm0 psubw xmm1, xmm2 pmulhrsw xmm1, xm8 paddw xmm1, xmm2 movd [dstq+dsq*0], xmm1 pextrd [dstq+dsq*1], xmm1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movq xmm0, [srcq+ssq*0] .v_w4_loop: movq xmm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklqdq xmm2, xmm0, xmm1 movq xmm0, [srcq+ssq*0] punpcklqdq xmm1, xmm0 psubw xmm1, xmm2 pmulhrsw xmm1, xm8 paddw xmm1, xmm2 movq [dstq+dsq*0], xmm1 movhps [dstq+dsq*1], xmm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop RET .v_w8: movu xmm0, [srcq+ssq*0] .v_w8_loop: vbroadcasti128 ymm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd ymm2, ymm0, ymm1, 0xf0 vbroadcasti128 ymm0, [srcq+ssq*0] vpblendd ymm1, ymm0, 0xf0 psubw ymm1, ymm2 pmulhrsw ymm1, ym8 paddw ymm1, ymm2 mova [dstq+dsq*0], xmm1 vextracti128 [dstq+dsq*1], ymm1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop vzeroupper RET .v_w16: movu ym0, [srcq+ssq*0] .v_w16_loop: movu ym3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] psubw ym1, ym3, ym0 pmulhrsw ym1, ym8 paddw ym1, ym0 movu ym0, [srcq+ssq*0] psubw ym2, ym0, ym3 pmulhrsw ym2, ym8 paddw ym2, ym3 mova [dstq+dsq*0], ym1 mova [dstq+dsq*1], ym2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w16_loop RET .v_w32: movu m0, [srcq+ssq*0] .v_w32_loop: movu m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] psubw m1, m3, m0 pmulhrsw m1, m8 paddw m1, m0 movu m0, [srcq+ssq*0] psubw m2, m0, m3 pmulhrsw m2, m8 paddw m2, m3 mova [dstq+dsq*0], m1 mova [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w32_loop RET .v_w64: movu m0, [srcq+ssq*0+64*0] movu m1, [srcq+ssq*0+64*1] .v_w64_loop: movu m2, [srcq+ssq*1+64*0] movu m3, [srcq+ssq*1+64*1] lea srcq, [srcq+ssq*2] psubw m4, m2, m0 pmulhrsw m4, m8 paddw m4, m0 movu m0, [srcq+ssq*0+64*0] psubw m5, m3, m1 pmulhrsw m5, m8 paddw m5, m1 movu m1, [srcq+ssq*0+64*1] psubw m6, m0, m2 pmulhrsw m6, m8 psubw m7, m1, m3 pmulhrsw m7, m8 mova [dstq+dsq*0+64*0], m4 mova [dstq+dsq*0+64*1], m5 paddw m6, m2 paddw m7, m3 mova [dstq+dsq*1+64*0], m6 mova [dstq+dsq*1+64*1], m7 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w64_loop RET .v_w128: movu m0, [srcq+ssq*0+64*0] movu m1, [srcq+ssq*0+64*1] movu m2, [srcq+ssq*0+64*2] movu m3, [srcq+ssq*0+64*3] .v_w128_loop: movu m4, [srcq+ssq*1+64*0] movu m5, [srcq+ssq*1+64*1] movu m6, [srcq+ssq*1+64*2] movu m7, [srcq+ssq*1+64*3] lea srcq, [srcq+ssq*2] psubw m9, m4, m0 pmulhrsw m9, m8 paddw m9, m0 movu m0, [srcq+ssq*0+64*0] psubw m10, m5, m1 pmulhrsw m10, m8 paddw m10, m1 movu m1, [srcq+ssq*0+64*1] psubw m11, m6, m2 pmulhrsw m11, m8 paddw m11, m2 movu m2, [srcq+ssq*0+64*2] psubw m12, m7, m3 pmulhrsw m12, m8 paddw m12, m3 movu m3, [srcq+ssq*0+64*3] mova [dstq+dsq*0+64*0], m9 psubw m9, m0, m4 pmulhrsw m9, m8 mova [dstq+dsq*0+64*1], m10 psubw m10, m1, m5 pmulhrsw m10, m8 mova [dstq+dsq*0+64*2], m11 psubw m11, m2, m6 pmulhrsw m11, m8 mova [dstq+dsq*0+64*3], m12 psubw m12, m3, m7 pmulhrsw m12, m8 paddw m9, m4 paddw m10, m5 mova [dstq+dsq*1+64*0], m9 mova [dstq+dsq*1+64*1], m10 paddw m11, m6 paddw m12, m7 mova [dstq+dsq*1+64*2], m11 mova [dstq+dsq*1+64*3], m12 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w128_loop RET .hv: movzx t0d, word [r7+t0*2+table_offset(put, _bilin_hv)] shl mxyd, 11 vpbroadcastd m6, [pw_2] vpbroadcastw m7, mxyd vpbroadcastd m8, [pw_8192] add t0, r7 test dword r8m, 0x800 jnz .hv_12bpc psllw m4, 2 psllw m5, 2 vpbroadcastd m8, [pw_2048] .hv_12bpc: jmp t0 .hv_w2: vpbroadcastq xmm1, [srcq+ssq*0] pmullw xmm0, xmm1, xm4 psrlq xmm1, 16 pmullw xmm1, xm5 paddw xmm0, xm6 paddw xmm0, xmm1 psrlw xmm0, 2 .hv_w2_loop: movq xmm2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movhps xmm2, [srcq+ssq*0] pmullw xmm1, xmm2, xm4 psrlq xmm2, 16 pmullw xmm2, xm5 paddw xmm1, xm6 paddw xmm1, xmm2 psrlw xmm1, 2 ; 1 _ 2 _ shufpd xmm2, xmm0, xmm1, 0x01 ; 0 _ 1 _ mova xmm0, xmm1 psubw xmm1, xmm2 paddw xmm1, xmm1 pmulhw xmm1, xm7 paddw xmm1, xmm2 pmulhrsw xmm1, xm8 movd [dstq+dsq*0], xmm1 pextrd [dstq+dsq*1], xmm1, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: pmullw xmm0, xm4, [srcq+ssq*0-8] pmullw xmm1, xm5, [srcq+ssq*0-6] paddw xmm0, xm6 paddw xmm0, xmm1 psrlw xmm0, 2 .hv_w4_loop: movq xmm1, [srcq+ssq*1+0] movq xmm2, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] movhps xmm1, [srcq+ssq*0+0] movhps xmm2, [srcq+ssq*0+2] pmullw xmm1, xm4 pmullw xmm2, xm5 paddw xmm1, xm6 paddw xmm1, xmm2 psrlw xmm1, 2 ; 1 2 shufpd xmm2, xmm0, xmm1, 0x01 ; 0 1 mova xmm0, xmm1 psubw xmm1, xmm2 paddw xmm1, xmm1 pmulhw xmm1, xm7 paddw xmm1, xmm2 pmulhrsw xmm1, xm8 movq [dstq+dsq*0], xmm1 movhps [dstq+dsq*1], xmm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop RET .hv_w8: pmullw xmm0, xm4, [srcq+ssq*0+0] pmullw xmm1, xm5, [srcq+ssq*0+2] paddw xmm0, xm6 paddw xmm0, xmm1 psrlw xmm0, 2 vinserti32x4 ym0, xmm0, 1 .hv_w8_loop: movu xm1, [srcq+ssq*1+0] movu xm2, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] vinserti32x4 ym1, [srcq+ssq*0+0], 1 vinserti32x4 ym2, [srcq+ssq*0+2], 1 pmullw ym1, ym4 pmullw ym2, ym5 paddw ym1, ym6 paddw ym1, ym2 psrlw ym1, 2 ; 1 2 vshufi32x4 ym2, ym0, ym1, 0x01 ; 0 1 mova ym0, ym1 psubw ym1, ym2 paddw ym1, ym1 pmulhw ym1, ym7 paddw ym1, ym2 pmulhrsw ym1, ym8 mova [dstq+dsq*0], xm1 vextracti32x4 [dstq+dsq*1], ym1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop RET .hv_w16: pmullw ym0, ym4, [srcq+ssq*0+0] pmullw ym1, ym5, [srcq+ssq*0+2] paddw ym0, ym6 paddw ym0, ym1 psrlw ym0, 2 vinserti32x8 m0, ym0, 1 .hv_w16_loop: movu ym1, [srcq+ssq*1+0] movu ym2, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] vinserti32x8 m1, [srcq+ssq*0+0], 1 vinserti32x8 m2, [srcq+ssq*0+2], 1 pmullw m1, m4 pmullw m2, m5 paddw m1, m6 paddw m1, m2 psrlw m1, 2 ; 1 2 vshufi32x4 m2, m0, m1, q1032 ; 0 1 mova m0, m1 psubw m1, m2 paddw m1, m1 pmulhw m1, m7 paddw m1, m2 pmulhrsw m1, m8 mova [dstq+dsq*0], ym1 vextracti32x8 [dstq+dsq*1], m1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w16_loop RET .hv_w32: .hv_w64: .hv_w128: movifnidn wd, wm lea r6d, [hq+wq*8-256] mov r4, srcq mov r7, dstq .hv_w32_loop0: pmullw m0, m4, [srcq+ssq*0+0] pmullw m1, m5, [srcq+ssq*0+2] paddw m0, m6 paddw m0, m1 psrlw m0, 2 .hv_w32_loop: pmullw m3, m4, [srcq+ssq*1+0] pmullw m1, m5, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] paddw m3, m6 paddw m3, m1 psrlw m3, 2 psubw m1, m3, m0 paddw m1, m1 pmulhw m1, m7 paddw m1, m0 pmullw m0, m4, [srcq+ssq*0+0] pmullw m2, m5, [srcq+ssq*0+2] paddw m0, m6 paddw m0, m2 psrlw m0, 2 psubw m2, m0, m3 paddw m2, m2 pmulhw m2, m7 paddw m2, m3 pmulhrsw m1, m8 pmulhrsw m2, m8 mova [dstq+dsq*0], m1 mova [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w32_loop add r4, 64 add r7, 64 movzx hd, r6b mov srcq, r4 mov dstq, r7 sub r6d, 1<<8 jg .hv_w32_loop0 RET cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, w, h, mxy, stride3 movifnidn mxyd, r5m ; mx lea r6, [prep_avx512icl] tzcnt wd, wm movifnidn hd, hm test mxyd, mxyd jnz .h mov mxyd, r6m ; my test mxyd, mxyd jnz .v .prep: movzx wd, word [r6+wq*2+table_offset(prep,)] mov r5d, r7m ; bitdepth_max vpbroadcastd m5, [r6-prep_avx512icl+pw_8192] add wq, r6 shr r5d, 11 vpbroadcastd m4, [r6-prep_avx512icl+prep_mul+r5*4] lea stride3q, [strideq*3] jmp wq .prep_w4: mov r3d, 0x0c kmovb k1, r3d .prep_w4_loop: movq xm0, [srcq+strideq*0] movhps xm0, [srcq+strideq*1] vpbroadcastq ym1, [srcq+strideq*2] vpunpcklqdq ym0{k1}, ym1, [srcq+stride3q] {1to4} lea srcq, [srcq+strideq*4] pmullw ym0, ym4 psubw ym0, ym5 mova [tmpq], ym0 add tmpq, 32 sub hd, 4 jg .prep_w4_loop RET .prep_w8: movu xm0, [srcq+strideq*0] vinserti32x4 ym0, [srcq+strideq*1], 1 vinserti32x4 m0, [srcq+strideq*2], 2 vinserti32x4 m0, [srcq+stride3q ], 3 lea srcq, [srcq+strideq*4] pmullw m0, m4 psubw m0, m5 mova [tmpq], m0 add tmpq, 64 sub hd, 4 jg .prep_w8 RET .prep_w16: movu ym0, [srcq+strideq*0] vinserti32x8 m0, [srcq+strideq*1], 1 movu ym1, [srcq+strideq*2] vinserti32x8 m1, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] pmullw m0, m4 pmullw m1, m4 psubw m0, m5 psubw m1, m5 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 add tmpq, 64*2 sub hd, 4 jg .prep_w16 RET .prep_w32: pmullw m0, m4, [srcq+strideq*0] pmullw m1, m4, [srcq+strideq*1] pmullw m2, m4, [srcq+strideq*2] pmullw m3, m4, [srcq+stride3q ] lea srcq, [srcq+strideq*4] REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 mova [tmpq+64*2], m2 mova [tmpq+64*3], m3 add tmpq, 64*4 sub hd, 4 jg .prep_w32 RET .prep_w64: pmullw m0, m4, [srcq+strideq*0+64*0] pmullw m1, m4, [srcq+strideq*0+64*1] pmullw m2, m4, [srcq+strideq*1+64*0] pmullw m3, m4, [srcq+strideq*1+64*1] lea srcq, [srcq+strideq*2] REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 mova [tmpq+64*2], m2 mova [tmpq+64*3], m3 add tmpq, 64*4 sub hd, 2 jg .prep_w64 RET .prep_w128: pmullw m0, m4, [srcq+64*0] pmullw m1, m4, [srcq+64*1] pmullw m2, m4, [srcq+64*2] pmullw m3, m4, [srcq+64*3] add srcq, strideq REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 mova [tmpq+64*2], m2 mova [tmpq+64*3], m3 add tmpq, 64*4 dec hd jg .prep_w128 RET .h: vpbroadcastw m5, mxyd mov mxyd, r6m ; my vpbroadcastd m4, [pw_16] vpbroadcastd m6, [pw_32766] psubw m4, m5 test dword r7m, 0x800 jnz .h_12bpc psllw m4, 2 psllw m5, 2 .h_12bpc: test mxyd, mxyd jnz .hv movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] add wq, r6 lea stride3q, [strideq*3] jmp wq .h_w4: movu xm1, [srcq+strideq*0] vinserti32x4 ym1, [srcq+strideq*2], 1 movu xm2, [srcq+strideq*1] vinserti32x4 ym2, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] punpcklqdq ym0, ym1, ym2 psrldq ym1, 2 psrldq ym2, 2 pmullw ym0, ym4 punpcklqdq ym1, ym2 pmullw ym1, ym5 psubw ym0, ym6 paddw ym0, ym1 psraw ym0, 2 mova [tmpq], ym0 add tmpq, 32 sub hd, 4 jg .h_w4 RET .h_w8: movu xm0, [srcq+strideq*0+0] movu xm1, [srcq+strideq*0+2] vinserti32x4 ym0, [srcq+strideq*1+0], 1 vinserti32x4 ym1, [srcq+strideq*1+2], 1 vinserti32x4 m0, [srcq+strideq*2+0], 2 vinserti32x4 m1, [srcq+strideq*2+2], 2 vinserti32x4 m0, [srcq+stride3q +0], 3 vinserti32x4 m1, [srcq+stride3q +2], 3 lea srcq, [srcq+strideq*4] pmullw m0, m4 pmullw m1, m5 psubw m0, m6 paddw m0, m1 psraw m0, 2 mova [tmpq], m0 add tmpq, 64 sub hd, 4 jg .h_w8 RET .h_w16: movu ym0, [srcq+strideq*0+0] vinserti32x8 m0, [srcq+strideq*1+0], 1 movu ym1, [srcq+strideq*0+2] vinserti32x8 m1, [srcq+strideq*1+2], 1 lea srcq, [srcq+strideq*2] pmullw m0, m4 pmullw m1, m5 psubw m0, m6 paddw m0, m1 psraw m0, 2 mova [tmpq], m0 add tmpq, 64 sub hd, 2 jg .h_w16 RET .h_w32: pmullw m0, m4, [srcq+strideq*0+0] pmullw m2, m5, [srcq+strideq*0+2] pmullw m1, m4, [srcq+strideq*1+0] pmullw m3, m5, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] psubw m0, m6 psubw m1, m6 paddw m0, m2 paddw m1, m3 psraw m0, 2 psraw m1, 2 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 add tmpq, 64*2 sub hd, 2 jg .h_w32 RET .h_w64: pmullw m0, m4, [srcq+ 0] pmullw m2, m5, [srcq+ 2] pmullw m1, m4, [srcq+64] pmullw m3, m5, [srcq+66] add srcq, strideq psubw m0, m6 psubw m1, m6 paddw m0, m2 paddw m1, m3 psraw m0, 2 psraw m1, 2 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 add tmpq, 64*2 dec hd jg .h_w64 RET .h_w128: pmullw m0, m4, [srcq+ 0] pmullw m7, m5, [srcq+ 2] pmullw m1, m4, [srcq+ 64] pmullw m8, m5, [srcq+ 66] pmullw m2, m4, [srcq+128] pmullw m9, m5, [srcq+130] pmullw m3, m4, [srcq+192] pmullw m10, m5, [srcq+194] add srcq, strideq REPX {psubw x, m6}, m0, m1, m2, m3 paddw m0, m7 paddw m1, m8 paddw m2, m9 paddw m3, m10 REPX {psraw x, 2}, m0, m1, m2, m3 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 mova [tmpq+64*2], m2 mova [tmpq+64*3], m3 add tmpq, 64*4 dec hd jg .h_w128 RET .v: movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] vpbroadcastw m9, mxyd vpbroadcastd m8, [pw_16] vpbroadcastd m10, [pw_32766] add wq, r6 lea stride3q, [strideq*3] psubw m8, m9 test dword r7m, 0x800 jnz .v_12bpc psllw m8, 2 psllw m9, 2 .v_12bpc: jmp wq .v_w4: movq xmm0, [srcq+strideq*0] .v_w4_loop: vpbroadcastq xmm2, [srcq+strideq*1] vpbroadcastq ymm1, [srcq+strideq*2] vpbroadcastq ymm3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpblendd ymm2, ymm1, 0x30 vpblendd ymm2, ymm3, 0xc0 vpblendd ymm1, ymm2, ymm0, 0x03 ; 0 1 2 3 movq xmm0, [srcq+strideq*0] valignq ymm2, ymm0, ymm2, 1 ; 1 2 3 4 pmullw ymm1, ym8 pmullw ymm2, ym9 psubw ymm1, ym10 paddw ymm1, ymm2 psraw ymm1, 2 mova [tmpq], ymm1 add tmpq, 32 sub hd, 4 jg .v_w4_loop vzeroupper RET .v_w8: movu xm0, [srcq+strideq*0] .v_w8_loop: vinserti32x4 ym1, ym0, [srcq+strideq*1], 1 vinserti32x4 m1, [srcq+strideq*2], 2 vinserti32x4 m1, [srcq+stride3q ], 3 ; 0 1 2 3 lea srcq, [srcq+strideq*4] movu xm0, [srcq+strideq*0] valignq m2, m0, m1, 2 ; 1 2 3 4 pmullw m1, m8 pmullw m2, m9 psubw m1, m10 paddw m1, m2 psraw m1, 2 mova [tmpq], m1 add tmpq, 64 sub hd, 4 jg .v_w8_loop RET .v_w16: movu ym0, [srcq+strideq*0] .v_w16_loop: vinserti32x8 m1, m0, [srcq+strideq*1], 1 ; 0 1 movu ym3, [srcq+strideq*2] vinserti32x8 m2, m3, [srcq+stride3q ], 1 ; 2 3 lea srcq, [srcq+strideq*4] movu ym0, [srcq+strideq*0] vshufi32x4 m3, m1, m3, q1032 ; 1 2 vshufi32x4 m4, m2, m0, q1032 ; 3 4 pmullw m1, m8 pmullw m2, m8 pmullw m3, m9 pmullw m4, m9 psubw m1, m10 psubw m2, m10 paddw m1, m3 paddw m2, m4 psraw m1, 2 psraw m2, 2 mova [tmpq+64*0], m1 mova [tmpq+64*1], m2 add tmpq, 64*2 sub hd, 4 jg .v_w16_loop RET .v_w32: movu m0, [srcq+strideq*0] .v_w32_loop: movu m3, [srcq+strideq*1] lea srcq, [srcq+strideq*2] pmullw m1, m8, m0 movu m0, [srcq+strideq*0] pmullw m2, m8, m3 pmullw m3, m9 pmullw m4, m9, m0 psubw m1, m10 psubw m2, m10 paddw m1, m3 paddw m2, m4 psraw m1, 2 psraw m2, 2 mova [tmpq+64*0], m1 mova [tmpq+64*1], m2 add tmpq, 64*2 sub hd, 2 jg .v_w32_loop RET .v_w64: movu m0, [srcq+64*0] movu m1, [srcq+64*1] .v_w64_loop: add srcq, strideq pmullw m2, m8, m0 movu m0, [srcq+64*0] pmullw m3, m8, m1 movu m1, [srcq+64*1] pmullw m4, m9, m0 pmullw m5, m9, m1 psubw m2, m10 psubw m3, m10 paddw m2, m4 paddw m3, m5 psraw m2, 2 psraw m3, 2 mova [tmpq+64*0], m2 mova [tmpq+64*1], m3 add tmpq, 64*2 dec hd jg .v_w64_loop RET .v_w128: movu m0, [srcq+64*0] movu m1, [srcq+64*1] movu m2, [srcq+64*2] movu m3, [srcq+64*3] .v_w128_loop: add srcq, strideq pmullw m4, m8, m0 movu m0, [srcq+64*0] pmullw m5, m8, m1 movu m1, [srcq+64*1] pmullw m6, m8, m2 movu m2, [srcq+64*2] pmullw m7, m8, m3 movu m3, [srcq+64*3] pmullw m11, m9, m0 pmullw m12, m9, m1 pmullw m13, m9, m2 pmullw m14, m9, m3 REPX {psubw x, m10}, m4, m5, m6, m7 paddw m4, m11 paddw m5, m12 paddw m6, m13 paddw m7, m14 REPX {psraw x, 2}, m4, m5, m6, m7 mova [tmpq+64*0], m4 mova [tmpq+64*1], m5 mova [tmpq+64*2], m6 mova [tmpq+64*3], m7 add tmpq, 64*4 dec hd jg .v_w128_loop RET .hv: movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] shl mxyd, 11 vpbroadcastw m7, mxyd add wq, r6 lea stride3q, [strideq*3] jmp wq .hv_w4: movq xmm0, [srcq+strideq*0+0] movq xmm1, [srcq+strideq*0+2] pmullw xmm0, xm4 pmullw xmm1, xm5 psubw xmm0, xm6 paddw xmm0, xmm1 psraw xmm0, 2 vpbroadcastq ym0, xmm0 .hv_w4_loop: movu xm1, [srcq+strideq*1] vinserti128 ym1, [srcq+stride3q ], 1 movu xm2, [srcq+strideq*2] lea srcq, [srcq+strideq*4] vinserti128 ym2, [srcq+strideq*0], 1 punpcklqdq ym3, ym1, ym2 psrldq ym1, 2 psrldq ym2, 2 pmullw ym3, ym4 punpcklqdq ym1, ym2 pmullw ym1, ym5 psubw ym3, ym6 paddw ym1, ym3 psraw ym1, 2 ; 1 2 3 4 valignq ym2, ym1, ym0, 3 ; 0 1 2 3 mova ym0, ym1 psubw ym1, ym2 pmulhrsw ym1, ym7 paddw ym1, ym2 mova [tmpq], ym1 add tmpq, 32 sub hd, 4 jg .hv_w4_loop RET .hv_w8: pmullw xm0, xm4, [srcq+strideq*0+0] pmullw xm1, xm5, [srcq+strideq*0+2] psubw xm0, xm6 paddw xm0, xm1 psraw xm0, 2 vinserti32x4 m0, xm0, 3 .hv_w8_loop: movu xm1, [srcq+strideq*1+0] movu xm2, [srcq+strideq*1+2] vinserti32x4 ym1, [srcq+strideq*2+0], 1 vinserti32x4 ym2, [srcq+strideq*2+2], 1 vinserti32x4 m1, [srcq+stride3q +0], 2 vinserti32x4 m2, [srcq+stride3q +2], 2 lea srcq, [srcq+strideq*4] vinserti32x4 m1, [srcq+strideq*0+0], 3 vinserti32x4 m2, [srcq+strideq*0+2], 3 pmullw m1, m4 pmullw m2, m5 psubw m1, m6 paddw m1, m2 psraw m1, 2 ; 1 2 3 4 valignq m2, m1, m0, 6 ; 0 1 2 3 mova m0, m1 psubw m1, m2 pmulhrsw m1, m7 paddw m1, m2 mova [tmpq], m1 add tmpq, 64 sub hd, 4 jg .hv_w8_loop RET .hv_w16: pmullw ym0, ym4, [srcq+strideq*0+0] pmullw ym1, ym5, [srcq+strideq*0+2] psubw ym0, ym6 paddw ym0, ym1 psraw ym0, 2 vinserti32x8 m0, ym0, 1 .hv_w16_loop: movu ym1, [srcq+strideq*1+0] movu ym2, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] vinserti32x8 m1, [srcq+strideq*0+0], 1 vinserti32x8 m2, [srcq+strideq*0+2], 1 pmullw m1, m4 pmullw m2, m5 psubw m1, m6 paddw m1, m2 psraw m1, 2 ; 1 2 vshufi32x4 m2, m0, m1, q1032 ; 0 1 mova m0, m1 psubw m1, m2 pmulhrsw m1, m7 paddw m1, m2 mova [tmpq], m1 add tmpq, 64 sub hd, 2 jg .hv_w16_loop RET .hv_w32: pmullw m0, m4, [srcq+strideq*0+0] pmullw m1, m5, [srcq+strideq*0+2] psubw m0, m6 paddw m0, m1 psraw m0, 2 .hv_w32_loop: pmullw m3, m4, [srcq+strideq*1+0] pmullw m1, m5, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] psubw m3, m6 paddw m3, m1 psraw m3, 2 psubw m1, m3, m0 pmulhrsw m1, m7 paddw m1, m0 pmullw m0, m4, [srcq+strideq*0+0] pmullw m2, m5, [srcq+strideq*0+2] psubw m0, m6 paddw m0, m2 psraw m0, 2 psubw m2, m0, m3 pmulhrsw m2, m7 paddw m2, m3 mova [tmpq+64*0], m1 mova [tmpq+64*1], m2 add tmpq, 64*2 sub hd, 2 jg .hv_w32_loop RET .hv_w64: pmullw m0, m4, [srcq+ 0] pmullw m2, m5, [srcq+ 2] pmullw m1, m4, [srcq+64] pmullw m3, m5, [srcq+66] psubw m0, m6 psubw m1, m6 paddw m0, m2 paddw m1, m3 psraw m0, 2 psraw m1, 2 .hv_w64_loop: add srcq, strideq pmullw m2, m4, [srcq+ 0] pmullw m8, m5, [srcq+ 2] pmullw m3, m4, [srcq+64] pmullw m9, m5, [srcq+66] psubw m2, m6 psubw m3, m6 paddw m2, m8 paddw m3, m9 psraw m2, 2 psraw m3, 2 psubw m8, m2, m0 psubw m9, m3, m1 pmulhrsw m8, m7 pmulhrsw m9, m7 paddw m8, m0 mova m0, m2 paddw m9, m1 mova m1, m3 mova [tmpq+64*0], m8 mova [tmpq+64*1], m9 add tmpq, 64*2 dec hd jg .hv_w64_loop RET .hv_w128: pmullw m0, m4, [srcq+ 0] pmullw m8, m5, [srcq+ 2] pmullw m1, m4, [srcq+ 64] pmullw m9, m5, [srcq+ 66] pmullw m2, m4, [srcq+128] pmullw m10, m5, [srcq+130] pmullw m3, m4, [srcq+192] pmullw m11, m5, [srcq+194] REPX {psubw x, m6}, m0, m1, m2, m3 paddw m0, m8 paddw m1, m9 paddw m2, m10 paddw m3, m11 REPX {psraw x, 2}, m0, m1, m2, m3 .hv_w128_loop: add srcq, strideq pmullw m8, m4, [srcq+ 0] pmullw m12, m5, [srcq+ 2] pmullw m9, m4, [srcq+ 64] pmullw m13, m5, [srcq+ 66] pmullw m10, m4, [srcq+128] pmullw m14, m5, [srcq+130] pmullw m11, m4, [srcq+192] pmullw m15, m5, [srcq+194] REPX {psubw x, m6}, m8, m9, m10, m11 paddw m8, m12 paddw m9, m13 paddw m10, m14 paddw m11, m15 REPX {psraw x, 2}, m8, m9, m10, m11 psubw m12, m8, m0 psubw m13, m9, m1 psubw m14, m10, m2 psubw m15, m11, m3 REPX {pmulhrsw x, m7}, m12, m13, m14, m15 paddw m12, m0 mova m0, m8 paddw m13, m1 mova m1, m9 mova [tmpq+64*0], m12 mova [tmpq+64*1], m13 paddw m14, m2 mova m2, m10 paddw m15, m3 mova m3, m11 mova [tmpq+64*2], m14 mova [tmpq+64*3], m15 add tmpq, 64*4 dec hd jg .hv_w128_loop RET ; int8_t subpel_filters[5][15][8] %assign FILTER_REGULAR (0*15 << 16) | 3*15 %assign FILTER_SMOOTH (1*15 << 16) | 4*15 %assign FILTER_SHARP (2*15 << 16) | 3*15 %macro FN 4-5 ; prefix, type, type_h, type_v, jmp_to cglobal %1_%2_16bpc mov t0d, FILTER_%3 %ifidn %3, %4 mov t1d, t0d %else mov t1d, FILTER_%4 %endif %if %0 == 5 ; skip the jump in the last filter jmp mangle(private_prefix %+ _%5 %+ SUFFIX) %endif %endmacro %if WIN64 DECLARE_REG_TMP 4, 5 %define buf rsp+stack_offset+8 ; shadow space %else DECLARE_REG_TMP 7, 8 %define buf rsp-40 ; red zone %endif %define PUT_8TAP_FN FN put_8tap, PUT_8TAP_FN smooth, SMOOTH, SMOOTH, put_6tap_16bpc PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR, put_6tap_16bpc PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH, put_6tap_16bpc PUT_8TAP_FN regular, REGULAR, REGULAR cglobal put_6tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my %define base r8-put_avx512icl imul mxd, mxm, 0x010101 add mxd, t0d ; 6tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 6tap_v, my, 4tap_v lea r8, [put_avx512icl] movifnidn wd, wm movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v .put: tzcnt wd, wd movzx wd, word [r8+wq*2+table_offset(put,)] add wq, r8 %if WIN64 pop r8 %endif jmp wq .h_w8: mova m4, [spel_h_shufA] movu m5, [spel_h_shufB] movu m6, [spel_h_shufC] .h_w8_loop: movu ym2, [srcq+ssq*0] vinserti32x8 m2, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] mova m0, m8 vpermb m1, m4, m2 vpdpwssd m0, m10, m1 vpermb m1, m5, m2 vpdpwssd m0, m11, m1 vpermb m1, m6, m2 vpdpwssd m0, m12, m1 psrad m0, 6 vextracti32x8 ym1, m0, 1 packusdw ym0, ym1 pminsw ym0, ym15 mova [dstq+dsq*0], xm0 vextracti32x4 [dstq+dsq*1], ym0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8_loop RET .h: vpbroadcastw m15, r8m test myd, 0xf00 jnz .hv mov r7d, r8m shr r7d, 11 vpbroadcastd m8, [base+put_8tap_h_rnd+r7*4] cmp wd, 4 jle mangle(private_prefix %+ _put_8tap_16bpc_avx512icl).h_w4 shr mxd, 16 sub srcq, 4 pmovsxbw xmm0, [base+subpel_filters+1+mxq*8] mova [buf], xmm0 vpbroadcastd m10, xmm0 vpbroadcastd m12, [buf+8] vpbroadcastd m11, [buf+4] sub wd, 16 jl .h_w8 vbroadcasti32x4 m6, [spel_h_shufA] vbroadcasti32x4 m7, [spel_h_shufB] jg .h_w32 .h_w16_loop: movu ym2, [srcq+ssq*0+ 0] vinserti32x8 m2, [srcq+ssq*1+ 0], 1 movu ym3, [srcq+ssq*0+12] vinserti32x8 m3, [srcq+ssq*1+12], 1 lea srcq, [srcq+ssq*2] mova m0, m8 mova m1, m8 pshufb m4, m2, m6 vpdpwssd m0, m10, m4 ; a0 b0 pshufb m4, m3, m7 vpdpwssd m1, m12, m4 ; a2' b2' pshufb m2, m7 pshufb m3, m6 vpdpwssd m0, m11, m2 ; a1 b1 vpdpwssd m1, m11, m3 ; a1' b1' shufpd m2, m3, 0x55 vpdpwssd m0, m12, m2 ; a2 b2 vpdpwssd m1, m10, m2 ; a0' b0' psrad m0, 6 psrad m1, 6 packusdw m0, m1 pminsw m0, m15 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w16_loop RET .h_w32: lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] neg wq .h_w32_loop0: mov r6, wq .h_w32_loop: movu m2, [srcq+r6*2+ 0] movu m3, [srcq+r6*2+12] mova m0, m8 mova m1, m8 pshufb m4, m2, m6 vpdpwssd m0, m10, m4 ; a0 pshufb m4, m3, m7 vpdpwssd m1, m12, m4 ; b2 pshufb m2, m7 pshufb m3, m6 vpdpwssd m0, m11, m2 ; a1 vpdpwssd m1, m11, m3 ; b1 shufpd m2, m3, 0x55 vpdpwssd m0, m12, m2 ; a2 vpdpwssd m1, m10, m2 ; b0 psrad m0, 6 psrad m1, 6 packusdw m0, m1 pminsw m0, m15 mova [dstq+r6*2], m0 add r6, 32 jl .h_w32_loop add srcq, ssq add dstq, dsq dec hd jg .h_w32_loop0 RET .v: movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd vpbroadcastd m11, [pd_32] pmovsxbw xmm0, [base+subpel_filters+1+myq*8] tzcnt r7d, wd vpbroadcastw m15, r8m mov r6, ssq movzx r7d, word [r8+r7*2+table_offset(put, _6tap_v)] neg r6 mova [rsp+stack_offset+8], xmm0 vpbroadcastd m12, xmm0 add r7, r8 vpbroadcastd m13, [rsp+stack_offset+12] vpbroadcastd m14, [rsp+stack_offset+16] jmp r7 .v_w2: movd xmm2, [srcq+r6 *2] pinsrd xmm2, [srcq+r6 *1], 1 pinsrd xmm2, [srcq+ssq*0], 2 pinsrd xmm2, [srcq+ssq*1], 3 ; 0 1 2 3 lea srcq, [srcq+ssq*2] movd xmm0, [srcq+ssq*0] palignr xmm3, xmm0, xmm2, 4 ; 1 2 3 4 punpcklwd xmm1, xmm2, xmm3 ; 01 12 punpckhwd xmm2, xmm3 ; 23 34 .v_w2_loop: movd xmm3, [srcq+ssq*1] mova xmm4, xm11 vpdpwssd xmm4, xmm1, xm12 ; a0 b0 lea srcq, [srcq+ssq*2] mova xmm1, xmm2 vpdpwssd xmm4, xmm2, xm13 ; a1 b1 punpckldq xmm2, xmm0, xmm3 ; 4 5 movd xmm0, [srcq+ssq*0] punpckldq xmm3, xmm0 ; 5 6 punpcklwd xmm2, xmm3 ; 45 56 vpdpwssd xmm4, xmm2, xm14 ; a2 b2 psrad xmm4, 6 packusdw xmm4, xmm4 pminsw xmm4, xm15 movd [dstq+dsq*0], xmm4 pextrd [dstq+dsq*1], xmm4, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movq xmm1, [srcq+r6 *2] vpbroadcastq ymm3, [srcq+r6 *1] vpbroadcastq ymm2, [srcq+ssq*0] vpbroadcastq ymm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpbroadcastq ymm0, [srcq+ssq*0] vpblendd ymm1, ymm3, 0x30 vpblendd ymm3, ymm2, 0x30 punpcklwd ymm1, ymm3 ; 01 12 vpblendd ymm2, ymm4, 0x30 vpblendd ymm4, ymm0, 0x30 punpcklwd ymm2, ymm4 ; 23 34 .v_w4_loop: vpbroadcastq ymm3, [srcq+ssq*1] mova ymm4, ym11 vpdpwssd ymm4, ymm1, ym12 ; a0 b0 lea srcq, [srcq+ssq*2] mova ymm1, ymm2 vpdpwssd ymm4, ymm2, ym13 ; a1 b1 vpblendd ymm2, ymm0, ymm3, 0x30 vpbroadcastq ymm0, [srcq+ssq*0] vpblendd ymm3, ymm0, 0x30 punpcklwd ymm2, ymm3 ; 45 56 vpdpwssd ymm4, ymm2, ym14 ; a2 b2 psrad ymm4, 6 vextracti128 xmm3, ymm4, 1 packusdw xmm4, xmm3 pminsw xmm4, xm15 movq [dstq+dsq*0], xmm4 movhps [dstq+dsq*1], xmm4 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop vzeroupper RET .v_w8: vbroadcasti32x4 m0, [srcq+ssq*0] vinserti32x4 m1, m0, [srcq+r6 *2], 0 vinserti32x4 m1, [srcq+r6 *1], 1 ; 0 1 2 vinserti32x4 ym0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] mova m5, [spel_v_shuf8] vinserti32x4 m0, [srcq+ssq*0], 2 ; 2 3 4 vpermb m1, m5, m1 ; 01 12 vpermb m2, m5, m0 ; 23 34 .v_w8_loop: vinserti32x4 m0, [srcq+ssq*1], 3 lea srcq, [srcq+ssq*2] movu xm3, [srcq+ssq*0] mova m4, m11 vpdpwssd m4, m12, m1 ; a0 b0 vshufi32x4 m0, m3, q1032 ; 4 5 6 mova m1, m2 vpdpwssd m4, m13, m2 ; a1 b1 vpermb m2, m5, m0 ; 45 56 vpdpwssd m4, m14, m2 ; a2 b2 psrad m4, 6 vextracti32x8 ym3, m4, 1 packusdw ym4, ym3 pminsw ym4, ym15 mova [dstq+dsq*0], xm4 vextracti32x4 [dstq+dsq*1], ym4, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop RET .v_w16: vbroadcasti32x8 m0, [srcq+r6 *1] vinserti32x8 m1, m0, [srcq+ssq*0], 1 vinserti32x8 m0, [srcq+r6*2], 0 mova m6, [spel_v_shuf16] movu ym3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x8 m3, [srcq+ssq*0], 1 vpermb m1, m6, m1 ; 12 vpermb m0, m6, m0 ; 01 vpermb m3, m6, m3 ; 34 mova m7, [deint_q_shuf] vpshrdd m2, m1, m3, 16 ; 23 .v_w16_loop: mova m5, m11 vpdpwssd m5, m12, m1 ; b0 mova m4, m11 vpdpwssd m4, m12, m0 ; a0 mova m1, m3 vpdpwssd m5, m13, m3 ; b1 mova m0, m2 vpdpwssd m4, m13, m2 ; a1 movu ym3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x8 m3, [srcq+ssq*0], 1 vpermb m3, m6, m3 ; 56 vpshrdd m2, m1, m3, 16 ; 45 vpdpwssd m5, m14, m3 ; b2 vpdpwssd m4, m14, m2 ; a2 psrad m5, 6 psrad m4, 6 packusdw m4, m5 pminsw m4, m15 vpermq m4, m7, m4 mova [dstq+dsq*0], ym4 vextracti32x8 [dstq+dsq*1], m4, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w16_loop RET .v_w32: .v_w64: .v_w128: lea wd, [hq+wq*8-256] .v_w32_loop0: movu m16, [srcq+r6 *2] movu m17, [srcq+r6 *1] lea r7, [srcq+ssq*2] movu m18, [srcq+ssq*0] movu m19, [srcq+ssq*1] mov r8, dstq movu m20, [r7 +ssq*0] punpcklwd m0, m16, m17 ; 01 punpckhwd m16, m17 punpcklwd m1, m17, m18 ; 12 punpckhwd m17, m18 punpcklwd m2, m18, m19 ; 23 punpckhwd m18, m19 punpcklwd m3, m19, m20 ; 34 punpckhwd m19, m20 .v_w32_loop: mova m4, m11 vpdpwssd m4, m12, m0 ; a0 mova m6, m11 vpdpwssd m6, m12, m16 mova m5, m11 vpdpwssd m5, m12, m1 ; b0 mova m7, m11 vpdpwssd m7, m12, m17 mova m0, m2 vpdpwssd m4, m13, m2 ; a1 mova m16, m18 vpdpwssd m6, m13, m18 mova m1, m3 vpdpwssd m5, m13, m3 ; b1 mova m17, m19 vpdpwssd m7, m13, m19 movu m19, [r7+ssq*1] lea r7, [r7+ssq*2] punpcklwd m2, m20, m19 ; 45 punpckhwd m18, m20, m19 movu m20, [r7+ssq*0] vpdpwssd m4, m14, m2 ; a2 vpdpwssd m6, m14, m18 punpcklwd m3, m19, m20 ; 56 punpckhwd m19, m20 vpdpwssd m5, m14, m3 ; b2 vpdpwssd m7, m14, m19 REPX {psrad x, 6}, m4, m6, m5, m7 packusdw m4, m6 packusdw m5, m7 pminsw m4, m15 pminsw m5, m15 mova [r8+dsq*0], m4 mova [r8+dsq*1], m5 lea r8, [r8+dsq*2] sub hd, 2 jg .v_w32_loop add srcq, 64 add dstq, 64 movzx hd, wb sub wd, 1<<8 jg .v_w32_loop0 vzeroupper RET .hv: cmp wd, 4 jg .hv_w8 movzx mxd, mxb pmovsxbw xmm0, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd pmovsxbw xmm1, [base+subpel_filters+1+myq*8] mov r6, ssq sub srcq, 2 neg r6 test dword r8m, 0x800 jnz .hv_12bit vpbroadcastd m10, [pd_2176] psllw xmm0, 6 jmp .hv_main .hv_12bit: vpbroadcastd m10, [pd_640] psllw xmm0, 4 psllw xmm1, 2 .hv_main: movu xm4, [srcq+r6 *2] vinserti32x4 ym4, [srcq+r6 *1], 1 vinserti32x4 m4, [srcq+ssq*0], 2 vbroadcasti32x4 m6, [spel_h_shufA] vinserti32x4 m4, [srcq+ssq*1], 3 ; 0 1 2 3 lea srcq, [srcq+ssq*2] movu xm5, [srcq+ssq*0] ; 4 mova [buf+ 0], xmm0 mova [buf+16], xmm1 vpbroadcastd m8, [buf+ 4] vpbroadcastd m9, [buf+ 8] vpbroadcastd ym12, xmm1 vpbroadcastd ym13, [buf+20] vpbroadcastd ym14, [buf+24] cmp wd, 4 je .hv_w4 vbroadcasti32x4 m2, [spel_h_shufA] mova m3, [spel_h_shuf2b] mova m1, m10 pshufb m4, m6 pshufb xm5, xm6 punpcklqdq m2, m4, m5 vpdpwssd m1, m8, m2 ; 04 1_ 2_ 3_ mova ym6, [spel_h_shuf2a] punpckhqdq m4, m5 mova xm5, [spel_shuf2] vpdpwssd m1, m9, m4 vpermb m1, m3, m1 ; 01 12 vextracti32x4 xm2, ym1, 1 ; 23 34 .hv_w2_loop: movu xm3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x4 ym3, [srcq+ssq*0], 1 vpermb ym3, ym6, ym3 pmaddwd xmm0, xm12, xm1 ; a0 b0 mova xm4, xm10 vpdpwssd xm4, xm8, xm3 vextracti32x4 xm3, ym3, 1 mova xm1, xm2 vpdpwssd xmm0, xm13, xm2 ; a1 b1 vpdpwssd xm4, xm9, xm3 ; 5 6 vpermt2b xm2, xm5, xm4 ; 45 56 vpdpwssd xmm0, xm14, xm2 ; a2 b2 psrad xmm0, 10 packusdw xmm0, xmm0 pminsw xmm0, xm15 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: vbroadcasti32x4 m7, [spel_h_shufB] mova ym0, [spel_shuf4a] pshufb m1, m4, m6 mova m2, m10 vpdpwssd m2, m8, m1 pshufb xm1, xm5, xm6 mova xm3, xm10 vpdpwssd xm3, xm8, xm1 pshufb m4, m7 pshufb xm5, xm7 vpdpwssd m2, m9, m4 ; 0 1 2 3 vpdpwssd xm3, xm9, xm5 ; 4 mova ym5, [spel_shuf4b] vpermb m1, m0, m2 ; 01 12 vshufi32x4 m2, m3, q1032 ; 2 3 4 vpermb m2, m0, m2 ; 23 34 .hv_w4_loop: movu xm3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x4 ym3, [srcq+ssq*0], 1 pmaddwd ym0, ym12, ym1 ; a0 b0 mova ym1, ym2 pshufb ym4, ym3, ym6 mova ym2, ym10 vpdpwssd ym2, ym8, ym4 pshufb ym3, ym7 vpdpwssd ym0, ym13, ym1 ; a1 b1 vpdpwssd ym2, ym9, ym3 ; 5 6 vpermt2b ym2, ym5, ym1 ; 45 56 vpdpwssd ym0, ym14, ym2 ; a2 b2 psrad ym0, 10 vextracti32x4 xm4, ym0, 1 packusdw xm0, xm4 pminsw xmm0, xm0, xm15 movq [dstq+dsq*0], xmm0 movhps [dstq+dsq*1], xmm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop RET .hv_w8: shr mxd, 16 pmovsxbw xmm0, [base+subpel_filters+1+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd pmovsxbw xmm1, [base+subpel_filters+1+myq*8] mov r6, ssq sub srcq, 4 neg r6 test dword r8m, 0x800 jnz .hv_w8_12bit vpbroadcastd m8, [pd_2176] psllw xmm0, 6 jmp .hv_w8_main .hv_w8_12bit: vpbroadcastd m8, [pd_640] psllw xmm0, 4 psllw xmm1, 2 .hv_w8_main: mova [buf+ 0], xmm0 mova [buf+16], xmm1 vpbroadcastd m9, xmm0 vpbroadcastd m10, [buf+ 4] vpbroadcastd m11, [buf+ 8] vpbroadcastd m12, xmm1 vpbroadcastd m13, [buf+20] vpbroadcastd m14, [buf+24] cmp wd, 16 jge .hv_w16 mova m6, [spel_h_shufA] movu ym16, [srcq+r6 *2] vinserti32x8 m16, [srcq+r6 *1], 1 ; 0 1 movu ym17, [srcq+ssq*0] vinserti32x8 m17, [srcq+ssq*1], 1 ; 2 3 lea srcq, [srcq+ssq*2] movu ym18, [srcq+ssq*0] ; 4 movu m7, [spel_h_shufC] vpermb m3, m6, m16 mova m1, m8 vpermb m4, m6, m17 vpdpwssd m1, m9, m3 ; a0 b0 mova m2, m8 vpermb m5, m6, m18 vpdpwssd m2, m9, m4 ; c0 d0 mova m0, m8 vpermb m16, m7, m16 vpdpwssd m0, m9, m5 ; e0 vpermb m17, m7, m17 vpdpwssd m1, m11, m16 ; a2 b2 vpermb m18, m7, m18 vpdpwssd m2, m11, m17 ; c2 d2 shufpd m3, m16, 0x55 vpdpwssd m0, m11, m18 ; e2 mova m16, [spel_shuf8a] shufpd m4, m17, 0x55 vpdpwssd m1, m10, m3 ; a1 b1 shufpd m5, m18, 0x55 vpdpwssd m2, m10, m4 ; c1 d1 vpdpwssd m0, m10, m5 ; e1 mova m5, [spel_shuf8b] vpermt2b m1, m16, m2 ; 01 12 vpermt2b m2, m16, m0 ; 23 34 .hv_w8_loop: movu ym18, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x8 m18, [srcq+ssq*0], 1 mova m0, m8 vpermb m17, m6, m18 vpdpwssd m0, m9, m17 ; f0 g0 vpermb m18, m7, m18 pmaddwd m16, m12, m1 ; A0 B0 vpdpwssd m0, m11, m18 ; f2 g2 shufpd m17, m18, 0x55 mova m1, m2 vpdpwssd m16, m13, m2 ; A1 B1 vpdpwssd m0, m10, m17 ; f1 g1 vpermt2b m2, m5, m0 ; 45 56 vpdpwssd m16, m14, m2 ; A2 B2 psrad m16, 10 vextracti32x8 ym17, m16, 1 packusdw ym16, ym17 pminsw ym16, ym15 mova [dstq+dsq*0], xm16 vextracti128 [dstq+dsq*1], ym16, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop vzeroupper RET .hv_w16: vbroadcasti32x4 m20, [spel_h_shufA] vbroadcasti32x4 m21, [spel_h_shufB] jg .hv_w32 vbroadcasti32x8 m6, [srcq+r6 *2+ 8] vinserti32x8 m2, m6, [srcq+r6 *2+16], 1 vinserti32x8 m6, [srcq+r6 *2+ 0], 0 ; 0 movu ym16, [srcq+r6 *1+ 0] movu ym17, [srcq+r6 *1+12] vinserti32x8 m16, [srcq+ssq*0+ 0], 1 vinserti32x8 m17, [srcq+ssq*0+12], 1 ; 1 2 movu ym18, [srcq+ssq*1+ 0] movu ym19, [srcq+ssq*1+12] lea srcq, [srcq+ssq*2] vinserti32x8 m18, [srcq+ssq*0+ 0], 1 vinserti32x8 m19, [srcq+ssq*0+12], 1 ; 3 4 pshufb m2, m20 mova m1, m8 pshufb m3, m16, m20 vpdpwssd m1, m11, m2 ; a2 mova m2, m8 pshufb m4, m17, m21 vpdpwssd m2, m9, m3 ; b0 c0 mova m3, m8 pshufb m5, m18, m20 vpdpwssd m3, m11, m4 ; b2' c2' mova m4, m8 pshufb m7, m19, m21 vpdpwssd m4, m9, m5 ; d0 e0 mova m5, m8 pshufb m0, m6, m20 vpdpwssd m5, m11, m7 ; d2' e2' mova m7, [spel_shuf16] pshufb m16, m21 vpdpwssd m1, m9, m0 ; a0 pshufb m17, m20 vpdpwssd m2, m10, m16 ; b1 c1 pshufb m18, m21 vpdpwssd m3, m10, m17 ; b1' c1' pshufb m19, m20 vpdpwssd m4, m10, m18 ; d1 e1 pshufb m6, m21 vpdpwssd m5, m10, m19 ; d1' e1' shufpd m16, m17, 0x55 vpdpwssd m1, m10, m6 ; a1 shufpd m18, m19, 0x55 vpdpwssd m2, m11, m16 ; b2 c2 vpdpwssd m3, m9, m16 ; b0' c0' vpdpwssd m4, m11, m18 ; d2 e2 vpdpwssd m5, m9, m18 ; d0' e0' pslldq m1, 1 vpermt2b m2, m7, m3 ; 12 vpermt2b m4, m7, m5 ; 34 vpshrdd m1, m2, 16 ; 01 vpshrdd m3, m2, m4, 16 ; 23 .hv_w16_loop: movu ym18, [srcq+ssq*1+ 0] movu ym19, [srcq+ssq*1+12] lea srcq, [srcq+ssq*2] vinserti32x8 m18, [srcq+ssq*0+ 0], 1 vinserti32x8 m19, [srcq+ssq*0+12], 1 mova m5, m8 mova m6, m8 pshufb m17, m18, m20 vpdpwssd m5, m9, m17 ; f0 g0 pshufb m16, m19, m21 vpdpwssd m6, m11, m16 ; f2' g2' pmaddwd m17, m12, m2 ; B0 mova m2, m4 pmaddwd m16, m12, m1 ; A0 mova m1, m3 pshufb m18, m21 vpdpwssd m5, m10, m18 ; f1 g1 pshufb m19, m20 vpdpwssd m6, m10, m19 ; f1' g1' vpdpwssd m17, m13, m4 ; B1 vpdpwssd m16, m13, m3 ; A1 shufpd m18, m19, 0x55 vpdpwssd m5, m11, m18 ; f2 g2 vpdpwssd m6, m9, m18 ; f0' g0' mova m4, m7 vpermi2b m4, m5, m6 ; 56 vpshrdd m3, m2, m4, 16 ; 45 vpdpwssd m17, m14, m4 ; B2 vpdpwssd m16, m14, m3 ; A2 psrad m16, 10 psrad m17, 10 vshufi32x4 m18, m16, m17, q3232 vinserti32x8 m16, ym17, 1 packusdw m16, m18 pminsw m16, m15 mova [dstq+dsq*0], ym16 vextracti32x8 [dstq+dsq*1], m16, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w16_loop vzeroupper RET .hv_w32: WIN64_SPILL_XMM 28 mova m27, [spel_shuf32] lea wd, [hq+wq*8-256] .hv_w32_loop0: movu m16, [srcq+r6 *2+ 0] movu m7, [srcq+r6 *2+12] movu m6, [srcq+r6 *1+ 0] movu m18, [srcq+r6 *1+12] lea r7, [srcq+ssq*2] movu m17, [srcq+ssq*0+ 0] movu m19, [srcq+ssq*0+12] movu m22, [srcq+ssq*1+ 0] movu m24, [srcq+ssq*1+12] mov r8, dstq movu m23, [r7 +ssq*0+ 0] movu m25, [r7 +ssq*0+12] pshufb m1, m16, m20 mova m0, m8 pshufb m2, m7, m21 vpdpwssd m0, m9, m1 ; a0 mova m1, m8 pshufb m4, m6, m20 vpdpwssd m1, m11, m2 ; a2' mova m2, m8 pshufb m3, m17, m20 vpdpwssd m2, m9, m4 ; b0 mova m4, m8 pshufb m5, m18, m21 vpdpwssd m4, m9, m3 ; c0 mova m3, m8 pshufb m26, m19, m21 vpdpwssd m3, m11, m5 ; b2' mova m5, m8 pshufb m16, m21 vpdpwssd m5, m11, m26 ; c2' pshufb m7, m20 vpdpwssd m0, m10, m16 ; a1 pshufb m6, m21 vpdpwssd m1, m10, m7 ; a1' pshufb m17, m21 vpdpwssd m2, m10, m6 ; b1 pshufb m18, m20 vpdpwssd m4, m10, m17 ; c1 pshufb m19, m20 vpdpwssd m3, m10, m18 ; b1' shufpd m16, m7, 0x55 vpdpwssd m5, m10, m19 ; c1' shufpd m6, m18, 0x55 vpdpwssd m0, m11, m16 ; a2 shufpd m17, m19, 0x55 vpdpwssd m1, m9, m16 ; a0' pshufb m16, m22, m20 vpdpwssd m2, m11, m6 ; b2 pshufb m7, m23, m20 vpdpwssd m4, m11, m17 ; c2 vpdpwssd m3, m9, m6 ; b0' mova m6, m8 vpdpwssd m5, m9, m17 ; c0' pshufb m17, m24, m21 vpdpwssd m6, m9, m16 ; d0 mova m16, m8 pshufb m26, m25, m21 vpdpwssd m16, m9, m7 ; e0 mova m7, m8 pshufb m22, m21 vpdpwssd m7, m11, m17 ; d2' mova m17, m8 pshufb m23, m21 vpdpwssd m17, m11, m26 ; e2' pshufb m24, m20 vpdpwssd m6, m10, m22 ; d1 pshufb m25, m20 vpdpwssd m16, m10, m23 ; e1 shufpd m22, m24, 0x55 vpdpwssd m7, m10, m24 ; d1' shufpd m23, m25, 0x55 vpdpwssd m17, m10, m25 ; e1' pslldq m0, 1 vpdpwssd m6, m11, m22 ; d2 pslldq m1, 1 vpdpwssd m16, m11, m23 ; e2 vpermt2b m2, m27, m4 ; 12 vpdpwssd m7, m9, m22 ; d0' vpermt2b m3, m27, m5 ; 12' vpdpwssd m17, m9, m23 ; e0' vpshrdd m0, m2, 16 ; 01 vpermt2b m6, m27, m16 ; 34 vpshrdd m1, m3, 16 ; 01' vpermt2b m7, m27, m17 ; 34' vpshrdd m4, m2, m6, 16 ; 23 vpshrdd m5, m3, m7, 16 ; 23' .hv_w32_loop: movu m22, [r7+ssq*1+ 0] movu m24, [r7+ssq*1+12] lea r7, [r7+ssq*2] movu m23, [r7+ssq*0+ 0] movu m25, [r7+ssq*0+12] pmaddwd m17, m12, m2 ; B0 mova m2, m6 pmaddwd m19, m12, m3 ; B0' mova m3, m7 pmaddwd m16, m12, m0 ; A0 mova m0, m4 pmaddwd m18, m12, m1 ; A0' mova m1, m5 vpdpwssd m17, m13, m6 ; B1 vpdpwssd m19, m13, m7 ; B1' mova m6, m8 vpdpwssd m16, m13, m4 ; A1 pshufb m4, m22, m20 vpdpwssd m18, m13, m5 ; A1' pshufb m7, m23, m20 vpdpwssd m6, m9, m4 ; f0 mova m4, m8 pshufb m5, m24, m21 vpdpwssd m4, m9, m7 ; g0 mova m7, m8 pshufb m26, m25, m21 vpdpwssd m7, m11, m5 ; f2' mova m5, m8 pshufb m22, m21 vpdpwssd m5, m11, m26 ; g2' pshufb m23, m21 vpdpwssd m6, m10, m22 ; f1 pshufb m24, m20 vpdpwssd m4, m10, m23 ; g1 pshufb m25, m20 vpdpwssd m7, m10, m24 ; f1' shufpd m22, m24, 0x55 vpdpwssd m5, m10, m25 ; g1' shufpd m23, m25, 0x55 vpdpwssd m6, m11, m22 ; f2 vpdpwssd m4, m11, m23 ; g2 vpdpwssd m7, m9, m22 ; f0' vpdpwssd m5, m9, m23 ; g0' vpermt2b m6, m27, m4 ; 56 vpermt2b m7, m27, m5 ; 56' vpdpwssd m17, m14, m6 ; B2 vpshrdd m4, m2, m6, 16 ; 45 vpdpwssd m19, m14, m7 ; B2' vpshrdd m5, m3, m7, 16 ; 45' vpdpwssd m16, m14, m4 ; A2 vpdpwssd m18, m14, m5 ; A2' REPX {psrad x, 10}, m17, m19, m16, m18 packusdw m17, m19 packusdw m16, m18 pminsw m17, m15 pminsw m16, m15 mova [r8+dsq*0], m16 mova [r8+dsq*1], m17 lea r8, [r8+dsq*2] sub hd, 2 jg .hv_w32_loop add srcq, 64 add dstq, 64 movzx hd, wb sub wd, 1<<8 jg .hv_w32_loop0 RET PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP, put_8tap_16bpc PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH, put_8tap_16bpc PUT_8TAP_FN regular_sharp, REGULAR, SHARP, put_8tap_16bpc PUT_8TAP_FN sharp_regular, SHARP, REGULAR, put_8tap_16bpc PUT_8TAP_FN sharp, SHARP, SHARP cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v lea r8, [put_avx512icl] movifnidn wd, wm movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jz mangle(private_prefix %+ _put_6tap_16bpc_avx512icl).put .v: movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd vpbroadcastd m10, [pd_32] pmovsxbw xmm0, [base+subpel_filters+myq*8] tzcnt r7d, wd vpbroadcastw m11, r8m lea r6, [ssq*3] movzx r7d, word [r8+r7*2+table_offset(put, _8tap_v)] sub srcq, r6 mova [rsp+stack_offset+8], xmm0 vpbroadcastd m12, xmm0 add r7, r8 vpbroadcastd m13, [rsp+stack_offset+12] vpbroadcastd m14, [rsp+stack_offset+16] vpbroadcastd m15, [rsp+stack_offset+20] jmp r7 .v_w2: movd xmm2, [srcq+ssq*0] pinsrd xmm2, [srcq+ssq*1], 1 pinsrd xmm2, [srcq+ssq*2], 2 add srcq, r6 pinsrd xmm2, [srcq+ssq*0], 3 ; 0 1 2 3 movd xmm3, [srcq+ssq*1] vpbroadcastd xmm1, [srcq+ssq*2] add srcq, r6 vpbroadcastd xmm0, [srcq+ssq*0] vpblendd xmm3, xmm1, 0x02 ; 4 5 vpblendd xmm1, xmm0, 0x02 ; 5 6 palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4 punpcklwd xmm3, xmm1 ; 45 56 punpcklwd xmm1, xmm2, xmm4 ; 01 12 punpckhwd xmm2, xmm4 ; 23 34 .v_w2_loop: vpbroadcastd xmm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova xmm5, xm10 vpdpwssd xmm5, xm12, xmm1 ; a0 b0 mova xmm1, xmm2 vpdpwssd xmm5, xm13, xmm2 ; a1 b1 mova xmm2, xmm3 vpdpwssd xmm5, xm14, xmm3 ; a2 b2 vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7 vpbroadcastd xmm0, [srcq+ssq*0] vpblendd xmm4, xmm0, 0x02 ; 7 8 punpcklwd xmm3, xmm4 ; 67 78 vpdpwssd xmm5, xm15, xmm3 ; a3 b3 psrad xmm5, 6 packusdw xmm5, xmm5 pminsw xmm5, xm11 movd [dstq+dsq*0], xmm5 pextrd [dstq+dsq*1], xmm5, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movq xmm1, [srcq+ssq*0] vpbroadcastq ymm0, [srcq+ssq*1] vpbroadcastq ymm2, [srcq+ssq*2] add srcq, r6 vpbroadcastq ymm4, [srcq+ssq*0] vpbroadcastq ymm3, [srcq+ssq*1] vpbroadcastq ymm5, [srcq+ssq*2] add srcq, r6 vpblendd ymm1, ymm0, 0x30 vpblendd ymm0, ymm2, 0x30 punpcklwd ymm1, ymm0 ; 01 12 vpbroadcastq ymm0, [srcq+ssq*0] vpblendd ymm2, ymm4, 0x30 vpblendd ymm4, ymm3, 0x30 punpcklwd ymm2, ymm4 ; 23 34 vpblendd ymm3, ymm5, 0x30 vpblendd ymm5, ymm0, 0x30 punpcklwd ymm3, ymm5 ; 45 56 .v_w4_loop: vpbroadcastq ymm5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova ymm4, ym10 vpdpwssd ymm4, ym12, ymm1 ; a0 b0 mova ymm1, ymm2 vpdpwssd ymm4, ym13, ymm2 ; a1 b1 mova ymm2, ymm3 vpdpwssd ymm4, ym14, ymm3 ; a2 b2 vpblendd ymm3, ymm0, ymm5, 0x30 vpbroadcastq ymm0, [srcq+ssq*0] vpblendd ymm5, ymm0, 0x30 punpcklwd ymm3, ymm5 ; 67 78 vpdpwssd ymm4, ym15, ymm3 ; a3 b3 psrad ymm4, 6 vextracti128 xmm5, ymm4, 1 packusdw xmm4, xmm5 pminsw xmm4, xm11 movq [dstq+dsq*0], xmm4 movhps [dstq+dsq*1], xmm4 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop vzeroupper RET .v_w8: vbroadcasti32x4 m2, [srcq+ssq*2] vinserti32x4 m1, m2, [srcq+ssq*0], 0 vinserti32x4 m1, [srcq+ssq*1], 1 ; 0 1 2 add srcq, r6 vinserti32x4 ym2, [srcq+ssq*0], 1 vinserti32x4 m2, [srcq+ssq*1], 2 ; 2 3 4 mova m6, [spel_v_shuf8] movu xm0, [srcq+ssq*1] vinserti32x4 ym0, [srcq+ssq*2], 1 add srcq, r6 vinserti32x4 m0, [srcq+ssq*0], 2 ; 4 5 6 vpermb m1, m6, m1 ; 01 12 vpermb m2, m6, m2 ; 23 34 vpermb m3, m6, m0 ; 45 56 .v_w8_loop: vinserti32x4 m0, [srcq+ssq*1], 3 lea srcq, [srcq+ssq*2] movu xm5, [srcq+ssq*0] mova m4, m10 vpdpwssd m4, m12, m1 ; a0 b0 mova m1, m2 vshufi32x4 m0, m5, q1032 ; 6 7 8 vpdpwssd m4, m13, m2 ; a1 b1 mova m2, m3 vpdpwssd m4, m14, m3 ; a2 b2 vpermb m3, m6, m0 ; 67 78 vpdpwssd m4, m15, m3 ; a3 b3 psrad m4, 6 vextracti32x8 ym5, m4, 1 packusdw ym4, ym5 pminsw ym4, ym11 mova [dstq+dsq*0], xm4 vextracti32x4 [dstq+dsq*1], ym4, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop RET .v_w16: vbroadcasti32x8 m0, [srcq+ssq*1] vinserti32x8 m1, m0, [srcq+ssq*2], 1 vinserti32x8 m0, [srcq+ssq*0], 0 mova m8, [spel_v_shuf16] add srcq, r6 movu ym3, [srcq+ssq*0] vinserti32x8 m3, [srcq+ssq*1], 1 movu ym5, [srcq+ssq*2] add srcq, r6 vinserti32x8 m5, [srcq+ssq*0], 1 vpermb m1, m8, m1 ; 12 vpermb m0, m8, m0 ; 01 vpermb m3, m8, m3 ; 34 vpermb m5, m8, m5 ; 56 mova m9, [deint_q_shuf] vpshrdd m2, m1, m3, 16 ; 23 vpshrdd m4, m3, m5, 16 ; 45 .v_w16_loop: mova m7, m10 vpdpwssd m7, m12, m1 ; b0 mova m6, m10 vpdpwssd m6, m12, m0 ; a0 mova m1, m3 vpdpwssd m7, m13, m3 ; b1 mova m0, m2 vpdpwssd m6, m13, m2 ; a1 mova m3, m5 vpdpwssd m7, m14, m5 ; b2 mova m2, m4 vpdpwssd m6, m14, m4 ; a2 movu ym5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x8 m5, [srcq+ssq*0], 1 vpermb m5, m8, m5 ; 78 vpshrdd m4, m3, m5, 16 ; 67 vpdpwssd m7, m15, m5 ; b3 vpdpwssd m6, m15, m4 ; a3 psrad m7, 6 psrad m6, 6 packusdw m6, m7 pminsw m6, m11 vpermq m6, m9, m6 mova [dstq+dsq*0], ym6 vextracti32x8 [dstq+dsq*1], m6, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w16_loop RET .v_w32: .v_w64: .v_w128: WIN64_SPILL_XMM 23 lea wd, [hq+wq*8-256] .v_w32_loop0: movu m16, [srcq+ssq*0] movu m17, [srcq+ssq*1] lea r7, [srcq+r6 ] movu m18, [srcq+ssq*2] movu m19, [r7 +ssq*0] mov r8, dstq movu m20, [r7 +ssq*1] movu m21, [r7 +ssq*2] add r7, r6 movu m22, [r7 +ssq*0] punpcklwd m0, m16, m17 ; 01l punpckhwd m16, m17 ; 01h punpcklwd m1, m17, m18 ; 12l punpckhwd m17, m18 ; 12h punpcklwd m2, m18, m19 ; 23l punpckhwd m18, m19 ; 23h punpcklwd m3, m19, m20 ; 34l punpckhwd m19, m20 ; 34h punpcklwd m4, m20, m21 ; 45l punpckhwd m20, m21 ; 45h punpcklwd m5, m21, m22 ; 56l punpckhwd m21, m22 ; 56h .v_w32_loop: mova m6, m10 vpdpwssd m6, m12, m0 ; a0l mova m8, m10 vpdpwssd m8, m12, m16 ; a0h mova m7, m10 vpdpwssd m7, m12, m1 ; b0l mova m9, m10 vpdpwssd m9, m12, m17 ; b0h mova m0, m2 vpdpwssd m6, m13, m2 ; a1l mova m16, m18 vpdpwssd m8, m13, m18 ; a1h mova m1, m3 vpdpwssd m7, m13, m3 ; b1l mova m17, m19 vpdpwssd m9, m13, m19 ; b1h mova m2, m4 vpdpwssd m6, m14, m4 ; a2l mova m18, m20 vpdpwssd m8, m14, m20 ; a2h mova m3, m5 vpdpwssd m7, m14, m5 ; b2l mova m19, m21 vpdpwssd m9, m14, m21 ; b2h movu m21, [r7+ssq*1] lea r7, [r7+ssq*2] punpcklwd m4, m22, m21 ; 67l punpckhwd m20, m22, m21 ; 67h movu m22, [r7+ssq*0] vpdpwssd m6, m15, m4 ; a3l vpdpwssd m8, m15, m20 ; a3h punpcklwd m5, m21, m22 ; 78l punpckhwd m21, m22 ; 78h vpdpwssd m7, m15, m5 ; b3l vpdpwssd m9, m15, m21 ; b3h REPX {psrad x, 6}, m6, m8, m7, m9 packusdw m6, m8 packusdw m7, m9 pminsw m6, m11 pminsw m7, m11 mova [r8+dsq*0], m6 mova [r8+dsq*1], m7 lea r8, [r8+dsq*2] sub hd, 2 jg .v_w32_loop add srcq, 64 add dstq, 64 movzx hd, wb sub wd, 1<<8 jg .v_w32_loop0 RET .h_w2: RESET_STACK_STATE mova ym2, [spel_h_shuf2a] sub srcq, 2 pshufd xmm3, xmm0, q1111 pshufd xmm4, xmm0, q2222 .h_w2_loop: movu xm1, [srcq+ssq*0] vinserti32x4 ym1, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] mova xmm0, xm8 vpermb ym1, ym2, ym1 vpdpwssd xmm0, xmm3, xm1 vextracti32x4 xm1, ym1, 1 vpdpwssd xmm0, xmm4, xm1 psrad xmm0, 6 packusdw xmm0, xmm0 pminsw xmm0, xm15 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2_loop RET .h_w4: movzx mxd, mxb pmovsxbw xmm0, [base+subpel_filters+mxq*8] jl .h_w2 vbroadcasti32x4 ym4, [spel_h_shufA] vbroadcasti32x4 ym5, [spel_h_shufB] sub srcq, 2 pshufd xmm0, xmm0, q2211 vpbroadcastq ym6, xmm0 vpermq ym7, ymm0, q1111 .h_w4_loop: movu xm2, [srcq+ssq*0] vinserti32x4 ym2, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] mova ym0, ym8 pshufb ym1, ym2, ym4 vpdpwssd ym0, ym6, ym1 pshufb ym2, ym5 vpdpwssd ym0, ym7, ym2 psrad ym0, 6 vextracti32x4 xm1, ym0, 1 packusdw xm0, xm1 pminsw xmm0, xm0, xm15 movq [dstq+dsq*0], xmm0 movhps [dstq+dsq*1], xmm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4_loop RET .h_w8: mova m4, [spel_h_shufA] movu m5, [spel_h_shufB] movu m6, [spel_h_shufC] mova m7, [spel_h_shufD] .h_w8_loop: movu ym2, [srcq+ssq*0] vinserti32x8 m2, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] mova m0, m8 vpermb m1, m4, m2 vpdpwssd m0, m10, m1 vpermb m1, m5, m2 vpdpwssd m0, m11, m1 vpermb m1, m6, m2 vpdpwssd m0, m12, m1 vpermb m1, m7, m2 vpdpwssd m0, m13, m1 psrad m0, 6 vextracti32x8 ym1, m0, 1 packusdw ym0, ym1 pminsw ym0, ym15 mova [dstq+dsq*0], xm0 vextracti32x4 [dstq+dsq*1], ym0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8_loop RET .h: vpbroadcastw m15, r8m test myd, 0xf00 jnz .hv mov r7d, r8m shr r7d, 11 vpbroadcastd m8, [base+put_8tap_h_rnd+r7*4] cmp wd, 4 jle .h_w4 shr mxd, 16 sub srcq, 6 pmovsxbw xmm0, [base+subpel_filters+mxq*8] mova [buf], xmm0 vpbroadcastd m10, xmm0 vpbroadcastd m11, [buf+ 4] vpbroadcastd m12, [buf+ 8] vpbroadcastd m13, [buf+12] sub wd, 16 jl .h_w8 vbroadcasti32x4 m6, [spel_h_shufA] vbroadcasti32x4 m7, [spel_h_shufB] jg .h_w32 .h_w16_loop: movu ym2, [srcq+ssq*0+ 0] vinserti32x8 m2, [srcq+ssq*1+ 0], 1 movu ym3, [srcq+ssq*0+16] vinserti32x8 m3, [srcq+ssq*1+16], 1 lea srcq, [srcq+ssq*2] mova m0, m8 mova m1, m8 pshufb m4, m2, m6 vpdpwssd m0, m10, m4 ; a0 pshufb m4, m3, m6 vpdpwssd m1, m12, m4 ; b2 pshufb m4, m2, m7 vpdpwssd m0, m11, m4 ; a1 pshufb m4, m3, m7 vpdpwssd m1, m13, m4 ; b3 shufpd m2, m3, 0x55 pshufb m4, m2, m6 vpdpwssd m0, m12, m4 ; a2 vpdpwssd m1, m10, m4 ; b0 pshufb m2, m7 vpdpwssd m0, m13, m2 ; a3 vpdpwssd m1, m11, m2 ; b1 psrad m0, 6 psrad m1, 6 packusdw m0, m1 pminsw m0, m15 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w16_loop RET .h_w32: lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] neg wq .h_w32_loop0: mov r6, wq .h_w32_loop: movu m2, [srcq+r6*2+ 0] movu m3, [srcq+r6*2+ 8] mova m0, m8 mova m1, m8 pshufb m4, m2, m6 vpdpwssd m0, m10, m4 ; a0 pshufb m4, m3, m6 vpdpwssd m1, m10, m4 ; b0 vpdpwssd m0, m12, m4 ; a2 movu m4, [srcq+r6*2+16] pshufb m3, m7 vpdpwssd m1, m11, m3 ; b1 vpdpwssd m0, m13, m3 ; a3 pshufb m3, m4, m6 vpdpwssd m1, m12, m3 ; b2 pshufb m2, m7 vpdpwssd m0, m11, m2 ; a1 pshufb m4, m7 vpdpwssd m1, m13, m4 ; b3 psrad m0, 6 psrad m1, 6 packusdw m0, m1 pminsw m0, m15 mova [dstq+r6*2], m0 add r6, 32 jl .h_w32_loop add srcq, ssq add dstq, dsq dec hd jg .h_w32_loop0 RET .hv: cmp wd, 4 jg .hv_w8 movzx mxd, mxb pmovsxbw xmm0, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd pmovsxbw xmm1, [base+subpel_filters+myq*8] lea r6, [ssq*3] sub srcq, 2 sub srcq, r6 test dword r8m, 0x800 jnz .hv_12bit vpbroadcastd m10, [pd_2176] psllw xmm0, 6 jmp .hv_main .hv_12bit: vpbroadcastd m10, [pd_640] psllw xmm0, 4 psllw xmm1, 2 .hv_main: mova [buf+ 0], xmm0 mova [buf+16], xmm1 vpbroadcastd m8, [buf+ 4] vpbroadcastd m9, [buf+ 8] vpbroadcastd ym11, xmm1 vpbroadcastd ym12, [buf+20] vpbroadcastd ym13, [buf+24] vpbroadcastd ym14, [buf+28] movu xm4, [srcq+ssq*0] vinserti32x4 ym4, [srcq+ssq*1], 1 vinserti32x4 m4, [srcq+ssq*2], 2 add srcq, r6 vinserti32x4 m4, [srcq+ssq*0], 3 ; 0 1 2 3 movu xm0, [srcq+ssq*1] vinserti32x4 ym0, [srcq+ssq*2], 1 add srcq, r6 vinserti32x4 m0, [srcq+ssq*0], 2 ; 4 5 6 cmp wd, 4 je .hv_w4 vbroadcasti32x4 m2, [spel_h_shufA] mova m3, [spel_h_shuf2b] mova ym6, [spel_h_shuf2a] mova xm7, [spel_shuf2] mova m1, m10 pshufb m4, m2 pshufb m0, m2 punpcklqdq m2, m4, m0 vpdpwssd m1, m8, m2 ; 04 15 26 3_ punpckhqdq m4, m0 vpdpwssd m1, m9, m4 vpermb m1, m3, m1 ; 01 12 vextracti32x4 xm2, ym1, 1 ; 23 34 vextracti32x4 xm3, m1, 2 ; 45 56 .hv_w2_loop: movu xm5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x4 ym5, [srcq+ssq*0], 1 mova xm4, xm10 vpermb ym5, ym6, ym5 pmaddwd xmm0, xm11, xm1 ; a0 b0 vpdpwssd xm4, xm8, xm5 vextracti32x4 xm5, ym5, 1 mova xm1, xm2 vpdpwssd xmm0, xm12, xm2 ; a1 b1 vpdpwssd xm4, xm9, xm5 ; 7 8 mova xm2, xm3 vpdpwssd xmm0, xm13, xm3 ; a2 b2 vpermt2b xm3, xm7, xm4 ; 67 78 vpdpwssd xmm0, xm14, xm3 ; a3 b3 psrad xmm0, 10 packusdw xmm0, xmm0 pminsw xmm0, xm15 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: vbroadcasti32x4 m19, [spel_h_shufA] vbroadcasti32x4 m20, [spel_h_shufB] mova ym6, [spel_shuf4a] mova ym7, [spel_shuf4b] mova m2, m10 mova m3, m10 pshufb m1, m4, m19 vpdpwssd m2, m8, m1 pshufb m1, m0, m19 vpdpwssd m3, m8, m1 pshufb m4, m20 vpdpwssd m2, m9, m4 pshufb m0, m20 vpdpwssd m3, m9, m0 vpermb m1, m6, m2 ; 01 12 vshufi32x4 m2, m3, q1032 vpermb m3, m6, m3 ; 45 56 vpermb m2, m6, m2 ; 23 34 .hv_w4_loop: movu xm18, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti128 ym18, [srcq+ssq*0], 1 pmaddwd ym16, ym11, ym1 ; a0 b0 mova ym1, ym2 mova ym2, ym3 pshufb ym17, ym18, ym19 mova ym3, ym10 vpdpwssd ym3, ym8, ym17 pshufb ym18, ym20 vpdpwssd ym16, ym12, ym1 ; a1 b1 vpdpwssd ym3, ym9, ym18 ; 7 8 vpdpwssd ym16, ym13, ym2 ; a2 b2 vpermt2b ym3, ym7, ym2 ; 67 78 vpdpwssd ym16, ym14, ym3 ; a3 b3 psrad ym16, 10 vextracti128 xm17, ym16, 1 packusdw xm16, xm17 pminsw xm16, xm15 movq [dstq+dsq*0], xm16 movhps [dstq+dsq*1], xm16 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop vzeroupper RET .hv_w8: shr mxd, 16 pmovsxbw xmm0, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd pmovsxbw xmm1, [base+subpel_filters+myq*8] lea r6, [ssq*3] sub srcq, 6 sub srcq, r6 test dword r8m, 0x800 jnz .hv_w8_12bit vpbroadcastd m10, [pd_2176] psllw xmm0, 6 jmp .hv_w8_main .hv_w8_12bit: vpbroadcastd m10, [pd_640] psllw xmm0, 4 psllw xmm1, 2 .hv_w8_main: mova [buf+ 0], xmm0 mova [buf+16], xmm1 vpbroadcastd m11, xmm0 vpbroadcastd m12, [buf+ 4] vpbroadcastd m13, [buf+ 8] vpbroadcastd m14, [buf+12] vpbroadcastd m16, xmm1 vpbroadcastd m17, [buf+20] vpbroadcastd m18, [buf+24] vpbroadcastd m19, [buf+28] cmp wd, 8 jg .hv_w16 mova m5, [spel_h_shufA] movu ym0, [srcq+ssq*0] vinserti32x8 m0, [srcq+ssq*1], 1 ; 0 1 movu ym9, [srcq+ssq*2] add srcq, r6 vinserti32x8 m9, [srcq+ssq*0], 1 ; 2 3 movu ym20, [srcq+ssq*1] vinserti32x8 m20, [srcq+ssq*2], 1 ; 4 5 add srcq, r6 movu ym21, [srcq+ssq*0] ; 6 movu m6, [spel_h_shufB] movu m7, [spel_h_shufC] vpermb m8, m5, m0 mova m1, m10 vpdpwssd m1, m11, m8 ; a0 b0 vpermb m8, m5, m9 mova m2, m10 vpdpwssd m2, m11, m8 ; c0 d0 vpermb m8, m5, m20 mova m3, m10 vpdpwssd m3, m11, m8 ; e0 f0 vpermb m8, m5, m21 mova m4, m10 vpdpwssd m4, m11, m8 ; g0 vpermb m8, m6, m0 vpdpwssd m1, m12, m8 ; a1 b1 vpermb m8, m6, m9 vpdpwssd m2, m12, m8 ; c1 d1 vpermb m8, m6, m20 vpdpwssd m3, m12, m8 ; e1 f1 vpermb m8, m6, m21 vpdpwssd m4, m12, m8 ; g1 vpermb m8, m7, m0 vpdpwssd m1, m13, m8 ; a2 b2 vpermb m8, m7, m9 vpdpwssd m2, m13, m8 ; c2 d2 vpermb m8, m7, m20 vpdpwssd m3, m13, m8 ; e2 f2 vpermb m8, m7, m21 vpdpwssd m4, m13, m8 ; g2 mova m8, [spel_h_shufD] vpermb m0, m8, m0 vpdpwssd m1, m14, m0 ; a3 b3 mova m0, [spel_shuf8a] vpermb m9, m8, m9 vpdpwssd m2, m14, m9 ; c3 d3 mova m9, [spel_shuf8b] vpermb m20, m8, m20 vpdpwssd m3, m14, m20 ; e3 f3 vpermb m21, m8, m21 vpdpwssd m4, m14, m21 ; g3 vpermt2b m1, m0, m2 ; 01 12 vpermt2b m2, m0, m3 ; 23 34 vpermt2b m3, m0, m4 ; 45 56 .hv_w8_loop: movu ym0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x8 m0, [srcq+ssq*0], 1 mova m4, m10 vpermb m21, m5, m0 vpdpwssd m4, m11, m21 ; h0 i0 vpermb m21, m6, m0 pmaddwd m20, m16, m1 ; A0 B0 vpdpwssd m4, m12, m21 ; h1 i1 vpermb m21, m7, m0 mova m1, m2 vpdpwssd m20, m17, m2 ; A1 B1 vpdpwssd m4, m13, m21 ; h2 i2 vpermb m21, m8, m0 mova m2, m3 vpdpwssd m20, m18, m3 ; A2 B2 vpdpwssd m4, m14, m21 ; h3 i3 vpermt2b m3, m9, m4 ; 67 78 vpdpwssd m20, m19, m3 ; A3 B3 psrad m20, 10 vextracti32x8 ym21, m20, 1 packusdw ym20, ym21 pminsw ym20, ym15 mova [dstq+dsq*0], xm20 vextracti128 [dstq+dsq*1], ym20, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop vzeroupper RET .hv_w16: WIN64_SPILL_XMM 26 vbroadcasti32x4 m20, [spel_h_shufA] vbroadcasti32x4 m21, [spel_h_shufB] add wd, wd mova m9, [spel_shuf16] lea wd, [hq+wq*8-256] .hv_w16_loop0: vbroadcasti32x8 m5, [srcq+ssq*0+ 8] vinserti32x8 m4, m5, [srcq+ssq*0+ 0], 0 vinserti32x8 m5, [srcq+ssq*0+16], 1 ; 0 movu ym6, [srcq+ssq*1+ 0] movu ym7, [srcq+ssq*1+16] lea r7, [srcq+r6] vinserti32x8 m6, [srcq+ssq*2+ 0], 1 vinserti32x8 m7, [srcq+ssq*2+16], 1 ; 1 2 movu ym22, [r7 +ssq*0+ 0] movu ym23, [r7 +ssq*0+16] mov r8, dstq vinserti32x8 m22, [r7 +ssq*1+ 0], 1 vinserti32x8 m23, [r7 +ssq*1+16], 1 ; 3 4 movu ym24, [r7 +ssq*2+ 0] movu ym25, [r7 +ssq*2+16] add r7, r6 vinserti32x8 m24, [r7 +ssq*0+ 0], 1 vinserti32x8 m25, [r7 +ssq*0+16], 1 ; 5 6 pshufb m0, m4, m20 mova m1, m10 vpdpwssd m1, m11, m0 ; a0 pshufb m0, m6, m20 mova m2, m10 vpdpwssd m2, m11, m0 ; b0 pshufb m0, m7, m20 mova m3, m10 vpdpwssd m3, m13, m0 ; c2 pshufb m0, m4, m21 vpdpwssd m1, m12, m0 ; a1 pshufb m0, m6, m21 vpdpwssd m2, m12, m0 ; b1 pshufb m0, m7, m21 vpdpwssd m3, m14, m0 ; c3 pshufb m0, m5, m20 vpdpwssd m1, m13, m0 ; a2 shufpd m6, m7, 0x55 pshufb m7, m6, m20 vpdpwssd m2, m13, m7 ; b2 vpdpwssd m3, m11, m7 ; c0 pshufb m5, m21 vpdpwssd m1, m14, m5 ; a3 pshufb m6, m21 vpdpwssd m2, m14, m6 ; b3 vpdpwssd m3, m12, m6 ; c1 pshufb m0, m22, m20 mova m4, m10 vpdpwssd m4, m11, m0 ; d0 pshufb m0, m23, m20 mova m5, m10 vpdpwssd m5, m13, m0 ; e2 pshufb m0, m24, m20 mova m6, m10 vpdpwssd m6, m11, m0 ; f0 pshufb m0, m25, m20 mova m7, m10 vpdpwssd m7, m13, m0 ; g2 pshufb m0, m22, m21 vpdpwssd m4, m12, m0 ; d1 pshufb m0, m23, m21 vpdpwssd m5, m14, m0 ; e3 pshufb m0, m24, m21 vpdpwssd m6, m12, m0 ; f1 pshufb m0, m25, m21 vpdpwssd m7, m14, m0 ; g3 shufpd m22, m23, 0x55 pshufb m23, m22, m20 vpdpwssd m4, m13, m23 ; d2 vpdpwssd m5, m11, m23 ; e0 shufpd m24, m25, 0x55 pshufb m25, m24, m20 vpdpwssd m6, m13, m25 ; f2 vpdpwssd m7, m11, m25 ; g0 pshufb m22, m21 vpdpwssd m4, m14, m22 ; d3 vpdpwssd m5, m12, m22 ; e1 pshufb m24, m21 vpdpwssd m6, m14, m24 ; f3 vpdpwssd m7, m12, m24 ; g1 pslldq m1, 1 vpermt2b m2, m9, m3 ; 12 vpermt2b m4, m9, m5 ; 34 vpermt2b m6, m9, m7 ; 56 vpshrdd m1, m2, 16 ; 01 vpshrdd m3, m2, m4, 16 ; 23 vpshrdd m5, m4, m6, 16 ; 45 .hv_w16_loop: movu ym24, [r7+ssq*1+ 0] movu ym25, [r7+ssq*1+16] lea r7, [r7+ssq*2] vinserti32x8 m24, [r7+ssq*0+ 0], 1 vinserti32x8 m25, [r7+ssq*0+16], 1 mova m7, m10 mova m8, m10 pshufb m0, m24, m20 vpdpwssd m7, m11, m0 ; h0 pshufb m0, m25, m20 vpdpwssd m8, m13, m0 ; i2 pmaddwd m22, m16, m1 ; A0 mova m1, m3 pmaddwd m23, m16, m2 ; B0 mova m2, m4 pshufb m0, m24, m21 vpdpwssd m7, m12, m0 ; h1 pshufb m0, m25, m21 vpdpwssd m8, m14, m0 ; i3 vpdpwssd m22, m17, m3 ; A1 mova m3, m5 vpdpwssd m23, m17, m4 ; B1 mova m4, m6 shufpd m24, m25, 0x55 pshufb m25, m24, m20 vpdpwssd m7, m13, m25 ; h2 vpdpwssd m8, m11, m25 ; i0 vpdpwssd m22, m18, m5 ; A2 vpdpwssd m23, m18, m6 ; B2 pshufb m24, m21 vpdpwssd m7, m14, m24 ; h3 vpdpwssd m8, m12, m24 ; i1 vpermt2b m7, m9, m8 ; 78 vpshrdd m5, m6, m7, 16 ; 67 vpdpwssd m22, m19, m5 ; A3 vpdpwssd m23, m19, m7 ; B3 mova m6, m7 psrad m22, 10 psrad m23, 10 vshufi32x4 m0, m22, m23, q3232 vinserti32x8 m22, ym23, 1 packusdw m22, m0 pminsw m22, m15 mova [r8+dsq*0], ym22 vextracti32x8 [r8+dsq*1], m22, 1 lea r8, [r8+dsq*2] sub hd, 2 jg .hv_w16_loop add srcq, 32 add dstq, 32 movzx hd, wb sub wd, 1<<8 jg .hv_w16_loop0 RET %if WIN64 DECLARE_REG_TMP 6, 4 %else DECLARE_REG_TMP 6, 7 %endif %define PREP_8TAP_FN FN prep_8tap, PREP_8TAP_FN smooth, SMOOTH, SMOOTH, prep_6tap_16bpc PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR, prep_6tap_16bpc PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH, prep_6tap_16bpc PREP_8TAP_FN regular, REGULAR, REGULAR cglobal prep_6tap_16bpc, 3, 8, 0, tmp, src, ss, w, h, mx, my %define base r7-prep_avx512icl imul mxd, mxm, 0x010101 add mxd, t0d ; 6tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 6tap_v, my, 4tap_v lea r7, [prep_avx512icl] mov wd, wm movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v .prep: tzcnt wd, wd mov r5d, r7m ; bitdepth_max vpbroadcastd m5, [pw_8192] movzx wd, word [r7+wq*2+table_offset(prep,)] shr r5d, 11 vpbroadcastd m4, [r7-prep_avx512icl+prep_mul+r5*4] add wq, r7 lea r6, [ssq*3] %if WIN64 pop r7 %endif jmp wq .h_w8: mova m6, [spel_h_shufA] movu m7, [spel_h_shufC] mova m8, [prep_endB] .h_w8_loop: movu ym4, [srcq+ssq*0] vinserti32x8 m4, [srcq+ssq*1], 1 movu ym5, [srcq+ssq*2] vinserti32x8 m5, [srcq+r6 ], 1 lea srcq, [srcq+ssq*4] mova m0, m10 mova m1, m10 vpermb m2, m6, m4 vpermb m3, m6, m5 vpdpwssd m0, m12, m2 ; a0 b0 vpdpwssd m1, m12, m3 ; c0 d0 vpermb m4, m7, m4 vpermb m5, m7, m5 vpdpwssd m0, m14, m4 ; a2 b2 vpdpwssd m1, m14, m5 ; c2 d2 shufpd m2, m4, 0x55 shufpd m3, m5, 0x55 vpdpwssd m0, m13, m2 ; a1 b1 vpdpwssd m1, m13, m3 ; c1 d1 vpermt2b m0, m8, m1 mova [tmpq], m0 add tmpq, 64 sub hd, 4 jg .h_w8_loop RET .h: vpbroadcastd m10, [prep_8tap_rnd] test myd, 0xf00 jnz .hv lea r6, [ssq*3] cmp wd, 4 je mangle(private_prefix %+ _prep_8tap_16bpc_avx512icl).h_w4 shr mxd, 16 pmovsxbw xmm0, [base+subpel_filters+1+mxq*8] mov r5d, r7m sub srcq, 4 shr r5d, 11 psllw xmm0, [base+prep_hv_shift+r5*8] mova [tmpq], xmm0 vpbroadcastd m12, xmm0 vpbroadcastd m13, [tmpq+ 4] vpbroadcastd m14, [tmpq+ 8] cmp wd, 16 jl .h_w8 vbroadcasti32x4 m5, [spel_h_shufA] vbroadcasti32x4 m6, [spel_h_shufB] mova m7, [prep_endC] jg .h_w32 .h_w16_loop: movu ym2, [srcq+ssq*0+ 0] vinserti32x8 m2, [srcq+ssq*1+ 0], 1 movu ym3, [srcq+ssq*0+12] vinserti32x8 m3, [srcq+ssq*1+12], 1 lea srcq, [srcq+ssq*2] mova m0, m10 mova m1, m10 pshufb m4, m2, m5 ; 01 vpdpwssd m0, m12, m4 ; a0 b0 pshufb m4, m3, m6 ; 89 vpdpwssd m1, m14, m4 ; a2' b2' pshufb m2, m6 ; 23 pshufb m3, m5 ; 67 vpdpwssd m0, m13, m2 ; a1 b1 vpdpwssd m1, m13, m3 ; a1' b1' shufpd m2, m3, 0x55 ; 45 vpdpwssd m0, m14, m2 ; a2 b2 vpdpwssd m1, m12, m2 ; a0' b0' vpermt2b m0, m7, m1 mova [tmpq], m0 add tmpq, 64 sub hd, 2 jg .h_w16_loop RET .h_w32: lea srcq, [srcq+wq*2] neg wq .h_w32_loop0: mov r6, wq .h_w32_loop: movu m2, [srcq+r6*2+ 0] movu m3, [srcq+r6*2+12] mova m0, m10 mova m1, m10 pshufb m4, m2, m5 vpdpwssd m0, m12, m4 pshufb m4, m3, m6 vpdpwssd m1, m14, m4 pshufb m2, m6 pshufb m3, m5 vpdpwssd m0, m13, m2 vpdpwssd m1, m13, m3 shufpd m2, m3, 0x55 vpdpwssd m0, m14, m2 vpdpwssd m1, m12, m2 vpermt2b m0, m7, m1 mova [tmpq], m0 add tmpq, 64 add r6, 32 jl .h_w32_loop add srcq, ssq dec hd jg .h_w32_loop0 RET .v: movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd mov r5d, r7m vpbroadcastd m10, [prep_8tap_rnd] pmovsxbw xmm0, [base+subpel_filters+1+myq*8] tzcnt r6d, wd shr r5d, 11 movzx r6d, word [r7+r6*2+table_offset(prep, _6tap_v)] psllw xmm0, [base+prep_hv_shift+r5*8] add r7, r6 mova [tmpq], xmm0 vpbroadcastd m12, xmm0 mov r6, ssq vpbroadcastd m13, [tmpq+ 4] neg r6 vpbroadcastd m14, [tmpq+ 8] jmp r7 .v_w4: mov r3d, 0x330c movq xm1, [srcq+r6 *2] kmovw k1, r3d vpbroadcastq ym1{k1}, [srcq+r6 *1] vpbroadcastq m2, [srcq+ssq*0] vinserti32x4 m1{k1}, m2, [srcq+ssq*1], 3 movq xm0, [srcq+ssq*2] mova ym4, [prep_endA] valignq m0, m1, 2 punpcklwd m1, m0 ; 01 12 23 34 .v_w4_loop: lea srcq, [srcq+ssq*4] movq xm2, [srcq+r6 *1] vpbroadcastq ym2{k1}, [srcq+ssq*0] vpbroadcastq m3, [srcq+ssq*1] vinserti32x4 m2{k1}, m3, [srcq+ssq*2], 3 mova m3, m10 vpdpwssd m3, m12, m1 ; a0 b0 c0 d0 valignq m0, m2, m0, 6 ; 4 5 6 7 punpcklwd m0, m2 ; 45 56 67 78 vpdpwssd m3, m14, m0 ; a2 b2 c2 d2 vshufi32x4 m1, m0, q1032 ; 23 34 45 56 vpdpwssd m3, m13, m1 ; a1 b1 c1 d1 mova m1, m0 mova m0, m2 vpermb m3, m4, m3 mova [tmpq], ym3 add tmpq, 32 sub hd, 4 jg .v_w4_loop RET .v_w8: vbroadcasti32x4 ym1, [srcq+r6 *1] mov r3d, 0x33 vbroadcasti32x4 m2, [srcq+ssq*0] kmovb k1, r3d mova m6, [spel_v_shuf8] vinserti64x2 m1{k1}, m2, [srcq+r6 *2], 0 ; 0 1 2 vbroadcasti32x4 ym0, [srcq+ssq*1] vinserti64x2 m0{k1}, m2, [srcq+ssq*2], 2 ; 2 3 4 mova m7, [prep_endB] vpermb m1, m6, m1 ; 01 12 vpermb m2, m6, m0 ; 23 34 .v_w8_loop: lea srcq, [srcq+ssq*4] vbroadcasti32x4 ym3, [srcq+r6 *1] movu xm4, [srcq+ssq*0] vshufi64x2 m3{k1}, m0, m4, q1032 ; 4 5 6 vbroadcasti32x4 ym0, [srcq+ssq*1] vinserti64x2 m0{k1}, m4, [srcq+ssq*2], 2 ; 6 7 8 mova m4, m10 vpdpwssd m4, m12, m1 ; a0 b0 mova m5, m10 vpdpwssd m5, m12, m2 ; c0 d0 vpermb m1, m6, m3 ; 45 56 vpdpwssd m4, m13, m2 ; a1 b1 vpermb m2, m6, m0 ; 67 78 vpdpwssd m5, m13, m1 ; c1 d1 vpdpwssd m4, m14, m1 ; a2 b2 vpdpwssd m5, m14, m2 ; c2 d2 vpermt2b m4, m7, m5 mova [tmpq], m4 add tmpq, 64 sub hd, 4 jg .v_w8_loop RET .v_w16: vbroadcasti32x8 m0, [srcq+r6 *1] vinserti32x8 m1, m0, [srcq+ssq*0], 1 ; 1 2 vinserti32x8 m0, [srcq+r6 *2], 0 ; 0 1 mova m6, [spel_v_shuf16] movu ym3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x8 m3, [srcq+ssq*0], 1 ; 3 4 mova m7, [prep_endA] vpermb m1, m6, m1 ; 12 vpermb m0, m6, m0 ; 01 vpermb m3, m6, m3 ; 34 vpshrdd m2, m1, m3, 16 ; 23 .v_w16_loop: mova m5, m10 vpdpwssd m5, m12, m1 ; b0 mova m4, m10 vpdpwssd m4, m12, m0 ; a0 mova m1, m3 vpdpwssd m5, m13, m3 ; b1 movu ym3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpdpwssd m4, m13, m2 ; a1 vinserti32x8 m3, [srcq+ssq*0], 1 mova m0, m2 vpermb m3, m6, m3 ; 56 vpshrdd m2, m1, m3, 16 ; 45 vpdpwssd m5, m14, m3 ; b2 vpdpwssd m4, m14, m2 ; a2 vpermt2b m4, m7, m5 mova [tmpq], m4 add tmpq, 64 sub hd, 2 jg .v_w16_loop RET .v_w32: .v_w64: .v_w128: %if WIN64 push r8 %endif mova m11, [prep_endC] lea r5, [hq+wq*8-256] .v_w32_loop0: movu m4, [srcq+r6 *2] movu m5, [srcq+r6 *1] lea r7, [srcq+ssq*2] movu m6, [srcq+ssq*0] movu m7, [srcq+ssq*1] mov r8, tmpq movu m8, [r7 +ssq*0] punpcklwd m0, m4, m5 ; 01 punpckhwd m4, m5 punpcklwd m1, m5, m6 ; 12 punpckhwd m5, m6 punpcklwd m2, m6, m7 ; 23 punpckhwd m6, m7 punpcklwd m3, m7, m8 ; 34 punpckhwd m7, m8 .v_w32_loop: mova m16, m10 movu m9, [r7+ssq*1] mova m18, m10 vpdpwssd m16, m12, m0 ; a0 mova m17, m10 vpdpwssd m18, m12, m4 mova m19, m10 vpdpwssd m17, m12, m1 ; b0 lea r7, [r7+ssq*2] vpdpwssd m19, m12, m5 mova m0, m2 vpdpwssd m16, m13, m2 ; a1 punpcklwd m2, m8, m9 ; 45 mova m4, m6 vpdpwssd m18, m13, m6 punpckhwd m6, m8, m9 movu m8, [r7+ssq*0] vpdpwssd m17, m13, m3 ; b1 mova m1, m3 vpdpwssd m19, m13, m7 mova m5, m7 vpdpwssd m16, m14, m2 ; a2 punpcklwd m3, m9, m8 ; 56 vpdpwssd m18, m14, m6 punpckhwd m7, m9, m8 vpdpwssd m17, m14, m3 ; b2 vpdpwssd m19, m14, m7 vpermt2b m16, m11, m18 vpermt2b m17, m11, m19 mova [r8+wq*0], m16 mova [r8+wq*2], m17 lea r8, [r8+wq*4] sub hd, 2 jg .v_w32_loop add srcq, 64 add tmpq, 64 movzx hd, r5b sub r5d, 1<<8 jg .v_w32_loop0 %if WIN64 pop r8 %endif vzeroupper RET .hv_w4: movzx mxd, mxb pmovsxbw xmm0, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd mov r5d, r7m pmovsxbw xmm1, [base+subpel_filters+1+myq*8] mov r6, ssq sub srcq, 2 shr r5d, 11 neg r6 psllw xmm0, [base+prep_hv_shift+r5*8] psllw xmm1, 2 mova [tmpq+ 0], xmm0 mova [tmpq+16], xmm1 vpbroadcastd m8, [tmpq+ 4] mov r3d, 0xf0 vpbroadcastd m9, [tmpq+ 8] vpbroadcastd m12, xmm1 movu xm3, [srcq+r6 *2] kmovb k1, r3d vinserti32x4 ym3, [srcq+r6 *1], 1 vbroadcasti32x4 m2, [srcq+ssq*0] vinserti64x2 m3{k1}, m2, [srcq+ssq*1], 3 movu xm4, [srcq+ssq*2] vbroadcasti32x4 m5, [spel_h_shufA] vbroadcasti32x4 m6, [spel_h_shufB] mova m1, m11 mova m15, [spel_shuf4a] mova xm2, xm11 pshufb m0, m3, m5 vpdpwssd m1, m8, m0 pshufb xm0, xm4, xm5 vpdpwssd xm2, xm8, xm0 vpbroadcastd m13, [tmpq+20] pshufb m3, m6 vpbroadcastd m14, [tmpq+24] pshufb xm4, xm6 mova m7, [spel_shuf4b] vpdpwssd m1, m9, m3 ; 0 1 2 3 vpdpwssd xm2, xm9, xm4 ; 4 vpermt2b m1, m15, m2 ; 01 12 23 34 mova ym15, [prep_endA] .hv_w4_loop: lea srcq, [srcq+ssq*4] movu xm4, [srcq+r6 *1] vinserti32x4 ym4, [srcq+ssq*0], 1 vbroadcasti32x4 m3, [srcq+ssq*1] vinserti64x2 m4{k1}, m3, [srcq+ssq*2], 3 mova m2, m11 pshufb m3, m4, m5 vpdpwssd m2, m8, m3 mova m3, m10 vpdpwssd m3, m12, m1 ; a0 b0 c0 d0 pshufb m4, m6 vpdpwssd m2, m9, m4 ; 5 6 7 8 mova m4, m1 vpermt2b m1, m7, m2 ; 45 56 67 78 vpdpwssd m3, m14, m1 ; a2 b2 c2 d2 vshufi32x4 m4, m1, q1032 ; 23 34 45 56 vpdpwssd m3, m13, m4 ; a1 b1 c1 d1 vpermb m3, m15, m3 mova [tmpq], ym3 add tmpq, 32 sub hd, 4 jg .hv_w4_loop RET .hv_w8: mova m8, [spel_h_shufA] movu ym18, [srcq+r6 *2] vinserti32x8 m18, [srcq+r6 *1], 1 ; 0 1 movu ym19, [srcq+ssq*0] vinserti32x8 m19, [srcq+ssq*1], 1 ; 2 3 movu ym20, [srcq+ssq*2] ; 4 movu m9, [spel_h_shufC] mova m21, [spel_shuf8a] mova m0, [spel_shuf8b] vpermb m4, m8, m18 mova m1, m10 vpermb m5, m8, m19 vpdpwssd m1, m12, m4 ; a0 b0 mova m2, m10 vpermb m6, m8, m20 vpdpwssd m2, m12, m5 ; c0 d0 mova m3, m10 vpermb m18, m9, m18 vpdpwssd m3, m12, m6 ; e0 mova m7, [prep_endB] vpermb m19, m9, m19 vpdpwssd m1, m14, m18 ; a2 b2 vpermb m20, m9, m20 vpdpwssd m2, m14, m19 ; c2 d2 shufpd m4, m18, 0x55 vpdpwssd m3, m14, m20 ; e2 shufpd m5, m19, 0x55 vpdpwssd m1, m13, m4 ; a1 b1 shufpd m6, m20, 0x55 vpdpwssd m2, m13, m5 ; c1 d1 vpdpwssd m3, m13, m6 ; e1 vpermt2b m1, m21, m2 ; 01 12 vpermt2b m2, m21, m3 ; 23 34 .hv_w8_loop: lea srcq, [srcq+ssq*4] movu ym18, [srcq+r6 *1] vinserti32x8 m18, [srcq+ssq*0], 1 movu ym19, [srcq+ssq*1] vinserti32x8 m19, [srcq+ssq*2], 1 mova m3, m10 vpermb m5, m8, m18 mova m4, m10 vpermb m6, m8, m19 vpdpwssd m3, m12, m5 ; f0 g0 mova m20, m11 vpdpwssd m4, m12, m6 ; h0 i0 mova m21, m11 vpdpwssd m20, m15, m1 ; A0 B0 vpermb m18, m9, m18 vpdpwssd m21, m15, m2 ; C0 D0 vpermb m19, m9, m19 vpdpwssd m3, m14, m18 ; f2 g2 vpdpwssd m4, m14, m19 ; h2 i2 shufpd m5, m18, 0x55 vpdpwssd m20, m16, m2 ; A1 B1 shufpd m6, m19, 0x55 vpdpwssd m3, m13, m5 ; f1 g1 vpdpwssd m4, m13, m6 ; h1 i1 vpermt2b m2, m0, m3 ; 45 56 vpdpwssd m21, m16, m2 ; C1 D1 mova m1, m2 vpermt2b m2, m0, m4 ; 67 78 vpdpwssd m20, m17, m1 ; A2 B2 vpdpwssd m21, m17, m2 ; A2 B2 vpermt2b m20, m7, m21 mova [tmpq], m20 add tmpq, 64 sub hd, 4 jg .hv_w8_loop vzeroupper RET .hv: vpbroadcastd m11, [pd_128] cmp wd, 4 je .hv_w4 shr mxd, 16 pmovsxbw xmm0, [base+subpel_filters+1+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd mov r5d, r7m pmovsxbw xmm1, [base+subpel_filters+1+myq*8] mov r6, ssq sub srcq, 4 shr r5d, 11 neg r6 psllw xmm0, [base+prep_hv_shift+r5*8] psllw xmm1, 2 mova [tmpq+ 0], xmm0 mova [tmpq+16], xmm1 vpbroadcastd m12, xmm0 vpbroadcastd m13, [tmpq+ 4] vpbroadcastd m14, [tmpq+ 8] vpbroadcastd m15, xmm1 vpbroadcastd m16, [tmpq+20] vpbroadcastd m17, [tmpq+24] cmp wd, 16 jl .hv_w8 vbroadcasti32x4 m8, [spel_h_shufA] vbroadcasti32x4 m9, [spel_h_shufB] jg .hv_w32 vbroadcasti32x8 m6, [srcq+r6 *2+ 8] vinserti32x8 m2, m6, [srcq+r6 *2+16], 1 vinserti32x8 m6, [srcq+r6 *2+ 0], 0 ; 0 movu ym18, [srcq+r6 *1+ 0] movu ym19, [srcq+r6 *1+12] vinserti32x8 m18, [srcq+ssq*0+ 0], 1 vinserti32x8 m19, [srcq+ssq*0+12], 1 ; 1 2 movu ym20, [srcq+ssq*1+ 0] movu ym21, [srcq+ssq*1+12] lea srcq, [srcq+ssq*2] vinserti32x8 m20, [srcq+ssq*0+ 0], 1 vinserti32x8 m21, [srcq+ssq*0+12], 1 ; 3 4 pshufb m2, m8 mova m1, m10 pshufb m3, m18, m8 vpdpwssd m1, m14, m2 ; a2 mova m2, m10 pshufb m4, m19, m9 vpdpwssd m2, m12, m3 ; b0 c0 mova m3, m10 pshufb m5, m20, m8 vpdpwssd m3, m14, m4 ; b2' c2' mova m4, m10 pshufb m7, m21, m9 vpdpwssd m4, m12, m5 ; d0 e0 mova m5, m10 pshufb m0, m6, m8 vpdpwssd m5, m14, m7 ; d2' e2' mova m7, [spel_shuf16] pshufb m18, m9 vpdpwssd m1, m12, m0 ; a0 pshufb m19, m8 vpdpwssd m2, m13, m18 ; b1 c1 pshufb m20, m9 vpdpwssd m3, m13, m19 ; b1' c1' pshufb m21, m8 vpdpwssd m4, m13, m20 ; d1 e1 pshufb m6, m9 vpdpwssd m5, m13, m21 ; d1' e1' mova m0, [prep_endB] shufpd m18, m19, 0x55 vpdpwssd m1, m13, m6 ; a1 shufpd m20, m21, 0x55 vpdpwssd m2, m14, m18 ; b2 c2 vpdpwssd m3, m12, m18 ; b0' c0' vpdpwssd m4, m14, m20 ; d2 e2 vpdpwssd m5, m12, m20 ; d0' e0' pslldq m1, 1 vpermt2b m2, m7, m3 ; 12 vpermt2b m4, m7, m5 ; 34 vpshrdd m1, m2, 16 ; 01 vpshrdd m3, m2, m4, 16 ; 23 .hv_w16_loop: movu ym18, [srcq+ssq*1+ 0] movu ym19, [srcq+ssq*1+12] lea srcq, [srcq+ssq*2] vinserti32x8 m18, [srcq+ssq*0+ 0], 1 vinserti32x8 m19, [srcq+ssq*0+12], 1 mova m5, m10 mova m6, m10 pshufb m21, m18, m8 vpdpwssd m5, m12, m21 ; f0 g0 pshufb m20, m19, m9 mova m21, m11 vpdpwssd m6, m14, m20 ; f2' g2' mova m20, m11 vpdpwssd m21, m15, m2 ; B0 mova m2, m4 vpdpwssd m20, m15, m1 ; A0 mova m1, m3 pshufb m18, m9 vpdpwssd m5, m13, m18 ; f1 g1 pshufb m19, m8 vpdpwssd m6, m13, m19 ; f1' g1' vpdpwssd m21, m16, m4 ; B1 vpdpwssd m20, m16, m3 ; A1 shufpd m18, m19, 0x55 vpdpwssd m5, m14, m18 ; f2 g2 vpdpwssd m6, m12, m18 ; f0' g0' mova m4, m7 vpermi2b m4, m5, m6 ; 56 vpshrdd m3, m2, m4, 16 ; 45 vpdpwssd m21, m17, m4 ; B2 vpdpwssd m20, m17, m3 ; A2 vpermt2b m20, m0, m21 mova [tmpq], m20 add tmpq, 64 sub hd, 2 jg .hv_w16_loop vzeroupper RET .hv_w32: WIN64_SPILL_XMM 29 %if WIN64 push r8 %endif mova m27, [spel_shuf32] lea r5d, [hq+wq*8-256] mova m28, [prep_endC] .hv_w32_loop0: movu m18, [srcq+r6 *2+ 0] movu m7, [srcq+r6 *2+12] movu m6, [srcq+r6 *1+ 0] movu m20, [srcq+r6 *1+12] lea r7, [srcq+ssq*2] movu m19, [srcq+ssq*0+ 0] movu m21, [srcq+ssq*0+12] movu m22, [srcq+ssq*1+ 0] movu m24, [srcq+ssq*1+12] mov r8, tmpq movu m23, [r7 +ssq*0+ 0] movu m25, [r7 +ssq*0+12] pshufb m1, m18, m8 mova m0, m10 pshufb m2, m7, m9 vpdpwssd m0, m12, m1 ; a0 mova m1, m10 pshufb m4, m6, m8 vpdpwssd m1, m14, m2 ; a2' mova m2, m10 pshufb m3, m19, m8 vpdpwssd m2, m12, m4 ; b0 mova m4, m10 pshufb m5, m20, m9 vpdpwssd m4, m12, m3 ; c0 mova m3, m10 pshufb m26, m21, m9 vpdpwssd m3, m14, m5 ; b2' mova m5, m10 pshufb m18, m9 vpdpwssd m5, m14, m26 ; c2' pshufb m7, m8 vpdpwssd m0, m13, m18 ; a1 pshufb m6, m9 vpdpwssd m1, m13, m7 ; a1' pshufb m19, m9 vpdpwssd m2, m13, m6 ; b1 pshufb m20, m8 vpdpwssd m4, m13, m19 ; c1 pshufb m21, m8 vpdpwssd m3, m13, m20 ; b1' shufpd m18, m7, 0x55 vpdpwssd m5, m13, m21 ; c1' shufpd m6, m20, 0x55 vpdpwssd m0, m14, m18 ; a2 shufpd m19, m21, 0x55 vpdpwssd m1, m12, m18 ; a0' pshufb m18, m22, m8 vpdpwssd m2, m14, m6 ; b2 pshufb m7, m23, m8 vpdpwssd m4, m14, m19 ; c2 vpdpwssd m3, m12, m6 ; b0' mova m6, m10 vpdpwssd m5, m12, m19 ; c0' pshufb m19, m24, m9 vpdpwssd m6, m12, m18 ; d0 mova m18, m10 pshufb m26, m25, m9 vpdpwssd m18, m12, m7 ; e0 mova m7, m10 pshufb m22, m9 vpdpwssd m7, m14, m19 ; d2' mova m19, m10 pshufb m23, m9 vpdpwssd m19, m14, m26 ; e2' pshufb m24, m8 vpdpwssd m6, m13, m22 ; d1 pshufb m25, m8 vpdpwssd m18, m13, m23 ; e1 shufpd m22, m24, 0x55 vpdpwssd m7, m13, m24 ; d1' shufpd m23, m25, 0x55 vpdpwssd m19, m13, m25 ; e1' pslldq m0, 1 vpdpwssd m6, m14, m22 ; d2 pslldq m1, 1 vpdpwssd m18, m14, m23 ; e2 vpermt2b m2, m27, m4 ; 12 vpdpwssd m7, m12, m22 ; d0' vpermt2b m3, m27, m5 ; 12' vpdpwssd m19, m12, m23 ; e0' vpshrdd m0, m2, 16 ; 01 vpermt2b m6, m27, m18 ; 34 vpshrdd m1, m3, 16 ; 01' vpermt2b m7, m27, m19 ; 34' vpshrdd m4, m2, m6, 16 ; 23 vpshrdd m5, m3, m7, 16 ; 23' .hv_w32_loop: movu m22, [r7+ssq*1+ 0] movu m24, [r7+ssq*1+12] lea r7, [r7+ssq*2] movu m23, [r7+ssq*0+ 0] movu m25, [r7+ssq*0+12] mova m19, m11 vpdpwssd m19, m15, m2 ; B0 mova m21, m11 vpdpwssd m21, m15, m3 ; B0' mova m18, m11 vpdpwssd m18, m15, m0 ; A0 mova m20, m11 vpdpwssd m20, m15, m1 ; A0' mova m2, m6 vpdpwssd m19, m16, m6 ; B1 mova m3, m7 vpdpwssd m21, m16, m7 ; B1' mova m0, m4 vpdpwssd m18, m16, m4 ; A1 mova m1, m5 pshufb m4, m22, m8 vpdpwssd m20, m16, m5 ; A1' mova m6, m10 pshufb m7, m23, m8 vpdpwssd m6, m12, m4 ; f0 mova m4, m10 pshufb m5, m24, m9 vpdpwssd m4, m12, m7 ; g0 mova m7, m10 pshufb m26, m25, m9 vpdpwssd m7, m14, m5 ; f2' mova m5, m10 pshufb m22, m9 vpdpwssd m5, m14, m26 ; g2' pshufb m23, m9 vpdpwssd m6, m13, m22 ; f1 pshufb m24, m8 vpdpwssd m4, m13, m23 ; g1 pshufb m25, m8 vpdpwssd m7, m13, m24 ; f1' shufpd m22, m24, 0x55 vpdpwssd m5, m13, m25 ; g1' shufpd m23, m25, 0x55 vpdpwssd m6, m14, m22 ; f2 vpdpwssd m4, m14, m23 ; g2 vpdpwssd m7, m12, m22 ; f0' vpdpwssd m5, m12, m23 ; g0' vpermt2b m6, m27, m4 ; 56 vpermt2b m7, m27, m5 ; 56' vpdpwssd m19, m17, m6 ; B2 vpshrdd m4, m2, m6, 16 ; 45 vpdpwssd m21, m17, m7 ; B2' vpshrdd m5, m3, m7, 16 ; 45' vpdpwssd m18, m17, m4 ; A2 vpdpwssd m20, m17, m5 ; A2' vpermt2b m19, m28, m21 vpermt2b m18, m28, m20 mova [r8+wq*0], m18 mova [r8+wq*2], m19 lea r8, [r8+wq*4] sub hd, 2 jg .hv_w32_loop add srcq, 64 add tmpq, 64 movzx hd, r5b sub r5d, 1<<8 jg .hv_w32_loop0 %if WIN64 pop r8 %endif RET PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_16bpc PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_16bpc PREP_8TAP_FN regular_sharp, REGULAR, SHARP, prep_8tap_16bpc PREP_8TAP_FN sharp_regular, SHARP, REGULAR, prep_8tap_16bpc PREP_8TAP_FN sharp, SHARP, SHARP cglobal prep_8tap_16bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my %define base r7-prep_avx512icl imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v lea r7, [prep_avx512icl] mov wd, wm movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jz mangle(private_prefix %+ _prep_6tap_16bpc_avx512icl).prep .v: movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd mov r5d, r7m vpbroadcastd m10, [prep_8tap_rnd] pmovsxbw xmm0, [base+subpel_filters+myq*8] tzcnt r6d, wd shr r5d, 11 movzx r6d, word [r7+r6*2+table_offset(prep, _8tap_v)] psllw xmm0, [base+prep_hv_shift+r5*8] add r7, r6 lea r6, [strideq*3] sub srcq, r6 mova [tmpq], xmm0 vpbroadcastd m12, xmm0 vpbroadcastd m13, [tmpq+ 4] vpbroadcastd m14, [tmpq+ 8] vpbroadcastd m15, [tmpq+12] jmp r7 .v_w4: mov r3d, 0x330c movq xm1, [srcq+strideq*0] kmovw k1, r3d vpbroadcastq ym1{k1}, [srcq+strideq*1] vpbroadcastq m0, [srcq+r6 ] vinserti32x4 m1{k1}, m0, [srcq+strideq*2], 2 ; 0 1 2 3 lea srcq, [srcq+strideq*4] vpbroadcastq ym0{k1}, [srcq+strideq*0] vpbroadcastq m2, [srcq+strideq*1] vinserti32x4 m0{k1}, m2, [srcq+strideq*2], 3 ; 3 4 5 6 mova ym5, [prep_endA] vshufi32x4 m3, m1, m0, q1021 ; 1 2 3 4 vshufi32x4 m2, m1, m0, q2132 ; 2 3 4 5 punpcklwd m1, m3 ; 01 12 23 34 punpcklwd m2, m0 ; 23 34 45 56 .v_w4_loop: movq xm4, [srcq+r6 ] lea srcq, [srcq+strideq*4] vpbroadcastq ym4{k1}, [srcq+strideq*0] vpbroadcastq m3, [srcq+strideq*1] vinserti32x4 m4{k1}, m3, [srcq+strideq*2], 3 ; 7 8 9 a mova m3, m10 vpdpwssd m3, m12, m1 ; a0 b0 c0 d0 valignq m1, m4, m0, 6 ; 6 7 8 9 vpdpwssd m3, m13, m2 ; a1 b1 c1 d1 mova m0, m4 punpcklwd m4, m1, m4 ; 67 78 89 9a vpdpwssd m3, m15, m4 ; a3 b3 c3 d3 vshufi32x4 m1, m2, m4, q1032 ; 45 56 67 78 vpdpwssd m3, m14, m1 ; a2 b2 c2 d2 mova m2, m4 vpermb m3, m5, m3 mova [tmpq], ym3 add tmpq, 32 sub hd, 4 jg .v_w4_loop RET .v_w8: movu xm0, [srcq+strideq*0] mov r3d, 0x33 vbroadcasti32x4 ym1, [srcq+strideq*1] kmovb k1, r3d mova m7, [spel_v_shuf8] vinserti64x2 m1{k1}, m0, [srcq+strideq*2], 2 ; 0 1 2 add srcq, r6 vbroadcasti32x4 ym2, [srcq+strideq*0] vbroadcasti32x4 m3, [srcq+strideq*1] vbroadcasti32x4 ym0, [srcq+strideq*2] vshufi64x2 m2{k1}, m1, m3, q1032 ; 2 3 4 vinserti64x2 m0{k1}, m3, [srcq+r6], 2 ; 4 5 6 mova m8, [prep_endB] vpermb m1, m7, m1 ; 01 12 vpermb m2, m7, m2 ; 23 34 vpermb m3, m7, m0 ; 45 56 .v_w8_loop: lea srcq, [srcq+strideq*4] vbroadcasti32x4 ym4, [srcq+strideq*0] movu xm5, [srcq+strideq*1] vshufi64x2 m4{k1}, m0, m5, q1032 ; 6 7 8 vbroadcasti32x4 ym0, [srcq+strideq*2] vinserti64x2 m0{k1}, m5, [srcq+r6], 2 ; 8 9 a mova m5, m10 vpdpwssd m5, m12, m1 ; a0 b0 mova m6, m10 vpdpwssd m6, m12, m2 ; c0 d0 mova m1, m3 vpdpwssd m5, m13, m2 ; c1 d1 vpdpwssd m6, m13, m3 ; c1 d1 vpermb m2, m7, m4 ; 67 78 vpdpwssd m5, m14, m3 ; a2 b2 vpermb m3, m7, m0 ; 89 9a vpdpwssd m6, m14, m2 ; c2 d2 vpdpwssd m5, m15, m2 ; a3 b3 vpdpwssd m6, m15, m3 ; c3 d3 vpermt2b m5, m8, m6 mova [tmpq], m5 add tmpq, 64 sub hd, 4 jg .v_w8_loop RET .v_w16: vbroadcasti32x8 m0, [srcq+strideq*1] vinserti32x8 m1, m0, [srcq+strideq*2], 1 vinserti32x8 m0, [srcq+strideq*0], 0 mova m8, [spel_v_shuf16] add srcq, r6 movu ym3, [srcq+strideq*0] vinserti32x8 m3, [srcq+strideq*1], 1 movu ym5, [srcq+strideq*2] add srcq, r6 vinserti32x8 m5, [srcq+strideq*0], 1 mova m11, [prep_endA] vpermb m1, m8, m1 ; 12 vpermb m0, m8, m0 ; 01 vpermb m3, m8, m3 ; 34 vpermb m5, m8, m5 ; 56 vpshrdd m2, m1, m3, 16 ; 23 vpshrdd m4, m3, m5, 16 ; 45 .v_w16_loop: mova m7, m10 vpdpwssd m7, m12, m1 ; b0 mova m6, m10 vpdpwssd m6, m12, m0 ; a0 mova m1, m3 vpdpwssd m7, m13, m3 ; b1 mova m0, m2 vpdpwssd m6, m13, m2 ; a1 mova m3, m5 vpdpwssd m7, m14, m5 ; b2 mova m2, m4 vpdpwssd m6, m14, m4 ; a2 movu ym5, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vinserti32x8 m5, [srcq+strideq*0], 1 vpermb m5, m8, m5 ; 78 vpshrdd m4, m3, m5, 16 ; 67 vpdpwssd m7, m15, m5 ; b3 vpdpwssd m6, m15, m4 ; a3 vpermt2b m6, m11, m7 mova [tmpq], m6 add tmpq, 64 sub hd, 2 jg .v_w16_loop RET .v_w32: .v_w64: .v_w128: WIN64_PUSH_XMM 23 %if WIN64 push r8 %endif mova m11, [prep_endC] lea r5, [hq+wq*8-256] .v_w32_loop0: movu m16, [srcq+strideq*0] movu m17, [srcq+strideq*1] lea r7, [srcq+r6] movu m18, [srcq+strideq*2] movu m19, [r7 +strideq*0] mov r8, tmpq movu m20, [r7 +strideq*1] movu m21, [r7 +strideq*2] add r7, r6 movu m22, [r7 +strideq*0] punpcklwd m0, m16, m17 ; 01l punpckhwd m16, m17 ; 01h punpcklwd m1, m17, m18 ; 12l punpckhwd m17, m18 ; 12h punpcklwd m2, m18, m19 ; 23l punpckhwd m18, m19 ; 23h punpcklwd m3, m19, m20 ; 34l punpckhwd m19, m20 ; 34h punpcklwd m4, m20, m21 ; 45l punpckhwd m20, m21 ; 45h punpcklwd m5, m21, m22 ; 56l punpckhwd m21, m22 ; 56h .v_w32_loop: mova m6, m10 vpdpwssd m6, m12, m0 ; a0l mova m8, m10 vpdpwssd m8, m12, m16 ; a0h mova m7, m10 vpdpwssd m7, m12, m1 ; b0l mova m9, m10 vpdpwssd m9, m12, m17 ; b0h mova m0, m2 vpdpwssd m6, m13, m2 ; a1l mova m16, m18 vpdpwssd m8, m13, m18 ; a1h mova m1, m3 vpdpwssd m7, m13, m3 ; b1l mova m17, m19 vpdpwssd m9, m13, m19 ; b1h mova m2, m4 vpdpwssd m6, m14, m4 ; a2l mova m18, m20 vpdpwssd m8, m14, m20 ; a2h mova m3, m5 vpdpwssd m7, m14, m5 ; b2l mova m19, m21 vpdpwssd m9, m14, m21 ; b2h movu m21, [r7+strideq*1] lea r7, [r7+strideq*2] punpcklwd m4, m22, m21 ; 67l punpckhwd m20, m22, m21 ; 67h movu m22, [r7+strideq*0] vpdpwssd m6, m15, m4 ; a3l vpdpwssd m8, m15, m20 ; a3h punpcklwd m5, m21, m22 ; 78l punpckhwd m21, m22 ; 78h vpdpwssd m7, m15, m5 ; b3l vpdpwssd m9, m15, m21 ; b3h vpermt2b m6, m11, m8 vpermt2b m7, m11, m9 mova [r8+wq*0], m6 mova [r8+wq*2], m7 lea r8, [r8+wq*4] sub hd, 2 jg .v_w32_loop add srcq, 64 add tmpq, 64 movzx hd, r5b sub r5d, 1<<8 jg .v_w32_loop0 %if WIN64 pop r8 %endif RET .h_w4: RESET_STACK_STATE movzx mxd, mxb sub srcq, 2 pmovsxbw xmm0, [base+subpel_filters+mxq*8] mov r5d, r7m vbroadcasti32x4 m4, [spel_h_shufA] vbroadcasti32x4 m5, [spel_h_shufB] shr r5d, 11 mova ym9, [prep_endA] psllw xmm0, [base+prep_hv_shift+r5*8] mova [tmpq], xmm0 vpbroadcastd m6, [tmpq+4] vpbroadcastd m7, [tmpq+8] .h_w4_loop: movu xm2, [srcq+strideq*0] vinserti32x4 ym2, [srcq+strideq*1], 1 vinserti32x4 m2, [srcq+strideq*2], 2 vinserti32x4 m2, [srcq+r6 ], 3 lea srcq, [srcq+strideq*4] mova m0, m10 pshufb m1, m2, m4 vpdpwssd m0, m6, m1 pshufb m2, m5 vpdpwssd m0, m7, m2 vpermb m0, m9, m0 mova [tmpq], ym0 add tmpq, 32 sub hd, 4 jg .h_w4_loop RET .h_w8: mova m6, [spel_h_shufA] movu m7, [spel_h_shufB] movu m8, [spel_h_shufC] mova m9, [spel_h_shufD] mova m11, [prep_endB] .h_w8_loop: movu ym4, [srcq+strideq*0] vinserti32x8 m4, [srcq+strideq*1], 1 movu ym5, [srcq+strideq*2] vinserti32x8 m5, [srcq+r6 ], 1 lea srcq, [srcq+strideq*4] mova m0, m10 mova m1, m10 vpermb m2, m6, m4 vpermb m3, m6, m5 vpdpwssd m0, m12, m2 vpdpwssd m1, m12, m3 vpermb m2, m7, m4 vpermb m3, m7, m5 vpdpwssd m0, m13, m2 vpdpwssd m1, m13, m3 vpermb m2, m8, m4 vpermb m3, m8, m5 vpdpwssd m0, m14, m2 vpdpwssd m1, m14, m3 vpermb m2, m9, m4 vpermb m3, m9, m5 vpdpwssd m0, m15, m2 vpdpwssd m1, m15, m3 vpermt2b m0, m11, m1 mova [tmpq], m0 add tmpq, 64 sub hd, 4 jg .h_w8_loop RET .h: vpbroadcastd m10, [prep_8tap_rnd] test myd, 0xf00 jnz .hv lea r6, [strideq*3] cmp wd, 4 je .h_w4 shr mxd, 16 pmovsxbw xmm0, [base+subpel_filters+mxq*8] mov r5d, r7m sub srcq, 6 shr r5d, 11 psllw xmm0, [base+prep_hv_shift+r5*8] mova [tmpq], xmm0 vpbroadcastd m12, xmm0 vpbroadcastd m13, [tmpq+ 4] vpbroadcastd m14, [tmpq+ 8] vpbroadcastd m15, [tmpq+12] cmp wd, 16 jl .h_w8 vbroadcasti32x4 m6, [spel_h_shufA] vbroadcasti32x4 m7, [spel_h_shufB] mova m11, [prep_endC] jg .h_w32 .h_w16_loop: movu ym2, [srcq+strideq*0+ 0] vinserti32x8 m2, [srcq+strideq*1+ 0], 1 movu ym3, [srcq+strideq*0+16] vinserti32x8 m3, [srcq+strideq*1+16], 1 lea srcq, [srcq+strideq*2] mova m0, m10 mova m1, m10 pshufb m4, m2, m6 vpdpwssd m0, m12, m4 ; a0 pshufb m4, m3, m6 vpdpwssd m1, m14, m4 ; b2 pshufb m4, m2, m7 vpdpwssd m0, m13, m4 ; a1 pshufb m4, m3, m7 vpdpwssd m1, m15, m4 ; b3 shufpd m2, m3, 0x55 pshufb m4, m2, m6 vpdpwssd m0, m14, m4 ; a2 vpdpwssd m1, m12, m4 ; b0 pshufb m2, m7 vpdpwssd m0, m15, m2 ; a3 vpdpwssd m1, m13, m2 ; b1 vpermt2b m0, m11, m1 mova [tmpq], m0 add tmpq, 64 sub hd, 2 jg .h_w16_loop RET .h_w32: lea srcq, [srcq+wq*2] neg wq .h_w32_loop0: mov r6, wq .h_w32_loop: movu m2, [srcq+r6*2+ 0] movu m3, [srcq+r6*2+ 8] mova m0, m10 mova m1, m10 pshufb m4, m2, m6 vpdpwssd m0, m12, m4 ; a0 pshufb m4, m3, m6 vpdpwssd m1, m12, m4 ; b0 vpdpwssd m0, m14, m4 ; a2 movu m4, [srcq+r6*2+16] pshufb m3, m7 vpdpwssd m1, m13, m3 ; b1 vpdpwssd m0, m15, m3 ; a3 pshufb m3, m4, m6 vpdpwssd m1, m14, m3 ; b2 pshufb m2, m7 vpdpwssd m0, m13, m2 ; a1 pshufb m4, m7 vpdpwssd m1, m15, m4 ; b3 vpermt2b m0, m11, m1 mova [tmpq], m0 add tmpq, 64 add r6, 32 jl .h_w32_loop add srcq, strideq dec hd jg .h_w32_loop0 RET .hv: vpbroadcastd m11, [pd_128] cmp wd, 4 jg .hv_w8 movzx mxd, mxb pmovsxbw xmm0, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd mov r5d, r7m pmovsxbw xmm1, [base+subpel_filters+myq*8] lea r6, [strideq*3] sub srcq, 2 shr r5d, 11 sub srcq, r6 psllw xmm0, [base+prep_hv_shift+r5*8] psllw xmm1, 2 mova [tmpq+ 0], xmm0 mova [tmpq+16], xmm1 vpbroadcastd m12, xmm1 movu xm16, [srcq+strideq*0] mov r3d, 0xff0 vinserti128 ym16, [srcq+strideq*1], 1 kmovw k1, r3d vbroadcasti32x4 m18, [srcq+strideq*2] add srcq, r6 vinserti64x2 m16{k1}, m18, [srcq+strideq*0], 3 movu xm17, [srcq+strideq*1] vbroadcasti32x4 ym18, [srcq+strideq*2] add srcq, r6 vinserti32x4 m17{k1}, m18, [srcq+strideq*0], 2 vbroadcasti32x4 m5, [spel_h_shufA] vbroadcasti32x4 m6, [spel_h_shufB] vpbroadcastd m8, [tmpq+ 4] vpbroadcastd m9, [tmpq+ 8] mova m1, m10 mova m19, [spel_shuf4a] mova m2, m10 pshufb m0, m16, m5 vpdpwssd m1, m8, m0 pshufb m0, m17, m5 vpdpwssd m2, m8, m0 vpbroadcastd m13, [tmpq+20] pshufb m16, m6 vpbroadcastd m14, [tmpq+24] pshufb m17, m6 vpbroadcastd m15, [tmpq+28] vpdpwssd m1, m9, m16 ; 0 1 2 3 vpdpwssd m2, m9, m17 ; 4 5 6 mova m7, [spel_shuf4b] vpermt2b m1, m19, m2 ; 01 12 23 34 vpermb m2, m19, m2 ; 45 56 mova ym19, [prep_endA] vshufi32x4 m2, m1, m2, q1032 ; 23 34 45 56 .hv_w4_loop: movu xm17, [srcq+strideq*1] vinserti128 ym17, [srcq+strideq*2], 1 vbroadcasti32x4 m16, [srcq+r6 ] lea srcq, [srcq+strideq*4] vinserti64x2 m17{k1}, m16, [srcq+strideq*0], 3 mova m18, m10 pshufb m16, m17, m5 vpdpwssd m18, m8, m16 mova m16, m11 vpdpwssd m16, m12, m1 ; a0 b0 c0 d0 pshufb m17, m6 vpdpwssd m18, m9, m17 ; 7 8 9 a mova m1, m2 vpdpwssd m16, m13, m2 ; a1 b1 c1 d1 vpermt2b m2, m7, m18 ; 67 78 89 9a vpdpwssd m16, m15, m2 ; a3 b3 c3 d3 vshufi32x4 m1, m2, q1032 ; 45 56 67 78 vpdpwssd m16, m14, m1 ; a2 b2 c2 d2 vpermb m16, m19, m16 mova [tmpq], ym16 add tmpq, 32 sub hd, 4 jg .hv_w4_loop vzeroupper RET .hv_w8: shr mxd, 16 pmovsxbw xmm0, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd mov r5d, r7m pmovsxbw xmm1, [base+subpel_filters+myq*8] lea r6, [strideq*3] sub srcq, 6 shr r5d, 11 sub srcq, r6 psllw xmm0, [base+prep_hv_shift+r5*8] psllw xmm1, 2 mova [tmpq+ 0], xmm0 mova [tmpq+16], xmm1 vpbroadcastd m12, xmm0 vpbroadcastd m13, [tmpq+ 4] vpbroadcastd m14, [tmpq+ 8] vpbroadcastd m15, [tmpq+12] vpbroadcastd m16, xmm1 vpbroadcastd m17, [tmpq+20] vpbroadcastd m18, [tmpq+24] vpbroadcastd m19, [tmpq+28] cmp wd, 8 jg .hv_w16 WIN64_SPILL_XMM 23 mova m5, [spel_h_shufA] movu ym0, [srcq+strideq*0] vinserti32x8 m0, [srcq+strideq*1], 1 ; 0 1 movu ym9, [srcq+strideq*2] add srcq, r6 vinserti32x8 m9, [srcq+strideq*0], 1 ; 2 3 movu ym20, [srcq+strideq*1] vinserti32x8 m20, [srcq+strideq*2], 1 ; 4 5 add srcq, r6 movu ym21, [srcq+strideq*0] ; 6 movu m6, [spel_h_shufB] movu m7, [spel_h_shufC] mova ym22, [prep_endB] vpermb m8, m5, m0 mova m1, m10 vpdpwssd m1, m12, m8 ; a0 b0 vpermb m8, m5, m9 mova m2, m10 vpdpwssd m2, m12, m8 ; c0 d0 vpermb m8, m5, m20 mova m3, m10 vpdpwssd m3, m12, m8 ; e0 f0 vpermb m8, m5, m21 mova m4, m10 vpdpwssd m4, m12, m8 ; g0 vpermb m8, m6, m0 vpdpwssd m1, m13, m8 ; a1 b1 vpermb m8, m6, m9 vpdpwssd m2, m13, m8 ; c1 d1 vpermb m8, m6, m20 vpdpwssd m3, m13, m8 ; e1 f1 vpermb m8, m6, m21 vpdpwssd m4, m13, m8 ; g1 vpermb m8, m7, m0 vpdpwssd m1, m14, m8 ; a2 b2 vpermb m8, m7, m9 vpdpwssd m2, m14, m8 ; c2 d2 vpermb m8, m7, m20 vpdpwssd m3, m14, m8 ; e2 f2 vpermb m8, m7, m21 vpdpwssd m4, m14, m8 ; g2 mova m8, [spel_h_shufD] vpermb m0, m8, m0 vpdpwssd m1, m15, m0 ; a3 b3 mova m0, [spel_shuf8a] vpermb m9, m8, m9 vpdpwssd m2, m15, m9 ; c3 d3 mova m9, [spel_shuf8b] vpermb m20, m8, m20 vpdpwssd m3, m15, m20 ; e3 f3 vpermb m21, m8, m21 vpdpwssd m4, m15, m21 ; g3 vpermt2b m1, m0, m2 ; 01 12 vpermt2b m2, m0, m3 ; 23 34 vpermt2b m3, m0, m4 ; 45 56 .hv_w8_loop: movu ym0, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vinserti32x8 m0, [srcq+strideq*0], 1 mova m4, m10 mova m20, m11 vpermb m21, m5, m0 vpdpwssd m4, m12, m21 ; h0 i0 vpermb m21, m6, m0 vpdpwssd m20, m16, m1 ; A0 B0 vpdpwssd m4, m13, m21 ; h1 i1 vpermb m21, m7, m0 mova m1, m2 vpdpwssd m20, m17, m2 ; A1 B1 vpdpwssd m4, m14, m21 ; h2 i2 vpermb m21, m8, m0 mova m2, m3 vpdpwssd m20, m18, m3 ; A2 B2 vpdpwssd m4, m15, m21 ; h3 i3 vpermt2b m3, m9, m4 ; 67 78 vpdpwssd m20, m19, m3 ; A3 B3 vpermb m20, m22, m20 mova [tmpq], ym20 add tmpq, 32 sub hd, 2 jg .hv_w8_loop RET .hv_w16: WIN64_SPILL_XMM 27 %if WIN64 push r8 %endif vbroadcasti32x4 m20, [spel_h_shufA] vbroadcasti32x4 m21, [spel_h_shufB] add wd, wd mova m9, [spel_shuf16] mova m26, [prep_endB] lea r5d, [hq+wq*8-256] .hv_w16_loop0: vbroadcasti32x8 m5, [srcq+strideq*0+ 8] vinserti32x8 m4, m5, [srcq+strideq*0+ 0], 0 vinserti32x8 m5, [srcq+strideq*0+16], 1 ; 0 movu ym6, [srcq+strideq*1+ 0] movu ym7, [srcq+strideq*1+16] lea r7, [srcq+r6] vinserti32x8 m6, [srcq+strideq*2+ 0], 1 vinserti32x8 m7, [srcq+strideq*2+16], 1 ; 1 2 movu ym22, [r7 +strideq*0+ 0] movu ym23, [r7 +strideq*0+16] mov r8, tmpq vinserti32x8 m22, [r7 +strideq*1+ 0], 1 vinserti32x8 m23, [r7 +strideq*1+16], 1 ; 3 4 movu ym24, [r7 +strideq*2+ 0] movu ym25, [r7 +strideq*2+16] add r7, r6 vinserti32x8 m24, [r7 +strideq*0+ 0], 1 vinserti32x8 m25, [r7 +strideq*0+16], 1 ; 5 6 pshufb m0, m4, m20 mova m1, m10 vpdpwssd m1, m12, m0 ; a0 pshufb m0, m6, m20 mova m2, m10 vpdpwssd m2, m12, m0 ; b0 pshufb m0, m7, m20 mova m3, m10 vpdpwssd m3, m14, m0 ; c2 pshufb m0, m4, m21 vpdpwssd m1, m13, m0 ; a1 pshufb m0, m6, m21 vpdpwssd m2, m13, m0 ; b1 pshufb m0, m7, m21 vpdpwssd m3, m15, m0 ; c3 pshufb m0, m5, m20 vpdpwssd m1, m14, m0 ; a2 shufpd m6, m7, 0x55 pshufb m7, m6, m20 vpdpwssd m2, m14, m7 ; b2 vpdpwssd m3, m12, m7 ; c0 pshufb m5, m21 vpdpwssd m1, m15, m5 ; a3 pshufb m6, m21 vpdpwssd m2, m15, m6 ; b3 vpdpwssd m3, m13, m6 ; c1 pshufb m0, m22, m20 mova m4, m10 vpdpwssd m4, m12, m0 ; d0 pshufb m0, m23, m20 mova m5, m10 vpdpwssd m5, m14, m0 ; e2 pshufb m0, m24, m20 mova m6, m10 vpdpwssd m6, m12, m0 ; f0 pshufb m0, m25, m20 mova m7, m10 vpdpwssd m7, m14, m0 ; g2 pshufb m0, m22, m21 vpdpwssd m4, m13, m0 ; d1 pshufb m0, m23, m21 vpdpwssd m5, m15, m0 ; e3 pshufb m0, m24, m21 vpdpwssd m6, m13, m0 ; f1 pshufb m0, m25, m21 vpdpwssd m7, m15, m0 ; g3 shufpd m22, m23, 0x55 pshufb m23, m22, m20 vpdpwssd m4, m14, m23 ; d2 vpdpwssd m5, m12, m23 ; e0 shufpd m24, m25, 0x55 pshufb m25, m24, m20 vpdpwssd m6, m14, m25 ; f2 vpdpwssd m7, m12, m25 ; g0 pshufb m22, m21 vpdpwssd m4, m15, m22 ; d3 vpdpwssd m5, m13, m22 ; e1 pshufb m24, m21 vpdpwssd m6, m15, m24 ; f3 vpdpwssd m7, m13, m24 ; g1 pslldq m1, 1 vpermt2b m2, m9, m3 ; 12 vpermt2b m4, m9, m5 ; 34 vpermt2b m6, m9, m7 ; 56 vpshrdd m1, m2, 16 ; 01 vpshrdd m3, m2, m4, 16 ; 23 vpshrdd m5, m4, m6, 16 ; 45 .hv_w16_loop: movu ym24, [r7+strideq*1+ 0] movu ym25, [r7+strideq*1+16] lea r7, [r7+strideq*2] vinserti32x8 m24, [r7+strideq*0+ 0], 1 vinserti32x8 m25, [r7+strideq*0+16], 1 mova m7, m10 mova m8, m10 pshufb m0, m24, m20 vpdpwssd m7, m12, m0 ; h0 mova m22, m11 pshufb m0, m25, m20 vpdpwssd m8, m14, m0 ; i2 mova m23, m11 vpdpwssd m22, m16, m1 ; A0 mova m1, m3 vpdpwssd m23, m16, m2 ; B0 mova m2, m4 pshufb m0, m24, m21 vpdpwssd m7, m13, m0 ; h1 pshufb m0, m25, m21 vpdpwssd m8, m15, m0 ; i3 vpdpwssd m22, m17, m3 ; A1 mova m3, m5 vpdpwssd m23, m17, m4 ; B1 mova m4, m6 shufpd m24, m25, 0x55 pshufb m25, m24, m20 vpdpwssd m7, m14, m25 ; h2 vpdpwssd m8, m12, m25 ; i0 vpdpwssd m22, m18, m5 ; A2 vpdpwssd m23, m18, m6 ; B2 pshufb m24, m21 vpdpwssd m7, m15, m24 ; h3 vpdpwssd m8, m13, m24 ; i1 vpermt2b m7, m9, m8 ; 78 vpshrdd m5, m6, m7, 16 ; 67 vpdpwssd m22, m19, m5 ; A3 vpdpwssd m23, m19, m7 ; B3 mova m6, m7 vpermt2b m22, m26, m23 mova [r8+wq*0], ym22 vextracti32x8 [r8+wq*1], m22, 1 lea r8, [r8+wq*2] sub hd, 2 jg .hv_w16_loop add srcq, 32 add tmpq, 32 movzx hd, r5b sub r5d, 1<<8 jg .hv_w16_loop0 %if WIN64 pop r8 %endif RET %if WIN64 DECLARE_REG_TMP 5 %else DECLARE_REG_TMP 7 %endif cglobal warp_affine_8x8t_16bpc, 4, 7, 22, tmp, ts %define base r6-pd_0to7 mov t0d, r7m lea r6, [pd_0to7] shr t0d, 11 vpbroadcastd m8, [base+warp_8x8t_rnd_v] vpbroadcastd m1, [base+warp_8x8_rnd_h+t0*4] call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main psrad m14, m16, 15 call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2 psrad m16, 15 packssdw m14, m16 call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2 psrad m15, m16, 15 call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2 add tsq, tsq psrad m16, 15 packssdw m15, m16 jmp mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).end cglobal warp_affine_8x8_16bpc, 4, 7, 22, dst, ds, src, ss, abcd mov t0d, r7m ; pixel_max lea r6, [pd_0to7] shr t0d, 11 vpbroadcastd m1, [base+warp_8x8_rnd_h+t0*4] vpbroadcastd m8, [base+warp_8x8_rnd_v+t0*4] call .main psrad m14, m16, 13 call .main2 psrad m16, 13 packusdw m14, m16 call .main2 psrad m15, m16, 13 call .main2 vpbroadcastd m0, [base+bidir_shift+t0*4] vpsrlvw m14, m0 psrad m16, 13 packusdw m15, m16 vpsrlvw m15, m0 .end: mova m0, [base+warp8x8_end] vpermb m16, m0, m14 lea r2, [dsq*3] mova [dstq+dsq*0], xm16 vextracti128 [dstq+dsq*1], ym16, 1 vextracti32x4 [dstq+dsq*2], m16, 2 vextracti32x4 [dstq+r2 ], m16, 3 vpermb m16, m0, m15 lea dstq, [dstq+dsq*4] mova [dstq+dsq*0], xm16 vextracti128 [dstq+dsq*1], ym16, 1 vextracti32x4 [dstq+dsq*2], m16, 2 vextracti32x4 [dstq+r2 ], m16, 3 RET .main: vpbroadcastd ym3, [base+pd_512] %if WIN64 mov abcdq, r5mp vpaddd ym18, ym3, r6m {1to8} ; mx %else add r5d, 512 vpbroadcastd ym18, r5d %endif vpaddd ym20, ym3, r7m {1to8} ; my mova ym16, [base+pd_0to7] vpbroadcastd ym19, [abcdq+4*0] ; alpha vpbroadcastd ym21, [abcdq+4*1] ; gamma lea r4, [ssq*3+6] vpdpwssd ym18, ym19, ym16 ; tmx vpdpwssd ym20, ym21, ym16 ; tmy sub srcq, r4 mova m10, [base+warp8x8_permA] lea r4, [mc_warp_filter+64*8] vbroadcasti32x4 m12, [base+warp8x8_permC] kxnorb k1, k1, k1 vbroadcasti32x4 m13, [base+warp8x8_permD] movu ym5, [srcq+0] vinserti32x8 m5, [srcq+8], 1 psrad ym17, ym18, 10 mova m11, [base+warp8x8_permB] kmovb k2, k1 vpgatherdq m3{k1}, [r4+ym17*8] ; filter_x0 psrad ym19, 16 ; beta psrad ym21, 16 ; delta paddd ym18, ym19 vpermb m4, m10, m5 vpbroadcastq m9, [base+warp_shift_h+t0*8] pshufd m3, m3, q3120 paddd m7, m1, m1 pshufb m2, m3, m12 vpdpwssd m1, m4, m2 vpermb m5, m11, m5 vshufi32x4 m4, m5, q1021 pshufb m3, m13 vpdpwssd m1, m4, m3 call .h psllq m2, m1, 32 paddd m1, m2 vpmultishiftqb m1, m9, m1 vpshrdq m1, m0, 48 ; 01 12 call .h vpshrdq m2, m1, m0, 48 ; 23 34 call .h vpshrdq m3, m2, m0, 48 ; 45 56 .main2: call .h psrad ym6, ym20, 10 kmovb k1, k2 paddd ym17, ym20, ym21 ; my += delta vpgatherdq m20{k2}, [r4+ym6*8] ; filter_y0 psrad ym16, ym17, 10 kmovb k2, k1 vpgatherdq m6{k1}, [r4+ym16*8] ; filter_y1 shufps m5, m20, m6, q2020 mova m16, m8 pshufb m4, m5, m12 vpdpwssd m16, m1, m4 ; a0 b0 pshufb m5, m13 mova m1, m2 vpdpwssd m16, m2, m5 ; a1 b1 shufps m6, m20, m6, q3131 paddd ym20, ym17, ym21 pshufb m4, m6, m12 mova m2, m3 vpdpwssd m16, m3, m4 ; a2 b2 vpshrdq m3, m0, 48 ; 67 78 pshufb m6, m13 vpdpwssd m16, m3, m6 ; a3 b3 ret ALIGN function_align .h: movu ym16, [srcq+ssq*1] psrad ym6, ym18, 10 lea srcq, [srcq+ssq*2] vinserti32x8 m5, m16, [srcq+ssq*0], 1 kmovb k1, k2 paddd ym17, ym18, ym19 ; mx += beta vpgatherdq m18{k2}, [r4+ym6*8] ; filter_x1 psrad ym16, ym17, 10 kmovb k2, k1 vpgatherdq m6{k1}, [r4+ym16*8] ; filter_x2 vpermb m4, m10, m5 shufps m16, m18, m6, q2020 shufps m6, m18, m6, q3131 mova m0, m7 pshufb m18, m16, m12 vpdpwssd m0, m4, m18 ; a0 b0 vpermb m5, m11, m5 pshufb m18, m6, m13 vpdpwssd m0, m5, m18 ; a3 b3 paddd ym18, ym17, ym19 vshufi32x4 m17, m4, m5, q1021 pshufb m16, m13 vpdpwssd m0, m17, m16 ; a1 b1 vshufi32x4 m4, m5, q2132 pshufb m6, m12 vpdpwssd m0, m4, m6 ; a2 b2 vpmultishiftqb m0, m9, m0 ; a a b b ret %macro BIDIR_FN 0 call .main lea stride3q, [strideq*3] jmp wq .w4: movq [dstq ], xm0 movhps [dstq+strideq*1], xm0 vextracti32x4 xm2, ym0, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 cmp hd, 8 jl .w4_end vextracti32x4 xm2, m0, 2 lea dstq, [dstq+strideq*4] movq [dstq ], xm2 movhps [dstq+strideq*1], xm2 vextracti32x4 xm0, m0, 3 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq ], xm1 movhps [dstq+strideq*1], xm1 vextracti32x4 xm0, ym1, 1 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 vextracti32x4 xm0, m1, 2 lea dstq, [dstq+strideq*4] movq [dstq ], xm0 movhps [dstq+strideq*1], xm0 vextracti32x4 xm1, m1, 3 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 .w4_end: RET .w8_loop: call .main lea dstq, [dstq+strideq*4] .w8: mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+stride3q ], m0, 3 sub hd, 8 jl .w8_end lea dstq, [dstq+strideq*4] mova [dstq+strideq*0], xm1 vextracti32x4 [dstq+strideq*1], ym1, 1 vextracti32x4 [dstq+strideq*2], m1, 2 vextracti32x4 [dstq+stride3q ], m1, 3 jg .w8_loop .w8_end: RET .w16_loop: call .main lea dstq, [dstq+strideq*4] .w16: mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym1 vextracti32x8 [dstq+stride3q ], m1, 1 sub hd, 4 jg .w16_loop RET .w32_loop: call .main lea dstq, [dstq+strideq*2] .w32: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 sub hd, 2 jg .w32_loop RET .w64_loop: call .main add dstq, strideq .w64: mova [dstq+64*0], m0 mova [dstq+64*1], m1 dec hd jg .w64_loop RET .w128_loop: call .main add dstq, strideq .w128: mova [dstq+64*0], m0 mova [dstq+64*1], m1 call .main mova [dstq+64*2], m0 mova [dstq+64*3], m1 dec hd jg .w128_loop RET %endmacro %if WIN64 DECLARE_REG_TMP 5 %else DECLARE_REG_TMP 7 %endif cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h, stride3 %define base r6-avg_avx512icl_table lea r6, [avg_avx512icl_table] tzcnt wd, wm mov t0d, r6m ; pixel_max movsxd wq, [r6+wq*4] shr t0d, 11 vpbroadcastd m2, [base+avg_round+t0*4] vpbroadcastd m3, [base+avg_shift+t0*4] movifnidn hd, hm add wq, r6 BIDIR_FN ALIGN function_align .main: mova m0, [tmp1q+64*0] paddsw m0, [tmp2q+64*0] mova m1, [tmp1q+64*1] paddsw m1, [tmp2q+64*1] add tmp1q, 64*2 add tmp2q, 64*2 pmaxsw m0, m2 pmaxsw m1, m2 psubsw m0, m2 psubsw m1, m2 vpsrlvw m0, m3 vpsrlvw m1, m3 ret cglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h, stride3 %define base r6-w_avg_avx512icl_table lea r6, [w_avg_avx512icl_table] tzcnt wd, wm mov t0d, r7m ; pixel_max shr t0d, 11 movsxd wq, [r6+wq*4] vpbroadcastd m5, [base+w_avg_round+t0*4] vpbroadcastd m7, [base+bidir_shift+t0*4] add wq, r6 mov r6d, r6m ; weight lea t0d, [r6-16] shl r6d, 16 sub r6d, t0d ; 16-weight, weight movifnidn hd, hm vpbroadcastd m6, r6d BIDIR_FN ALIGN function_align .main: mova m3, [tmp1q+64*0] mova m1, [tmp2q+64*0] mova m0, [tmp1q+64*1] mova m4, [tmp2q+64*1] add tmp1q, 64*2 add tmp2q, 64*2 punpcklwd m2, m1, m3 punpckhwd m1, m3 punpcklwd m3, m4, m0 punpckhwd m4, m0 mova m0, m5 vpdpwssd m0, m6, m2 mova m2, m5 vpdpwssd m2, m6, m1 mova m1, m5 vpdpwssd m1, m6, m3 mova m3, m5 vpdpwssd m3, m6, m4 REPX {psrad x, 2}, m0, m2, m1, m3 packusdw m0, m2 packusdw m1, m3 vpsrlvw m0, m7 vpsrlvw m1, m7 ret cglobal mask_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-mask_avx512icl_table lea r7, [mask_avx512icl_table] tzcnt wd, wm mov r6d, r7m ; pixel_max movifnidn hd, hm shr r6d, 11 movsxd wq, [r7+wq*4] vpbroadcastd m8, [base+pw_64] vpbroadcastd m9, [base+mask_round+r6*4] vpbroadcastd m10, [base+bidir_shift+r6*4] mov maskq, maskmp add wq, r7 BIDIR_FN ALIGN function_align .main: pmovzxbw m1, [maskq+32*0] mova m4, [tmp1q+64*0] mova m2, [tmp2q+64*0] pmovzxbw m6, [maskq+32*1] mova m5, [tmp1q+64*1] mova m3, [tmp2q+64*1] add maskq, 32*2 add tmp1q, 64*2 add tmp2q, 64*2 punpcklwd m7, m4, m2 punpckhwd m4, m2 psubw m0, m8, m1 punpcklwd m2, m1, m0 ; m, 64-m punpckhwd m1, m0 mova m0, m9 vpdpwssd m0, m7, m2 mova m2, m9 vpdpwssd m2, m4, m1 ; tmp1 * m + tmp2 * (64-m) punpcklwd m7, m5, m3 punpckhwd m5, m3 psubw m1, m8, m6 punpcklwd m3, m6, m1 punpckhwd m6, m1 mova m1, m9 vpdpwssd m1, m7, m3 mova m3, m9 vpdpwssd m3, m5, m6 REPX {psrad x, 4}, m0, m2, m1, m3 packusdw m0, m2 packusdw m1, m3 vpsrlvw m0, m10 vpsrlvw m1, m10 ret cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_420_avx512icl_table lea r7, [w_mask_420_avx512icl_table] tzcnt wd, wm mov r6d, r8m ; pixel_max movifnidn hd, hm shr r6d, 11 movsxd wq, [r7+wq*4] vpbroadcastd m10, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 vpbroadcastd m11, [base+pw_64] vpbroadcastd m12, [base+mask_round+r6*4] vpbroadcastd m13, [base+bidir_shift+r6*4] mov r6d, r7m ; sign vpbroadcastd m14, [base+w_mask_round+r6*4] mova ym15, [w_mask_end42x] mov maskq, maskmp add wq, r7 call .main lea stride3q, [strideq*3] jmp wq .w4: mova m4, [w_mask_shuf4] vpermt2b m2, m4, m3 mova m3, m14 vpdpbusd m3, m2, [pb_64] {1to16} vpermb m3, m15, m3 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 vextracti32x4 xm2, ym0, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 mova [maskq], xm3 cmp hd, 8 jl .w4_end vextracti32x4 xm2, m0, 2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 vextracti32x4 xm0, m0, 3 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 vextracti32x4 xm2, ym1, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 vextracti32x4 xm2, m1, 2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 vextracti32x4 xm1, m1, 3 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 .w4_end: RET .w8: mova m8, [w_mask_shuf8] vpbroadcastd m9, [pb_64] jmp .w8_start .w8_loop: call .main lea dstq, [dstq+strideq*4] add maskq, 16 .w8_start: vpermt2b m2, m8, m3 mova m3, m14 vpdpbusd m3, m2, m9 vpermb m3, m15, m3 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+stride3q ], m0, 3 mova [maskq], xm3 sub hd, 8 jl .w8_end lea dstq, [dstq+strideq*4] mova [dstq+strideq*0], xm1 vextracti32x4 [dstq+strideq*1], ym1, 1 vextracti32x4 [dstq+strideq*2], m1, 2 vextracti32x4 [dstq+stride3q ], m1, 3 jg .w8_loop .w8_end: RET .w16: mova m8, [w_mask_shuf16] vpbroadcastd m9, [pb_64] jmp .w16_start .w16_loop: call .main lea dstq, [dstq+strideq*4] add maskq, 16 .w16_start: vpermt2b m2, m8, m3 mova m3, m14 vpdpbusd m3, m2, m9 vpermb m3, m15, m3 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym1 vextracti32x8 [dstq+stride3q ], m1, 1 mova [maskq], xm3 sub hd, 4 jg .w16_loop RET .w32_loop: call .main lea dstq, [dstq+strideq*4] add maskq, 32 .w32: paddw m2, m3 mova m8, m14 vpdpwssd m8, m11, m2 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 call .main paddw m2, m3 mova m3, m14 vpdpwssd m3, m11, m2 vpermt2b m8, m15, m3 mova [dstq+strideq*2], m0 mova [dstq+stride3q ], m1 mova [maskq], ym8 sub hd, 4 jg .w32_loop RET .w64_loop: call .main lea dstq, [dstq+strideq*2] add maskq, 32 .w64: mova m8, m2 mova m9, m3 mova [dstq+strideq*0+64*0], m0 mova [dstq+strideq*0+64*1], m1 call .main paddw m8, m2 paddw m9, m3 mova m2, m14 vpdpwssd m2, m11, m8 mova m3, m14 vpdpwssd m3, m11, m9 vpermt2b m2, m15, m3 mova [dstq+strideq*1+64*0], m0 mova [dstq+strideq*1+64*1], m1 mova [maskq], ym2 sub hd, 2 jg .w64_loop RET .w128_loop: call .main lea dstq, [dstq+strideq*2] add maskq, 64 .w128: mova m16, m2 mova m8, m3 mova [dstq+strideq*0+64*0], m0 mova [dstq+strideq*0+64*1], m1 call .main mova m17, m2 mova m9, m3 mova [dstq+strideq*0+64*2], m0 mova [dstq+strideq*0+64*3], m1 call .main paddw m2, m16 paddw m3, m8 mova m16, m14 vpdpwssd m16, m11, m2 mova m8, m14 vpdpwssd m8, m11, m3 mova [dstq+strideq*1+64*0], m0 mova [dstq+strideq*1+64*1], m1 call .main paddw m2, m17 paddw m3, m9 mova m17, m14 vpdpwssd m17, m11, m2 mova m9, m14 vpdpwssd m9, m11, m3 vpermt2b m16, m15, m8 vpermt2b m17, m15, m9 mova [dstq+strideq*1+64*2], m0 mova [dstq+strideq*1+64*3], m1 mova [maskq+32*0], ym16 mova [maskq+32*1], ym17 sub hd, 2 jg .w128_loop vzeroupper RET ALIGN function_align .main: mova m1, [tmp1q+64*0] mova m3, [tmp2q+64*0] mova m4, [tmp1q+64*1] mova m7, [tmp2q+64*1] add tmp1q, 64*2 add tmp2q, 64*2 psubsw m6, m1, m3 punpcklwd m5, m3, m1 pabsw m6, m6 punpckhwd m3, m1 psubusw m6, m10, m6 psrlw m6, 10 ; 64-m psubw m2, m11, m6 ; m punpcklwd m1, m6, m2 punpckhwd m6, m2 mova m0, m12 vpdpwssd m0, m5, m1 mova m1, m12 vpdpwssd m1, m3, m6 psubsw m5, m4, m7 punpcklwd m6, m7, m4 pabsw m5, m5 punpckhwd m7, m4 psubusw m5, m10, m5 psrlw m5, 10 psubw m3, m11, m5 punpcklwd m4, m5, m3 psrad m0, 4 punpckhwd m5, m3 psrad m1, 4 packusdw m0, m1 mova m1, m12 vpdpwssd m1, m6, m4 mova m4, m12 vpdpwssd m4, m7, m5 psrad m1, 4 psrad m4, 4 packusdw m1, m4 vpsrlvw m0, m13 vpsrlvw m1, m13 ret cglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_422_avx512icl_table lea r7, [w_mask_422_avx512icl_table] tzcnt wd, wm mov r6d, r8m ; pixel_max movifnidn hd, hm shr r6d, 11 movsxd wq, [r7+wq*4] vpbroadcastd m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 vpbroadcastd m9, [base+pw_64] vpbroadcastd m10, [base+mask_round+r6*4] vpbroadcastd m11, [base+bidir_shift+r6*4] mov r6d, r7m ; sign vpbroadcastd m12, [base+w_mask_round+r6*4] mova ym13, [w_mask_end42x] mov maskq, maskmp add wq, r7 paddw m14, m9, m9 ; pw_128 call .main lea stride3q, [strideq*3] jmp wq .w4: movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 vextracti32x4 xm2, ym0, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 cmp hd, 8 jl .w4_end vextracti32x4 xm2, m0, 2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 vextracti32x4 xm0, m0, 3 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 vextracti32x4 xm2, ym1, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 vextracti32x4 xm2, m1, 2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 vextracti32x4 xm1, m1, 3 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 .w4_end: RET .w8_loop: call .main lea dstq, [dstq+strideq*4] .w8: mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+stride3q ], m0, 3 sub hd, 8 jl .w8_end lea dstq, [dstq+strideq*4] mova [dstq+strideq*0], xm1 vextracti32x4 [dstq+strideq*1], ym1, 1 vextracti32x4 [dstq+strideq*2], m1, 2 vextracti32x4 [dstq+stride3q ], m1, 3 jg .w8_loop .w8_end: RET .w16_loop: call .main lea dstq, [dstq+strideq*4] .w16: mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym1 vextracti32x8 [dstq+stride3q ], m1, 1 sub hd, 4 jg .w16_loop RET .w32_loop: call .main lea dstq, [dstq+strideq*2] .w32: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 sub hd, 2 jg .w32_loop RET .w64_loop: call .main add dstq, strideq .w64: mova [dstq+64*0], m0 mova [dstq+64*1], m1 dec hd jg .w64_loop RET .w128_loop: call .main add dstq, strideq .w128: mova [dstq+64*0], m0 mova [dstq+64*1], m1 call .main mova [dstq+64*2], m0 mova [dstq+64*3], m1 dec hd jg .w128_loop RET ALIGN function_align .main: mova m1, [tmp1q+64*0] mova m3, [tmp2q+64*0] mova m4, [tmp1q+64*1] mova m7, [tmp2q+64*1] add tmp1q, 64*2 add tmp2q, 64*2 psubsw m6, m1, m3 punpcklwd m5, m3, m1 pabsw m6, m6 punpckhwd m3, m1 psubusw m6, m8, m6 psrlw m6, 10 psubw m2, m9, m6 punpcklwd m1, m6, m2 punpckhwd m6, m2 mova m0, m10 vpdpwssd m0, m5, m1 mova m1, m10 vpdpwssd m1, m3, m6 psubsw m5, m4, m7 punpcklwd m6, m7, m4 pabsw m5, m5 punpckhwd m7, m4 psubusw m5, m8, m5 psrlw m5, 10 psubw m3, m9, m5 punpcklwd m4, m5, m3 psrad m0, 4 punpckhwd m5, m3 psrad m1, 4 packusdw m0, m1 mova m1, m10 vpdpwssd m1, m6, m4 mova m4, m10 vpdpwssd m4, m7, m5 mova m5, m12 vpdpwssd m5, m14, m2 mova m2, m12 vpdpwssd m2, m14, m3 psrad m1, 4 psrad m4, 4 packusdw m1, m4 vpermt2b m5, m13, m2 vpsrlvw m0, m11 vpsrlvw m1, m11 mova [maskq], ym5 add maskq, 32 ret cglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_444_avx512icl_table lea r7, [w_mask_444_avx512icl_table] tzcnt wd, wm mov r6d, r8m ; pixel_max movifnidn hd, hm shr r6d, 11 movsxd wq, [r7+wq*4] vpbroadcastd m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 vpbroadcastd m9, [base+pw_64] vpbroadcastd m10, [base+mask_round+r6*4] mova m11, [w_mask_end444] vpbroadcastd m12, [base+bidir_shift+r6*4] mov maskq, maskmp add wq, r7 call .main lea stride3q, [strideq*3] jmp wq .w4: movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 vextracti32x4 xm2, ym0, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 cmp hd, 8 jl .w4_end vextracti32x4 xm2, m0, 2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 vextracti32x4 xm0, m0, 3 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 vextracti32x4 xm2, ym1, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 vextracti32x4 xm2, m1, 2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 vextracti32x4 xm1, m1, 3 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 .w4_end: RET .w8_loop: call .main lea dstq, [dstq+strideq*4] .w8: mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+stride3q ], m0, 3 sub hd, 8 jl .w8_end lea dstq, [dstq+strideq*4] mova [dstq+strideq*0], xm1 vextracti32x4 [dstq+strideq*1], ym1, 1 vextracti32x4 [dstq+strideq*2], m1, 2 vextracti32x4 [dstq+stride3q ], m1, 3 jg .w8_loop .w8_end: RET .w16_loop: call .main lea dstq, [dstq+strideq*4] .w16: mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym1 vextracti32x8 [dstq+stride3q ], m1, 1 sub hd, 4 jg .w16_loop RET .w32_loop: call .main lea dstq, [dstq+strideq*2] .w32: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 sub hd, 2 jg .w32_loop RET .w64_loop: call .main add dstq, strideq .w64: mova [dstq+64*0], m0 mova [dstq+64*1], m1 dec hd jg .w64_loop RET .w128_loop: call .main add dstq, strideq .w128: mova [dstq+64*0], m0 mova [dstq+64*1], m1 call .main mova [dstq+64*2], m0 mova [dstq+64*3], m1 dec hd jg .w128_loop RET ALIGN function_align .main: mova m1, [tmp1q+64*0] mova m3, [tmp2q+64*0] mova m4, [tmp1q+64*1] mova m7, [tmp2q+64*1] add tmp1q, 64*2 add tmp2q, 64*2 psubsw m6, m1, m3 punpcklwd m5, m3, m1 pabsw m6, m6 punpckhwd m3, m1 psubusw m6, m8, m6 psrlw m6, 10 psubw m2, m9, m6 punpcklwd m1, m6, m2 punpckhwd m6, m2 mova m0, m10 vpdpwssd m0, m5, m1 mova m1, m10 vpdpwssd m1, m3, m6 psubsw m5, m4, m7 punpcklwd m6, m7, m4 pabsw m5, m5 punpckhwd m7, m4 psubusw m5, m8, m5 psrlw m5, 10 psubw m3, m9, m5 punpcklwd m4, m5, m3 psrad m0, 4 punpckhwd m5, m3 psrad m1, 4 packusdw m0, m1 mova m1, m10 vpdpwssd m1, m6, m4 mova m4, m10 vpdpwssd m4, m7, m5 vpermt2b m2, m11, m3 psrad m1, 4 psrad m4, 4 packusdw m1, m4 vpsrlvw m0, m12 vpsrlvw m1, m12 mova [maskq], m2 add maskq, 64 ret cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask %define base r6-blend_avx512icl_table lea r6, [blend_avx512icl_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, [r6+wq*4] movifnidn maskq, maskmp vpbroadcastd m6, [base+pw_m512] add wq, r6 lea r6, [dsq*3] jmp wq .w4: pmovzxbw ym19, [maskq] movq xm16, [dstq+dsq*0] movhps xm16, [dstq+dsq*1] vpbroadcastq ym17, [dstq+dsq*2] vpbroadcastq ym18, [dstq+r6 ] pmullw ym19, ym6 vpblendd ym16, ym17, 0x30 vpblendd ym16, ym18, 0xc0 psubw ym17, ym16, [tmpq] add maskq, 16 add tmpq, 32 pmulhrsw ym17, ym19 paddw ym16, ym17 vextracti128 xm17, ym16, 1 movq [dstq+dsq*0], xm16 movhps [dstq+dsq*1], xm16 movq [dstq+dsq*2], xm17 movhps [dstq+r6 ], xm17 lea dstq, [dstq+dsq*4] sub hd, 4 jg .w4 vzeroupper RET .w8: pmovzxbw m2, [maskq] mova xm0, [dstq+dsq*0] vinserti32x4 ym0, [dstq+dsq*1], 1 vinserti32x4 m0, [dstq+dsq*2], 2 vinserti32x4 m0, [dstq+r6 ], 3 pmullw m2, m6 psubw m1, m0, [tmpq] add maskq, 32 add tmpq, 64 pmulhrsw m1, m2 paddw m0, m1 mova [dstq+dsq*0], xm0 vextracti32x4 [dstq+dsq*1], ym0, 1 vextracti32x4 [dstq+dsq*2], m0, 2 vextracti32x4 [dstq+r6 ], m0, 3 lea dstq, [dstq+dsq*4] sub hd, 4 jg .w8 RET .w16: pmovzxbw m4, [maskq+32*0] pmovzxbw m5, [maskq+32*1] mova ym0, [dstq+dsq*0] vinserti32x8 m0, [dstq+dsq*1], 1 mova ym1, [dstq+dsq*2] vinserti32x8 m1, [dstq+r6 ], 1 pmullw m4, m6 pmullw m5, m6 psubw m2, m0, [tmpq+64*0] psubw m3, m1, [tmpq+64*1] add maskq, 32*2 add tmpq, 64*2 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 mova [dstq+dsq*2], ym1 vextracti32x8 [dstq+r6 ], m1, 1 lea dstq, [dstq+dsq*4] sub hd, 4 jg .w16 RET .w32: pmovzxbw m4, [maskq+32*0] pmovzxbw m5, [maskq+32*1] mova m0, [dstq+dsq*0] mova m1, [dstq+dsq*1] pmullw m4, m6 pmullw m5, m6 psubw m2, m0, [tmpq+ 64*0] psubw m3, m1, [tmpq+ 64*1] add maskq, 32*2 add tmpq, 64*2 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w32 RET cglobal blend_v_16bpc, 3, 6, 5, dst, ds, tmp, w, h lea r5, [blend_v_avx512icl_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, [r5+wq*4] add wq, r5 jmp wq .w2: vpbroadcastd xmm2, [obmc_masks_avx2+2*2] .w2_loop: movd xmm0, [dstq+dsq*0] pinsrd xmm0, [dstq+dsq*1], 1 movq xmm1, [tmpq] add tmpq, 4*2 psubw xmm1, xmm0, xmm1 pmulhrsw xmm1, xmm2 paddw xmm0, xmm1 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w2_loop RET .w4: vpbroadcastq xmm2, [obmc_masks_avx2+4*2] .w4_loop: movq xmm0, [dstq+dsq*0] movhps xmm0, [dstq+dsq*1] psubw xmm1, xmm0, [tmpq] add tmpq, 8*2 pmulhrsw xmm1, xmm2 paddw xmm0, xmm1 movq [dstq+dsq*0], xmm0 movhps [dstq+dsq*1], xmm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w4_loop RET .w8: vbroadcasti32x4 ym2, [obmc_masks_avx2+8*2] .w8_loop: mova xm0, [dstq+dsq*0] vinserti32x4 ym0, [dstq+dsq*1], 1 psubw ym1, ym0, [tmpq] add tmpq, 16*2 pmulhrsw ym1, ym2 paddw ym0, ym1 mova [dstq+dsq*0], xm0 vextracti32x4 [dstq+dsq*1], ym0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w8_loop RET .w16: vbroadcasti32x8 m2, [obmc_masks_avx2+16*2] .w16_loop: mova ym0, [dstq+dsq*0] vinserti32x8 m0, [dstq+dsq*1], 1 psubw m1, m0, [tmpq] add tmpq, 32*2 pmulhrsw m1, m2 paddw m0, m1 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w16_loop RET .w32: mova m4, [obmc_masks_avx2+32*2] .w32_loop: mova m0, [dstq+dsq*0] psubw m2, m0, [tmpq+ 64*0] mova m1, [dstq+dsq*1] psubw m3, m1, [tmpq+ 64*1] add tmpq, 64*2 pmulhrsw m2, m4 pmulhrsw m3, m4 paddw m0, m2 paddw m1, m3 mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w32_loop RET cglobal blend_h_16bpc, 3, 7, 9, dst, ds, tmp, w, h, mask %define base r6-$$ lea r6, [$$] tzcnt wd, wm mov hd, hm movsxd wq, [base+blend_h_avx512icl_table+wq*4] lea maskq, [base+obmc_masks_avx2+hq*2] lea hd, [hq*3] lea wq, [base+blend_h_avx512icl_table+wq] shr hd, 2 ; h * 3/4 lea maskq, [maskq+hq*2] neg hq jmp wq .w2: movd xmm0, [dstq+dsq*0] pinsrd xmm0, [dstq+dsq*1], 1 movd xmm2, [maskq+hq*2] movq xmm1, [tmpq] add tmpq, 4*2 punpcklwd xmm2, xmm2 psubw xmm1, xmm0, xmm1 pmulhrsw xmm1, xmm2 paddw xmm0, xmm1 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w2 RET .w4: mova xmm3, [blend_shuf] .w4_loop: movq xmm0, [dstq+dsq*0] movhps xmm0, [dstq+dsq*1] movd xmm2, [maskq+hq*2] psubw xmm1, xmm0, [tmpq] add tmpq, 8*2 pshufb xmm2, xmm3 pmulhrsw xmm1, xmm2 paddw xmm0, xmm1 movq [dstq+dsq*0], xmm0 movhps [dstq+dsq*1], xmm0 lea dstq, [dstq+dsq*2] add hq, 2 jl .w4_loop RET .w8: vbroadcasti32x4 ym3, [blend_shuf] shufpd ym3, ym3, 0x0c .w8_loop: mova xm0, [dstq+dsq*0] vinserti32x4 ym0, [dstq+dsq*1], 1 vpbroadcastd ym2, [maskq+hq*2] psubw ym1, ym0, [tmpq] add tmpq, 16*2 pshufb ym2, ym3 pmulhrsw ym1, ym2 paddw ym0, ym1 mova [dstq+dsq*0], xm0 vextracti32x4 [dstq+dsq*1], ym0, 1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w8_loop RET .w16: vbroadcasti32x4 m3, [blend_shuf] shufpd m3, m3, 0xf0 .w16_loop: mova ym0, [dstq+dsq*0] vinserti32x8 m0, [dstq+dsq*1], 1 vpbroadcastd m2, [maskq+hq*2] psubw m1, m0, [tmpq] add tmpq, 32*2 pshufb m2, m3 pmulhrsw m1, m2 paddw m0, m1 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w16_loop RET .w32: vpbroadcastw m4, [maskq+hq*2] vpbroadcastw m5, [maskq+hq*2+2] mova m0, [dstq+dsq*0] psubw m2, m0, [tmpq+ 64*0] mova m1, [dstq+dsq*1] psubw m3, m1, [tmpq+ 64*1] add tmpq, 64*2 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w32 RET .w64: vpbroadcastw m4, [maskq+hq*2] mova m0, [dstq+64*0] psubw m2, m0, [tmpq+64*0] mova m1, [dstq+64*1] psubw m3, m1, [tmpq+64*1] add tmpq, 64*2 pmulhrsw m2, m4 pmulhrsw m3, m4 paddw m0, m2 paddw m1, m3 mova [dstq+64*0], m0 mova [dstq+64*1], m1 add dstq, dsq inc hq jl .w64 RET .w128: vpbroadcastw m8, [maskq+hq*2] mova m0, [dstq+64*0] psubw m4, m0, [tmpq+64*0] mova m1, [dstq+64*1] psubw m5, m1, [tmpq+64*1] mova m2, [dstq+64*2] psubw m6, m2, [tmpq+64*2] mova m3, [dstq+64*3] psubw m7, m3, [tmpq+64*3] add tmpq, 64*4 REPX {pmulhrsw x, m8}, m4, m5, m6, m7 paddw m0, m4 paddw m1, m5 paddw m2, m6 paddw m3, m7 mova [dstq+64*0], m0 mova [dstq+64*1], m1 mova [dstq+64*2], m2 mova [dstq+64*3], m3 add dstq, dsq inc hq jl .w128 RET cglobal resize_16bpc, 6, 12, 32, dst, dst_stride, src, src_stride, \ dst_w, h, src_w, dx, mx0, pxmax sub dword mx0m, 4<<14 sub dword src_wm, 8 mov r6, ~0 vpbroadcastd m5, dxm vpbroadcastd m8, mx0m vpbroadcastd m6, src_wm kmovq k6, r6 DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax LEA r7, $$ %define base r7-$$ vpbroadcastd m3, [base+pd_16384] vpbroadcastd m7, [base+pd_63] mova m24, [base+resize_permA] mova m25, [base+resize_permB] mova m26, [base+resize_permC] mova m27, [base+resize_permD] vbroadcasti32x4 m28, [base+resize_shufA] vbroadcasti32x4 m29, [base+resize_shufB] mova m30, [base+resize_permE] vpbroadcastw ym31, pxmaxm vpdpwssd m8, m5, [base+rescale_mul] ; mx+dx*[0-15] pslld m5, 4 ; dx*16 pslld m6, 14 pxor m2, m2 .loop_y: xor xd, xd mova m4, m8 ; per-line working version of mx .loop_x: pmaxsd m0, m4, m2 psrad m9, m4, 8 ; filter offset (unmasked) pminsd m0, m6 ; iclip(mx, 0, src_w-8) psubd m1, m4, m0 ; pshufb offset psrad m0, 14 ; clipped src_x offset psrad m1, 14 ; pshufb edge_emu offset vptestmd k5, m1, m1 pand m9, m7 ; filter offset (masked) ktestw k5, k5 jz .load vpbroadcastq m14, [base+pd_0_4] vpermq m10, m0, q1100 vpermq m11, m0, q3322 vpermq m20, m1, q1100 vpermq m21, m1, q3322 punpckldq m10, m10 punpckldq m11, m11 punpckldq m20, m20 punpckldq m21, m21 paddd m10, m14 paddd m11, m14 paddd m20, m14 paddd m21, m14 vextracti32x8 ym12, m10, 1 vextracti32x8 ym13, m11, 1 vextracti32x8 ym22, m20, 1 vextracti32x8 ym23, m21, 1 kmovq k1, k6 kmovq k2, k6 kmovq k3, k6 kmovq k4, k6 vpgatherdq m16{k1}, [srcq+ym10*2] ; 0 1 2 3 vpgatherdq m17{k2}, [srcq+ym11*2] ; 4 5 6 7 vpgatherdq m18{k3}, [srcq+ym12*2] ; 8 9 A B vpgatherdq m19{k4}, [srcq+ym13*2] ; C D E F kmovq k1, k6 kmovq k2, k6 kmovq k3, k6 kmovq k4, k6 vpgatherdq m0{k1}, [base+resize_shuf+8+ym20*2] vpgatherdq m1{k2}, [base+resize_shuf+8+ym21*2] vpgatherdq m14{k3}, [base+resize_shuf+8+ym22*2] vpgatherdq m15{k4}, [base+resize_shuf+8+ym23*2] pshufb m16, m0 pshufb m17, m1 pshufb m18, m14 pshufb m19, m15 mova m20, m24 mova m22, m24 mova m21, m25 mova m23, m25 vpermi2d m20, m16, m17 ; 0-3a 0-3b 4-7a 4-7b vpermi2d m21, m16, m17 ; 0-3c 0-3d 4-7c 4-7d vpermi2d m22, m18, m19 ; 8-Ba 8-Bb C-Fa C-Fb vpermi2d m23, m18, m19 ; 8-Bc 8-Bd C-Fc C-Fd mova m15, m26 mova m17, m26 mova m16, m27 mova m18, m27 vpermi2q m15, m20, m22 ; 0-3a 4-7a 8-Ba C-Fa vpermi2q m16, m20, m22 ; 0-3b 4-7b 8-Bb C-Fb vpermi2q m17, m21, m23 ; 0-3c 4-7c 8-Bc C-Fc vpermi2q m18, m21, m23 ; 0-3d 4-7d 8-Bd C-Fd kmovq k1, k6 kmovq k2, k6 vpgatherdd m11{k1}, [base+resize_filter+m9*8+0] vpgatherdd m13{k2}, [base+resize_filter+m9*8+4] pshufb m10, m11, m28 pshufb m11, m11, m29 pshufb m12, m13, m28 pshufb m13, m13, m29 jmp .filter .load: kmovq k1, k6 kmovq k2, k6 kmovq k3, k6 kmovq k4, k6 vpgatherdd m11{k1}, [base+resize_filter+m9*8+0] vpgatherdd m13{k2}, [base+resize_filter+m9*8+4] pshufb m10, m11, m28 pshufb m11, m11, m29 pshufb m12, m13, m28 pshufb m13, m13, m29 vpgatherdd m15{k3}, [srcq+m0*2+ 0] vpgatherdd m16{k4}, [srcq+m0*2+ 4] kmovq k1, k6 kmovq k2, k6 vpgatherdd m17{k1}, [srcq+m0*2+ 8] vpgatherdd m18{k2}, [srcq+m0*2+12] .filter: mova m14, m2 vpdpwssd m14, m15, m10 vpdpwssd m14, m16, m11 vpdpwssd m14, m17, m12 vpdpwssd m14, m18, m13 psubd m14, m3, m14 psrad m14, 15 packusdw m14, m14 vpermq m14, m30, m14 pminsw ym14, ym31 mova [dstq+xq*2], ym14 paddd m4, m5 add xd, 16 cmp xd, dst_wd jl .loop_x add dstq, dst_strideq add srcq, src_strideq dec hd jg .loop_y RET %endif ; ARCH_X86_64