/* * Copyright © 2019, VideoLAN and dav1d authors * Copyright © 2019, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" #define BUF_POS 0 #define BUF_END 8 #define DIF 16 #define RNG 24 #define CNT 28 #define ALLOW_UPDATE_CDF 32 #define COEFFS_BASE_OFFSET 30 #define MASKS8_OFFSET (64-COEFFS_BASE_OFFSET) const coeffs .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0 .short 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // masks8 .short -0x202, -0x202, -0x202, -0x202, -0x202, -0x202, -0x202, 0xF0E endconst .macro ld1_n d0, d1, src, sz, n .if \n <= 8 ld1 {\d0\sz}, [\src] .else ld1 {\d0\sz, \d1\sz}, [\src] .endif .endm .macro st1_n s0, s1, dst, sz, n .if \n <= 8 st1 {\s0\sz}, [\dst] .else st1 {\s0\sz, \s1\sz}, [\dst] .endif .endm .macro ushr_n d0, d1, s0, s1, shift, sz, n ushr \d0\sz, \s0\sz, \shift .if \n == 16 ushr \d1\sz, \s1\sz, \shift .endif .endm .macro add_n d0, d1, s0, s1, s2, s3, sz, n add \d0\sz, \s0\sz, \s2\sz .if \n == 16 add \d1\sz, \s1\sz, \s3\sz .endif .endm .macro sub_n d0, d1, s0, s1, s2, s3, sz, n sub \d0\sz, \s0\sz, \s2\sz .if \n == 16 sub \d1\sz, \s1\sz, \s3\sz .endif .endm .macro and_n d0, d1, s0, s1, s2, s3, sz, n and \d0\sz, \s0\sz, \s2\sz .if \n == 16 and \d1\sz, \s1\sz, \s3\sz .endif .endm .macro cmhs_n d0, d1, s0, s1, s2, s3, sz, n cmhs \d0\sz, \s0\sz, \s2\sz .if \n == 16 cmhs \d1\sz, \s1\sz, \s3\sz .endif .endm .macro sshl_n d0, d1, s0, s1, s2, s3, sz, n sshl \d0\sz, \s0\sz, \s2\sz .if \n == 16 sshl \d1\sz, \s1\sz, \s3\sz .endif .endm .macro sqdmulh_n d0, d1, s0, s1, s2, s3, sz, n sqdmulh \d0\sz, \s0\sz, \s2\sz .if \n == 16 sqdmulh \d1\sz, \s1\sz, \s3\sz .endif .endm .macro str_n idx0, idx1, dstreg, dstoff, n str \idx0, [\dstreg, \dstoff] .if \n == 16 str \idx1, [\dstreg, \dstoff + 16] .endif .endm // unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf, // size_t n_symbols); function msac_decode_symbol_adapt4_neon, export=1 .macro decode_update sz, szb, n .if \n == 16 sub sp, sp, #48 .endif add x8, x0, #RNG ld1_n v0, v1, x1, \sz, \n // cdf ld1r {v29\sz}, [x8] // rng movrel x9, coeffs, COEFFS_BASE_OFFSET movi v31\sz, #0x7f, lsl #8 // 0x7f00 sub x10, x9, x2, lsl #1 mvni v30\sz, #0x3f // 0xffc0 and v7\szb, v29\szb, v31\szb // rng & 0x7f00 .if \n == 16 str h29, [sp, #14] // store original u = s->rng .endif and_n v2, v3, v0, v1, v30, v30, \szb, \n // cdf & 0xffc0 ld1_n v4, v5, x10, \sz, \n // EC_MIN_PROB * (n_symbols - ret) sqdmulh_n v6, v7, v2, v3, v7, v7, \sz, \n // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1 ldr d28, [x0, #DIF] add_n v4, v5, v2, v3, v4, v5, \sz, \n // v = cdf + EC_MIN_PROB * (n_symbols - ret) add_n v4, v5, v6, v7, v4, v5, \sz, \n // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret) dup v30\sz, v28.h[3] // dif >> (EC_WIN_SIZE - 16) .if \n == 8 ldur q31, [x9, #MASKS8_OFFSET] .elseif \n == 16 str_n q4, q5, sp, #16, \n // store v values to allow indexed access .endif // After the condition starts being true it continues, such that the vector looks like: // 0, 0, 0 ... -1, -1 cmhs_n v2, v3, v30, v30, v4, v5, \sz, \n // c >= v .if \n == 4 ext v29\szb, v29\szb, v4\szb, #6 // u umov x15, v2.d[0] ldr w4, [x0, #ALLOW_UPDATE_CDF] rev x15, x15 sub v29\sz, v29\sz, v4\sz // rng = u-v // rev + clz = count trailing zeros clz x15, x15 // 16*ret .elseif \n == 8 // The final short of the compare is always set. // Using addv, subtract -0x202*ret from this value to create a lookup table for a short. // For n == 8: // -0x202 + -0x202 + ... + 0xF0E // (0x202*7) | (1 << 8) // ^-------offset for second byte of the short and v31\szb, v31\szb, v2\szb ext v29\szb, v29\szb, v4\szb, #14 // u addv h31, v31\sz // ((2*ret + 1) << 8) | (2*ret) ldr w4, [x0, #ALLOW_UPDATE_CDF] sub v30\sz, v30\sz, v4\sz // (dif >> 48) - v smov w15, v31.b[0] // 2*ret sub v29\sz, v29\sz, v4\sz // rng = u-v .elseif \n == 16 add v6\sz, v2\sz, v3\sz addv h31, v6\sz // -n + ret ldr w4, [x0, #ALLOW_UPDATE_CDF] smov w15, v31.h[0] .endif cbz w4, 0f // update_cdf ldrh w3, [x1, x2, lsl #1] // count = cdf[n_symbols] .if \n == 16 // 16 case has a lower bound that guarantees n_symbols > 2 mov w4, #-5 .elseif \n == 8 mvn w14, w2 mov w4, #-4 cmn w14, #3 // set C if n_symbols <= 2 .else // if n_symbols < 4 (or < 6 even) then // (1 + n_symbols) >> 2 == n_symbols > 2 add w14, w2, #17 // (1 + n_symbols) + (4 << 2) .endif sub_n v16, v17, v0, v1, v2, v3, \sz, \n // cdf + (i >= val ? 1 : 0) orr v2\sz, #0x80, lsl #8 .if \n == 16 orr v3\sz, #0x80, lsl #8 .endif .if \n == 16 sub w4, w4, w3, lsr #4 // -((count >> 4) + 5) .elseif \n == 8 lsr w14, w3, #4 // count >> 4 sbc w4, w4, w14 // -((count >> 4) + (n_symbols > 2) + 4) .else neg w4, w14, lsr #2 // -((n_symbols > 2) + 4) sub w4, w4, w3, lsr #4 // -((count >> 4) + (n_symbols > 2) + 4) .endif sub_n v2, v3, v2, v3, v0, v1, \sz, \n // (32768 - cdf[i]) or (-1 - cdf[i]) dup v6\sz, w4 // -rate sub w3, w3, w3, lsr #5 // count - (count == 32) sshl_n v2, v3, v2, v3, v6, v6, \sz, \n // ({32768,-1} - cdf[i]) >> rate add w3, w3, #1 // count + (count < 32) add_n v0, v1, v16, v17, v2, v3, \sz, \n // cdf + (32768 - cdf[i]) >> rate st1_n v0, v1, x1, \sz, \n strh w3, [x1, x2, lsl #1] 0: // renorm .if \n == 4 ldr w6, [x0, #CNT] ldr x7, [x0, #DIF] mov x4, v29.d[0] // rng (packed) mov x3, v4.d[0] // v (packed) // Shift 'v'/'rng' for ret into the 16 least sig bits. There is // garbage in the remaining bits, but we can work around this. lsr x4, x4, x15 // rng lsr x3, x3, x15 // v lsl w5, w4, #16 // rng << 16 sub x7, x7, x3, lsl #48 // dif - (v << 48) clz w5, w5 // d = clz(rng << 16) lsl w4, w4, w5 // rng << d subs w6, w6, w5 // cnt -= d lsl x7, x7, x5 // (dif - (v << 48)) << d strh w4, [x0, #RNG] b.lo 1f str w6, [x0, #CNT] str x7, [x0, #DIF] lsr w0, w15, #4 ret 1: lsr w15, w15, #4 b L(refill) .elseif \n == 8 ldr w6, [x0, #CNT] tbl v30.8b, {v30.16b}, v31.8b tbl v29.8b, {v29.16b}, v31.8b ins v28.h[3], v30.h[0] // dif - (v << 48) clz v0.4h, v29.4h // d = clz(rng) umov w5, v0.h[0] ushl v29.4h, v29.4h, v0.4h // rng << d // The vec for clz(rng) is filled with garbage after the first short, // but ushl/sshl conveniently uses only the first byte for the shift // amount. ushl d28, d28, d0 // (dif - (v << 48)) << d subs w6, w6, w5 // cnt -= d str h29, [x0, #RNG] b.lo 1f str w6, [x0, #CNT] str d28, [x0, #DIF] lsr w0, w15, #1 // ret ret 1: lsr w15, w15, #1 // ret mov x7, v28.d[0] b L(refill) .elseif \n == 16 add x8, sp, w15, sxtw #1 ldrh w3, [x8, #48] // v ldurh w4, [x8, #46] // u ldr w6, [x0, #CNT] ldr x7, [x0, #DIF] sub w4, w4, w3 // rng = u - v clz w5, w4 // clz(rng) eor w5, w5, #16 // d = clz(rng) ^ 16 sub x7, x7, x3, lsl #48 // dif - (v << 48) lsl w4, w4, w5 // rng << d subs w6, w6, w5 // cnt -= d lsl x7, x7, x5 // (dif - (v << 48)) << d str w4, [x0, #RNG] add sp, sp, #48 b.lo 1f str w6, [x0, #CNT] str x7, [x0, #DIF] add w0, w15, #\n // ret ret 1: add w15, w15, #\n // ret b L(refill) .endif .endm decode_update .4h, .8b, 4 L(refill): // refill ldp x3, x4, [x0] // BUF_POS, BUF_END add x5, x3, #8 subs x5, x5, x4 b.hi 6f ldr x8, [x3] // next_bits add w4, w6, #-48 // shift_bits = cnt + 16 (- 64) mvn x8, x8 neg w5, w4 rev x8, x8 // next_bits = bswap(next_bits) lsr w5, w5, #3 // num_bytes_read lsr x8, x8, x4 // next_bits >>= (shift_bits & 63) 2: // refill_end add x3, x3, x5 add w6, w6, w5, lsl #3 // cnt += num_bits_read str x3, [x0, #BUF_POS] 3: // refill_end2 orr x7, x7, x8 // dif |= next_bits 4: // end str w6, [x0, #CNT] str x7, [x0, #DIF] mov w0, w15 ret 5: // pad_with_ones add w8, w6, #-16 ror x8, x8, x8 b 3b 6: // refill_eob cmp x3, x4 b.hs 5b ldr x8, [x4, #-8] lsl w5, w5, #3 lsr x8, x8, x5 add w5, w6, #-48 mvn x8, x8 sub w4, w4, w3 // num_bytes_left rev x8, x8 lsr x8, x8, x5 neg w5, w5 lsr w5, w5, #3 cmp w5, w4 csel w5, w5, w4, lo // num_bytes_read b 2b endfunc function msac_decode_symbol_adapt8_neon, export=1 decode_update .8h, .16b, 8 endfunc function msac_decode_symbol_adapt16_neon, export=1 decode_update .8h, .16b, 16 endfunc function msac_decode_hi_tok_neon, export=1 ld1 {v0.4h}, [x1] // cdf add x16, x0, #RNG movi v31.4h, #0x7f, lsl #8 // 0x7f00 movrel x17, coeffs, COEFFS_BASE_OFFSET-2*3 mvni v30.4h, #0x3f // 0xffc0 ldrh w9, [x1, #6] // count = cdf[n_symbols] ld1r {v3.4h}, [x16] // rng ld1 {v29.4h}, [x17] // EC_MIN_PROB * (n_symbols - ret) add x17, x0, #DIF + 6 mov w13, #-24*8 and v17.8b, v0.8b, v30.8b // cdf & 0xffc0 ldr w10, [x0, #ALLOW_UPDATE_CDF] ld1r {v1.8h}, [x17] // dif >> (EC_WIN_SIZE - 16) ldr w6, [x0, #CNT] ldr x7, [x0, #DIF] 1: and v7.8b, v3.8b, v31.8b // rng & 0x7f00 sqdmulh v6.4h, v17.4h, v7.4h // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1 add v4.4h, v17.4h, v29.4h // v = cdf + EC_MIN_PROB * (n_symbols - ret) add v4.4h, v6.4h, v4.4h // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret) cmhs v2.4h, v1.4h, v4.4h // c >= v add w13, w13, #5*8 ext v18.8b, v3.8b, v4.8b, #6 // u umov x15, v2.d[0] rev x15, x15 sub v18.4h, v18.4h, v4.4h // rng = u-v // rev + clz = count trailing zeros clz x15, x15 // 16*ret cbz w10, 2f // update_cdf sub v5.4h, v0.4h, v2.4h // cdf[i] + (i >= val ? 1 : 0) mov w4, #-5 orr v2.4h, #0x80, lsl #8 // i >= val ? -1 : 32768 sub w4, w4, w9, lsr #4 // -((count >> 4) + 5) sub v2.4h, v2.4h, v0.4h // (32768 - cdf[i]) or (-1 - cdf[i]) dup v6.4h, w4 // -rate sub w9, w9, w9, lsr #5 // count - (count == 32) sshl v2.4h, v2.4h, v6.4h // ({32768,-1} - cdf[i]) >> rate add w9, w9, #1 // count + (count < 32) add v0.4h, v5.4h, v2.4h // cdf[i] + (32768 - cdf[i]) >> rate st1 {v0.4h}, [x1] and v17.8b, v0.8b, v30.8b // cdf & 0xffc0 strh w9, [x1, #6] 2: mov x4, v18.d[0] // rng (packed) mov x3, v4.d[0] // v (packed) // Shift 'v'/'rng' for ret into the 16 least sig bits. There is // garbage in the remaining bits, but we can work around this. lsr x4, x4, x15 // rng lsr x3, x3, x15 // v lsl w5, w4, #16 // rng << 16 sub x7, x7, x3, lsl #48 // dif - (v << 48) clz w5, w5 // d = clz(rng << 16) lsl w4, w4, w5 // rng << d subs w6, w6, w5 // cnt -= d lsl x7, x7, x5 // (dif - (v << 48)) << d strh w4, [x0, #RNG] dup v3.4h, w4 b.hs 5f // refill ldp x3, x4, [x0] // BUF_POS, BUF_END add x5, x3, #8 subs x5, x5, x4 b.hi 7f ldr x8, [x3] // next_bits add w4, w6, #-48 // shift_bits = cnt + 16 (- 64) mvn x8, x8 neg w5, w4 rev x8, x8 // next_bits = bswap(next_bits) lsr w5, w5, #3 // num_bytes_read lsr x8, x8, x4 // next_bits >>= (shift_bits & 63) 3: // refill_end add x3, x3, x5 add w6, w6, w5, lsl #3 // cnt += num_bits_read str x3, [x0, #BUF_POS] 4: // refill_end2 orr x7, x7, x8 // dif |= next_bits 5: // end sub w15, w15, #5*8 lsr x12, x7, #48 adds w13, w13, w15 // carry = tok_br < 3 || tok == 15 dup v1.8h, w12 b.cc 1b // loop if !carry add w13, w13, #30*8 str w6, [x0, #CNT] str x7, [x0, #DIF] lsr w0, w13, #4 ret 6: // pad_with_ones add w8, w6, #-16 ror x8, x8, x8 b 4b 7: // refill_eob cmp x3, x4 b.hs 6b ldr x8, [x4, #-8] lsl w5, w5, #3 lsr x8, x8, x5 add w5, w6, #-48 mvn x8, x8 sub w4, w4, w3 // num_bytes_left rev x8, x8 lsr x8, x8, x5 neg w5, w5 lsr w5, w5, #3 cmp w5, w4 csel w5, w5, w4, lo // num_bytes_read b 3b endfunc function msac_decode_bool_equi_neon, export=1 ldp w5, w6, [x0, #RNG] // + CNT ldr x7, [x0, #DIF] bic w4, w5, #0xff // r &= 0xff00 add w4, w4, #8 subs x8, x7, x4, lsl #47 // dif - vw lsr w4, w4, #1 // v sub w5, w5, w4 // r - v cset w15, lo csel w4, w5, w4, hs // if (ret) v = r - v; csel x7, x8, x7, hs // if (ret) dif = dif - vw; clz w5, w4 // clz(rng) eor w5, w5, #16 // d = clz(rng) ^ 16 lsl w4, w4, w5 // rng << d subs w6, w6, w5 // cnt -= d lsl x7, x7, x5 // (dif - (v << 48)) << d str w4, [x0, #RNG] b.lo L(refill) str w6, [x0, #CNT] str x7, [x0, #DIF] mov w0, w15 ret endfunc function msac_decode_bool_neon, export=1 ldp w5, w6, [x0, #RNG] // + CNT ldr x7, [x0, #DIF] lsr w4, w5, #8 // r >> 8 bic w1, w1, #0x3f // f &= ~63 mul w4, w4, w1 lsr w4, w4, #7 add w4, w4, #4 // v subs x8, x7, x4, lsl #48 // dif - vw sub w5, w5, w4 // r - v cset w15, lo csel w4, w5, w4, hs // if (ret) v = r - v; csel x7, x8, x7, hs // if (ret) dif = dif - vw; clz w5, w4 // clz(rng) eor w5, w5, #16 // d = clz(rng) ^ 16 lsl w4, w4, w5 // rng << d subs w6, w6, w5 // cnt -= d lsl x7, x7, x5 // (dif - (v << 48)) << d str w4, [x0, #RNG] b.lo L(refill) str w6, [x0, #CNT] str x7, [x0, #DIF] mov w0, w15 ret endfunc function msac_decode_bool_adapt_neon, export=1 ldr w9, [x1] // cdf[0-1] ldp w5, w6, [x0, #RNG] // + CNT ldr x7, [x0, #DIF] lsr w4, w5, #8 // r >> 8 and w2, w9, #0xffc0 // f &= ~63 mul w4, w4, w2 lsr w4, w4, #7 add w4, w4, #4 // v subs x8, x7, x4, lsl #48 // dif - vw sub w5, w5, w4 // r - v cset w15, lo csel w4, w5, w4, hs // if (ret) v = r - v; csel x7, x8, x7, hs // if (ret) dif = dif - vw; ldr w10, [x0, #ALLOW_UPDATE_CDF] clz w5, w4 // clz(rng) eor w5, w5, #16 // d = clz(rng) ^ 16 cbz w10, 1f lsr w2, w9, #16 // count = cdf[1] and w9, w9, #0xffff // cdf[0] sub w3, w2, w2, lsr #5 // count - (count >= 32) lsr w2, w2, #4 // count >> 4 add w10, w3, #1 // count + (count < 32) add w2, w2, #4 // rate = (count >> 4) | 4 sub w9, w9, w15 // cdf[0] -= bit sub w11, w9, w15, lsl #15 // {cdf[0], cdf[0] - 32769} asr w11, w11, w2 // {cdf[0], cdf[0] - 32769} >> rate sub w9, w9, w11 // cdf[0] strh w9, [x1] strh w10, [x1, #2] 1: lsl w4, w4, w5 // rng << d subs w6, w6, w5 // cnt -= d lsl x7, x7, x5 // (dif - (v << 48)) << d str w4, [x0, #RNG] b.lo L(refill) str w6, [x0, #CNT] str x7, [x0, #DIF] mov w0, w15 ret endfunc