@/***************************************************************************** @* @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore @* @* Licensed under the Apache License, Version 2.0 (the "License"); @* you may not use this file except in compliance with the License. @* You may obtain a copy of the License at: @* @* http://www.apache.org/licenses/LICENSE-2.0 @* @* Unless required by applicable law or agreed to in writing, software @* distributed under the License is distributed on an "AS IS" BASIS, @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @* See the License for the specific language governing permissions and @* limitations under the License. @* @*****************************************************************************/ @/** @/** @****************************************************************************** @* @file @* ihevc_inter_pred_luma_horz_w16out.s @* @* @brief @* contains function definitions for inter prediction interpolation. @* functions are coded using neon intrinsics and can be compiled using @* rvct @* @* @author @* parthiban v @* @* @par list of functions: @* @* - ihevc_inter_pred_luma_horz_w16out() @* @* @remarks @* none @* @******************************************************************************* @*/ @/** @******************************************************************************* @* @* @brief @* interprediction luma filter for horizontal 16bit output @* @* @par description: @* applies a horizontal filter with coefficients pointed to by 'pi1_coeff' @* to the elements pointed by 'pu1_src' and writes to the location pointed @* by 'pu1_dst' no downshifting or clipping is done and the output is used @* as an input for vertical filtering or weighted prediction assumptions : @* the function is optimized considering the fact width is multiple of 4 or @* 8. if width is multiple of 4 then height should be multiple of 2, width 8 @* is optimized further. @* @* @param[in] pu1_src @* uword8 pointer to the source @* @* @param[out] pi2_dst @* word16 pointer to the destination @* @* @param[in] src_strd @* integer source stride @* @* @param[in] dst_strd @* integer destination stride @* @* @param[in] pi1_coeff @* word8 pointer to the filter coefficients @* @* @param[in] ht @* integer height of the array @* @* @param[in] wd @* integer width of the array @* @* @returns @* @* @remarks @* none @* @******************************************************************************* @*/ @void ihevc_inter_pred_luma_horz_w16out(uword8 *pu1_src, @ word16 *pi2_dst, @ word32 src_strd, @ word32 dst_strd, @ word8 *pi1_coeff, @ word32 ht, @ word32 wd @r0 - free @r1 - dst_ptr @r2 - src_strd @r3 - dst_strd @r4 - src_ptr2 @r5 - inner loop counter @r6 - dst_ptr2 @r7 - free @r8 - dst_strd2 @r9 - src_strd1 @r10 - wd @r11 - #1 @r12 - src_ptr1 @r14 - loop_counter .equ coeff_offset, 104 .equ ht_offset, 108 .equ wd_offset, 112 .text .align 4 .syntax unified .globl ihevc_inter_pred_luma_horz_w16out_a9q .type ihevc_inter_pred_luma_horz_w16out_a9q, %function ihevc_inter_pred_luma_horz_w16out_a9q: bic r14, #1 @ clearing bit[0], so that it goes back to mode stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments vpush {d8 - d15} ldr r4,[sp,#coeff_offset] @loads pi1_coeff ldr r7,[sp,#ht_offset] @loads ht vld1.8 {d0},[r4] @coeff = vld1_s8(pi1_coeff) sub r14,r7,#0 @checks for ht == 0 vabs.s8 d2,d0 @vabs_s8(coeff) mov r11,#1 ldr r10,[sp,#wd_offset] @loads wd vdup.8 d24,d2[0] @coeffabs_0 = vdup_lane_u8(coeffabs, 0) sub r12,r0,#3 @pu1_src - 3 vdup.8 d25,d2[1] @coeffabs_1 = vdup_lane_u8(coeffabs, 1) add r4,r12,r2 @pu1_src_tmp2_8 = pu1_src + src_strd vdup.8 d26,d2[2] @coeffabs_2 = vdup_lane_u8(coeffabs, 2) rsb r9,r10,r2,lsl #1 @2*src_strd - wd vdup.8 d27,d2[3] @coeffabs_3 = vdup_lane_u8(coeffabs, 3) rsb r8,r10,r3 @dst_strd - wd vdup.8 d28,d2[4] @coeffabs_4 = vdup_lane_u8(coeffabs, 4) vdup.8 d29,d2[5] @coeffabs_5 = vdup_lane_u8(coeffabs, 5) and r7,r14,#1 @calculating ht_residue ht_residue = (ht & 1) vdup.8 d30,d2[6] @coeffabs_6 = vdup_lane_u8(coeffabs, 6) sub r14,r14,r7 @decrement height by ht_residue(residue value is calculated outside) vdup.8 d31,d2[7] @coeffabs_7 = vdup_lane_u8(coeffabs, 7) cmp r7,#1 beq odd_height_decision even_height_decision: mov r7,r1 cmp r10,#4 ble outer_loop_4 cmp r10,#24 moveq r10,#16 addeq r8,#8 addeq r9,#8 cmp r10,#16 bge outer_loop_16_branch cmp r10,#12 addeq r8,#4 addeq r9,#4 outer_loop_8_branch: b outer_loop_8 outer_loop_16_branch: b outer_loop_16 odd_height_decision: cmp r10,#24 beq outer_loop_8_branch cmp r10,#12 beq outer_loop_4 b even_height_decision outer_loop4_residual: sub r12,r0,#3 @pu1_src - 3 mov r1,r7 add r1,#16 mov r10,#4 add r12,#8 mov r14,#16 add r8,#4 add r9,#4 outer_loop_4: add r6,r1,r3,lsl #1 @pu1_dst + dst_strd add r4,r12,r2 @pu1_src + src_strd subs r5,r10,#0 @checks wd ble end_inner_loop_4 inner_loop_4: vld1.u32 {d0},[r12],r11 @vector load pu1_src vld1.u32 {d1},[r12],r11 vld1.u32 {d2},[r12],r11 vld1.u32 {d3},[r12],r11 vld1.u32 {d4},[r12],r11 vld1.u32 {d5},[r12],r11 vld1.u32 {d6},[r12],r11 vld1.u32 {d7},[r12],r11 @add r12,r12,#4 @increment the input pointer sub r12,r12,#4 @vext.u8 d2,d0,d1,#2 @vector extract of src[0_2] @vext.u8 d3,d0,d1,#3 @vector extract of src[0_3] @vext.u8 d4,d0,d1,#4 @vector extract of src[0_4] @vext.u8 d5,d0,d1,#5 @vector extract of src[0_5] @vext.u8 d6,d0,d1,#6 @vector extract of src[0_6] @vext.u8 d7,d0,d1,#7 @vector extract of src[0_7] @vext.u8 d1,d0,d1,#1 @vector extract of src[0_1] vld1.u32 {d12},[r4],r11 @vector load pu1_src + src_strd vld1.u32 {d13},[r4],r11 vzip.32 d0,d12 @vector zip the i iteration and ii interation in single register vld1.u32 {d14},[r4],r11 vzip.32 d1,d13 vld1.u32 {d15},[r4],r11 vzip.32 d2,d14 vld1.u32 {d16},[r4],r11 vzip.32 d3,d15 vld1.u32 {d17},[r4],r11 vzip.32 d4,d16 vld1.u32 {d18},[r4],r11 vzip.32 d5,d17 vld1.u32 {d19},[r4],r11 sub r4,r4,#4 @ add r4,r4,#4 @increment the input pointer @ vext.u8 d14,d12,d13,#2 @vector extract of src[0_2] @ vext.u8 d15,d12,d13,#3 @vector extract of src[0_3] @ vext.u8 d16,d12,d13,#4 @vector extract of src[0_4] @ vext.u8 d17,d12,d13,#5 @vector extract of src[0_5] @ vext.u8 d18,d12,d13,#6 @vector extract of src[0_6] @ vext.u8 d19,d12,d13,#7 @vector extract of src[0_7] @vext.u8 d13,d12,d13,#1 @vector extract of src[0_1] vzip.32 d6,d18 vzip.32 d7,d19 vmull.u8 q4,d1,d25 @arithmetic operations for ii iteration in the same time vmlsl.u8 q4,d0,d24 vmlsl.u8 q4,d2,d26 vmlal.u8 q4,d3,d27 vmlal.u8 q4,d4,d28 vmlsl.u8 q4,d5,d29 vmlal.u8 q4,d6,d30 vmlsl.u8 q4,d7,d31 @ vqrshrun.s16 d8,q4,#6 @narrow right shift and saturating the result vst1.64 {d8},[r1]! @store the i iteration result which is in upper part of the register vst1.64 {d9},[r6]! @store the ii iteration result which is in lower part of the register subs r5,r5,#4 @decrement the wd by 4 bgt inner_loop_4 end_inner_loop_4: subs r14,r14,#2 @decrement the ht by 4 add r12,r12,r9 @increment the input pointer 2*src_strd-wd add r1,r6,r8,lsl #1 @increment the output pointer 2*dst_strd-wd bgt outer_loop_4 height_residue_4: ldr r7,[sp,#ht_offset] @loads ht and r7,r7,#1 @calculating ht_residue ht_residue = (ht & 1) cmp r7,#0 beq end_loops outer_loop_height_residue_4: subs r5,r10,#0 @checks wd ble end_inner_loop_height_residue_4 inner_loop_height_residue_4: vld1.u32 {d0},[r12],r11 @vector load pu1_src vld1.u32 {d1},[r12],r11 @ vext.u8 d2,d0,d1,#2 @vector extract of src[0_2] @ vext.u8 d3,d0,d1,#3 @vector extract of src[0_3] @ vext.u8 d4,d0,d1,#4 @vector extract of src[0_4] @add r12,r12,#4 @increment the input pointer @ vext.u8 d5,d0,d1,#5 @vector extract of src[0_5] @ vext.u8 d6,d0,d1,#6 @vector extract of src[0_6] @ vext.u8 d7,d0,d1,#7 @vector extract of src[0_7] @ vext.u8 d1,d0,d1,#1 @vector extract of src[0_1] vld1.u32 {d2},[r12],r11 vmull.u8 q4,d1,d25 @arithmetic operations for ii iteration in the same time vld1.u32 {d3},[r12],r11 vmlsl.u8 q4,d0,d24 vld1.u32 {d4},[r12],r11 vmlsl.u8 q4,d2,d26 vld1.u32 {d5},[r12],r11 vmlal.u8 q4,d3,d27 vld1.u32 {d6},[r12],r11 vmlal.u8 q4,d4,d28 vld1.u32 {d7},[r12],r11 vmlsl.u8 q4,d5,d29 sub r12,r12,#4 vmlal.u8 q4,d6,d30 vmlsl.u8 q4,d7,d31 @store the i iteration result which is in upper part of the register subs r5,r5,#4 @decrement the wd by 4 vst1.64 {d8},[r1]! bgt inner_loop_height_residue_4 end_inner_loop_height_residue_4: subs r7,r7,#1 @decrement the ht by 4 rsb r9,r10,r2 add r12,r12,r9 @increment the input pointer src_strd-wd add r1,r1,r8 @increment the output pointer dst_strd-wd bgt outer_loop_height_residue_4 vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp outer_loop8_residual: sub r12,r0,#3 @pu1_src - 3 mov r1,r7 mov r14,#32 add r1,#32 add r12,#16 mov r10,#8 add r8,#8 add r9,#8 outer_loop_8: add r6,r1,r3,lsl #1 @pu1_dst + dst_strd add r4,r12,r2 @pu1_src + src_strd subs r5,r10,#0 @checks wd ble end_inner_loop_8 inner_loop_8: vld1.u32 {d0},[r12],r11 @vector load pu1_src vld1.u32 {d1},[r12],r11 vld1.u32 {d2},[r12],r11 vld1.u32 {d3},[r12],r11 @ vext.u8 d2,d0,d1,#2 @vector extract of src[0_2] @ vext.u8 d3,d0,d1,#3 @vector extract of src[0_3] @ vext.u8 d4,d0,d1,#4 @vector extract of src[0_4] @ vext.u8 d5,d0,d1,#5 @vector extract of src[0_5] @ vext.u8 d6,d0,d1,#6 @vector extract of src [0_6] @ vext.u8 d7,d0,d1,#7 @vector extract of src[0_7] @ vext.u8 d1,d0,d1,#1 @vector extract of src[0_1] @ vext.u8 d14,d12,d13,#2 @vext.u8 d15,d12,d13,#3 @vector extract of src[0_3] @ vext.u8 d16,d12,d13,#4 @vector extract of src[0_4] @ vext.u8 d17,d12,d13,#5 @vector extract of src[0_5] @vext.u8 d18,d12,d13,#6 @vector extract of src[0_6] @vext.u8 d19,d12,d13,#7 @vector extract of src[0_7] @vext.u8 d13,d12,d13,#1 @vector extract of src[0_1] vld1.u32 {d4},[r12],r11 vmull.u8 q4,d1,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ vld1.u32 {d5},[r12],r11 vmlal.u8 q4,d3,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ vld1.u32 {d6},[r12],r11 vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ vld1.u32 {d7},[r12],r11 vmlsl.u8 q4,d2,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ vld1.u32 {d12},[r4],r11 @vector load pu1_src + src_strd vmlal.u8 q4,d4,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@ vld1.u32 {d13},[r4],r11 vmlsl.u8 q4,d5,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@ vld1.u32 {d14},[r4],r11 vmlal.u8 q4,d6,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@ vld1.u32 {d15},[r4],r11 vmlsl.u8 q4,d7,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@ vld1.u32 {d16},[r4],r11 @vector load pu1_src + src_strd vmull.u8 q5,d15,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ vld1.u32 {d17},[r4],r11 vmlsl.u8 q5,d14,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ vld1.u32 {d18},[r4],r11 vmlal.u8 q5,d16,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@ vld1.u32 {d19},[r4],r11 @vector load pu1_src + src_strd vmlsl.u8 q5,d17,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@ @ vqrshrun.s16 d20,q4,#6 @right shift and saturating narrow result 1 vmlal.u8 q5,d18,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@ vmlsl.u8 q5,d19,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@ vst1.16 {q4},[r1]! @store the result pu1_dst vmlsl.u8 q5,d12,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ vmlal.u8 q5,d13,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ @ vqrshrun.s16 d8,q5,#6 @right shift and saturating narrow result 2 subs r5,r5,#8 @decrement the wd loop vst1.16 {q5},[r6]! @store the result pu1_dst cmp r5,#4 bgt inner_loop_8 end_inner_loop_8: subs r14,r14,#2 @decrement the ht loop add r12,r12,r9 @increment the src pointer by 2*src_strd-wd add r1,r6,r8,lsl #1 @increment the dst pointer by 2*dst_strd-wd bgt outer_loop_8 ldr r10,[sp,#wd_offset] @loads wd cmp r10,#12 beq outer_loop4_residual ldr r7,[sp,#ht_offset] @loads ht and r7,r7,#1 cmp r7,#1 beq height_residue_4 vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp outer_loop_16: str r0, [sp, #-4]! str r7, [sp, #-4]! add r6,r1,r3,lsl #1 @pu1_dst + dst_strd add r4,r12,r2 @pu1_src + src_strd and r0, r12, #31 sub r5,r10,#0 @checks wd pld [r12, r2, lsl #1] vld1.u32 {q0},[r12],r11 @vector load pu1_src pld [r4, r2, lsl #1] vld1.u32 {q1},[r12],r11 vld1.u32 {q2},[r12],r11 vld1.u32 {q3},[r12],r11 vld1.u32 {q6},[r12],r11 vmull.u8 q4,d2,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ vld1.u32 {q7},[r12],r11 vmlal.u8 q4,d6,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ vld1.u32 {q8},[r12],r11 vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ vld1.u32 {q9},[r12],r11 vmlsl.u8 q4,d4,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ vmlal.u8 q4,d12,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@ vmlsl.u8 q4,d14,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@ vmlal.u8 q4,d16,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@ vmlsl.u8 q4,d18,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@ inner_loop_16: subs r5,r5,#16 vmull.u8 q10,d3,d25 add r12,#8 vmlsl.u8 q10,d1,d24 vld1.u32 {q0},[r4],r11 @vector load pu1_src vmlal.u8 q10,d7,d27 vld1.u32 {q1},[r4],r11 vmlsl.u8 q10,d5,d26 vld1.u32 {q2},[r4],r11 vmlal.u8 q10,d13,d28 vld1.u32 {q3},[r4],r11 vmlal.u8 q10,d17,d30 vld1.u32 {q6},[r4],r11 vmlsl.u8 q10,d15,d29 vld1.u32 {q7},[r4],r11 vmlsl.u8 q10,d19,d31 vld1.u32 {q8},[r4],r11 vmull.u8 q5,d2,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ vld1.u32 {q9},[r4],r11 vmlal.u8 q5,d6,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ add r4,#8 vmlsl.u8 q5,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ pld [r12, r2, lsl #2] pld [r4, r2, lsl #2] vst1.8 {q4},[r1]! @store the result pu1_dst vmlsl.u8 q5,d4,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ addeq r12,r12,r9 @increment the src pointer by 2*src_strd-wd vmlal.u8 q5,d12,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@ addeq r4,r12,r2 @pu1_src + src_strd vmlsl.u8 q5,d14,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@ @ and r7, r12, #31 vmlal.u8 q5,d16,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@ subeq r14,r14,#2 vmlsl.u8 q5,d18,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@ @cmp r7, r0 vmull.u8 q11,d3,d25 @ pld [r12, r2, lsl #2] vmlsl.u8 q11,d1,d24 vst1.16 {q10},[r1]! vmlal.u8 q11,d7,d27 @ pld [r4, r2, lsl #2] vmlsl.u8 q11,d5,d26 @ mov r0, r7 vmlal.u8 q11,d13,d28 cmp r14,#0 vmlal.u8 q11,d17,d30 vst1.16 {q5},[r6]! vmlsl.u8 q11,d15,d29 vmlsl.u8 q11,d19,d31 beq epilog_16 vld1.u32 {q0},[r12],r11 @vector load pu1_src vld1.u32 {q1},[r12],r11 vld1.u32 {q2},[r12],r11 vld1.u32 {q3},[r12],r11 vld1.u32 {q6},[r12],r11 vmull.u8 q4,d2,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ vld1.u32 {q7},[r12],r11 vmlal.u8 q4,d6,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ vld1.u32 {q8},[r12],r11 vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ vld1.u32 {q9},[r12],r11 vmlsl.u8 q4,d4,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ vmlal.u8 q4,d12,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@ cmp r5,#0 vmlsl.u8 q4,d14,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@ moveq r5,r10 vmlal.u8 q4,d16,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@ vst1.8 {q11},[r6]! @store the result pu1_dst vmlsl.u8 q4,d18,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@ addeq r1,r6,r8,lsl #1 addeq r6,r1,r3,lsl #1 @pu1_dst + dst_strd b inner_loop_16 epilog_16: @ vqrshrun.s16 d11,q11,#6 vst1.8 {q11},[r6]! @store the result pu1_dst ldr r7, [sp], #4 ldr r0, [sp], #4 ldr r10,[sp,#wd_offset] cmp r10,#24 beq outer_loop8_residual add r1,r6,r8,lsl #1 ldr r7,[sp,#ht_offset] @loads ht and r7,r7,#1 cmp r7,#1 beq height_residue_4 end_loops: vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp