/******************************************************************************
 *
 * Copyright (C) 2015 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 *****************************************************************************
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
*/

/**
*******************************************************************************
* @file
*  ih264_inter_pred_filters.c
*
* @brief
*  Contains function definitions for inter prediction interpolation filters
*
* @author
*  ittiam
*
* @par List of Functions:
*  - ih264_inter_pred_luma_copy
*  - ih264_interleave_copy
*  - ih264_inter_pred_luma_horz
*  - ih264_inter_pred_luma_vert
*  - ih264_inter_pred_luma_horz_hpel_vert_hpel
*  - ih264_inter_pred_luma_horz_qpel
*  - ih264_inter_pred_luma_vert_qpel
*  - ih264_inter_pred_luma_horz_qpel_vert_qpel
*  - ih264_inter_pred_luma_horz_hpel_vert_qpel
*  - ih264_inter_pred_luma_horz_qpel_vert_hpel
*  - ih264_inter_pred_luma_bilinear
*  - ih264_inter_pred_chroma
*
* @remarks
*  none
*
*******************************************************************************
*/

/*****************************************************************************/
/* File Includes                                                             */
/*****************************************************************************/

/* User include files */
#include "ih264_typedefs.h"
#include "ih264_macros.h"
#include "ih264_inter_pred_filters.h"
#include "ih264_platform_macros.h"


/*****************************************************************************/
/* Constant Data variables                                                   */
/*****************************************************************************/
/**
 ******************************************************************************
 * @brief coefficients for 6 tap filtering
 ******************************************************************************
 */
const WORD32 ih264_g_six_tap[3] =
{
    1, -5, 20
};


/*****************************************************************************/
/*  Function definitions .                                                   */
/*****************************************************************************/

/**
*******************************************************************************
*
* @brief function for luma copy
*
* @par Description:
*  Copies the array of width 'wd' and height 'ht' from the location pointed
*  by 'src' to the location pointed by 'dst'
*
* @param[in] pu1_src
*  pointer to the source
*
* @param[out] pu1_dst
*  pointer to the destination
*
* @param[in] src_strd
*  source stride
*
* @param[in] dst_strd
*  destination stride
*
* @param[in] ht
*  height of the array
*
* @param[in] wd
*  width of the array
*
* @param[in] pu1_tmp
*  temporary buffer
*
* @param[in] dydx
*  x and y reference offset for qpel calculations
*
* @returns
*
* @remarks
*  none
*
*******************************************************************************
*/
void ih264_inter_pred_luma_copy(UWORD8 *pu1_src,
                                UWORD8 *pu1_dst,
                                WORD32 src_strd,
                                WORD32 dst_strd,
                                WORD32 ht,
                                WORD32 wd,
                                UWORD8 *pu1_tmp,
                                WORD32 dydx)
{
    WORD32 row, col;

    UNUSED(pu1_tmp);
    UNUSED(dydx);
    for(row = 0; row < ht; row++)
    {
        for(col = 0; col < wd; col++)
        {
            pu1_dst[col] = pu1_src[col];
        }

        pu1_src += src_strd;
        pu1_dst += dst_strd;
    }
}

/**
*******************************************************************************
*
* @brief interleaved copy
*
* @par Description:
*  Copies the array of width 'wd' and height 'ht' from the location pointed
*  by 'src' to the location pointed by 'dst'
*
* @param[in] pu1_src
*  pointer to the source
*
* @param[out] pu1_dst
*  pointer to the destination
*
* @param[in] src_strd
*  source stride
*
* @param[in] dst_strd
*  destination stride
*
* @param[in] ht
*  height of the array
*
* @param[in] wd
*  width of the array
*
* @returns
*
* @remarks
*  The alternate elements of src will be copied to alternate locations in dsr
*  Other locations are not touched
*
*******************************************************************************
*/
void ih264_interleave_copy(UWORD8 *pu1_src,
                           UWORD8 *pu1_dst,
                           WORD32 src_strd,
                           WORD32 dst_strd,
                           WORD32 ht,
                           WORD32 wd)
{
    WORD32 row, col;

    wd *= 2;
    for(row = 0; row < ht; row++)
    {
        for(col = 0; col < wd; col += 2)
        {
            pu1_dst[col] = pu1_src[col];
        }
        pu1_src += src_strd;
        pu1_dst += dst_strd;
    }
}

/**
*******************************************************************************
*
* @brief
*  Luma hpel Horizontal Interprediction
*
* @par Description:
*  Applies a 6 tap horizontal filter. The output is clipped to 8 bits.
*  Refer sec 8.4.2.2.1 titled "Luma sample interpolation process"
*
* @param[in] pu1_src
*  pointer to the source
*
* @param[out] pu1_dst
*  pointer to the destination
*
* @param[in] src_strd
*  source stride
*
* @param[in] dst_strd
*  destination stride
*
* @param[in] ht
*  height of the array
*
* @param[in] wd
*  width of the array
*
* @param[in] pu1_tmp
*  temporary buffer
*
* @param[in] dydx
*  x and y reference offset for qpel calculations
*
* @returns
*
* @remarks
*  none
*
*******************************************************************************
*/
void ih264_inter_pred_luma_horz(UWORD8 *pu1_src,
                                UWORD8 *pu1_dst,
                                WORD32 src_strd,
                                WORD32 dst_strd,
                                WORD32 ht,
                                WORD32 wd,
                                UWORD8 *pu1_tmp,
                                WORD32 dydx)
{
    WORD32 row, col;
    WORD16 i2_tmp;

    UNUSED(pu1_tmp);
    UNUSED(dydx);
    for(row = 0; row < ht; row++)
    {
        for(col = 0; col < wd; col++)
        {
            i2_tmp = ih264_g_six_tap[0] * (pu1_src[col - 2] + pu1_src[col + 3])
                     + ih264_g_six_tap[1] * (pu1_src[col - 1] + pu1_src[col + 2])
                     + ih264_g_six_tap[2] * (pu1_src[col] + pu1_src[col + 1]);
            i2_tmp = (i2_tmp + 16) >> 5;
            pu1_dst[col] = CLIP_U8(i2_tmp);
        }
        pu1_src += src_strd;
        pu1_dst += dst_strd;
    }
}

/**
*******************************************************************************
*
* @brief
*  Luma hpel Vertical Interprediction
*
* @par Description:
*  Applies a 6 tap vertical filter.The output is clipped to 8 bits
*  Refer sec 8.4.2.2.1 titled "Luma sample interpolation process"
*
* @param[in] pu1_src
*  pointer to the source
*
* @param[out] pu1_dst
*  pointer to the destination
*
* @param[in] src_strd
*  source stride
*
* @param[in] dst_strd
*  destination stride
*
* @param[in] ht
*  height of the array
*
* @param[in] wd
*  width of the array
*
* @param[in] pu1_tmp
*  temporary buffer
*
* @param[in] dydx
*  x and y reference offset for qpel calculations
*
* @returns
*
* @remarks
*  none
*
*******************************************************************************
*/
void ih264_inter_pred_luma_vert(UWORD8 *pu1_src,
                                UWORD8 *pu1_dst,
                                WORD32 src_strd,
                                WORD32 dst_strd,
                                WORD32 ht,
                                WORD32 wd,
                                UWORD8 *pu1_tmp,
                                WORD32 dydx)
{
    WORD32 row, col;
    WORD16 i2_tmp;

    UNUSED(pu1_tmp);
    UNUSED(dydx);
    for(row = 0; row < ht; row++)
    {
        for(col = 0; col < wd; col++)
        {
            i2_tmp = ih264_g_six_tap[0] * (pu1_src[col - 2 * src_strd] + pu1_src[col + 3 * src_strd])
                     + ih264_g_six_tap[1] * (pu1_src[col - 1 * src_strd] + pu1_src[col + 2 * src_strd])
                     + ih264_g_six_tap[2] * (pu1_src[col] + pu1_src[col + 1 * src_strd]);
            i2_tmp = (i2_tmp + 16) >> 5;
            pu1_dst[col] = CLIP_U8(i2_tmp);
        }
        pu1_src += src_strd;
        pu1_dst += dst_strd;
    }
}

/**
*******************************************************************************
*
* @brief
*  Luma Horizontal hpel & Vertical hpel Interprediction
*
* @par Description:
*  This function implements a two stage cascaded six tap filter. It
*  applies the six tap filter in the horizontal direction on the
*  predictor values, followed by applying the same filter in the
*  vertical direction on the output of the first stage. The six tap
*  filtering operation is described in sec 8.4.2.2.1 titled "Luma sample
*  interpolation process"
*
* @param[in] pu1_src
*  pointer to the source
*
* @param[out] pu1_dst
*  pointer to the destination
*
* @param[in] src_strd
*  source stride
*
* @param[in] dst_strd
*  destination stride
*
* @param[in] ht
*  height of the array
*
* @param[in] wd
*  width of the array
*
* @param[in] pu1_tmp
*  temporary buffer
*
* @param[in] dydx
*  x and y reference offset for qpel calculations
*
* @returns
*
* @remarks
*  This function takes the 8 bit predictor values, applies the six tap
*  filter in the horizontal direction and outputs the result clipped to
*  8 bit precision. The input is stored in the buffer pointed to by
*  pu1_src while the output is stored in the buffer pointed by pu1_dst.
*  Both pu1_src and pu1_dst could point to the same buffer i.e. the
*  six tap filter could be done in place.
*
*******************************************************************************
*/
void ih264_inter_pred_luma_horz_hpel_vert_hpel(UWORD8 *pu1_src,
                                               UWORD8 *pu1_dst,
                                               WORD32 src_strd,
                                               WORD32 dst_strd,
                                               WORD32 ht,
                                               WORD32 wd,
                                               UWORD8 *pu1_tmp,
                                               WORD32 dydx)
{
    WORD32 row, col;
    WORD32 tmp;
    WORD16 *pi2_pred1_temp;
    WORD16 *pi2_pred1;

    UNUSED(dydx);
    pi2_pred1_temp = (WORD16*)pu1_tmp;
    pi2_pred1_temp += 2;
    pi2_pred1 = pi2_pred1_temp;

    for(row = 0; row < ht; row++)
    {
        for(col = -2; col < wd + 3; col++)
        {
            tmp = ih264_g_six_tap[0] * (pu1_src[col - 2 * src_strd] + pu1_src[col + 3 * src_strd])
                  + ih264_g_six_tap[1] * (pu1_src[col - 1 * src_strd] + pu1_src[col + 2 * src_strd])
                  + ih264_g_six_tap[2] * (pu1_src[col] + pu1_src[col + 1 * src_strd]);
            pi2_pred1_temp[col] = tmp;
        }
        pu1_src += src_strd;
        pi2_pred1_temp = pi2_pred1_temp + wd + 5;
    }

    for(row = 0; row < ht; row++)
    {
        for(col = 0; col < wd; col++)
        {
            tmp = ih264_g_six_tap[0] * (pi2_pred1[col - 2] + pi2_pred1[col + 3])
                  + ih264_g_six_tap[1] * (pi2_pred1[col - 1] + pi2_pred1[col + 2])
                  + ih264_g_six_tap[2] * (pi2_pred1[col] + pi2_pred1[col + 1]);
            tmp = (tmp + 512) >> 10;
            pu1_dst[col] = CLIP_U8(tmp);
        }
        pi2_pred1 += (wd + 5);
        pu1_dst += dst_strd;
    }
}

/**
*******************************************************************************
*
* @brief
*  Luma Horizontal qpel Interprediction
*
* @par Description:
*  This routine applies the six tap filter to the predictors in the
*  horizontal direction and interpolates them to obtain pixels at quarter
*  horizontal positions (1/4, 0) and (3/4, 0). The six tap filtering operation
*  is described in sec 8.4.2.2.1 titled "Luma sample interpolation process"
*
* @param[in] pu1_src
*  Pointer to the buffer containing the predictor values.
*
* @param[out] pu1_dst
*  Pointer to the destination buffer where the output of the six tap filter is
*  stored
*
* @param[in] src_strd
*  source stride
*
* @param[in] dst_strd
*  destination stride
*
* @param[in] ht
*  Height of the rectangular pixel grid to be interpolated
*
* @param[in] wd
*  Width of the rectangular pixel grid to be interpolated
*
* @param[in] pu1_tmp
*  temporary buffer. UNUSED in this function
*
* @param[in] dydx
*  x and y reference offset for qpel calculations
*
* @returns
*
* @remarks
*  This function takes the 8 bit predictor values, applies the six tap
*  filter in the horizontal direction and outputs the result clipped to
*  8 bit precision. The input is stored in the buffer pointed to by
*  pu1_src while the output is stored in the buffer pointed by pu1_dst.
*  Both pu1_src and pu1_dst could point to the same buffer i.e. the
*  six tap filter could be done in place.
*
*******************************************************************************
*/
void ih264_inter_pred_luma_horz_qpel(UWORD8 *pu1_src,
                                     UWORD8 *pu1_dst,
                                     WORD32 src_strd,
                                     WORD32 dst_strd,
                                     WORD32 ht,
                                     WORD32 wd,
                                     UWORD8* pu1_tmp,
                                     WORD32 dydx)
{
    WORD32 row, col;
    UWORD8 *pu1_pred1;
    WORD32 x_offset = dydx & 0x3;
    UNUSED(pu1_tmp);
    pu1_pred1 = pu1_src + (x_offset >> 1);

    for(row = 0; row < ht; row++)
    {
        for(col = 0; col < wd; col++, pu1_src++, pu1_dst++)
        {
            WORD16 i2_temp;
            /* The logic below implements the following equation
             i2_temp = puc_pred[-2] - 5 * (puc_pred[-1] + puc_pred[2]) +
             20 * (puc_pred[0] + puc_pred[1]) + puc_pred[3]; */
            i2_temp = pu1_src[-2] + pu1_src[3]
                      - (pu1_src[-1] + pu1_src[2])
                      + ((pu1_src[0] + pu1_src[1] - pu1_src[-1] - pu1_src[2]) << 2)
                      + ((pu1_src[0] + pu1_src[1]) << 4);
            i2_temp = (i2_temp + 16) >> 5;
            i2_temp = CLIP_U8(i2_temp);
            *pu1_dst = (i2_temp + *pu1_pred1 + 1) >> 1;

            pu1_pred1++;
        }
        pu1_dst += dst_strd - wd;
        pu1_src += src_strd - wd;
        pu1_pred1 += src_strd - wd;
    }
}

/**
*******************************************************************************
*
* @brief
*  Luma vertical qpel Interprediction
*
* @par Description:
*  This routine applies the six tap filter to the predictors in the
*  vertical direction and interpolates them to obtain pixels at quarter
*  vertical positions (0, 1/4) and (0, 3/4). The six tap filtering operation
*  is described in sec 8.4.2.2.1 titled "Luma sample interpolation process"
*
* @param[in] pu1_src
*  Pointer to the buffer containing the predictor values.
*
* @param[out] pu1_dst
*  Pointer to the destination buffer where the output of the six tap filter is
*  stored
*
* @param[in] src_strd
*  source stride
*
* @param[in] dst_strd
*  destination stride
*
* @param[in] ht
*  Height of the rectangular pixel grid to be interpolated
*
* @param[in] wd
*  Width of the rectangular pixel grid to be interpolated
*
* @param[in] pu1_tmp
*  temporary buffer. UNUSED in this function
*
* @param[in] dydx
*  x and y reference offset for qpel calculations
*
* @returns
*
* @remarks
*  This function takes the 8 bit predictor values, applies the six tap
*  filter in the horizontal direction and outputs the result clipped to
*  8 bit precision. The input is stored in the buffer pointed to by
*  pu1_src while the output is stored in the buffer pointed by pu1_dst.
*  Both pu1_src and pu1_dst could point to the same buffer i.e. the
*  six tap filter could be done in place.
*
*******************************************************************************
*/
void ih264_inter_pred_luma_vert_qpel(UWORD8 *pu1_src,
                                     UWORD8 *pu1_dst,
                                     WORD32 src_strd,
                                     WORD32 dst_strd,
                                     WORD32 ht,
                                     WORD32 wd,
                                     UWORD8* pu1_tmp,
                                     WORD32 dydx)
{
    WORD32 row, col;
    WORD32 y_offset = dydx >> 2;
    WORD32 off1, off2, off3;
    UWORD8 *pu1_pred1;
    UNUSED(pu1_tmp);
    y_offset = y_offset & 0x3;

    off1 = src_strd;
    off2 = src_strd << 1;
    off3 = off1 + off2;

    pu1_pred1 = pu1_src + (y_offset >> 1) * src_strd;

    for(row = 0; row < ht; row++)
    {
        for(col = 0; col < wd; col++, pu1_dst++, pu1_src++, pu1_pred1++)
        {
            WORD16 i2_temp;
            /* The logic below implements the following equation
             i16_temp = puc_pred[-2*src_strd] + puc_pred[3*src_strd] -
             5 * (puc_pred[-1*src_strd] + puc_pred[2*src_strd])  +
             20 * (puc_pred[0] + puc_pred[src_strd]); */
            i2_temp = pu1_src[-off2] + pu1_src[off3]
                       - (pu1_src[-off1] + pu1_src[off2])
                       + ((pu1_src[0] + pu1_src[off1] - pu1_src[-off1] - pu1_src[off2]) << 2)
                       + ((pu1_src[0] + pu1_src[off1]) << 4);
            i2_temp = (i2_temp + 16) >> 5;
            i2_temp = CLIP_U8(i2_temp);

            *pu1_dst = (i2_temp + *pu1_pred1 + 1) >> 1;
        }
        pu1_src += src_strd - wd;
        pu1_pred1 += src_strd - wd;
        pu1_dst += dst_strd - wd;
    }
}

/**
*******************************************************************************
*
* @brief
*  Luma Horizontal qpel and vertical qpel Interprediction
*
* @par Description:
*  This routine applies the six tap filter to the predictors in the
*  vertical and horizontal direction averages them to get pixels at locations
*  (1/4, 1/4), (1/4, 3/4), (3/4, 1/4) & (3/4, 3/4). The six tap filtering operation
*  is described in sec 8.4.2.2.1 titled "Luma sample interpolation process"
*
* @param[in] pu1_src
*  Pointer to the buffer containing the predictor values.
*
* @param[out] pu1_dst
*  Pointer to the destination buffer where the output of the six tap filter is
*  stored
*
* @param[in] src_strd
*  source stride
*
* @param[in] dst_strd
*  destination stride
*
* @param[in] ht
*  Height of the rectangular pixel grid to be interpolated
*
* @param[in] wd
*  Width of the rectangular pixel grid to be interpolated
*
* @param[in] pu1_tmp
*  temporary buffer. UNUSED in this function
*
* @param[in] dydx
*  x and y reference offset for qpel calculations
*
* @returns
*
* @remarks
*  This function takes the 8 bit predictor values, applies the six tap
*  filter in the horizontal direction and outputs the result clipped to
*  8 bit precision. The input is stored in the buffer pointed to by
*  pu1_src while the output is stored in the buffer pointed by pu1_dst.
*  Both pu1_src and pu1_dst could point to the same buffer i.e. the
*  six tap filter could be done in place.
*
*******************************************************************************
*/
void ih264_inter_pred_luma_horz_qpel_vert_qpel(UWORD8 *pu1_src,
                                               UWORD8 *pu1_dst,
                                               WORD32 src_strd,
                                               WORD32 dst_strd,
                                               WORD32 ht,
                                               WORD32 wd,
                                               UWORD8* pu1_tmp,
                                               WORD32 dydx)
{
    WORD32 row, col;
    WORD32 x_offset = dydx & 0x3;
    WORD32 y_offset = dydx >> 2;

    WORD32 off1, off2, off3;
    UWORD8* pu1_pred_vert, *pu1_pred_horz;
    UNUSED(pu1_tmp);
    y_offset = y_offset & 0x3;

    off1 = src_strd;
    off2 = src_strd << 1;
    off3 = off1 + off2;

    pu1_pred_horz = pu1_src + (y_offset >> 1) * src_strd;
    pu1_pred_vert = pu1_src + (x_offset >> 1);

    for(row = 0; row < ht; row++)
    {
        for(col = 0; col < wd;
                        col++, pu1_dst++, pu1_pred_vert++, pu1_pred_horz++)
        {
            WORD16 i2_temp_vert, i2_temp_horz;
            /* The logic below implements the following equation
             i2_temp = puc_pred[-2*src_strd] + puc_pred[3*src_strd] -
             5 * (puc_pred[-1*src_strd] + puc_pred[2*src_strd])  +
             20 * (puc_pred[0] + puc_pred[src_strd]); */
            i2_temp_vert = pu1_pred_vert[-off2] + pu1_pred_vert[off3]
                            - (pu1_pred_vert[-off1] + pu1_pred_vert[off2])
                            + ((pu1_pred_vert[0] + pu1_pred_vert[off1]
                                            - pu1_pred_vert[-off1]
                                            - pu1_pred_vert[off2]) << 2)
                            + ((pu1_pred_vert[0] + pu1_pred_vert[off1]) << 4);
            i2_temp_vert = (i2_temp_vert + 16) >> 5;
            i2_temp_vert = CLIP_U8(i2_temp_vert);

            /* The logic below implements the following equation
             i16_temp = puc_pred[-2] - 5 * (puc_pred[-1] + puc_pred[2]) +
             20 * (puc_pred[0] + puc_pred[1]) + puc_pred[3]; */
            i2_temp_horz = pu1_pred_horz[-2] + pu1_pred_horz[3]
                            - (pu1_pred_horz[-1] + pu1_pred_horz[2])
                            + ((pu1_pred_horz[0] + pu1_pred_horz[1]
                                            - pu1_pred_horz[-1]
                                            - pu1_pred_horz[2]) << 2)
                            + ((pu1_pred_horz[0] + pu1_pred_horz[1]) << 4);
            i2_temp_horz = (i2_temp_horz + 16) >> 5;
            i2_temp_horz = CLIP_U8(i2_temp_horz);
            *pu1_dst = (i2_temp_vert + i2_temp_horz + 1) >> 1;
        }
        pu1_pred_vert += (src_strd - wd);
        pu1_pred_horz += (src_strd - wd);
        pu1_dst += (dst_strd - wd);
    }
}

/**
*******************************************************************************
*
* @brief
*  Luma Horizontal qpel and vertical hpel Interprediction
*
* @par Description:
*  This routine applies the six tap filter to the predictors in the vertical
*  and horizontal direction to obtain the pixel at (1/2,1/2). It then interpolates
*  pixel at (0, 1/2) and (1/2, 1/2) to obtain pixel at (1/4, 1/2). Similarly for
*  (3/4,1/2). The six tap filtering operation is described in sec 8.4.2.2.1
*  titled "Luma sample interpolation process"
*
* @param[in] pu1_src
*  Pointer to the buffer containing the predictor values.
*
* @param[out] pu1_dst
*  Pointer to the destination buffer where the output of the six tap filter is
*  stored
*
* @param[in] src_strd
*  source stride
*
* @param[in] dst_strd
*  destination stride
*
* @param[in] ht
*  Height of the rectangular pixel grid to be interpolated
*
* @param[in] wd
*  Width of the rectangular pixel grid to be interpolated
*
* @param[in] pu1_tmp
*  temporary buffer. UNUSED in this function
*
* @param[in] dydx
*  x and y reference offset for qpel calculations
*
* @returns
*
* @remarks
*  This function takes the 8 bit predictor values, applies the six tap
*  filter in the horizontal direction and outputs the result clipped to
*  8 bit precision. The input is stored in the buffer pointed to by
*  pu1_src while the output is stored in the buffer pointed by pu1_dst.
*  Both pu1_src and pu1_dst could point to the same buffer i.e. the
*  six tap filter could be done in place.
*
*******************************************************************************
*/
void ih264_inter_pred_luma_horz_qpel_vert_hpel(UWORD8 *pu1_src,
                                               UWORD8 *pu1_dst,
                                               WORD32 src_strd,
                                               WORD32 dst_strd,
                                               WORD32 ht,
                                               WORD32 wd,
                                               UWORD8* pu1_tmp,
                                               WORD32 dydx)
{
    WORD32 row, col;
    WORD32 tmp;
    WORD16* pi2_pred1_temp, *pi2_pred1;
    UWORD8* pu1_dst_tmp;
    WORD32 x_offset = dydx & 0x3;
    WORD16 i2_macro;

    pi2_pred1_temp = (WORD16*)pu1_tmp;
    pi2_pred1_temp += 2;
    pi2_pred1 = pi2_pred1_temp;
    pu1_dst_tmp = pu1_dst;

    for(row = 0; row < ht; row++)
    {
        for(col = -2; col < wd + 3; col++)
        {
            tmp = 0;/*ih264_g_six_tap[] is the array containing the filter coeffs*/
            tmp = ih264_g_six_tap[0] *
                            (pu1_src[col - 2 * src_strd] + pu1_src[col + 3 * src_strd])
                  + ih264_g_six_tap[1] *
                            (pu1_src[col - 1 * src_strd] + pu1_src[col + 2 * src_strd])
                  + ih264_g_six_tap[2] *
                            (pu1_src[col] + pu1_src[col + 1 * src_strd]);
            pi2_pred1_temp[col] = tmp;
        }

        pu1_src += src_strd;
        pi2_pred1_temp = pi2_pred1_temp + wd + 5;
    }

    pi2_pred1_temp = pi2_pred1;
    for(row = 0; row < ht; row++)
    {
        for(col = 0; col < wd; col++)
        {
            tmp = 0;/*ih264_g_six_tap[] is the array containing the filter coeffs*/
            tmp = ih264_g_six_tap[0] *
                            (pi2_pred1[col - 2] + pi2_pred1[col + 3])
                  + ih264_g_six_tap[1] *
                            (pi2_pred1[col - 1] + pi2_pred1[col + 2])
                  + ih264_g_six_tap[2] *
                            (pi2_pred1[col] + pi2_pred1[col + 1]);
            tmp = (tmp + 512) >> 10;
            pu1_dst[col] = CLIP_U8(tmp);
        }
        pi2_pred1 += (wd + 5);
        pu1_dst += dst_strd;
    }

    pu1_dst = pu1_dst_tmp;
    pi2_pred1_temp += (x_offset >> 1);
    for(row = ht; row != 0; row--)
    {
        for(col = wd; col != 0; col--, pu1_dst++, pi2_pred1_temp++)
        {
            UWORD8 uc_temp;
            /* Clipping the output of the six tap filter obtained from the
             first stage of the 2d filter stage */
            *pi2_pred1_temp = (*pi2_pred1_temp + 16) >> 5;
            i2_macro = (*pi2_pred1_temp);
            uc_temp = CLIP_U8(i2_macro);
            *pu1_dst = (*pu1_dst + uc_temp + 1) >> 1;
        }
        pi2_pred1_temp += 5;
        pu1_dst += dst_strd - wd;
    }
}

/*!
 **************************************************************************
 * \if Function name : ih264_inter_pred_luma_horz_hpel_vert_qpel \endif
 *
 * \brief
 *    This routine applies the six tap filter to the predictors in the horizontal
 *    and vertical direction to obtain the pixel at (1/2,1/2). It then interpolates
 *    pixel at (1/2,0) and (1/2,1/2) to obtain pixel at (1/2,1/4). Similarly for (1/2,3/4).
 *    The six tap filtering operation is described in sec 8.4.2.2.1 titled
 *    "Luma sample interpolation process"
 *
 * \param pu1_src: Pointer to the buffer containing the predictor values.
 *     pu1_src could point to the frame buffer or the predictor buffer.
 * \param pu1_dst: Pointer to the destination buffer where the output of
 *     the six tap filter followed by interpolation is stored.
 * \param wd: Width of the rectangular pixel grid to be interpolated
 * \param ht: Height of the rectangular pixel grid to be interpolated
 * \param src_strd: Width of the buffer pointed to by puc_pred.
 * \param dst_strd: Width of the destination buffer
 * \param pu1_tmp: buffer to store temporary output after 1st 6-tap filter.
 * \param dydx: x and y reference offset for qpel calculations.
 *
 * \return
 *    void
 *
 * \note
 *    This function takes the 8 bit predictor values, applies the six tap
 *    filter in the vertical direction and outputs the result clipped to
 *    8 bit precision. The input is stored in the buffer pointed to by
 *    puc_pred while the output is stored in the buffer pointed by puc_dest.
 *    Both puc_pred and puc_dest could point to the same buffer i.e. the
 *    six tap filter could be done in place.
 *
 * \para <title>
 *    <paragraph>
 *  ...
 **************************************************************************
 */
/**
*******************************************************************************
*
* @brief
*  Luma Horizontal hpel and vertical qpel Interprediction
*
* @par Description:
*  This routine applies the six tap filter to the predictors in the vertical
*  and horizontal direction to obtain the pixel at (1/2,1/2). It then interpolates
*  pixel at (1/2, 0) and (1/2, 1/2) to obtain pixel at (1/2, 1/4). Similarly for
*  (1/2, 3/4). The six tap filtering operation is described in sec 8.4.2.2.1
*  titled "Luma sample interpolation process"
*
* @param[in] pu1_src
*  Pointer to the buffer containing the predictor values.
*
* @param[out] pu1_dst
*  Pointer to the destination buffer where the output of the six tap filter is
*  stored
*
* @param[in] src_strd
*  source stride
*
* @param[in] dst_strd
*  destination stride
*
* @param[in] ht
*  Height of the rectangular pixel grid to be interpolated
*
* @param[in] wd
*  Width of the rectangular pixel grid to be interpolated
*
* @param[in] pu1_tmp
*  temporary buffer. UNUSED in this function
*
* @param[in] dydx
*  x and y reference offset for qpel calculations
*
* @returns
*
* @remarks
*  This function takes the 8 bit predictor values, applies the six tap
*  filter in the horizontal direction and outputs the result clipped to
*  8 bit precision. The input is stored in the buffer pointed to by
*  pu1_src while the output is stored in the buffer pointed by pu1_dst.
*  Both pu1_src and pu1_dst could point to the same buffer i.e. the
*  six tap filter could be done in place.
*
*******************************************************************************
*/
void ih264_inter_pred_luma_horz_hpel_vert_qpel(UWORD8 *pu1_src,
                                               UWORD8 *pu1_dst,
                                               WORD32 src_strd,
                                               WORD32 dst_strd,
                                               WORD32 ht,
                                               WORD32 wd,
                                               UWORD8* pu1_tmp,
                                               WORD32 dydx)
{

    WORD32 row, col;
    WORD32 tmp;
    WORD32 y_offset = dydx >> 2;
    WORD16* pi2_pred1_temp, *pi2_pred1;
    UWORD8* pu1_dst_tmp;
    //WORD32 x_offset = dydx & 0x3;
    WORD16 i2_macro;

    y_offset = y_offset & 0x3;

    pi2_pred1_temp = (WORD16*)pu1_tmp;
    pi2_pred1_temp += 2 * wd;
    pi2_pred1 = pi2_pred1_temp;
    pu1_dst_tmp = pu1_dst;
    pu1_src -= 2 * src_strd;
    for(row = -2; row < ht + 3; row++)
    {
        for(col = 0; col < wd; col++)
        {
            tmp = 0;/*ih264_g_six_tap[] is the array containing the filter coeffs*/
            tmp = ih264_g_six_tap[0] * (pu1_src[col - 2] + pu1_src[col + 3])
                  + ih264_g_six_tap[1] * (pu1_src[col - 1] + pu1_src[col + 2])
                  + ih264_g_six_tap[2] * (pu1_src[col] + pu1_src[col + 1]);
            pi2_pred1_temp[col - 2 * wd] = tmp;
        }

        pu1_src += src_strd;
        pi2_pred1_temp += wd;
    }
    pi2_pred1_temp = pi2_pred1;
    for(row = 0; row < ht; row++)
    {
        for(col = 0; col < wd; col++)
        {
            tmp = 0;/*ih264_g_six_tap[] is the array containing the filter coeffs*/
            tmp = ih264_g_six_tap[0] * (pi2_pred1[col - 2 * wd] + pi2_pred1[col + 3 * wd])
                  + ih264_g_six_tap[1] * (pi2_pred1[col - 1 * wd] + pi2_pred1[col + 2 * wd])
                  + ih264_g_six_tap[2] * (pi2_pred1[col] + pi2_pred1[col + 1 * wd]);
            tmp = (tmp + 512) >> 10;
            pu1_dst[col] = CLIP_U8(tmp);
        }
        pi2_pred1 += wd;
        pu1_dst += dst_strd;
    }
    pu1_dst = pu1_dst_tmp;
    pi2_pred1_temp += (y_offset >> 1) * wd;
    for(row = ht; row != 0; row--)

    {
        for(col = wd; col != 0; col--, pu1_dst++, pi2_pred1_temp++)
        {
            UWORD8 u1_temp;
            /* Clipping the output of the six tap filter obtained from the
             first stage of the 2d filter stage */
            *pi2_pred1_temp = (*pi2_pred1_temp + 16) >> 5;
            i2_macro = (*pi2_pred1_temp);
            u1_temp = CLIP_U8(i2_macro);
            *pu1_dst = (*pu1_dst + u1_temp + 1) >> 1;
        }
        //pi16_pred1_temp += wd;
        pu1_dst += dst_strd - wd;
    }
}

/**
*******************************************************************************
*
* @brief This routine applies the bilinear filter to the predictors.
*
* @par Description This routine applies the bilinear filter to the predictors.
*  The  filtering operation is described in sec 8.4.2.2.1 titled
*  "Luma sample interpolation process". This function is called to obtain
*  pixels lying at the following locations (1/4,1), (3/4,1),(1,1/4), (1,3/4),
*  (1/4,1/2), (3/4,1/2), (1/2,1/4), (1/2,3/4), (3/4,1/4), (1/4,3/4), (3/4,3/4)
*  and (1/4,1/4). The function averages the two adjacent values from the two
*  input arrays in horizontal direction.
*
* @param[in] pu1_src1:
*  Pointer to the buffer containing the first input array.
*
* @param[in] pu1_src2:
*  Pointer to the buffer containing the second input array.
*
* @param[out] pu1_dst
*  pointer to the destination where the output of bilinear filter is stored.
*
* @param[in] src_strd1
*  Stride of the first input buffer
*
* @param[in] src_strd2
*  Stride of the second input buffer
*
* @param[in] dst_strd
*  destination stride of pu1_dst
*
* @param[in] ht
*  height of the array
*
* @param[in] wd
*  width of the array
*
* @returns
*
* @remarks
*  None
*
*******************************************************************************
*/
void ih264_inter_pred_luma_bilinear(UWORD8 *pu1_src1,
                                    UWORD8 *pu1_src2,
                                    UWORD8 *pu1_dst,
                                    WORD32 src_strd1,
                                    WORD32 src_strd2,
                                    WORD32 dst_strd,
                                    WORD32 ht,
                                    WORD32 wd)
{
    WORD32 row, col;
    WORD16 i2_tmp;

    for(row = 0; row < ht; row++)
    {
        for(col = 0; col < wd; col++)
        {
            i2_tmp = pu1_src1[col] + pu1_src2[col];
            i2_tmp = (i2_tmp + 1) >> 1;
            pu1_dst[col] = CLIP_U8(i2_tmp);
        }
        pu1_src1 += src_strd1;
        pu1_src2 += src_strd2;
        pu1_dst += dst_strd;
    }
}

/**
*******************************************************************************
*
* @brief Interprediction chroma filter
*
* @par Description:
*  Applies filtering to chroma samples as mentioned in sec 8.4.2.2.2 titled
*  "chroma sample interpolation process"
*
* @param[in] pu1_src
*  pointer to the source containing alternate U and V samples
*
* @param[out] pu1_dst
*  pointer to the destination
*
* @param[in] src_strd
*  source stride
*
* @param[in] dst_strd
*  destination stride
*
* @param[in] dx
*  dx value where the sample is to be produced (refer sec 8.4.2.2.2 )
*
* @param[in] dy
*  dy value where the sample is to be produced (refer sec 8.4.2.2.2 )
*
* @param[in] ht
*  integer height of the array
*
* @param[in] wd
*  integer width of the array
*
* @returns
*
* @remarks
*  None
*
*******************************************************************************
*/
void ih264_inter_pred_chroma(UWORD8 *pu1_src,
                             UWORD8 *pu1_dst,
                             WORD32 src_strd,
                             WORD32 dst_strd,
                             WORD32 dx,
                             WORD32 dy,
                             WORD32 ht,
                             WORD32 wd)
{
    WORD32 row, col;
    WORD16 i2_tmp;

    for(row = 0; row < ht; row++)
    {
        for(col = 0; col < 2 * wd; col++)
        {
            i2_tmp = 0; /* applies equation (8-266) in section 8.4.2.2.2 */
            i2_tmp = (8 - dx) * (8 - dy) * pu1_src[col]
                     + (dx) * (8 - dy) * pu1_src[col + 2]
                     + (8 - dx) * (dy) * (pu1_src + src_strd)[col]
                     + (dx) * (dy) * (pu1_src + src_strd)[col + 2];
            i2_tmp = (i2_tmp + 32) >> 6;
            pu1_dst[col] = CLIP_U8(i2_tmp);
        }
        pu1_src += src_strd;
        pu1_dst += dst_strd;
    }
}
