/******************************************************************************
*
* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
/**
 *******************************************************************************
 * @file
 *  ihevcd_iquant_itrans_recon_ctb.c
 *
 * @brief
 *  Contains functions for inverse quantization, inverse transform and recon
 *
 * @author
 *  Ittiam
 *
 * @par List of Functions:
 * - ihevcd_iquant_itrans_recon_ctb()
 *
 * @remarks
 *  None
 *
 *******************************************************************************
 */
/*****************************************************************************/
/* File Includes                                                             */
/*****************************************************************************/
#include <stdio.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>

#include "ihevc_typedefs.h"
#include "iv.h"
#include "ivd.h"
#include "ihevcd_cxa.h"

#include "ihevc_defs.h"
#include "ihevc_debug.h"
#include "ihevc_structs.h"
#include "ihevc_cabac_tables.h"
#include "ihevc_macros.h"
#include "ihevc_platform_macros.h"

#include "ihevcd_defs.h"
#include "ihevcd_function_selector.h"
#include "ihevcd_structs.h"
#include "ihevcd_error.h"
#include "ihevcd_bitstream.h"
#include "ihevc_common_tables.h"

/* Intra pred includes */
#include "ihevc_intra_pred.h"

/* Inverse transform common module includes */
#include "ihevc_trans_tables.h"
#include "ihevc_trans_macros.h"
#include "ihevc_itrans_recon.h"
#include "ihevc_recon.h"
#include "ihevc_chroma_itrans_recon.h"
#include "ihevc_chroma_recon.h"

/* Decoder includes */
#include "ihevcd_common_tables.h"
#include "ihevcd_iquant_itrans_recon_ctb.h"
#include "ihevcd_debug.h"
#include "ihevcd_profile.h"
#include "ihevcd_statistics.h"
#include "ihevcd_itrans_recon_dc.h"

static const UWORD32 gau4_ihevcd_4_bit_reverse[] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 };


/* Globals */
static const WORD32 g_i4_ip_funcs[MAX_NUM_IP_MODES] =
  { IP_FUNC_MODE_0, /* Mode 0 */
    IP_FUNC_MODE_1, /* Mode 1 */
    IP_FUNC_MODE_2, /* Mode 2 */
    IP_FUNC_MODE_3TO9, /* Mode 3 */
    IP_FUNC_MODE_3TO9, /* Mode 4 */
    IP_FUNC_MODE_3TO9, /* Mode 5 */
    IP_FUNC_MODE_3TO9, /* Mode 6 */
    IP_FUNC_MODE_3TO9, /* Mode 7 */
    IP_FUNC_MODE_3TO9, /* Mode 8 */
    IP_FUNC_MODE_3TO9, /* Mode 9 */
    IP_FUNC_MODE_10, /* Mode 10 */
    IP_FUNC_MODE_11TO17, /* Mode 11 */
    IP_FUNC_MODE_11TO17, /* Mode 12 */
    IP_FUNC_MODE_11TO17, /* Mode 13 */
    IP_FUNC_MODE_11TO17, /* Mode 14 */
    IP_FUNC_MODE_11TO17, /* Mode 15 */
    IP_FUNC_MODE_11TO17, /* Mode 16 */
    IP_FUNC_MODE_11TO17, /* Mode 17 */
    IP_FUNC_MODE_18_34, /* Mode 18 */
    IP_FUNC_MODE_19TO25, /* Mode 19 */
    IP_FUNC_MODE_19TO25, /* Mode 20 */
    IP_FUNC_MODE_19TO25, /* Mode 21 */
    IP_FUNC_MODE_19TO25, /* Mode 22 */
    IP_FUNC_MODE_19TO25, /* Mode 23 */
    IP_FUNC_MODE_19TO25, /* Mode 24 */
    IP_FUNC_MODE_19TO25, /* Mode 25 */
    IP_FUNC_MODE_26, /* Mode 26 */
    IP_FUNC_MODE_27TO33, /* Mode 27 */
    IP_FUNC_MODE_27TO33, /* Mode 26 */
    IP_FUNC_MODE_27TO33, /* Mode 29 */
    IP_FUNC_MODE_27TO33, /* Mode 30 */
    IP_FUNC_MODE_27TO33, /* Mode 31 */
    IP_FUNC_MODE_27TO33, /* Mode 32 */
    IP_FUNC_MODE_27TO33, /* Mode 33 */
    IP_FUNC_MODE_18_34, /* Mode 34 */
};


const WORD16 *g_ai2_ihevc_trans_tables[] =
  { &g_ai2_ihevc_trans_dst_4[0][0],
    &g_ai2_ihevc_trans_4[0][0],
    &g_ai2_ihevc_trans_8[0][0],
    &g_ai2_ihevc_trans_16[0][0],
    &g_ai2_ihevc_trans_32[0][0]
};


/*****************************************************************************/
/* Function Prototypes                                                       */
/*****************************************************************************/
/* Returns number of ai2_level read from ps_sblk_coeff */
UWORD8* ihevcd_unpack_coeffs(WORD16 *pi2_tu_coeff,
                             WORD32 log2_trans_size,
                             UWORD8 *pu1_tu_coeff_data,
                             WORD16 *pi2_dequant_matrix,
                             WORD32 qp_rem,
                             WORD32 qp_div,
                             TRANSFORM_TYPE e_trans_type,
                             WORD32 trans_quant_bypass,
                             UWORD32 *pu4_zero_cols,
                             UWORD32 *pu4_zero_rows,
                             UWORD32 *pu4_coeff_type,
                             WORD16 *pi2_coeff_value)
{
    /* Generating coeffs from coeff-map */
    WORD32 i;
    WORD16 *pi2_sblk_ptr;
    WORD32 subblk_pos_x, subblk_pos_y;
    WORD32 sblk_scan_idx, coeff_raster_idx;
    WORD32 sblk_non_zero_coeff_idx;
    tu_sblk_coeff_data_t *ps_tu_sblk_coeff_data;
    UWORD8 u1_num_coded_sblks, u1_scan_type;
    UWORD8 *pu1_new_tu_coeff_data;
    WORD32 trans_size;
    WORD32 xs, ys;
    WORD32 trans_skip;
    WORD16 iquant_out;
    WORD32 shift_iq;
    {
        WORD32 bit_depth;

        bit_depth = 8 + 0;
        shift_iq = bit_depth + log2_trans_size - 5;
    }
    trans_size = (1 << log2_trans_size);

    /* First byte points to number of coded blocks */
    u1_num_coded_sblks = *pu1_tu_coeff_data++;

    /* Next byte points to scan type */
    u1_scan_type = *pu1_tu_coeff_data++;
    /* 0th bit has trans_skip */
    trans_skip = u1_scan_type & 1;
    u1_scan_type >>= 1;

    pi2_sblk_ptr = pi2_tu_coeff;

    /* Initially all columns are assumed to be zero */
    *pu4_zero_cols = 0xFFFFFFFF;
    /* Initially all rows are assumed to be zero */
    *pu4_zero_rows = 0xFFFFFFFF;

    ps_tu_sblk_coeff_data = (tu_sblk_coeff_data_t *)(pu1_tu_coeff_data);

    if(trans_skip)
        memset(pi2_tu_coeff, 0, trans_size * trans_size * sizeof(WORD16));

    STATS_INIT_SBLK_AND_COEFF_POS();

    /* DC only case */
    if((e_trans_type != DST_4x4) && (1 == u1_num_coded_sblks)
                    && (0 == ps_tu_sblk_coeff_data->u2_subblk_pos)
                    && (1 == ps_tu_sblk_coeff_data->u2_sig_coeff_map))
    {
        *pu4_coeff_type = 1;

        if(!trans_quant_bypass)
        {
            if(4 == trans_size)
            {
                IQUANT_4x4(iquant_out,
                           ps_tu_sblk_coeff_data->ai2_level[0],
                           pi2_dequant_matrix[0]
                                           * g_ihevc_iquant_scales[qp_rem],
                           shift_iq, qp_div);
            }
            else
            {
                IQUANT(iquant_out, ps_tu_sblk_coeff_data->ai2_level[0],
                       pi2_dequant_matrix[0] * g_ihevc_iquant_scales[qp_rem],
                       shift_iq, qp_div);
            }
            if(trans_skip)
                iquant_out = (iquant_out + 16) >> 5;
        }
        else
        {
            /* setting the column to zero */
            for(i = 0; i < trans_size; i++)
                *(pi2_tu_coeff + i * trans_size) = 0;

            iquant_out = ps_tu_sblk_coeff_data->ai2_level[0];
        }
        *pi2_coeff_value = iquant_out;
        *pi2_tu_coeff = iquant_out;
        *pu4_zero_cols &= ~0x1;
        *pu4_zero_rows &= ~0x1;
        ps_tu_sblk_coeff_data =
                        (void *)&ps_tu_sblk_coeff_data->ai2_level[1];

        STATS_UPDATE_COEFF_COUNT();
        STATS_LAST_SBLK_POS_UPDATE(e_trans_type, (trans_skip || trans_quant_bypass),  0, 0);
        STATS_UPDATE_SBLK_AND_COEFF_HISTOGRAM(e_trans_type, (trans_quant_bypass || trans_skip));
        return ((UWORD8 *)ps_tu_sblk_coeff_data);
    }
    else
    {
        *pu4_coeff_type = 0;
        /* In case of trans skip, memset has already happened */
        if(!trans_skip)
            memset(pi2_tu_coeff, 0, trans_size * trans_size * sizeof(WORD16));
    }

    for(i = 0; i < u1_num_coded_sblks; i++)
    {
        UWORD32 u4_sig_coeff_map;
        subblk_pos_x = ps_tu_sblk_coeff_data->u2_subblk_pos & 0x00FF;
        subblk_pos_y = (ps_tu_sblk_coeff_data->u2_subblk_pos & 0xFF00) >> 8;

        STATS_LAST_SBLK_POS_UPDATE(e_trans_type, (trans_skip || trans_quant_bypass), subblk_pos_x, subblk_pos_y);

        subblk_pos_x = subblk_pos_x * MIN_TU_SIZE;
        subblk_pos_y = subblk_pos_y * MIN_TU_SIZE;

        pi2_sblk_ptr = pi2_tu_coeff + subblk_pos_y * trans_size
                        + subblk_pos_x;

        //*pu4_zero_cols &= ~(0xF << subblk_pos_x);

        sblk_non_zero_coeff_idx = 0;
        u4_sig_coeff_map = ps_tu_sblk_coeff_data->u2_sig_coeff_map;
        //for(sblk_scan_idx = (31 - CLZ(u4_sig_coeff_map)); sblk_scan_idx >= 0; sblk_scan_idx--)
        sblk_scan_idx = 31;
        do
        {
            WORD32 clz = CLZ(u4_sig_coeff_map);

            sblk_scan_idx -= clz;
            /* when clz is 31, u4_sig_coeff_map << (clz+1) might result in unknown behaviour in some cases */
            /* Hence either use SHL which takes care of handling these issues based on platform or shift in two stages */
            u4_sig_coeff_map = u4_sig_coeff_map << clz;
            /* Copying coeffs and storing in reverse order */
            {
                STATS_UPDATE_COEFF_COUNT();
                coeff_raster_idx =
                                gau1_ihevc_invscan4x4[u1_scan_type][sblk_scan_idx];

                xs = coeff_raster_idx & 0x3;
                ys = coeff_raster_idx >> 2;

                if(!trans_quant_bypass)
                {
                    if(4 == trans_size)
                    {
                        IQUANT_4x4(iquant_out,
                                   ps_tu_sblk_coeff_data->ai2_level[sblk_non_zero_coeff_idx],
                                   pi2_dequant_matrix[(subblk_pos_x + xs)
                                                   + (subblk_pos_y + ys)
                                                   * trans_size]
                                   * g_ihevc_iquant_scales[qp_rem],
                                   shift_iq, qp_div);
                        sblk_non_zero_coeff_idx++;
                    }
                    else
                    {
                        IQUANT(iquant_out,
                               ps_tu_sblk_coeff_data->ai2_level[sblk_non_zero_coeff_idx],
                               pi2_dequant_matrix[(subblk_pos_x + xs)
                                               + (subblk_pos_y + ys)
                                               * trans_size]
                               * g_ihevc_iquant_scales[qp_rem],
                               shift_iq, qp_div);
                        sblk_non_zero_coeff_idx++;
                    }

                    if(trans_skip)
                        iquant_out = (iquant_out + 16) >> 5;
                }
                else
                {
                    iquant_out = ps_tu_sblk_coeff_data->ai2_level[sblk_non_zero_coeff_idx++];
                }
                *pu4_zero_cols &= ~(0x1 << (subblk_pos_x + xs));
                *pu4_zero_rows &= ~(0x1 << (subblk_pos_y + ys));
                *(pi2_sblk_ptr + xs + ys * trans_size) = iquant_out;
            }
            sblk_scan_idx--;
            u4_sig_coeff_map <<= 1;

        }while(u4_sig_coeff_map);
        /* Updating the sblk pointer */
        ps_tu_sblk_coeff_data =
                        (void *)&ps_tu_sblk_coeff_data->ai2_level[sblk_non_zero_coeff_idx];
    }

    STATS_UPDATE_SBLK_AND_COEFF_HISTOGRAM(e_trans_type, (trans_quant_bypass || trans_skip));

    pu1_new_tu_coeff_data = (UWORD8 *)ps_tu_sblk_coeff_data;

    return pu1_new_tu_coeff_data;
}

WORD32 ihevcd_get_intra_nbr_flag(process_ctxt_t *ps_proc,
                                 tu_t *ps_tu,
                                 UWORD32 *pu4_intra_nbr_avail,
                                 WORD16 i2_pic_width_in_luma_samples,
                                 UWORD8 i1_constrained_intra_pred_flag,
                                 WORD32 trans_size,
                                 WORD32 ctb_size)
{
    sps_t *ps_sps;
    UWORD8 u1_bot_lt_avail, u1_left_avail, u1_top_avail, u1_top_rt_avail,
                    u1_top_lt_avail;
    WORD32 x_cur, y_cur, x_nbr, y_nbr;
    UWORD8 *pu1_nbr_intra_flag;
    UWORD8 *pu1_pic_intra_flag;
    UWORD8 top_right, top, top_left, left, bot_left;
    WORD32 intra_pos;
    WORD32 num_8_blks, num_8_blks_in_bits;
    WORD32 numbytes_row = (i2_pic_width_in_luma_samples + 63) / 64;
    WORD32 cur_x, cur_y;
    WORD32 i;
    WORD32 nbr_flags;

    ps_sps = ps_proc->ps_sps;
    cur_x = ps_tu->b4_pos_x;
    cur_y = ps_tu->b4_pos_y;

    u1_bot_lt_avail = (pu4_intra_nbr_avail[1 + cur_y + trans_size / MIN_TU_SIZE]
                    >> (31 - (1 + cur_x - 1))) & 1;
    u1_left_avail = (pu4_intra_nbr_avail[1 + cur_y] >> (31 - (1 + cur_x - 1)))
                    & 1;
    u1_top_avail = (pu4_intra_nbr_avail[1 + cur_y - 1] >> (31 - (1 + cur_x)))
                    & 1;
    u1_top_rt_avail = (pu4_intra_nbr_avail[1 + cur_y - 1]
                    >> (31 - (1 + cur_x + trans_size / MIN_TU_SIZE))) & 1;
    u1_top_lt_avail = (pu4_intra_nbr_avail[1 + cur_y - 1]
                    >> (31 - (1 + cur_x - 1))) & 1;

    x_cur = ps_proc->i4_ctb_x * ctb_size + cur_x * MIN_TU_SIZE;
    y_cur = ps_proc->i4_ctb_y * ctb_size + cur_y * MIN_TU_SIZE;

    pu1_pic_intra_flag = ps_proc->pu1_pic_intra_flag;

    /* WORD32 nbr_flags as below  MSB --> LSB */
    /*    Top-Left | Top-Right | Top | Left | Bottom-Left
     *       1         4         4     4         4
     */
    bot_left = 0;
    left = 0;
    top_right = 0;
    top = 0;
    top_left = 0;

    num_8_blks = trans_size > 4 ? trans_size / 8 : 1;
    num_8_blks_in_bits = ((1 << num_8_blks) - 1);

    if(i1_constrained_intra_pred_flag)
    {
        /* TODO: constrained intra pred not tested */
        if(u1_bot_lt_avail)
        {
            x_nbr = x_cur - 1;
            y_nbr = y_cur + trans_size;

            pu1_nbr_intra_flag = pu1_pic_intra_flag + y_nbr / 8 * numbytes_row
                            + x_nbr / 64;
            intra_pos = ((x_nbr / 8) % 8);
            for(i = 0; i < num_8_blks; i++)
            {
                bot_left |= ((*(pu1_nbr_intra_flag + i * numbytes_row)
                                >> intra_pos) & 1) << i;
            }
            bot_left &= num_8_blks_in_bits;
        }
        if(u1_left_avail)
        {
            x_nbr = x_cur - 1;
            y_nbr = y_cur;

            pu1_nbr_intra_flag = pu1_pic_intra_flag + y_nbr / 8 * numbytes_row
                            + x_nbr / 64;
            intra_pos = ((x_nbr / 8) % 8);

            for(i = 0; i < num_8_blks; i++)
            {
                left |= ((*(pu1_nbr_intra_flag + i * numbytes_row) >> intra_pos)
                                & 1) << i;
            }
            left &= num_8_blks_in_bits;
        }
        if(u1_top_avail)
        {
            x_nbr = x_cur;
            y_nbr = y_cur - 1;

            pu1_nbr_intra_flag = pu1_pic_intra_flag + y_nbr / 8 * numbytes_row
                            + x_nbr / 64;
            intra_pos = ((x_nbr / 8) % 8);

            top = (*pu1_nbr_intra_flag >> intra_pos);
            top &= num_8_blks_in_bits;
            /*
             for(i=0;i<num_8_blks;i++)
             {
             top |= ( (*pu1_nbr_intra_flag >> (intra_pos+i)) & 1) << i;
             }
             */
        }
        if(u1_top_rt_avail)
        {
            x_nbr = x_cur + trans_size;
            y_nbr = y_cur - 1;

            pu1_nbr_intra_flag = pu1_pic_intra_flag + y_nbr / 8 * numbytes_row
                            + x_nbr / 64;
            intra_pos = ((x_nbr / 8) % 8);

            top_right = (*pu1_nbr_intra_flag >> intra_pos);
            top_right &= num_8_blks_in_bits;
            /*
             for(i=0;i<num_8_blks;i++)
             {
             top_right |= ( (*pu1_nbr_intra_flag >> (intra_pos+i)) & 1) << i;
             }
             */
        }
        if(u1_top_lt_avail)
        {
            x_nbr = x_cur - 1;
            y_nbr = y_cur - 1;

            pu1_nbr_intra_flag = pu1_pic_intra_flag + y_nbr / 8 * numbytes_row
                            + x_nbr / 64;
            intra_pos = ((x_nbr / 8) % 8);

            top_left = (*pu1_nbr_intra_flag >> intra_pos) & 1;
        }
    }
    else
    {
        if(u1_top_avail)
            top = 0xF;
        if(u1_top_rt_avail)
            top_right = 0xF;
        if(u1_bot_lt_avail)
            bot_left = 0xF;
        if(u1_left_avail)
            left = 0xF;
        if(u1_top_lt_avail)
            top_left = 0x1;
    }

    /* Handling incomplete CTBs */
    {
        WORD32 pu_size_limit = MIN(trans_size, 8);
        WORD32 cols_remaining = ps_sps->i2_pic_width_in_luma_samples
                        - (ps_proc->i4_ctb_x << ps_sps->i1_log2_ctb_size)
                        - (ps_tu->b4_pos_x * MIN_TU_SIZE)
                        - (1 << (ps_tu->b3_size + 2));
        /* ctb_size_top gives number of valid pixels remaining in the current row */
        WORD32 ctb_size_top = MIN(ctb_size, cols_remaining);
        WORD32 ctb_size_top_bits = (1 << (ctb_size_top / pu_size_limit)) - 1;

        WORD32 rows_remaining = ps_sps->i2_pic_height_in_luma_samples
                        - (ps_proc->i4_ctb_y << ps_sps->i1_log2_ctb_size)
                        - (ps_tu->b4_pos_y * MIN_TU_SIZE)
                        - (1 << (ps_tu->b3_size + 2));
        /* ctb_size_bot gives number of valid pixels remaining in the current column */
        WORD32 ctb_size_bot = MIN(ctb_size, rows_remaining);
        WORD32 ctb_size_bot_bits = (1 << (ctb_size_bot / pu_size_limit)) - 1;

        top_right &= ctb_size_top_bits;
        bot_left &= ctb_size_bot_bits;
    }

    /*    Top-Left | Top-Right | Top | Left | Bottom-Left
     *      1         4         4     4         4
     */

    /*
     nbr_flags = (top_left << 16) | (gau4_ihevcd_4_bit_reverse[top_right] << 12) | (gau4_ihevcd_4_bit_reverse[top] << 8) | (gau4_ihevcd_4_bit_reverse[left] << 4)
     | gau4_ihevcd_4_bit_reverse[bot_left];
     */
    nbr_flags = (top_left << 16) | (top_right << 12) | (top << 8) | (gau4_ihevcd_4_bit_reverse[left] << 4)
                    | gau4_ihevcd_4_bit_reverse[bot_left];


    return nbr_flags;

}

WORD32 ihevcd_iquant_itrans_recon_ctb(process_ctxt_t *ps_proc)
{
    WORD16 *pi2_scaling_mat;
    UWORD8 *pu1_y_dst_ctb;
    UWORD8 *pu1_uv_dst_ctb;
    WORD32 ctb_size;
    codec_t *ps_codec;
    slice_header_t *ps_slice_hdr;
    tu_t *ps_tu;
    WORD16 *pi2_ctb_coeff;
    WORD32 tu_cnt;
    WORD16 *pi2_tu_coeff;
    WORD16 *pi2_tmp;
    WORD32 pic_strd;
    WORD32 luma_nbr_flags;
    WORD32 luma_nbr_flags_4x4[4] = { 0 };
    WORD32 chroma_nbr_flags = 0;
    UWORD8 u1_luma_pred_mode_first_tu = 0;
    /* Pointers for generating 2d coeffs from coeff-map */
    UWORD8 *pu1_tu_coeff_data;
    /* nbr avail map for CTB */
    /* 1st bit points to neighbor (left/top_left/bot_left) */
    /* 1Tb starts at 2nd bit from msb of 2nd value in array, followed by number of min_tu's in that ctb */
    UWORD32 au4_intra_nbr_avail[MAX_CTB_SIZE / MIN_TU_SIZE
                    + 2 /* Top nbr + bot nbr */]; UWORD32
                    top_avail_bits;
    sps_t *ps_sps;
    pps_t *ps_pps;
    WORD32 intra_flag;
    UWORD8 *pu1_pic_intra_flag;
    /*************************************************************************/
    /* Contanis scaling matrix offset in the following order in a 1D buffer  */
    /* Entries that are listed as UNUSED are invalid combinations where      */
    /* scaling matrix is not used. eg: 64x64 SKIP CU, 64x64 PCM CU           */
    /* Intra 4 x 4 Y, 4 x 4 U, 4 x 4 V                                       */
    /* Inter 4 x 4 Y, 4 x 4 U, 4 x 4 V                                       */
    /* Intra 8 x 8 Y, 8 x 8 U, 8 x 8 V                                       */
    /* Inter 8 x 8 Y, 8 x 8 U, 8 x 8 V                                       */
    /* Intra 16x16 Y, 16x16 U, 16x16 V                                       */
    /* Inter 16x16 Y, 16x16 U, 16x16 V                                       */
    /* Intra 32x32 Y, UNUSED,  UNUSED                                        */
    /* Inter 32x32 Y, UNUSED,  UNUSED                                        */
    /* UNUSED,        UNUSED,  UNUSED                                        */
    /* UNUSED,        UNUSED,  UNUSED                                        */
    /*************************************************************************/
    static const WORD32 scaling_mat_offset[] =
      { 0, 16, 32, 48, 64, 80, 96, 160, 224, 288, 352, 416, 480, 736, 992,
        1248, 1504, 1760, 2016, 0, 0, 3040, 0, 0, 0, 0, 0, 0, 0, 0};

    PROFILE_DISABLE_IQ_IT_RECON_INTRA_PRED();

    ps_sps = ps_proc->ps_sps;
    ps_pps = ps_proc->ps_pps;
    ps_slice_hdr = ps_proc->ps_slice_hdr;
    ps_codec = ps_proc->ps_codec;

    pu1_y_dst_ctb = ps_proc->pu1_cur_ctb_luma;
    pu1_uv_dst_ctb = ps_proc->pu1_cur_ctb_chroma;

    pi2_ctb_coeff = ps_proc->pi2_invscan_out;

    ctb_size = (1 << ps_sps->i1_log2_ctb_size);
    pu1_tu_coeff_data = (UWORD8 *)ps_proc->pv_tu_coeff_data;

    pic_strd = ps_codec->i4_strd;

    pi2_tmp = ps_proc->pi2_itrans_intrmd_buf;

    pi2_tu_coeff = pi2_ctb_coeff;

    ps_tu = ps_proc->ps_tu;

    if((1 == ps_sps->i1_scaling_list_enable_flag) && (1 == ps_pps->i1_pps_scaling_list_data_present_flag))
    {
        pi2_scaling_mat = ps_pps->pi2_scaling_mat;
    }
    else
    {
        pi2_scaling_mat = ps_sps->pi2_scaling_mat;
    }

    {
        /* Updating the initial availability map */
        WORD32 i;
        UWORD8 u1_left_ctb_avail, u1_top_lt_ctb_avail, u1_top_rt_ctb_avail,
                        u1_top_ctb_avail;

        u1_left_ctb_avail = ps_proc->u1_left_ctb_avail;
        u1_top_lt_ctb_avail = ps_proc->u1_top_lt_ctb_avail;
        u1_top_ctb_avail = ps_proc->u1_top_ctb_avail;
        u1_top_rt_ctb_avail = ps_proc->u1_top_rt_ctb_avail;

        /* Initializing the availability array */
        memset(au4_intra_nbr_avail, 0,
               (MAX_CTB_SIZE / MIN_TU_SIZE + 2) * sizeof(UWORD32));
        /* Initializing the availability array with CTB level availability flags */
        {
            WORD32 rows_remaining = ps_sps->i2_pic_height_in_luma_samples - (ps_proc->i4_ctb_y << ps_sps->i1_log2_ctb_size);
            WORD32 ctb_size_left = MIN(ctb_size, rows_remaining);
            for(i = 0; i < ctb_size_left / MIN_TU_SIZE; i++)
            {
                au4_intra_nbr_avail[i + 1] = ((UWORD32)u1_left_ctb_avail << 31);
            }
        }
        au4_intra_nbr_avail[0] |= (((UWORD32)u1_top_rt_ctb_avail << 31)
                        >> (1 + ctb_size / MIN_TU_SIZE)); /* 1+ctb_size/4 position bit pos from msb */

        au4_intra_nbr_avail[0] |= ((UWORD32)u1_top_lt_ctb_avail << 31);

        {
            WORD32 cols_remaining = ps_sps->i2_pic_width_in_luma_samples - (ps_proc->i4_ctb_x << ps_sps->i1_log2_ctb_size);
            WORD32 ctb_size_top = MIN(ctb_size, cols_remaining);
            WORD32 shift = (31 - (ctb_size / MIN_TU_SIZE));

            /* ctb_size_top gives number of valid pixels remaining in the current row */
            /* Since we need pattern of 1's starting from the MSB, an additional shift */
            /* is needed */
            shift += ((ctb_size - ctb_size_top) / MIN_TU_SIZE);

            top_avail_bits = ((1 << (ctb_size_top / MIN_TU_SIZE)) - 1)
                            << shift;
        }
        au4_intra_nbr_avail[0] |= (
                        (u1_top_ctb_avail == 1) ? top_avail_bits : 0x0);
        /* Starting from msb 2nd bit to (1+ctb_size/4) bit, set 1 if top avail,or 0 */

    }

    /* Applying Inverse transform on all the TU's in CTB */
    for(tu_cnt = 0; tu_cnt < ps_proc->i4_ctb_tu_cnt; tu_cnt++, ps_tu++)
    {
        WORD32 transform_skip_flag = 0;
        WORD32 transform_skip_flag_v = 0;
        WORD32 num_comp, c_idx, func_idx;
        WORD32 src_strd, pred_strd, dst_strd;
        WORD32 qp_div = 0, qp_rem = 0;
        WORD32 qp_div_v = 0, qp_rem_v = 0;
        UWORD32 zero_cols = 0, zero_cols_v = 0;
        UWORD32 zero_rows = 0, zero_rows_v = 0;
        UWORD32 coeff_type = 0, coeff_type_v = 0;
        WORD16 i2_coeff_value, i2_coeff_value_v;
        WORD32 trans_size = 0;
        TRANSFORM_TYPE e_trans_type;
        WORD32 log2_y_trans_size_minus_2, log2_uv_trans_size_minus_2;
        WORD32 log2_trans_size;
        WORD32 chroma_qp_idx;
        WORD16 *pi2_src = NULL, *pi2_src_v = NULL;
        UWORD8 *pu1_pred = NULL, *pu1_pred_v = NULL;
        UWORD8 *pu1_dst = NULL, *pu1_dst_v = NULL;
        WORD16 *pi2_dequant_matrix = NULL, *pi2_dequant_matrix_v = NULL;
        WORD32 tu_x, tu_y;
        WORD32 tu_y_offset, tu_uv_offset;
        WORD8 i1_chroma_pic_qp_offset, i1_chroma_slice_qp_offset;
        UWORD8 u1_cbf = 0, u1_cbf_v = 0, u1_luma_pred_mode, u1_chroma_pred_mode;
        WORD32 offset;
        WORD32 pcm_flag;
        WORD32  chroma_yuv420sp_vu = (ps_codec->e_ref_chroma_fmt == IV_YUV_420SP_VU);
        /* If 420SP_VU is chroma format, pred and dst pointer   */
        /* will be added +1 to point to U                       */
        WORD32 chroma_yuv420sp_vu_u_offset = 1 * chroma_yuv420sp_vu;
        /* If 420SP_VU is chroma format, pred and dst pointer   */
        /* will be added U offset of +1 and subtracted 2        */
        /* to point to V                                        */
        WORD32 chroma_yuv420sp_vu_v_offset = -2 * chroma_yuv420sp_vu;

        tu_x = ps_tu->b4_pos_x * 4; /* Converting minTU unit to pixel unit */
        tu_y = ps_tu->b4_pos_y * 4; /* Converting minTU unit to pixel unit */
        {
            WORD32 tu_abs_x = (ps_proc->i4_ctb_x << ps_sps->i1_log2_ctb_size) + (tu_x);
            WORD32 tu_abs_y = (ps_proc->i4_ctb_y << ps_sps->i1_log2_ctb_size) + (tu_y);

            WORD32 numbytes_row =  (ps_sps->i2_pic_width_in_luma_samples + 63) / 64;

            pu1_pic_intra_flag = ps_proc->pu1_pic_intra_flag;
            pu1_pic_intra_flag += (tu_abs_y >> 3) * numbytes_row;
            pu1_pic_intra_flag += (tu_abs_x >> 6);

            intra_flag = *pu1_pic_intra_flag;
            intra_flag &= (1 << ((tu_abs_x >> 3) % 8));
        }

        u1_luma_pred_mode = ps_tu->b6_luma_intra_mode;
        u1_chroma_pred_mode = ps_tu->b3_chroma_intra_mode_idx;

        if(u1_chroma_pred_mode != 7)
            num_comp = 2; /* Y and UV */
        else
            num_comp = 1; /* Y */


        pcm_flag = 0;

        if((intra_flag) && (u1_luma_pred_mode == INTRA_PRED_NONE))
        {
            UWORD8 *pu1_buf;
            UWORD8 *pu1_y_dst = pu1_y_dst_ctb;
            UWORD8 *pu1_uv_dst = pu1_uv_dst_ctb;
            WORD32 i, j;
            tu_sblk_coeff_data_t *ps_tu_sblk_coeff_data;
            WORD32 cb_size = 1 << (ps_tu->b3_size + 2);

            /* trans_size is used to update availability after reconstruction */
            trans_size = cb_size;

            pcm_flag = 1;

            tu_y_offset = tu_x + tu_y * pic_strd;
            pu1_y_dst += tu_x + tu_y * pic_strd;
            pu1_uv_dst += tu_x + (tu_y >> 1) * pic_strd;

            /* First byte points to number of coded blocks */
            pu1_tu_coeff_data++;

            /* Next byte points to scan type */
            pu1_tu_coeff_data++;

            ps_tu_sblk_coeff_data = (tu_sblk_coeff_data_t *)pu1_tu_coeff_data;

            pu1_buf = (UWORD8 *)&ps_tu_sblk_coeff_data->ai2_level[0];
            {

                for(i = 0; i < cb_size; i++)
                {
                    //pu1_y_dst[i * pic_strd + j] = *pu1_buf++;
                    memcpy(&pu1_y_dst[i * pic_strd], pu1_buf, cb_size);
                    pu1_buf += cb_size;
                }

                pu1_uv_dst = pu1_uv_dst + chroma_yuv420sp_vu_u_offset;

                /* U */
                for(i = 0; i < cb_size / 2; i++)
                {
                    for(j = 0; j < cb_size / 2; j++)
                    {
                        pu1_uv_dst[i * pic_strd + 2 * j] = *pu1_buf++;
                    }
                }

                pu1_uv_dst = pu1_uv_dst + 1 + chroma_yuv420sp_vu_v_offset;

                /* V */
                for(i = 0; i < cb_size / 2; i++)
                {
                    for(j = 0; j < cb_size / 2; j++)
                    {
                        pu1_uv_dst[i * pic_strd + 2 * j] = *pu1_buf++;
                    }
                }
            }

            pu1_tu_coeff_data = pu1_buf;

        }





        for(c_idx = 0; c_idx < num_comp; c_idx++)
        {
            if(0 == pcm_flag)
            {
                /* Initializing variables */
                pred_strd = pic_strd;
                dst_strd = pic_strd;

                if(c_idx == 0) /* Y */
                {
                    log2_y_trans_size_minus_2 = ps_tu->b3_size;
                    trans_size = 1 << (log2_y_trans_size_minus_2 + 2);
                    log2_trans_size = log2_y_trans_size_minus_2 + 2;

                    tu_y_offset = tu_x + tu_y * pic_strd;

                    pi2_src = pi2_tu_coeff;
                    pu1_pred = pu1_y_dst_ctb + tu_y_offset;
                    pu1_dst = pu1_y_dst_ctb + tu_y_offset;

                    /* Calculating scaling matrix offset */
                    offset = log2_y_trans_size_minus_2 * 6
                                    + (!intra_flag) * 3 + c_idx;
                    pi2_dequant_matrix = pi2_scaling_mat
                                    + scaling_mat_offset[offset];

                    src_strd = trans_size;

                    /* 4x4 transform Luma in INTRA mode is DST */
                    if(log2_y_trans_size_minus_2 == 0 && intra_flag)
                    {
                        func_idx = log2_y_trans_size_minus_2;
                        e_trans_type = DST_4x4;
                    }
                    else
                    {
                        func_idx = log2_y_trans_size_minus_2 + 1;
                        e_trans_type = (TRANSFORM_TYPE)(log2_y_trans_size_minus_2 + 1);
                    }

                    qp_div = ps_tu->b7_qp / 6;
                    qp_rem = ps_tu->b7_qp % 6;

                    u1_cbf = ps_tu->b1_y_cbf;

                    transform_skip_flag = pu1_tu_coeff_data[1] & 1;
                    /* Unpacking coeffs */
                    if(1 == u1_cbf)
                    {
                        pu1_tu_coeff_data = ihevcd_unpack_coeffs(
                                        pi2_src, log2_y_trans_size_minus_2 + 2,
                                        pu1_tu_coeff_data, pi2_dequant_matrix,
                                        qp_rem, qp_div, e_trans_type,
                                        ps_tu->b1_transquant_bypass, &zero_cols,
                                        &zero_rows, &coeff_type,
                                        &i2_coeff_value);
                    }
                }
                else /* UV interleaved */
                {
                    /* Chroma :If Transform size is 4x4, keep 4x4 else do transform on (trans_size/2 x trans_size/2) */
                    if(ps_tu->b3_size == 0)
                    {
                        /* Chroma 4x4 is present with 4th luma 4x4 block. For this case chroma postion has to be (luma pos x- 4,luma pos y- 4) */
                        log2_uv_trans_size_minus_2 = ps_tu->b3_size;
                        tu_uv_offset = (tu_x - 4) + ((tu_y - 4) / 2) * pic_strd;
                    }
                    else
                    {
                        log2_uv_trans_size_minus_2 = ps_tu->b3_size - 1;
                        tu_uv_offset = tu_x + (tu_y >> 1) * pic_strd;
                    }
                    trans_size = 1 << (log2_uv_trans_size_minus_2 + 2);
                    log2_trans_size = log2_uv_trans_size_minus_2 + 2;

                    pi2_src = pi2_tu_coeff;
                    pi2_src_v = pi2_tu_coeff + trans_size * trans_size;
                    pu1_pred = pu1_uv_dst_ctb + tu_uv_offset + chroma_yuv420sp_vu_u_offset; /* Pointing to start byte of U*/
                    pu1_pred_v = pu1_pred + 1 + chroma_yuv420sp_vu_v_offset; /* Pointing to start byte of V*/
                    pu1_dst = pu1_uv_dst_ctb + tu_uv_offset + chroma_yuv420sp_vu_u_offset; /* Pointing to start byte of U*/
                    pu1_dst_v = pu1_dst + 1 + chroma_yuv420sp_vu_v_offset; /* Pointing to start byte of V*/

                    /*TODO: Add support for choosing different tables for U and V,
                     * change this to a single array to handle flat/default/custom, intra/inter, luma/chroma and various sizes
                     */
                    /* Calculating scaling matrix offset */
                    /* ((log2_uv_trans_size_minus_2 == 3) ? 1:3) condition check is not needed, since
                     * max uv trans size is 16x16
                     */
                    offset = log2_uv_trans_size_minus_2 * 6
                                    + (!intra_flag) * 3 + c_idx;
                    pi2_dequant_matrix = pi2_scaling_mat
                                    + scaling_mat_offset[offset];
                    pi2_dequant_matrix_v = pi2_scaling_mat
                                    + scaling_mat_offset[offset + 1];

                    src_strd = trans_size;

                    func_idx = 1 + 4 + log2_uv_trans_size_minus_2; /* DST func + Y funcs + cur func index*/

                    /* Handle error cases where 64x64 TU is signalled which results in 32x32 chroma.
                     * By limiting func_idx to 7, max of 16x16 chroma is called */
                    func_idx = MIN(func_idx, 7);

                    e_trans_type = (TRANSFORM_TYPE)(log2_uv_trans_size_minus_2 + 1);
                    /* QP for U */
                    i1_chroma_pic_qp_offset = ps_pps->i1_pic_cb_qp_offset;
                    i1_chroma_slice_qp_offset = ps_slice_hdr->i1_slice_cb_qp_offset;
                    u1_cbf = ps_tu->b1_cb_cbf;

                    chroma_qp_idx = ps_tu->b7_qp + i1_chroma_pic_qp_offset
                                    + i1_chroma_slice_qp_offset;
                    chroma_qp_idx = CLIP3(chroma_qp_idx, 0, 57);
                    qp_div = gai2_ihevcd_chroma_qp[chroma_qp_idx] / 6;
                    qp_rem = gai2_ihevcd_chroma_qp[chroma_qp_idx] % 6;

                    /* QP for V */
                    i1_chroma_pic_qp_offset = ps_pps->i1_pic_cr_qp_offset;
                    i1_chroma_slice_qp_offset = ps_slice_hdr->i1_slice_cr_qp_offset;
                    u1_cbf_v = ps_tu->b1_cr_cbf;

                    chroma_qp_idx = ps_tu->b7_qp + i1_chroma_pic_qp_offset
                                    + i1_chroma_slice_qp_offset;
                    chroma_qp_idx = CLIP3(chroma_qp_idx, 0, 57);
                    qp_div_v = gai2_ihevcd_chroma_qp[chroma_qp_idx] / 6;
                    qp_rem_v = gai2_ihevcd_chroma_qp[chroma_qp_idx] % 6;

                    /* Unpacking coeffs */
                    transform_skip_flag = pu1_tu_coeff_data[1] & 1;
                    if(1 == u1_cbf)
                    {
                        pu1_tu_coeff_data = ihevcd_unpack_coeffs(
                                        pi2_src, log2_uv_trans_size_minus_2 + 2,
                                        pu1_tu_coeff_data, pi2_dequant_matrix,
                                        qp_rem, qp_div, e_trans_type,
                                        ps_tu->b1_transquant_bypass, &zero_cols,
                                        &zero_rows, &coeff_type,
                                        &i2_coeff_value);
                    }

                    transform_skip_flag_v = pu1_tu_coeff_data[1] & 1;
                    if(1 == u1_cbf_v)
                    {
                        pu1_tu_coeff_data = ihevcd_unpack_coeffs(
                                        pi2_src_v, log2_uv_trans_size_minus_2 + 2,
                                        pu1_tu_coeff_data, pi2_dequant_matrix_v,
                                        qp_rem_v, qp_div_v, e_trans_type,
                                        ps_tu->b1_transquant_bypass, &zero_cols_v,
                                        &zero_rows_v, &coeff_type_v, &i2_coeff_value_v);
                    }
                }
                /***************************************************************/
                /******************  Intra Prediction **************************/
                /***************************************************************/
                if(intra_flag) /* Intra */
                {
                    /* While (MAX_TU_SIZE * 2 * 2) + 1 is the actaul size needed,
                       au1_ref_sub_out size is kept as multiple of 8,
                       so that SIMD functions can load 64 bits. Also some SIMD
                       modules read few bytes before the start of the array, so
                       allocate 16 extra bytes at the start */
                    UWORD8 au1_ref_sub_out[16 + (MAX_TU_SIZE * 2 * 2) + 8] = {0};
                    UWORD8 *pu1_ref_sub_out = &au1_ref_sub_out[16];
                    UWORD8 *pu1_top_left, *pu1_top, *pu1_left;
                    WORD32 luma_pred_func_idx, chroma_pred_func_idx;

                    /* Get the neighbour availability flags */
                    /* Done for only Y */
                    if(c_idx == 0)
                    {
                        /* Get neighbor availability for Y only */
                        luma_nbr_flags = ihevcd_get_intra_nbr_flag(ps_proc,
                                                                   ps_tu,
                                                                   au4_intra_nbr_avail,
                                                                   ps_sps->i2_pic_width_in_luma_samples,
                                                                   ps_pps->i1_constrained_intra_pred_flag,
                                                                   trans_size,
                                                                   ctb_size);

                        if(trans_size == 4)
                            luma_nbr_flags_4x4[(ps_tu->b4_pos_x % 2) + (ps_tu->b4_pos_y % 2) * 2] = luma_nbr_flags;

                        if((ps_tu->b4_pos_x % 2 == 0) && (ps_tu->b4_pos_y % 2 == 0))
                        {
                            chroma_nbr_flags = luma_nbr_flags;
                        }

                        /* Initializing nbr pointers */
                        pu1_top = pu1_pred - pic_strd;
                        pu1_left = pu1_pred - 1;
                        pu1_top_left = pu1_pred - pic_strd - 1;

                        /* call reference array substitution */
                        if(luma_nbr_flags == 0x1ffff)
                            ps_codec->s_func_selector.ihevc_intra_pred_luma_ref_subst_all_avlble_fptr(
                                            pu1_top_left,
                                            pu1_top, pu1_left, pred_strd, trans_size, luma_nbr_flags, pu1_ref_sub_out, 1);
                        else
                            ps_codec->s_func_selector.ihevc_intra_pred_luma_ref_substitution_fptr(
                                            pu1_top_left,
                                            pu1_top, pu1_left, pred_strd, trans_size, luma_nbr_flags, pu1_ref_sub_out, 1);

                        /* call reference filtering */
                        ps_codec->s_func_selector.ihevc_intra_pred_ref_filtering_fptr(
                                        pu1_ref_sub_out, trans_size,
                                        pu1_ref_sub_out,
                                        u1_luma_pred_mode, ps_sps->i1_strong_intra_smoothing_enable_flag);

                        /* use the look up to get the function idx */
                        luma_pred_func_idx = g_i4_ip_funcs[u1_luma_pred_mode];

                        /* call the intra prediction function */
                        ps_codec->apf_intra_pred_luma[luma_pred_func_idx](pu1_ref_sub_out, 1, pu1_pred, pred_strd, trans_size, u1_luma_pred_mode);
                    }
                    else
                    {
                        /* In case of yuv420sp_vu, prediction happens as usual.         */
                        /* So point the pu1_pred pointer to original prediction pointer */
                        UWORD8 *pu1_pred_orig = pu1_pred - chroma_yuv420sp_vu_u_offset;

                        /*    Top-Left | Top-Right | Top | Left | Bottom-Left
                         *      1         4         4     4         4
                         *
                         * Generating chroma_nbr_flags depending upon the transform size */
                        if(ps_tu->b3_size == 0)
                        {
                            /* Take TL,T,L flags of First luma 4x4 block */
                            chroma_nbr_flags = (luma_nbr_flags_4x4[0] & 0x10FF0);
                            /* Take TR flags of Second luma 4x4 block */
                            chroma_nbr_flags |= (luma_nbr_flags_4x4[1] & 0x0F000);
                            /* Take BL flags of Third luma 4x4 block */
                            chroma_nbr_flags |= (luma_nbr_flags_4x4[2] & 0x0000F);
                        }

                        /* Initializing nbr pointers */
                        pu1_top = pu1_pred_orig - pic_strd;
                        pu1_left = pu1_pred_orig - 2;
                        pu1_top_left = pu1_pred_orig - pic_strd - 2;

                        /* Chroma pred  mode derivation from luma pred mode */
                        {
                            tu_t *ps_tu_tmp = ps_tu;
                            while(!ps_tu_tmp->b1_first_tu_in_cu)
                            {
                                ps_tu_tmp--;
                            }
                            u1_luma_pred_mode_first_tu = ps_tu_tmp->b6_luma_intra_mode;
                        }
                        if(4 == u1_chroma_pred_mode)
                            u1_chroma_pred_mode = u1_luma_pred_mode_first_tu;
                        else
                        {
                            u1_chroma_pred_mode = gau1_intra_pred_chroma_modes[u1_chroma_pred_mode];

                            if(u1_chroma_pred_mode ==
                                                            u1_luma_pred_mode_first_tu)
                            {
                                u1_chroma_pred_mode = INTRA_ANGULAR(34);
                            }
                        }

                        /* call the chroma reference array substitution */
                        ps_codec->s_func_selector.ihevc_intra_pred_chroma_ref_substitution_fptr(
                                        pu1_top_left,
                                        pu1_top, pu1_left, pic_strd, trans_size, chroma_nbr_flags, pu1_ref_sub_out, 1);

                        /* use the look up to get the function idx */
                        chroma_pred_func_idx =
                                        g_i4_ip_funcs[u1_chroma_pred_mode];

                        /* call the intra prediction function */
                        ps_codec->apf_intra_pred_chroma[chroma_pred_func_idx](pu1_ref_sub_out, 1, pu1_pred_orig, pred_strd, trans_size, u1_chroma_pred_mode);
                    }
                }

                /* Updating number of transform types */
                STATS_UPDATE_ALL_TRANS(e_trans_type, c_idx);

                /* IQ, IT and Recon for Y if c_idx == 0, and U if c_idx !=0 */
                if(1 == u1_cbf)
                {
                    if(ps_tu->b1_transquant_bypass || transform_skip_flag)
                    {
                        /* Recon */
                        ps_codec->apf_recon[func_idx](pi2_src, pu1_pred, pu1_dst,
                                                      src_strd, pred_strd, dst_strd,
                                                      zero_cols);
                    }
                    else
                    {

                        /* Updating coded number of transform types(excluding trans skip and trans quant skip) */
                        STATS_UPDATE_CODED_TRANS(e_trans_type, c_idx, 0);

                        /* iQuant , iTrans and Recon */
                        if((0 == coeff_type))
                        {
                            ps_codec->apf_itrans_recon[func_idx](pi2_src, pi2_tmp,
                                                                 pu1_pred, pu1_dst,
                                                                 src_strd, pred_strd,
                                                                 dst_strd, zero_cols,
                                                                 zero_rows);
                        }
                        else /* DC only */
                        {
                            STATS_UPDATE_CODED_TRANS(e_trans_type, c_idx, 1);
                            ps_codec->apf_itrans_recon_dc[c_idx](pu1_pred, pu1_dst,
                                                                 pred_strd, dst_strd,
                                                                 log2_trans_size,
                                                                 i2_coeff_value);
                        }
                    }
                }
                /* IQ, IT and Recon for V */
                if(c_idx != 0)
                {
                    if(1 == u1_cbf_v)
                    {
                        if(ps_tu->b1_transquant_bypass || transform_skip_flag_v)
                        {
                            /* Recon */
                            ps_codec->apf_recon[func_idx](pi2_src_v, pu1_pred_v,
                                                          pu1_dst_v, src_strd,
                                                          pred_strd, dst_strd,
                                                          zero_cols_v);
                        }
                        else
                        {
                            /* Updating number of transform types */
                            STATS_UPDATE_CODED_TRANS(e_trans_type, c_idx, 0);

                            /* iQuant , iTrans and Recon */
                            if((0 == coeff_type_v))
                            {
                                ps_codec->apf_itrans_recon[func_idx](pi2_src_v,
                                                                     pi2_tmp,
                                                                     pu1_pred_v,
                                                                     pu1_dst_v,
                                                                     src_strd,
                                                                     pred_strd,
                                                                     dst_strd,
                                                                     zero_cols_v,
                                                                     zero_rows_v);
                            }
                            else  /* DC only */
                            {
                                STATS_UPDATE_CODED_TRANS(e_trans_type, c_idx, 1);
                                ps_codec->apf_itrans_recon_dc[c_idx](pu1_pred_v, pu1_dst_v,
                                                                     pred_strd, dst_strd,
                                                                     log2_trans_size,
                                                                     i2_coeff_value_v);
                            }
                        }
                    }
                }
            }

            /* Neighbor availability inside CTB */
            /* 1bit per 4x4. Indicates whether that 4x4 block has been reconstructed(avialable) */
            /* Used for neighbor availability in intra pred */
            if(c_idx == 0)
            {
                WORD32 i;
                WORD32 trans_in_min_tu;
                UWORD32 cur_tu_in_bits;
                UWORD32 cur_tu_avail_flag;

                trans_in_min_tu = trans_size / MIN_TU_SIZE;
                cur_tu_in_bits = (1 << trans_in_min_tu) - 1;
                cur_tu_in_bits = cur_tu_in_bits << (32 - trans_in_min_tu);

                cur_tu_avail_flag = cur_tu_in_bits >> (ps_tu->b4_pos_x + 1);

                for(i = 0; i < trans_in_min_tu; i++)
                    au4_intra_nbr_avail[1 + ps_tu->b4_pos_y + i] |=
                                    cur_tu_avail_flag;
            }
        }
    }
    ps_proc->pv_tu_coeff_data = pu1_tu_coeff_data;

    return ps_proc->i4_ctb_tu_cnt;
}

