/*
* Copyright (c) 2017-2021, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
//!
//! \file     codechal_encode_hevc_g12.cpp
//! \brief    HEVC dual-pipe encoder for GEN12.
//!

#include "codechal_encode_hevc_g12.h"
#include "codechal_encode_csc_ds_g12.h"
#include "codechal_mmc_encode_hevc_g12.h"
#include "codechal_encode_wp_g12.h"
#include "codechal_kernel_header_g12.h"
#include "codechal_kernel_hme_g12.h"
#include "codechal_debug.h"
#if defined(ENABLE_KERNELS) && !defined(_FULL_OPEN_SOURCE)
#include "igcodeckrn_g12.h"
#endif
#include "codeckrnheader.h"
#include "mhw_vdbox_hcp_g12_X.h"
#include "mhw_vdbox_g12_X.h"
#include "mhw_mi_g12_X.h"
#include "mhw_render_g12_X.h"
#include "cm_queue_rt.h"
#include "codechal_debug.h"

//! \cond SKIP_DOXYGEN
#define CRECOST(lambda, mode, lcu, slice) (Map44LutValue((uint32_t)((lambda) * (m_modeBits[(lcu)][(mode)][(slice)]) * (m_modeBitsScale[(mode)][(slice)])), 0x8F))
#define RDEBITS62(mode, lcu, slice) (GetU62ModeBits((float)((m_modeBits[(lcu)][(mode)][(slice)]) * (m_modeBitsScale[(mode)][(slice)]))))
//! \endcond

MOS_STATUS CodechalEncHevcStateG12::SetGpuCtxCreatOption()
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    if (!MOS_VE_CTXBASEDSCHEDULING_SUPPORTED(m_osInterface))
    {
        CODECHAL_ENCODE_CHK_STATUS_RETURN(CodechalEncoderState::SetGpuCtxCreatOption());
    }
    else
    {
        m_gpuCtxCreatOpt = MOS_New(MOS_GPUCTX_CREATOPTIONS_ENHANCED);
        CODECHAL_ENCODE_CHK_NULL_RETURN(m_gpuCtxCreatOpt);
    }

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::AddHcpPipeModeSelectCmd(MOS_COMMAND_BUFFER *cmdBuffer)
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    // call MI_VD_CONTROL_STATE before HCP_PIPE_SELECT to init the pipe.
    {
        MHW_MI_VD_CONTROL_STATE_PARAMS vdControlStateParams;
        //set up VD_CONTROL_STATE command
        {
            MOS_ZeroMemory(&vdControlStateParams, sizeof(MHW_MI_VD_CONTROL_STATE_PARAMS));
            vdControlStateParams.initialization = true;
            CODECHAL_ENCODE_CHK_STATUS_RETURN(
                static_cast<MhwMiInterfaceG12 *>(m_miInterface)->AddMiVdControlStateCmd(cmdBuffer, &vdControlStateParams));
        }
    }

    MHW_VDBOX_PIPE_MODE_SELECT_PARAMS_G12 pipeModeSelectParams;
    SetHcpPipeModeSelectParams(pipeModeSelectParams);
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hcpInterface->AddHcpPipeModeSelectCmd(cmdBuffer, &pipeModeSelectParams));

    return eStatus;
}

void CodechalEncHevcStateG12::SetHcpPipeModeSelectParams(MHW_VDBOX_PIPE_MODE_SELECT_PARAMS &vdboxPipeModeSelectParams)
{
    MHW_VDBOX_PIPE_MODE_SELECT_PARAMS_G12 &pipeModeSelectParams =
        static_cast<MHW_VDBOX_PIPE_MODE_SELECT_PARAMS_G12 &>(vdboxPipeModeSelectParams);
    pipeModeSelectParams = {};
    CodechalEncodeHevcBase::SetHcpPipeModeSelectParams(vdboxPipeModeSelectParams);

    pipeModeSelectParams.pakPiplnStrmoutEnabled = m_pakPiplStrmOutEnable;
    pipeModeSelectParams.pakFrmLvlStrmoutEnable = (m_brcEnabled && m_numPipe > 1);

    if (m_numPipe > 1)
    {
        // Running in the multiple VDBOX mode
        if (IsFirstPipe())
        {
            pipeModeSelectParams.MultiEngineMode = MHW_VDBOX_HCP_MULTI_ENGINE_MODE_LEFT;
        }
        else if (IsLastPipe())
        {
            pipeModeSelectParams.MultiEngineMode = MHW_VDBOX_HCP_MULTI_ENGINE_MODE_RIGHT;
        }
        else
        {
            pipeModeSelectParams.MultiEngineMode = MHW_VDBOX_HCP_MULTI_ENGINE_MODE_MIDDLE;
        }
        pipeModeSelectParams.PipeWorkMode = MHW_VDBOX_HCP_PIPE_WORK_MODE_CODEC_BE;
    }
    else
    {
        pipeModeSelectParams.MultiEngineMode = MHW_VDBOX_HCP_MULTI_ENGINE_MODE_FE_LEGACY;
        pipeModeSelectParams.PipeWorkMode    = MHW_VDBOX_HCP_PIPE_WORK_MODE_LEGACY;
    }
}

void CodechalEncHevcStateG12::SetHcpPicStateParams(MHW_VDBOX_HEVC_PIC_STATE &picStateParams)
{
    CODECHAL_ENCODE_FUNCTION_ENTER;

    CodechalEncodeHevcBase::SetHcpPicStateParams(picStateParams);
    picStateParams.sseEnabledInVmeEncode = m_sseEnabled;
}

MOS_STATUS CodechalEncHevcStateG12::AddHcpSurfaceStateCmds(MOS_COMMAND_BUFFER *cmdBuffer)
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    MHW_VDBOX_SURFACE_PARAMS srcSurfaceParams;
    SetHcpSrcSurfaceParams(srcSurfaceParams);
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hcpInterface->AddHcpSurfaceCmd(cmdBuffer, &srcSurfaceParams));

    MHW_VDBOX_SURFACE_PARAMS reconSurfaceParams;
    SetHcpReconSurfaceParams(reconSurfaceParams);
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hcpInterface->AddHcpSurfaceCmd(cmdBuffer, &reconSurfaceParams));

    // Add the surface state for reference picture, GEN12 HW change
    MHW_VDBOX_SURFACE_PARAMS refSurfaceParams;
    SetHcpRefSurfaceParams(refSurfaceParams);

    if (m_mmcState->IsMmcEnabled())
    {
        refSurfaceParams.refsMmcEnable       = 0;
        refSurfaceParams.refsMmcType         = 0;
        refSurfaceParams.dwCompressionFormat = 0;

        //add for B frame support
        if (m_pictureCodingType != I_TYPE)
        {
            for (uint8_t i = 0; i < CODEC_MAX_NUM_REF_FRAME_HEVC; i++)
            {
                if (i < CODEC_MAX_NUM_REF_FRAME_HEVC &&
                    m_picIdx[i].bValid && m_currUsedRefPic[i])
                {
                    uint8_t idx          = m_picIdx[i].ucPicIdx;
                    uint8_t frameStoreId = m_refIdxMapping[i];

                    MOS_MEMCOMP_STATE mmcState = MOS_MEMCOMP_DISABLED;
                    ENCODE_CHK_STATUS_RETURN(m_mmcState->GetSurfaceMmcState(const_cast<PMOS_SURFACE>(&m_refList[idx]->sRefReconBuffer), &mmcState));
                    refSurfaceParams.refsMmcEnable |= (mmcState == MOS_MEMCOMP_RC || mmcState == MOS_MEMCOMP_MC) ? (1 << frameStoreId) : 0;
                    refSurfaceParams.refsMmcType |= (mmcState == MOS_MEMCOMP_RC) ? (1 << frameStoreId) : 0;
                    if (mmcState == MOS_MEMCOMP_RC || mmcState == MOS_MEMCOMP_MC)
                    {
                        ENCODE_CHK_STATUS_RETURN(m_mmcState->GetSurfaceMmcFormat(const_cast<PMOS_SURFACE>(&m_refList[idx]->sRefReconBuffer), &refSurfaceParams.dwCompressionFormat));
                    }
                }
            }
        }
    }

    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hcpInterface->AddHcpSurfaceCmd(cmdBuffer, &refSurfaceParams));

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::AddHcpPictureStateCmd(MOS_COMMAND_BUFFER *cmdBuffer)
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    MHW_VDBOX_HEVC_PIC_STATE_G12 picStateParams;

    SetHcpPicStateParams(picStateParams);

    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hcpInterface->AddHcpPicStateCmd(cmdBuffer, &picStateParams));

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::UpdateYUY2SurfaceInfo(
    MOS_SURFACE &surface,
    bool         is10Bit)
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    if (surface.Format == Format_YUY2V)
    {
        // surface has been updated
        return eStatus;
    }

    if (surface.Format != Format_YUY2 &&
        surface.Format != Format_Y210 &&
        surface.Format != Format_Y216)
    {
        eStatus = MOS_STATUS_INVALID_PARAMETER;
        return eStatus;
    }

    if (surface.dwWidth < m_oriFrameWidth / 2 || surface.dwHeight < m_oriFrameHeight * 2)
    {
        eStatus = MOS_STATUS_INVALID_PARAMETER;
        return eStatus;
    }

    surface.Format   = is10Bit ? Format_Y216V : Format_YUY2V;
    surface.dwWidth  = m_oriFrameWidth;
    surface.dwHeight = m_oriFrameHeight;

    surface.YPlaneOffset.iSurfaceOffset = 0;
    surface.YPlaneOffset.iXOffset       = 0;
    surface.YPlaneOffset.iYOffset       = 0;

    surface.UPlaneOffset.iSurfaceOffset = surface.dwHeight * surface.dwPitch;
    surface.UPlaneOffset.iXOffset       = 0;
    surface.UPlaneOffset.iYOffset       = surface.dwHeight;

    surface.VPlaneOffset.iSurfaceOffset = surface.dwHeight * surface.dwPitch;
    surface.VPlaneOffset.iXOffset       = 0;
    surface.VPlaneOffset.iYOffset       = surface.dwHeight;

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::InitializePicture(const EncoderParams &params)
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodechalEncHevcState::InitializePicture(params));

    if (m_resolutionChanged)
    {
        ResizeBufferOffset();
    }

    m_sseEnabled = false;
    // only 420 format support SSE output
    // see TDR in scalability case, disable SSE for now before HW confirm the capability.
    if (m_sseSupported &&
        m_hevcSeqParams->chroma_format_idc == HCP_CHROMA_FORMAT_YUV420 &&
        m_numPipe == 1)
    {
        m_sseEnabled = true;
    }

    // for HEVC VME, HUC based WP is not supported.
    m_hevcPicParams->bEnableGPUWeightedPrediction = false;

    m_pakPiplStrmOutEnable = m_sseEnabled || (m_brcEnabled && m_numPipe > 1);

    CODECHAL_ENCODE_CHK_STATUS_RETURN(SetTileData(m_tileParams, params.dwBitstreamSize));
    CODECHAL_ENCODE_CHK_STATUS_RETURN(AllocateTileStatistics());
    CODECHAL_ENCODE_CHK_STATUS_RETURN(AllocateResourcesVariableSize());

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::SetPictureStructs()
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodechalEncHevcState::SetPictureStructs());

    if (m_minMaxQpControlEnabled)
    {
        //if Min Max QP is on disable Frame Panic Mode
        m_enableFramePanicMode = false;
    }

    // This is an additional (the 5th) PAK pass for BRC panic mode. Enabled for the single pipe case only.
    // Panic mode is not supported with Min/Max QP
    if (m_brcEnabled && m_enableFramePanicMode && (false == m_hevcSeqParams->DisableHRDConformance) &&
        (I_TYPE != m_hevcPicParams->CodingType) &&
        (m_numPipe == 1))
    {
        m_numPasses++;
    }

    m_virtualEngineBbIndex = m_currOriginalPic.FrameIdx;

    if ((uint8_t)HCP_CHROMA_FORMAT_YUV422 == m_chromaFormat &&
        (uint8_t)HCP_CHROMA_FORMAT_YUV422 == m_outputChromaFormat)
    {
        uint8_t currRefIdx = m_hevcPicParams->CurrReconstructedPic.FrameIdx;
        UpdateYUY2SurfaceInfo(m_refList[currRefIdx]->sRefBuffer, m_is10BitHevc);

        if (m_pictureCodingType != I_TYPE)
        {
            for (uint32_t i = 0; i < CODEC_MAX_NUM_REF_FRAME_HEVC; i++)
            {
                if (!m_picIdx[i].bValid || !m_currUsedRefPic[i])
                {
                    continue;
                }
                uint8_t picIdx = m_picIdx[i].ucPicIdx;
                CODECHAL_ENCODE_ASSERT(picIdx < 127);

                UpdateYUY2SurfaceInfo((m_refList[picIdx]->sRefBuffer), m_is10BitHevc);
            }
        }
    }

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::SetKernelParams(
    EncOperation      encOperation,
    MHW_KERNEL_PARAM *kernelParams,
    uint32_t          idx)
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    kernelParams->iThreadCount = m_hwInterface->GetRenderInterface()->GetHwCaps()->dwMaxThreads;
    kernelParams->iIdCount     = 1;

    uint32_t curbeAlignment = m_hwInterface->GetRenderInterface()->m_stateHeapInterface->pStateHeapInterface->GetCurbeAlignment();
    switch (encOperation)
    {
    case ENC_MBENC:
    {
        switch (idx)
        {
        case MBENC_LCU32_KRNIDX:
            kernelParams->iBTCount     = MBENC_B_FRAME_END - MBENC_B_FRAME_BEGIN;
            kernelParams->iCurbeLength = MOS_ALIGN_CEIL(sizeof(MBENC_LCU32_BTI), (size_t)curbeAlignment);
            kernelParams->iBlockWidth  = CODECHAL_HEVC_MAX_LCU_SIZE_G9;
            kernelParams->iBlockHeight = CODECHAL_HEVC_MAX_LCU_SIZE_G9;
            break;

        case MBENC_LCU64_KRNIDX:
            kernelParams->iBTCount     = MBENC_B_FRAME_END - MBENC_B_FRAME_BEGIN;
            kernelParams->iCurbeLength = MOS_ALIGN_CEIL(sizeof(MBENC_LCU64_BTI), (size_t)curbeAlignment);
            kernelParams->iBlockWidth  = CODECHAL_HEVC_MAX_LCU_SIZE_G10;
            kernelParams->iBlockHeight = CODECHAL_HEVC_MAX_LCU_SIZE_G10;
            break;

        default:
            CODECHAL_ENCODE_ASSERTMESSAGE("Unsupported MBENC mode requested");
            return MOS_STATUS_INVALID_PARAMETER;
        }
    }
    break;

    case ENC_BRC:
    {
        switch (idx)
        {
        case CODECHAL_HEVC_BRC_INIT:
        case CODECHAL_HEVC_BRC_RESET:
            kernelParams->iBTCount     = BRC_INIT_RESET_END - BRC_INIT_RESET_BEGIN;
            kernelParams->iCurbeLength = MOS_ALIGN_CEIL(sizeof(BRC_INITRESET_CURBE), (size_t)curbeAlignment);
            kernelParams->iBlockWidth  = CODECHAL_HEVC_FRAME_BRC_BLOCK_SIZE;
            kernelParams->iBlockHeight = CODECHAL_HEVC_FRAME_BRC_BLOCK_SIZE;
            break;

        case CODECHAL_HEVC_BRC_FRAME_UPDATE:
            kernelParams->iBTCount     = BRC_UPDATE_END - BRC_UPDATE_BEGIN;
            kernelParams->iCurbeLength = MOS_ALIGN_CEIL(sizeof(BRCUPDATE_CURBE), (size_t)curbeAlignment);
            kernelParams->iBlockWidth  = CODECHAL_HEVC_FRAME_BRC_BLOCK_SIZE;
            kernelParams->iBlockHeight = CODECHAL_HEVC_FRAME_BRC_BLOCK_SIZE;
            break;

        case CODECHAL_HEVC_BRC_LCU_UPDATE:
            kernelParams->iBTCount     = BRC_LCU_UPDATE_END - BRC_LCU_UPDATE_BEGIN;
            kernelParams->iCurbeLength = MOS_ALIGN_CEIL(sizeof(BRCUPDATE_CURBE), (size_t)curbeAlignment);
            kernelParams->iBlockWidth  = CODECHAL_HEVC_LCU_BRC_BLOCK_SIZE;
            kernelParams->iBlockHeight = CODECHAL_HEVC_LCU_BRC_BLOCK_SIZE;
            break;

        default:
            CODECHAL_ENCODE_ASSERTMESSAGE("Unsupported BRC mode requested");
            return MOS_STATUS_INVALID_PARAMETER;
        }
    }
    break;

    default:
        CODECHAL_ENCODE_ASSERTMESSAGE("Unsupported ENC mode requested");
        return MOS_STATUS_INVALID_PARAMETER;
    }

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::SetBindingTable(
    EncOperation                           encOperation,
    PCODECHAL_ENCODE_BINDING_TABLE_GENERIC hevcEncBindingTable,
    uint32_t                               idx)
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_CHK_NULL_RETURN(hevcEncBindingTable);

    MOS_ZeroMemory(hevcEncBindingTable, sizeof(*hevcEncBindingTable));

    switch (encOperation)
    {
    case ENC_MBENC:
    {
        switch (idx)
        {
        case MBENC_LCU32_KRNIDX:
        case MBENC_LCU64_KRNIDX:
            hevcEncBindingTable->dwNumBindingTableEntries  = MBENC_B_FRAME_END - MBENC_B_FRAME_BEGIN;
            hevcEncBindingTable->dwBindingTableStartOffset = MBENC_B_FRAME_BEGIN;
            break;

        default:
            CODECHAL_ENCODE_ASSERTMESSAGE("Unsupported MBENC mode requested");
            return MOS_STATUS_INVALID_PARAMETER;
        }
    }
    break;

    case ENC_BRC:
    {
        switch (idx)
        {
        case CODECHAL_HEVC_BRC_INIT:
            hevcEncBindingTable->dwNumBindingTableEntries  = BRC_INIT_RESET_END - BRC_INIT_RESET_BEGIN;
            hevcEncBindingTable->dwBindingTableStartOffset = BRC_INIT_RESET_BEGIN;
            break;

        case CODECHAL_HEVC_BRC_RESET:
            hevcEncBindingTable->dwNumBindingTableEntries  = BRC_INIT_RESET_END - BRC_INIT_RESET_BEGIN;
            hevcEncBindingTable->dwBindingTableStartOffset = BRC_INIT_RESET_BEGIN;
            break;

        case CODECHAL_HEVC_BRC_FRAME_UPDATE:
            hevcEncBindingTable->dwNumBindingTableEntries  = BRC_UPDATE_END - BRC_UPDATE_BEGIN;
            hevcEncBindingTable->dwBindingTableStartOffset = BRC_UPDATE_BEGIN;
            break;

        case CODECHAL_HEVC_BRC_LCU_UPDATE:
            hevcEncBindingTable->dwNumBindingTableEntries  = BRC_LCU_UPDATE_END - BRC_LCU_UPDATE_BEGIN;
            hevcEncBindingTable->dwBindingTableStartOffset = BRC_LCU_UPDATE_BEGIN;
            break;

        default:
            CODECHAL_ENCODE_ASSERTMESSAGE("Unsupported BRC mode requested");
            return MOS_STATUS_INVALID_PARAMETER;
        }
    }
    break;

    default:
        CODECHAL_ENCODE_ASSERTMESSAGE("Unsupported ENC mode requested");
        return MOS_STATUS_INVALID_PARAMETER;
    }

    for (uint32_t i = 0; i < hevcEncBindingTable->dwNumBindingTableEntries; i++)
    {
        hevcEncBindingTable->dwBindingTableEntries[i] = i;
    }

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::AllocateEncResources()
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    // Surfaces used by I & B Kernels
    uint32_t width = 0, height = 0;
    uint32_t size = 0;

    MEDIA_WA_TABLE* waTable = m_osInterface->pfnGetWaTable(m_osInterface);
    uint32_t memType = (MEDIA_IS_WA(waTable, WaForceAllocateLML4)) ? MOS_MEMPOOL_DEVICEMEMORY : 0;

    if (!m_useMdf)
    {
        // Intermediate CU Record surface
        if (Mos_ResourceIsNull(&m_intermediateCuRecordSurfaceLcu32.OsResource))
        {
            width  = m_widthAlignedLcu32;
            height = m_heightAlignedLcu32 >> 1;

            CODECHAL_ENCODE_CHK_STATUS_RETURN(AllocateBuffer2D(
                &m_intermediateCuRecordSurfaceLcu32,
                width,
                height,
                "Intermediate CU record surface",
                MOS_TILE_Y));
        }

        // Scratch Surface for I-kernel
        if (Mos_ResourceIsNull(&m_scratchSurface.OsResource))
        {
            width  = m_widthAlignedLcu32 >> 3;
            height = m_heightAlignedLcu32 >> 5;

            CODECHAL_ENCODE_CHK_STATUS_RETURN(AllocateBuffer2D(
                &m_scratchSurface,
                width,
                height,
                "Scratch surface for I and B Kernels"));
        }

        // CU based QP surface
        if (Mos_ResourceIsNull(&m_16x16QpInputData.OsResource))
        {
            width  = MOS_ALIGN_CEIL(m_picWidthInMb, 64);
            height = MOS_ALIGN_CEIL(m_picHeightInMb, 64);

            CODECHAL_ENCODE_CHK_STATUS_RETURN(AllocateBuffer2D(
                &m_16x16QpInputData,
                width,
                height,
                "16x16 QP Data Input surface"));
        }

        // Surfaces used by B Kernels
        // Enc constant table for B LCU32
        if (Mos_ResourceIsNull(&m_encConstantTableForB.sResource))
        {
            size = m_encConstantDataLutSize;

            CODECHAL_ENCODE_CHK_STATUS_RETURN(AllocateBuffer(
                &m_encConstantTableForB,
                size,
                "Enc Constant Table surface For LCU32/LCU64"));
        }

        //Debug surface
        for (uint32_t i = 0; i < CODECHAL_GET_ARRAY_LENGTH(m_debugSurface); i++)
        {
            if (Mos_ResourceIsNull(&m_debugSurface[i].sResource))
            {
                size = m_debugSurfaceSize;

                CODECHAL_ENCODE_CHK_STATUS_RETURN(AllocateBuffer(
                    &m_debugSurface[i],
                    size,
                    "Kernel debug surface"));
            }
        }
    }

    // LCU Level Input Data
    for (uint32_t i = 0; i < CODECHAL_GET_ARRAY_LENGTH(m_lcuLevelInputDataSurface); i++)
    {
        if (Mos_ResourceIsNull(&m_lcuLevelInputDataSurface[i].OsResource))
        {
            width  = 16 * ((m_widthAlignedMaxLcu >> 6) << 1);
            height = ((m_heightAlignedMaxLcu >> 6) << 1);

            CODECHAL_ENCODE_CHK_STATUS_RETURN(AllocateBuffer2D(
                &m_lcuLevelInputDataSurface[i],
                width,
                height,
                "Lcu Level Data Input surface",
                MOS_TILE_LINEAR));
        }
    }

    m_brcInputForEncKernelBuffer = nullptr;

    //Current Picture Y with Reconstructed boundary pixels
    if (Mos_ResourceIsNull(&m_currPicWithReconBoundaryPix.OsResource))
    {
        width  = m_widthAlignedLcu32;
        height = m_heightAlignedLcu32;

        if (m_isMaxLcu64)
        {
            width  = m_widthAlignedMaxLcu;
            height = m_heightAlignedMaxLcu;
        }

        uint32_t aligned_height = (uint32_t) (height * m_alignReconFactor);
        CODECHAL_ENCODE_CHK_STATUS_RETURN(AllocateSurface(
            &m_currPicWithReconBoundaryPix,
            width,
            aligned_height,
            "Current Picture Y with Reconstructed Boundary Pixels surface",
            memType));
    }

    // Encoder History Input Surface
    if (Mos_ResourceIsNull(&m_encoderHistoryInputBuffer.OsResource))
    {
        width  = 32 * ((m_widthAlignedMaxLcu >> 6) << 1);
        height = ((m_heightAlignedMaxLcu >> 6) << 1);

        CODECHAL_ENCODE_CHK_STATUS_RETURN(AllocateBuffer2D(
            &m_encoderHistoryInputBuffer,
            width,
            height,
            "Encoder History Input surface"));
    }

    // Encoder History Output Surface
    if (Mos_ResourceIsNull(&m_encoderHistoryOutputBuffer.OsResource))
    {
        width  = 32 * ((m_widthAlignedMaxLcu >> 6) << 1);
        height = ((m_heightAlignedMaxLcu >> 6) << 1);

        CODECHAL_ENCODE_CHK_STATUS_RETURN(AllocateBuffer2D(
            &m_encoderHistoryOutputBuffer,
            width,
            height,
            "Encoder History Output surface"));
    }

    if (m_hmeSupported && !m_useMdf)
    {
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hmeKernel->AllocateResources());
        // BRC Distortion surface
        if (Mos_ResourceIsNull(&m_brcBuffers.sMeBrcDistortionBuffer.OsResource))
        {
            width  = MOS_ALIGN_CEIL((m_downscaledWidthInMb4x << 3), 64);
            height = MOS_ALIGN_CEIL((m_downscaledHeightInMb4x << 2), 8) << 1;

            CODECHAL_ENCODE_CHK_STATUS_RETURN(AllocateBuffer2D(
                &m_brcBuffers.sMeBrcDistortionBuffer,
                width,
                height,
                "Brc Distortion surface Buffer"));
        }

        CODECHAL_ENCODE_CHK_STATUS_RETURN(AllocateMeResources());
    }

    for (uint32_t i = 0; i < CODECHAL_GET_ARRAY_LENGTH(m_encBCombinedBuffer1); i++)
    {
        if (Mos_ResourceIsNull(&m_encBCombinedBuffer1[i].sResource))
        {
            size = sizeof(MBENC_COMBINED_BUFFER1);

            CODECHAL_ENCODE_CHK_STATUS_RETURN(AllocateBuffer(
                &m_encBCombinedBuffer1[i],
                size,
                "Enc B combined buffer1"));

            MOS_LOCK_PARAMS lockFlags;
            MOS_ZeroMemory(&lockFlags, sizeof(MOS_LOCK_PARAMS));
            lockFlags.WriteOnly = 1;
            uint8_t *data       = (uint8_t *)m_osInterface->pfnLockResource(
                m_osInterface,
                &m_encBCombinedBuffer1[i].sResource,
                &lockFlags);
            CODECHAL_ENCODE_CHK_NULL_RETURN(data);

            MOS_ZeroMemory(data, size);

            m_osInterface->pfnUnlockResource(
                m_osInterface,
                &m_encBCombinedBuffer1[i].sResource);
        }
    }

    for (uint32_t i = 0; i < CODECHAL_GET_ARRAY_LENGTH(m_encBCombinedBuffer2); i++)
    {
        if (Mos_ResourceIsNull(&m_encBCombinedBuffer2[i].sResource))
        {
            uint32_t               numLcu64 = m_widthAlignedMaxLcu * m_heightAlignedMaxLcu / 64 / 64;
            MBENC_COMBINED_BUFFER2 fixedBuf;

            m_historyOutBufferSize = MOS_ALIGN_CEIL(32 * numLcu64, CODECHAL_CACHELINE_SIZE);
            m_threadTaskBufferSize = MOS_ALIGN_CEIL(96 * numLcu64, CODECHAL_CACHELINE_SIZE);

            size = MOS_ALIGN_CEIL(sizeof(fixedBuf), CODECHAL_CACHELINE_SIZE) + m_historyOutBufferSize + m_threadTaskBufferSize;

            m_historyOutBufferOffset = MOS_ALIGN_CEIL(sizeof(fixedBuf), CODECHAL_CACHELINE_SIZE);
            m_threadTaskBufferOffset = m_historyOutBufferOffset + m_historyOutBufferSize;

            CODECHAL_ENCODE_CHK_STATUS_RETURN(AllocateBuffer(
                &m_encBCombinedBuffer2[i],
                size,
                "Enc B combined buffer2"));

            MOS_LOCK_PARAMS lockFlags;
            MOS_ZeroMemory(&lockFlags, sizeof(MOS_LOCK_PARAMS));
            lockFlags.WriteOnly = 1;
            uint8_t *data       = (uint8_t *)m_osInterface->pfnLockResource(
                m_osInterface,
                &m_encBCombinedBuffer2[i].sResource,
                &lockFlags);
            CODECHAL_ENCODE_CHK_NULL_RETURN(data);

            MOS_ZeroMemory(data, size);

            m_osInterface->pfnUnlockResource(
                m_osInterface,
                &m_encBCombinedBuffer2[i].sResource);
        }
    }

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::FreeEncResources()
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    MOS_DeleteArray(m_mbEncKernelStates);
    m_mbEncKernelStates = nullptr;
    MOS_FreeMemory(m_mbEncKernelBindingTable);
    m_mbEncKernelBindingTable = nullptr;

    MOS_DeleteArray(m_brcKernelStates);
    m_brcKernelStates = nullptr;
    MOS_FreeMemory(m_brcKernelBindingTable);
    m_brcKernelBindingTable = nullptr;

    HmeParams hmeParams;
    MOS_ZeroMemory(&hmeParams, sizeof(hmeParams));
    hmeParams.presMvAndDistortionSumSurface = &m_mvAndDistortionSumSurface.sResource;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(DestroyMEResources(&hmeParams));

    // Surfaces used by I kernel
    // Release Intermediate CU Record Surface
    m_osInterface->pfnFreeResource(
        m_osInterface,
        &m_intermediateCuRecordSurfaceLcu32.OsResource);

    // Release Scratch Surface for I-kernel
    m_osInterface->pfnFreeResource(
        m_osInterface,
        &m_scratchSurface.OsResource);

    // Release CU based QP surface
    m_osInterface->pfnFreeResource(
        m_osInterface,
        &m_16x16QpInputData.OsResource);

    // Release LCU Level Input Data
    for (uint32_t i = 0; i < CODECHAL_GET_ARRAY_LENGTH(m_lcuLevelInputDataSurface); i++)
    {
        m_osInterface->pfnFreeResource(
            m_osInterface,
            &m_lcuLevelInputDataSurface[i].OsResource);
    }

    // Release Current Picture Y with Reconstructed boundary pixels surface
    m_osInterface->pfnFreeResource(
        m_osInterface,
        &m_currPicWithReconBoundaryPix.OsResource);

    // Release Encoder History Input Data
    m_osInterface->pfnFreeResource(
        m_osInterface,
        &m_encoderHistoryInputBuffer.OsResource);

    // Release Encoder History Output Data
    m_osInterface->pfnFreeResource(
        m_osInterface,
        &m_encoderHistoryOutputBuffer.OsResource);

    // Release Debug surface
    for (uint32_t i = 0; i < CODECHAL_GET_ARRAY_LENGTH(m_debugSurface); i++)
    {
        m_osInterface->pfnFreeResource(
            m_osInterface,
            &m_debugSurface[i].sResource);
    }

    // Surfaces used by B Kernels
    // Enc constant table for B LCU32
    m_osInterface->pfnFreeResource(
        m_osInterface,
        &m_encConstantTableForB.sResource);

    CODECHAL_ENCODE_CHK_STATUS_RETURN(FreeMeResources());

    for (uint32_t i = 0; i < CODECHAL_GET_ARRAY_LENGTH(m_encBCombinedBuffer1); i++)
    {
        m_osInterface->pfnFreeResource(
            m_osInterface,
            &m_encBCombinedBuffer1[i].sResource);
    }

    for (uint32_t i = 0; i < CODECHAL_GET_ARRAY_LENGTH(m_encBCombinedBuffer2); i++)
    {
        m_osInterface->pfnFreeResource(
            m_osInterface,
            &m_encBCombinedBuffer2[i].sResource);
    }

    if (m_swScoreboard)
    {
        MOS_FreeMemory(m_swScoreboard);
        m_swScoreboard = nullptr;
    }

    if (m_numDelay)
    {
        m_osInterface->pfnFreeResource(m_osInterface, &m_resDelayMinus);
    }

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::AllocateMeResources()
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    // Mv and Distortion Summation Surface
    if (Mos_ResourceIsNull(&m_mvAndDistortionSumSurface.sResource))
    {
        uint32_t size = m_mvdistSummationSurfSize;

        CODECHAL_ENCODE_CHK_STATUS_RETURN(AllocateBuffer(
            &m_mvAndDistortionSumSurface,
            size,
            "Mv and Distortion Summation surface"));

        // Initialize the surface to zero for now till HME is updated to output the data into this surface
        MOS_LOCK_PARAMS lockFlags;
        MOS_ZeroMemory(&lockFlags, sizeof(MOS_LOCK_PARAMS));
        lockFlags.WriteOnly = 1;
        uint8_t *data       = (uint8_t *)m_osInterface->pfnLockResource(
            m_osInterface,
            &m_mvAndDistortionSumSurface.sResource,
            &lockFlags);
        CODECHAL_ENCODE_CHK_NULL_RETURN(data);

        MOS_ZeroMemory(data, size);

        m_osInterface->pfnUnlockResource(
            m_osInterface,
            &m_mvAndDistortionSumSurface.sResource);
    }

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::FreeMeResources()
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    m_osInterface->pfnFreeResource(
        m_osInterface,
        &m_brcBuffers.sMeBrcDistortionBuffer.OsResource);

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::AllocatePakResources()
{
    CODECHAL_ENCODE_FUNCTION_ENTER;

    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    uint32_t mvt_size        = MOS_ALIGN_CEIL(((m_frameWidth + 63) >> 6) * ((m_frameHeight + 15) >> 4), 2) * CODECHAL_CACHELINE_SIZE;
    uint32_t mvtb_size       = MOS_ALIGN_CEIL(((m_frameWidth + 31) >> 5) * ((m_frameHeight + 31) >> 5), 2) * CODECHAL_CACHELINE_SIZE;
    m_sizeOfMvTemporalBuffer = MOS_MAX(mvt_size, mvtb_size);

    const uint32_t minLcuSize        = 16;
    const uint32_t picWidthInMinLCU  = MOS_ROUNDUP_DIVIDE(m_frameWidth, minLcuSize);   //assume smallest LCU to get max width
    const uint32_t picHeightInMinLCU = MOS_ROUNDUP_DIVIDE(m_frameHeight, minLcuSize);  //assume smallest LCU to get max height

    MHW_VDBOX_HCP_BUFFER_SIZE_PARAMS hcpBufSizeParam;
    MOS_ZeroMemory(&hcpBufSizeParam, sizeof(hcpBufSizeParam));
    hcpBufSizeParam.ucMaxBitDepth  = m_bitDepth;
    hcpBufSizeParam.ucChromaFormat = m_chromaFormat;
    // We should move the buffer allocation to picture level if the size is dependent on LCU size
    hcpBufSizeParam.dwCtbLog2SizeY = 6;  //assume Max LCU size
    hcpBufSizeParam.dwPicWidth     = MOS_ALIGN_CEIL(m_frameWidth, MAX_LCU_SIZE);
    hcpBufSizeParam.dwPicHeight    = MOS_ALIGN_CEIL(m_frameHeight, MAX_LCU_SIZE);

    MOS_ALLOC_GFXRES_PARAMS allocParamsForBufferLinear;
    MOS_ZeroMemory(&allocParamsForBufferLinear, sizeof(MOS_ALLOC_GFXRES_PARAMS));
    allocParamsForBufferLinear.Type     = MOS_GFXRES_BUFFER;
    allocParamsForBufferLinear.TileType = MOS_TILE_LINEAR;
    allocParamsForBufferLinear.Format   = Format_Buffer;

    // Deblocking Filter Row Store Scratch data surface
    eStatus = (MOS_STATUS)m_hcpInterface->GetHevcBufferSize(
        MHW_VDBOX_HCP_INTERNAL_BUFFER_DBLK_LINE,
        &hcpBufSizeParam);

    if (eStatus != MOS_STATUS_SUCCESS)
    {
        CODECHAL_ENCODE_ASSERTMESSAGE("Failed to get the size for Deblocking Filter Row Store Scratch Buffer.");
        return eStatus;
    }

    allocParamsForBufferLinear.dwBytes  = hcpBufSizeParam.dwBufferSize;
    allocParamsForBufferLinear.pBufName = "DeblockingScratchBuffer";

    eStatus = (MOS_STATUS)m_osInterface->pfnAllocateResource(
        m_osInterface,
        &allocParamsForBufferLinear,
        &m_resDeblockingFilterRowStoreScratchBuffer);

    if (eStatus != MOS_STATUS_SUCCESS)
    {
        CODECHAL_ENCODE_ASSERTMESSAGE("Failed to allocate Deblocking Filter Row Store Scratch Buffer.");
        return eStatus;
    }

    // Deblocking Filter Tile Row Store Scratch data surface
    eStatus = (MOS_STATUS)m_hcpInterface->GetHevcBufferSize(
        MHW_VDBOX_HCP_INTERNAL_BUFFER_DBLK_TILE_LINE,
        &hcpBufSizeParam);

    if (eStatus != MOS_STATUS_SUCCESS)
    {
        CODECHAL_ENCODE_ASSERTMESSAGE("Failed to get the size for Deblocking Filter Tile Row Store Scratch Buffer.");
        return eStatus;
    }

    allocParamsForBufferLinear.dwBytes  = hcpBufSizeParam.dwBufferSize;
    allocParamsForBufferLinear.pBufName = "DeblockingTileRowScratchBuffer";

    eStatus = (MOS_STATUS)m_osInterface->pfnAllocateResource(
        m_osInterface,
        &allocParamsForBufferLinear,
        &m_resDeblockingFilterTileRowStoreScratchBuffer);

    if (eStatus != MOS_STATUS_SUCCESS)
    {
        CODECHAL_ENCODE_ASSERTMESSAGE("Failed to allocate Deblocking Filter Tile Row Store Scratch Buffer.");
        return eStatus;
    }

    // Deblocking Filter Column Row Store Scratch data surface
    eStatus = (MOS_STATUS)m_hcpInterface->GetHevcBufferSize(
        MHW_VDBOX_HCP_INTERNAL_BUFFER_DBLK_TILE_COL,
        &hcpBufSizeParam);

    if (eStatus != MOS_STATUS_SUCCESS)
    {
        CODECHAL_ENCODE_ASSERTMESSAGE("Failed to get the size for Deblocking Filter Tile Column Store Scratch Buffer.");
        return eStatus;
    }

    allocParamsForBufferLinear.dwBytes  = hcpBufSizeParam.dwBufferSize;
    allocParamsForBufferLinear.pBufName = "DeblockingColumnScratchBuffer";

    eStatus = (MOS_STATUS)m_osInterface->pfnAllocateResource(
        m_osInterface,
        &allocParamsForBufferLinear,
        &m_resDeblockingFilterColumnRowStoreScratchBuffer);

    if (eStatus != MOS_STATUS_SUCCESS)
    {
        CODECHAL_ENCODE_ASSERTMESSAGE("Failed to allocate Deblocking Filter Tile Column Row Store Scratch Buffer.");
        return eStatus;
    }

    // Metadata Line buffer
    eStatus = (MOS_STATUS)m_hcpInterface->GetHevcBufferSize(
        MHW_VDBOX_HCP_INTERNAL_BUFFER_META_LINE,
        &hcpBufSizeParam);

    if (eStatus != MOS_STATUS_SUCCESS)
    {
        CODECHAL_ENCODE_ASSERTMESSAGE("Failed to get the size for Metadata Line Buffer.");
        return eStatus;
    }

    allocParamsForBufferLinear.dwBytes  = hcpBufSizeParam.dwBufferSize;
    allocParamsForBufferLinear.pBufName = "MetadataLineBuffer";

    eStatus = (MOS_STATUS)m_osInterface->pfnAllocateResource(
        m_osInterface,
        &allocParamsForBufferLinear,
        &m_resMetadataLineBuffer);

    if (eStatus != MOS_STATUS_SUCCESS)
    {
        CODECHAL_ENCODE_ASSERTMESSAGE("Failed to allocate Metadata Line Buffer.");
        return eStatus;
    }

    // Metadata Tile Line buffer
    eStatus = (MOS_STATUS)m_hcpInterface->GetHevcBufferSize(
        MHW_VDBOX_HCP_INTERNAL_BUFFER_META_TILE_LINE,
        &hcpBufSizeParam);

    if (eStatus != MOS_STATUS_SUCCESS)
    {
        CODECHAL_ENCODE_ASSERTMESSAGE("Failed to get the size for Metadata Tile Line Buffer.");
        return eStatus;
    }

    allocParamsForBufferLinear.dwBytes  = hcpBufSizeParam.dwBufferSize;
    allocParamsForBufferLinear.pBufName = "MetadataTileLineBuffer";

    eStatus = (MOS_STATUS)m_osInterface->pfnAllocateResource(
        m_osInterface,
        &allocParamsForBufferLinear,
        &m_resMetadataTileLineBuffer);

    if (eStatus != MOS_STATUS_SUCCESS)
    {
        CODECHAL_ENCODE_ASSERTMESSAGE("Failed to allocate Metadata Tile Line Buffer.");
        return eStatus;
    }

    // Metadata Tile Column buffer
    eStatus = (MOS_STATUS)m_hcpInterface->GetHevcBufferSize(
        MHW_VDBOX_HCP_INTERNAL_BUFFER_META_TILE_COL,
        &hcpBufSizeParam);

    if (eStatus != MOS_STATUS_SUCCESS)
    {
        CODECHAL_ENCODE_ASSERTMESSAGE("Failed to get the size for Metadata Tile Column Buffer.");
        return eStatus;
    }

    allocParamsForBufferLinear.dwBytes  = hcpBufSizeParam.dwBufferSize;
    allocParamsForBufferLinear.pBufName = "MetadataTileColumnBuffer";

    eStatus = (MOS_STATUS)m_osInterface->pfnAllocateResource(
        m_osInterface,
        &allocParamsForBufferLinear,
        &m_resMetadataTileColumnBuffer);

    if (eStatus != MOS_STATUS_SUCCESS)
    {
        CODECHAL_ENCODE_ASSERTMESSAGE("Failed to allocate Metadata Tile Column Buffer.");
        return eStatus;
    }

    // SAO Line buffer
    eStatus = (MOS_STATUS)m_hcpInterface->GetHevcBufferSize(
        MHW_VDBOX_HCP_INTERNAL_BUFFER_SAO_LINE,
        &hcpBufSizeParam);

    if (eStatus != MOS_STATUS_SUCCESS)
    {
        CODECHAL_ENCODE_ASSERTMESSAGE("Failed to get the size for SAO Line Buffer.");
        return eStatus;
    }

    allocParamsForBufferLinear.dwBytes  = hcpBufSizeParam.dwBufferSize;
    allocParamsForBufferLinear.pBufName = "SaoLineBuffer";

    eStatus = (MOS_STATUS)m_osInterface->pfnAllocateResource(
        m_osInterface,
        &allocParamsForBufferLinear,
        &m_resSaoLineBuffer);

    if (eStatus != MOS_STATUS_SUCCESS)
    {
        CODECHAL_ENCODE_ASSERTMESSAGE("Failed to allocate SAO Line Buffer.");
        return eStatus;
    }

    // SAO Tile Line buffer
    eStatus = (MOS_STATUS)m_hcpInterface->GetHevcBufferSize(
        MHW_VDBOX_HCP_INTERNAL_BUFFER_SAO_TILE_LINE,
        &hcpBufSizeParam);

    if (eStatus != MOS_STATUS_SUCCESS)
    {
        CODECHAL_ENCODE_ASSERTMESSAGE("Failed to get the size for SAO Tile Line Buffer.");
        return eStatus;
    }

    allocParamsForBufferLinear.dwBytes  = hcpBufSizeParam.dwBufferSize;
    allocParamsForBufferLinear.pBufName = "SaoTileLineBuffer";

    eStatus = (MOS_STATUS)m_osInterface->pfnAllocateResource(
        m_osInterface,
        &allocParamsForBufferLinear,
        &m_resSaoTileLineBuffer);

    if (eStatus != MOS_STATUS_SUCCESS)
    {
        CODECHAL_ENCODE_ASSERTMESSAGE("Failed to allocate SAO Tile Line Buffer.");
        return eStatus;
    }

    // SAO Tile Column buffer
    eStatus = (MOS_STATUS)m_hcpInterface->GetHevcBufferSize(
        MHW_VDBOX_HCP_INTERNAL_BUFFER_SAO_TILE_COL,
        &hcpBufSizeParam);

    if (eStatus != MOS_STATUS_SUCCESS)
    {
        CODECHAL_ENCODE_ASSERTMESSAGE("Failed to get the size for SAO Tile Column Buffer.");
        return eStatus;
    }

    allocParamsForBufferLinear.dwBytes  = hcpBufSizeParam.dwBufferSize;
    allocParamsForBufferLinear.pBufName = "SaoTileColumnBuffer";

    eStatus = (MOS_STATUS)m_osInterface->pfnAllocateResource(
        m_osInterface,
        &allocParamsForBufferLinear,
        &m_resSaoTileColumnBuffer);

    if (eStatus != MOS_STATUS_SUCCESS)
    {
        CODECHAL_ENCODE_ASSERTMESSAGE("Failed to allocate SAO Tile Column Buffer.");
        return eStatus;
    }

    // Lcu ILDB StreamOut buffer
    // Allocate the buffer size
    // This is not enabled with HCP_PIPE_MODE_SELECT yet, placeholder here
    allocParamsForBufferLinear.dwBytes  = CODECHAL_CACHELINE_SIZE;
    allocParamsForBufferLinear.pBufName = "LcuILDBStreamOutBuffer";

    eStatus = (MOS_STATUS)m_osInterface->pfnAllocateResource(
        m_osInterface,
        &allocParamsForBufferLinear,
        &m_resLcuIldbStreamOutBuffer);

    if (eStatus != MOS_STATUS_SUCCESS)
    {
        CODECHAL_ENCODE_ASSERTMESSAGE("Failed to allocate LCU ILDB StreamOut Buffer.");
        return eStatus;
    }

    // Lcu Base Address buffer
    // HEVC Encoder Mode: Slice size is written to this buffer when slice size conformance is enabled.
    // 1 CL (= 16 DWs = 64 bytes) per slice * Maximum number of slices in a frame.
    // Align to page for HUC requirement
    uint32_t maxLcu                     = picWidthInMinLCU * picHeightInMinLCU;
    allocParamsForBufferLinear.dwBytes  = MOS_ALIGN_CEIL(maxLcu * CODECHAL_CACHELINE_SIZE, CODECHAL_PAGE_SIZE);
    allocParamsForBufferLinear.pBufName = "LcuBaseAddressBuffer";

    eStatus = (MOS_STATUS)m_osInterface->pfnAllocateResource(
        m_osInterface,
        &allocParamsForBufferLinear,
        &m_resLcuBaseAddressBuffer);

    if (eStatus != MOS_STATUS_SUCCESS)
    {
        CODECHAL_ENCODE_ASSERTMESSAGE("Failed to allocate LCU Base Address Buffer.");
        return eStatus;
    }

    // SAO StreamOut buffer
    // size = MOS_ALIGN_CEIL(picWidthInMinLCU, 4) * 16
    uint32_t size = MOS_ALIGN_CEIL(picWidthInMinLCU, 4) * CODECHAL_HEVC_SAO_STRMOUT_SIZE_PERLCU;
    //extra added size to cover tile enabled case, per tile width aligned to 4.  20: max tile column No.
    size += 3 * 20 * CODECHAL_HEVC_SAO_STRMOUT_SIZE_PERLCU;
    allocParamsForBufferLinear.dwBytes  = size;
    allocParamsForBufferLinear.pBufName = "SaoStreamOutBuffer";

    eStatus = (MOS_STATUS)m_osInterface->pfnAllocateResource(
        m_osInterface,
        &allocParamsForBufferLinear,
        &m_resSaoStreamOutBuffer);

    if (eStatus != MOS_STATUS_SUCCESS)
    {
        CODECHAL_ENCODE_ASSERTMESSAGE("Failed to allocate SAO StreamOut Buffer.");
        return eStatus;
    }

    uint32_t maxTileNumber = (MOS_ALIGN_CEIL(m_frameWidth, CODECHAL_HEVC_MIN_TILE_SIZE) / CODECHAL_HEVC_MIN_TILE_SIZE) *
                             (MOS_ALIGN_CEIL(m_frameHeight, CODECHAL_HEVC_MIN_TILE_SIZE) / CODECHAL_HEVC_MIN_TILE_SIZE);

    MOS_ZeroMemory(&allocParamsForBufferLinear, sizeof(MOS_ALLOC_GFXRES_PARAMS));
    allocParamsForBufferLinear.Type     = MOS_GFXRES_BUFFER;
    allocParamsForBufferLinear.TileType = MOS_TILE_LINEAR;
    allocParamsForBufferLinear.Format   = Format_Buffer;

    // Allocate Frame Statistics Streamout Data Destination Buffer. DW98-100 in HCP pipe buffer address command
    allocParamsForBufferLinear.dwBytes  = m_sizeOfHcpPakFrameStats * maxTileNumber;  //Each tile has 8 cache size bytes of data
    allocParamsForBufferLinear.pBufName = "FrameStatStreamOutBuffer";

    CODECHAL_ENCODE_CHK_STATUS_RETURN((MOS_STATUS)m_osInterface->pfnAllocateResource(
        m_osInterface,
        &allocParamsForBufferLinear,
        &m_resFrameStatStreamOutBuffer));

    // PAK CU Level Streamout Data:   DW57-59 in HCP pipe buffer address command
    // One CU has 16-byte. But, each tile needs to be aliged to the cache line
    uint32_t frameWidthInCus            = CODECHAL_GET_WIDTH_IN_BLOCKS(m_frameWidth, CODECHAL_HEVC_MIN_CU_SIZE);
    uint32_t frameHeightInCus           = CODECHAL_GET_WIDTH_IN_BLOCKS(m_frameHeight, CODECHAL_HEVC_MIN_CU_SIZE);
    size                                = MOS_ALIGN_CEIL(frameWidthInCus * frameHeightInCus * 16, CODECHAL_CACHELINE_SIZE);
    allocParamsForBufferLinear.dwBytes  = size;
    allocParamsForBufferLinear.pBufName = "PAK CU Level Streamout Data";

    CODECHAL_ENCODE_CHK_STATUS_RETURN((MOS_STATUS)m_osInterface->pfnAllocateResource(
        m_osInterface,
        &allocParamsForBufferLinear,
        &m_resPakcuLevelStreamoutData.sResource));
    m_resPakcuLevelStreamoutData.dwSize = size;
    CODECHAL_ENCODE_VERBOSEMESSAGE("first allocate cu steam out buffer, size=0x%x.\n", size);

    // Allocate SSE Source Pixel Row Store Buffer. Implementation for each tile column is shown as below:
    //   tileWidthInLCU = ((tileWidthInLCU+3) * BYTES_PER_CACHE_LINE)*(4+4) ; tileWidthInLCU <<= 1; // double the size as RTL treats it as 10 bit data
    // Here, we consider each LCU column is one tile column.

    m_sizeOfSseSrcPixelRowStoreBufferPerLcu = (CODECHAL_CACHELINE_SIZE * (4 + 4)) << 1;                          //size per LCU plus 10-bit
    size                                    = m_sizeOfSseSrcPixelRowStoreBufferPerLcu * (picWidthInMinLCU + 3);  // already aligned to cacheline size
    allocParamsForBufferLinear.dwBytes      = size;
    allocParamsForBufferLinear.pBufName     = "SseSrcPixelRowStoreBuffer";

    CODECHAL_ENCODE_CHK_STATUS_RETURN((MOS_STATUS)m_osInterface->pfnAllocateResource(
        m_osInterface,
        &allocParamsForBufferLinear,
        &m_resSseSrcPixelRowStoreBuffer));

    // SAO Row Store buffer, HSAO
    // Aligned to 4 for each tile column
    uint32_t maxTileColumn              = MOS_ROUNDUP_DIVIDE(m_frameWidth, CODECHAL_HEVC_MIN_TILE_SIZE);
    allocParamsForBufferLinear.dwBytes  = MOS_ALIGN_CEIL(picWidthInMinLCU + 3 * maxTileColumn, 4) * 16;
    allocParamsForBufferLinear.pBufName = "SaoRowStoreBuffer";

    eStatus = (MOS_STATUS)m_osInterface->pfnAllocateResource(
        m_osInterface,
        &allocParamsForBufferLinear,
        &m_SAORowStoreBuffer);

    if (eStatus != MOS_STATUS_SUCCESS)
    {
        CODECHAL_ENCODE_ASSERTMESSAGE("Failed to allocate SAO row store Buffer.");
        return eStatus;
    }

    //HCP scalability Sync buffer
    size                                = CODECHAL_HEVC_MAX_NUM_HCP_PIPE * CODECHAL_CACHELINE_SIZE;
    allocParamsForBufferLinear.dwBytes  = size;
    allocParamsForBufferLinear.pBufName = "GEN12 Hcp scalability Sync buffer ";

    CODECHAL_ENCODE_CHK_STATUS_RETURN((MOS_STATUS)m_osInterface->pfnAllocateResource(
        m_osInterface,
        &allocParamsForBufferLinear,
        &m_resHcpScalabilitySyncBuffer.sResource));
    m_resHcpScalabilitySyncBuffer.dwSize = size;

    // create the tile coding state parameters
    m_tileParams = (PMHW_VDBOX_HCP_TILE_CODING_PARAMS_G12)MOS_AllocAndZeroMemory(sizeof(MHW_VDBOX_HCP_TILE_CODING_PARAMS_G12) * maxTileNumber);

    if (m_enableHWSemaphore)
    {
        // Create the HW sync objects which will be used by each reference frame and BRC in GEN12
        allocParamsForBufferLinear.dwBytes  = sizeof(uint32_t);
        allocParamsForBufferLinear.pBufName = "SemaphoreMemory";

        MOS_LOCK_PARAMS lockFlagsWriteOnly;
        MOS_ZeroMemory(&lockFlagsWriteOnly, sizeof(MOS_LOCK_PARAMS));
        lockFlagsWriteOnly.WriteOnly = 1;

        for (auto i = 0; i < CODECHAL_GET_ARRAY_LENGTH(m_resBrcSemaphoreMem); i++)
        {
            eStatus = (MOS_STATUS)m_osInterface->pfnAllocateResource(
                m_osInterface,
                &allocParamsForBufferLinear,
                &m_resBrcSemaphoreMem[i].sResource);
            m_resBrcSemaphoreMem[i].dwSize = allocParamsForBufferLinear.dwBytes;
            CODECHAL_ENCODE_CHK_STATUS_MESSAGE_RETURN(eStatus, "Cannot create BRC HW Semaphore Memory.");

            uint32_t *data = (uint32_t *)m_osInterface->pfnLockResource(
                m_osInterface,
                &m_resBrcSemaphoreMem[i].sResource,
                &lockFlagsWriteOnly);

            CODECHAL_ENCODE_CHK_NULL_RETURN(data);

            *data = 1;

            CODECHAL_ENCODE_CHK_STATUS_RETURN(m_osInterface->pfnUnlockResource(
                m_osInterface,
                &m_resBrcSemaphoreMem[i].sResource));
        }

        eStatus = (MOS_STATUS)m_osInterface->pfnAllocateResource(
            m_osInterface,
            &allocParamsForBufferLinear,
            &m_resPipeStartSemaMem);
        CODECHAL_ENCODE_CHK_STATUS_MESSAGE_RETURN(eStatus, "Cannot create Scalability pipe start sync HW semaphore.");

        uint32_t *data = (uint32_t *)m_osInterface->pfnLockResource(
            m_osInterface,
            &m_resPipeStartSemaMem,
            &lockFlagsWriteOnly);

        CODECHAL_ENCODE_CHK_NULL_RETURN(data);
        *data = 0;
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_osInterface->pfnUnlockResource(
            m_osInterface,
            &m_resPipeStartSemaMem));

        eStatus = (MOS_STATUS)m_osInterface->pfnAllocateResource(
            m_osInterface,
            &allocParamsForBufferLinear,
            &m_resPipeCompleteSemaMem);
        CODECHAL_ENCODE_CHK_STATUS_MESSAGE_RETURN(eStatus, "Cannot create Scalability pipe completion sync HW semaphore.");

        data = (uint32_t *)m_osInterface->pfnLockResource(
            m_osInterface,
            &m_resPipeCompleteSemaMem,
            &lockFlagsWriteOnly);

        CODECHAL_ENCODE_CHK_NULL_RETURN(data);
        *data = 0;
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_osInterface->pfnUnlockResource(
            m_osInterface,
            &m_resPipeCompleteSemaMem));
    }

    if (m_hucPakStitchEnabled)
    {
        if (Mos_ResourceIsNull(&m_resHucStatus2Buffer))
        {
            // HUC STATUS 2 Buffer for HuC status check in COND_BB_END
            allocParamsForBufferLinear.dwBytes  = sizeof(uint64_t);
            allocParamsForBufferLinear.pBufName = "HUC STATUS 2 Buffer";

            CODECHAL_ENCODE_CHK_STATUS_MESSAGE_RETURN(
                m_osInterface->pfnAllocateResource(
                    m_osInterface,
                    &allocParamsForBufferLinear,
                    &m_resHucStatus2Buffer),
                "%s: Failed to allocate HUC STATUS 2 Buffer\n",
                __FUNCTION__);
        }

        uint8_t *data;

        // Pak stitch DMEM
        allocParamsForBufferLinear.dwBytes  = MOS_ALIGN_CEIL(sizeof(HucPakStitchDmemEncG12), CODECHAL_CACHELINE_SIZE);
        allocParamsForBufferLinear.pBufName = "PAK Stitch Dmem Buffer";
        auto numOfPasses                    = CODECHAL_DP_MAX_NUM_BRC_PASSES;
        for (auto j = 0; j < CODECHAL_ENCODE_RECYCLED_BUFFER_NUM; j++)
        {
            for (auto i = 0; i < numOfPasses; i++)
            {
                eStatus = (MOS_STATUS)m_osInterface->pfnAllocateResource(
                    m_osInterface,
                    &allocParamsForBufferLinear,
                    &m_resHucPakStitchDmemBuffer[j][i]);

                if (eStatus != MOS_STATUS_SUCCESS)
                {
                    CODECHAL_ENCODE_ASSERTMESSAGE("Failed to allocate PAK Stitch Dmem Buffer.");
                    return eStatus;
                }
            }
        }
        // BRC Data Buffer
        allocParamsForBufferLinear.dwBytes  = MOS_ALIGN_CEIL(CODECHAL_CACHELINE_SIZE, CODECHAL_PAGE_SIZE);
        allocParamsForBufferLinear.pBufName = "BRC Data Buffer";

        eStatus = (MOS_STATUS)m_osInterface->pfnAllocateResource(
            m_osInterface,
            &allocParamsForBufferLinear,
            &m_resBrcDataBuffer);

        if (eStatus != MOS_STATUS_SUCCESS)
        {
            CODECHAL_ENCODE_ASSERTMESSAGE("Failed to allocate BRC Data Buffer Buffer.");
            return eStatus;
        }

        MOS_LOCK_PARAMS lockFlags;
        MOS_ZeroMemory(&lockFlags, sizeof(MOS_LOCK_PARAMS));
        lockFlags.WriteOnly = 1;

        data = (uint8_t *)m_osInterface->pfnLockResource(
            m_osInterface,
            &m_resBrcDataBuffer,
            &lockFlags);

        CODECHAL_ENCODE_CHK_NULL_RETURN(data);

        MOS_ZeroMemory(
            data,
            allocParamsForBufferLinear.dwBytes);

        m_osInterface->pfnUnlockResource(m_osInterface, &m_resBrcDataBuffer);
        for (auto i = 0; i < CODECHAL_ENCODE_RECYCLED_BUFFER_NUM; i++)
        {
            for (auto j = 0; j < CODECHAL_HEVC_MAX_NUM_BRC_PASSES; j++)
            {
                // HuC stitching Data buffer
                allocParamsForBufferLinear.dwBytes  = MOS_ALIGN_CEIL(sizeof(HucCommandData), CODECHAL_PAGE_SIZE);
                allocParamsForBufferLinear.pBufName = "HEVC HuC Stitch Data Buffer";
                CODECHAL_ENCODE_CHK_STATUS_RETURN(
                    m_osInterface->pfnAllocateResource(
                        m_osInterface,
                        &allocParamsForBufferLinear,
                        &m_resHucStitchDataBuffer[i][j]));

                MOS_LOCK_PARAMS lockFlagsWriteOnly;
                MOS_ZeroMemory(&lockFlagsWriteOnly, sizeof(MOS_LOCK_PARAMS));
                lockFlagsWriteOnly.WriteOnly = 1;

                uint8_t *pData = (uint8_t *)m_osInterface->pfnLockResource(
                    m_osInterface,
                    &m_resHucStitchDataBuffer[i][j],
                    &lockFlagsWriteOnly);
                CODECHAL_ENCODE_CHK_NULL_RETURN(pData);
                MOS_ZeroMemory(pData, allocParamsForBufferLinear.dwBytes);
                m_osInterface->pfnUnlockResource(m_osInterface, &m_resHucStitchDataBuffer[i][j]);
            }
        }

        //Second level BB for huc stitching cmd
        MOS_ZeroMemory(&m_HucStitchCmdBatchBuffer, sizeof(m_HucStitchCmdBatchBuffer));
        m_HucStitchCmdBatchBuffer.bSecondLevel = true;
        CODECHAL_ENCODE_CHK_STATUS_RETURN(Mhw_AllocateBb(
            m_osInterface,
            &m_HucStitchCmdBatchBuffer,
            nullptr,
            m_hwInterface->m_HucStitchCmdBatchBufferSize));
    }

    // Pak obj and CU records for skip frame
    uint32_t mbCodeSize = m_mbCodeSize + 8 * CODECHAL_CACHELINE_SIZE;  // Must reserve at least 8 cachelines after MI_BATCH_BUFFER_END_CMD since HW prefetch max 8 cachelines from BB everytime

    MOS_ALLOC_GFXRES_PARAMS allocParams;
    MOS_ZeroMemory(&allocParams, sizeof(allocParams));
    allocParams.Type     = MOS_GFXRES_BUFFER;
    allocParams.Format   = Format_Buffer;
    allocParams.TileType = MOS_TILE_LINEAR;
    allocParams.dwBytes  = mbCodeSize;
    allocParams.pBufName = "skipFrameMbCodeSurface";

    eStatus = (MOS_STATUS)m_osInterface->pfnAllocateResource(
        m_osInterface,
        &allocParams,
        &m_skipFrameInfo.m_resMbCodeSkipFrameSurface);
    if (eStatus != MOS_STATUS_SUCCESS)
    {
        CODECHAL_ENCODE_ASSERTMESSAGE("Failed to allocate PAK object buffer for skip frame");
        return eStatus;
    }

    if (m_numDelay)
    {
        allocParamsForBufferLinear.dwBytes  = sizeof(uint32_t);
        allocParamsForBufferLinear.pBufName = "DelayMinusMemory";

        CODECHAL_ENCODE_CHK_STATUS_MESSAGE_RETURN(m_osInterface->pfnAllocateResource(
                                                      m_osInterface,
                                                      &allocParamsForBufferLinear,
                                                      &m_resDelayMinus),
            "Failed to allocate delay minus memory.");

        uint8_t *       data;
        MOS_LOCK_PARAMS lockFlags;
        MOS_ZeroMemory(&lockFlags, sizeof(MOS_LOCK_PARAMS));
        lockFlags.WriteOnly = 1;
        data                = (uint8_t *)m_osInterface->pfnLockResource(
            m_osInterface,
            &m_resDelayMinus,
            &lockFlags);

        CODECHAL_ENCODE_CHK_NULL_RETURN(data);

        MOS_ZeroMemory(data, sizeof(uint32_t));

        m_osInterface->pfnUnlockResource(m_osInterface, &m_resDelayMinus);
    }

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::FreePakResources()
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    // Release Frame Statistics Streamout Data Destination Buffer
    m_osInterface->pfnFreeResource(
        m_osInterface,
        &m_resFrameStatStreamOutBuffer);

    // PAK CU Level Stream out buffer
    m_osInterface->pfnFreeResource(
        m_osInterface,
        &m_resPakcuLevelStreamoutData.sResource);

    // Release SSE Source Pixel Row Store Buffer
    m_osInterface->pfnFreeResource(
        m_osInterface,
        &m_resSseSrcPixelRowStoreBuffer);

    // Release Hcp scalability Sync buffer
    m_osInterface->pfnFreeResource(
        m_osInterface,
        &m_resHcpScalabilitySyncBuffer.sResource);

    m_osInterface->pfnFreeResource(
        m_osInterface,
        &m_resPakcuLevelStreamoutData.sResource);

    m_osInterface->pfnFreeResource(
        m_osInterface,
        &m_resPakSliceLevelStreamoutData.sResource);

    m_osInterface->pfnFreeResource(
        m_osInterface,
        &m_SAORowStoreBuffer);

    m_osInterface->pfnFreeResource(
        m_osInterface,
        &m_skipFrameInfo.m_resMbCodeSkipFrameSurface);

    for (auto i = 0; i < CODECHAL_GET_ARRAY_LENGTH(m_resTileBasedStatisticsBuffer); i++)
    {
        m_osInterface->pfnFreeResource(m_osInterface, &m_resTileBasedStatisticsBuffer[i].sResource);
    }
    for (auto i = 0; i < CODECHAL_GET_ARRAY_LENGTH(m_tileRecordBuffer); i++)
    {
        m_osInterface->pfnFreeResource(m_osInterface, &m_tileRecordBuffer[i].sResource);
    }
    m_osInterface->pfnFreeResource(m_osInterface, &m_resHuCPakAggregatedFrameStatsBuffer.sResource);

    MOS_FreeMemory(m_tileParams);

    if (m_useVirtualEngine)
    {
        for (uint32_t i = 0; i < CODECHAL_NUM_UNCOMPRESSED_SURFACE_HEVC; i++)
        {
            for (uint32_t j = 0; j < CODECHAL_HEVC_MAX_NUM_HCP_PIPE; j++)
            {
                for (auto k = 0; k < CODECHAL_HEVC_MAX_NUM_BRC_PASSES; k++)
                {
                    PMOS_COMMAND_BUFFER cmdBuffer = &m_veBatchBuffer[i][j][k];
                    if (cmdBuffer->pCmdBase)
                    {
                        m_osInterface->pfnUnlockResource(m_osInterface, &cmdBuffer->OsResource);
                    }
                    m_osInterface->pfnFreeResource(m_osInterface, &cmdBuffer->OsResource);
                }
            }
        }
    }

    for (auto i = 0; i < CODECHAL_GET_ARRAY_LENGTH(m_refSync); i++)
    {
        auto sync = &m_refSync[i];

        if (!Mos_ResourceIsNull(&sync->resSyncObject))
        {
            // if this object has been signaled before, we need to wait to ensure singal-wait is in pair.
            if (sync->uiSemaphoreObjCount || sync->bInUsed)
            {
                MOS_SYNC_PARAMS syncParams  = g_cInitSyncParams;
                syncParams.GpuContext       = m_renderContext;
                syncParams.presSyncResource = &sync->resSyncObject;
                syncParams.uiSemaphoreCount = sync->uiSemaphoreObjCount;
                m_osInterface->pfnEngineWait(m_osInterface, &syncParams);
            }
        }
        m_osInterface->pfnFreeResource(m_osInterface, &sync->resSemaphoreMem.sResource);
    }

    for (auto i = 0; i < CODECHAL_GET_ARRAY_LENGTH(m_resBrcSemaphoreMem); i++)
    {
        m_osInterface->pfnFreeResource(m_osInterface, &m_resBrcSemaphoreMem[i].sResource);
    }
    m_osInterface->pfnFreeResource(m_osInterface, &m_resPipeStartSemaMem);
    m_osInterface->pfnFreeResource(m_osInterface, &m_resPipeCompleteSemaMem);

    if (m_hucPakStitchEnabled)
    {
        m_osInterface->pfnFreeResource(m_osInterface, &m_resHucStatus2Buffer);
        m_osInterface->pfnFreeResource(m_osInterface, &m_resBrcDataBuffer);

        for (int i = 0; i < CODECHAL_ENCODE_RECYCLED_BUFFER_NUM; i++)
        {
            for (int j = 0; j < CODECHAL_HEVC_MAX_NUM_BRC_PASSES; j++)
            {
                m_osInterface->pfnFreeResource(m_osInterface, &m_resHucPakStitchDmemBuffer[i][j]);
                m_osInterface->pfnFreeResource(m_osInterface, &m_resHucStitchDataBuffer[i][j]);
            }
        }
        Mhw_FreeBb(m_osInterface, &m_HucStitchCmdBatchBuffer, nullptr);
    }
    return CodechalEncHevcState::FreePakResources();
}

MOS_STATUS CodechalEncHevcStateG12::GetKernelHeaderAndSize(
    void *       binary,
    EncOperation operation,
    uint32_t     krnStateIdx,
    void *       krnHeader,
    uint32_t *   krnSize)
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    CODECHAL_ENCODE_CHK_NULL_RETURN(binary);
    CODECHAL_ENCODE_CHK_NULL_RETURN(krnHeader);
    CODECHAL_ENCODE_CHK_NULL_RETURN(krnSize);

    PCODECHAL_HEVC_KERNEL_HEADER kernelHeaderTable = (PCODECHAL_HEVC_KERNEL_HEADER)binary;

    PCODECHAL_KERNEL_HEADER currKrnHeader = nullptr;
    switch (operation)
    {
    case ENC_MBENC:
    {
        switch (krnStateIdx)
        {
        case MBENC_LCU32_KRNIDX:
            currKrnHeader = &kernelHeaderTable->HEVC_Enc_LCU32;
            break;

        case MBENC_LCU64_KRNIDX:
            currKrnHeader = &kernelHeaderTable->HEVC_Enc_LCU64;
            break;

        default:
            CODECHAL_ENCODE_ASSERTMESSAGE("Unsupported MBENC mode requested");
            return MOS_STATUS_INVALID_PARAMETER;
        }
    }
    break;

    case ENC_BRC:
    {
        switch (krnStateIdx)
        {
        case CODECHAL_HEVC_BRC_INIT:
            currKrnHeader = &kernelHeaderTable->HEVC_brc_init;
            break;

        case CODECHAL_HEVC_BRC_RESET:
            currKrnHeader = &kernelHeaderTable->HEVC_brc_reset;
            break;

        case CODECHAL_HEVC_BRC_FRAME_UPDATE:
            currKrnHeader = &kernelHeaderTable->HEVC_brc_update;
            break;

        case CODECHAL_HEVC_BRC_LCU_UPDATE:
            currKrnHeader = &kernelHeaderTable->HEVC_brc_lcuqp;
            break;

        default:
            CODECHAL_ENCODE_ASSERTMESSAGE("Unsupported BRC mode requested, krnStateIdx=%d", krnStateIdx);
            return MOS_STATUS_INVALID_PARAMETER;
        }
        break;
    }

    default:
        CODECHAL_ENCODE_ASSERTMESSAGE("Unsupported ENC mode requested");
        return MOS_STATUS_INVALID_PARAMETER;
    }

    *((PCODECHAL_KERNEL_HEADER)krnHeader) = *currKrnHeader;

    PCODECHAL_KERNEL_HEADER nextKrnHeader = (currKrnHeader + 1);
    PCODECHAL_KERNEL_HEADER invalidEntry  = &(kernelHeaderTable->HEVC_brc_lcuqp) + 1;
    uint32_t                nextKrnOffset = *krnSize;
    if (nextKrnHeader < invalidEntry)
    {
        nextKrnOffset = nextKrnHeader->KernelStartPointer << MHW_KERNEL_OFFSET_SHIFT;
    }
    *krnSize = nextKrnOffset - (currKrnHeader->KernelStartPointer << MHW_KERNEL_OFFSET_SHIFT);

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::InitKernelStateMbEnc()
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    PMHW_STATE_HEAP_INTERFACE stateHeapInterface = m_hwInterface->GetRenderInterface()->m_stateHeapInterface;
    m_numMbEncEncKrnStates                       = MBENC_NUM_KRN;

    m_mbEncKernelStates =
        MOS_NewArray(MHW_KERNEL_STATE, m_numMbEncEncKrnStates);
    CODECHAL_ENCODE_CHK_NULL_RETURN(m_mbEncKernelStates);

    m_mbEncKernelBindingTable = (PCODECHAL_ENCODE_BINDING_TABLE_GENERIC)MOS_AllocAndZeroMemory(
        sizeof(GenericBindingTable) * m_numMbEncEncKrnStates);
    CODECHAL_ENCODE_CHK_NULL_RETURN(m_mbEncKernelBindingTable);

    PMHW_KERNEL_STATE kernelStatePtr = m_mbEncKernelStates;

    for (uint32_t krnStateIdx = 0; krnStateIdx < m_numMbEncEncKrnStates; krnStateIdx++)
    {
        auto                   kernelSize = m_combinedKernelSize;
        CODECHAL_KERNEL_HEADER currKrnHeader;
        CODECHAL_ENCODE_CHK_STATUS_RETURN(GetKernelHeaderAndSize(
            m_kernelBinary,
            ENC_MBENC,
            krnStateIdx,
            &currKrnHeader,
            &kernelSize));

        CODECHAL_ENCODE_CHK_STATUS_RETURN(SetKernelParams(
            ENC_MBENC,
            &kernelStatePtr->KernelParams,
            krnStateIdx));

        CODECHAL_ENCODE_CHK_STATUS_RETURN(SetBindingTable(
            ENC_MBENC,
            &m_mbEncKernelBindingTable[krnStateIdx],
            krnStateIdx));

        kernelStatePtr->dwCurbeOffset = stateHeapInterface->pStateHeapInterface->GetSizeofCmdInterfaceDescriptorData();
        kernelStatePtr->KernelParams.pBinary =
            m_kernelBinary +
            (currKrnHeader.KernelStartPointer << MHW_KERNEL_OFFSET_SHIFT);
        kernelStatePtr->KernelParams.iSize   = kernelSize;
        kernelStatePtr->dwCurbeOffset        = stateHeapInterface->pStateHeapInterface->GetSizeofCmdInterfaceDescriptorData();
        kernelStatePtr->KernelParams.pBinary = m_kernelBinary + (currKrnHeader.KernelStartPointer << MHW_KERNEL_OFFSET_SHIFT);
        kernelStatePtr->KernelParams.iSize   = kernelSize;

        CODECHAL_ENCODE_CHK_STATUS_RETURN(stateHeapInterface->pfnCalculateSshAndBtSizesRequested(
            stateHeapInterface,
            kernelStatePtr->KernelParams.iBTCount,
            &kernelStatePtr->dwSshSize,
            &kernelStatePtr->dwBindingTableSize));

        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hwInterface->MhwInitISH(stateHeapInterface, kernelStatePtr));

        kernelStatePtr++;
    }

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::InitKernelStateBrc()
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    PMHW_STATE_HEAP_INTERFACE stateHeapInterface = m_hwInterface->GetRenderInterface()->m_stateHeapInterface;
    m_numBrcKrnStates                            = CODECHAL_HEVC_BRC_NUM;

    m_brcKernelStates = MOS_NewArray(MHW_KERNEL_STATE, m_numBrcKrnStates);
    CODECHAL_ENCODE_CHK_NULL_RETURN(m_brcKernelStates);

    m_brcKernelBindingTable = (PCODECHAL_ENCODE_BINDING_TABLE_GENERIC)MOS_AllocAndZeroMemory(
        sizeof(GenericBindingTable) * m_numBrcKrnStates);

    PMHW_KERNEL_STATE kernelStatePtr = m_brcKernelStates;

    kernelStatePtr++;  // Skipping BRC_COARSE_INTRA as it not in Gen11

    // KrnStateIdx initialization starts at 1 as Gen11 does not support BRC_COARSE_INTRA kernel in BRC. It is part of the Combined Common Kernel
    for (uint32_t krnStateIdx = 1; krnStateIdx < m_numBrcKrnStates; krnStateIdx++)
    {
        auto                   kernelSize = m_combinedKernelSize;
        CODECHAL_KERNEL_HEADER currKrnHeader;

        CODECHAL_ENCODE_CHK_STATUS_RETURN(GetKernelHeaderAndSize(
            m_kernelBinary,
            ENC_BRC,
            krnStateIdx,
            &currKrnHeader,
            (uint32_t *)&kernelSize));

        CODECHAL_ENCODE_CHK_STATUS_RETURN(SetKernelParams(
            ENC_BRC,
            &kernelStatePtr->KernelParams,
            krnStateIdx));

        CODECHAL_ENCODE_CHK_STATUS_RETURN(SetBindingTable(
            ENC_BRC,
            &m_brcKernelBindingTable[krnStateIdx],
            krnStateIdx));

        kernelStatePtr->dwCurbeOffset        = stateHeapInterface->pStateHeapInterface->GetSizeofCmdInterfaceDescriptorData();
        kernelStatePtr->KernelParams.pBinary = m_kernelBinary + (currKrnHeader.KernelStartPointer << MHW_KERNEL_OFFSET_SHIFT);
        kernelStatePtr->KernelParams.iSize   = kernelSize;

        CODECHAL_ENCODE_CHK_STATUS_RETURN(stateHeapInterface->pfnCalculateSshAndBtSizesRequested(
            stateHeapInterface,
            kernelStatePtr->KernelParams.iBTCount,
            &kernelStatePtr->dwSshSize,
            &kernelStatePtr->dwBindingTableSize));

        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hwInterface->MhwInitISH(stateHeapInterface, kernelStatePtr));

        kernelStatePtr++;
    }

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::GetFrameBrcLevel()
{
    CODECHAL_ENCODE_FUNCTION_ENTER;

    //if L0/L1 both points to previous frame, then its LBD otherwise its is level 1 RA B.
    auto                               B_or_LDB_brclevel = m_lowDelay ? HEVC_BRC_FRAME_TYPE_P_OR_LB : HEVC_BRC_FRAME_TYPE_B;
    std::map<int, HEVC_BRC_FRAME_TYPE> codingtype_to_brclevel{
        {I_TYPE, HEVC_BRC_FRAME_TYPE_I},
        {P_TYPE, HEVC_BRC_FRAME_TYPE_P_OR_LB},
        {B_TYPE, B_or_LDB_brclevel},
        {B1_TYPE, HEVC_BRC_FRAME_TYPE_B1},
        {B2_TYPE, HEVC_BRC_FRAME_TYPE_B2}};

    //Both I or P/LDB type at same HierarchLevelPlus1
    auto                               intra_LDBFrame_to_Brclevel = (m_pictureCodingType == I_TYPE) ? HEVC_BRC_FRAME_TYPE_I : HEVC_BRC_FRAME_TYPE_P_OR_LB;
    std::map<int, HEVC_BRC_FRAME_TYPE> hierchLevelPlus1_to_brclevel{
        {1, intra_LDBFrame_to_Brclevel},
        {2, HEVC_BRC_FRAME_TYPE_B},
        {3, HEVC_BRC_FRAME_TYPE_B1},
        {4, HEVC_BRC_FRAME_TYPE_B2}};

    if (m_hevcSeqParams->HierarchicalFlag && m_hevcSeqParams->GopRefDist > 1 && m_hevcSeqParams->GopRefDist <= 8)
    {
        if (m_hevcPicParams->HierarchLevelPlus1 > 0)  // LDB or RAB
        {
            m_currFrameBrcLevel = hierchLevelPlus1_to_brclevel.count(m_hevcPicParams->HierarchLevelPlus1) ? hierchLevelPlus1_to_brclevel[m_hevcPicParams->HierarchLevelPlus1] : HEVC_BRC_FRAME_TYPE_INVALID;
            //Invalid HierarchLevelPlus1 or LBD frames at level 3 eror check.
            if ((m_currFrameBrcLevel == HEVC_BRC_FRAME_TYPE_INVALID) ||
                (m_hevcSeqParams->LowDelayMode && m_currFrameBrcLevel == HEVC_BRC_FRAME_TYPE_B2))
            {
                CODECHAL_ENCODE_ASSERTMESSAGE("HEVC_BRC_FRAME_TYPE_INVALID or LBD picture doesn't support Level 4\n");
                return MOS_STATUS_INVALID_PARAMETER;
            }
        }
        else
        {
            if (!m_hevcSeqParams->LowDelayMode)  // RA B
            {
                m_currFrameBrcLevel = codingtype_to_brclevel.count(m_pictureCodingType) ? codingtype_to_brclevel[m_pictureCodingType] : HEVC_BRC_FRAME_TYPE_INVALID;
                //Invalid CodingType.
                if (m_currFrameBrcLevel == HEVC_BRC_FRAME_TYPE_INVALID)
                {
                    CODECHAL_ENCODE_ASSERTMESSAGE("Invalid CodingType\n");
                    return MOS_STATUS_INVALID_PARAMETER;
                }
            }
            else  // Low Delay mode: Flat case
            {
                m_currFrameBrcLevel = (m_pictureCodingType == I_TYPE) ? HEVC_BRC_FRAME_TYPE_I : HEVC_BRC_FRAME_TYPE_P_OR_LB;
            }
        }
    }
    else  // Flat B
    {
        m_currFrameBrcLevel = (m_pictureCodingType == I_TYPE) ? HEVC_BRC_FRAME_TYPE_I : B_or_LDB_brclevel;
    }

    return MOS_STATUS_SUCCESS;
}

uint32_t CodechalEncHevcStateG12::GetMaxBtCount()
{
    uint16_t btIdxAlignment = m_hwInterface->GetRenderInterface()->m_stateHeapInterface->pStateHeapInterface->GetBtIdxAlignment();

    // BRC Init kernel
    uint32_t btCountPhase1 = MOS_ALIGN_CEIL(m_brcKernelStates[CODECHAL_HEVC_BRC_INIT].KernelParams.iBTCount, btIdxAlignment);

    // SwScoreboard kernel
    uint32_t btCountPhase2 = MOS_ALIGN_CEIL(m_swScoreboardState->GetBTCount(), btIdxAlignment);

    // Csc+Ds+Conversion kernel
    btCountPhase2 += MOS_ALIGN_CEIL(m_cscDsState->GetBTCount(), btIdxAlignment);

    // Intra Distortion kernel
    if (m_intraDistKernel)
    {
        btCountPhase2 += MOS_ALIGN_CEIL(m_intraDistKernel->GetBTCount(), btIdxAlignment);
    }

    // HME 4x, 16x, 32x kernel
    if (m_hmeKernel)
    {
        btCountPhase2 += (MOS_ALIGN_CEIL(m_hmeKernel->GetBTCount(), btIdxAlignment) * 3);
    }

    // Weighted prediction kernel
    btCountPhase2 += MOS_ALIGN_CEIL(m_wpState->GetBTCount(), btIdxAlignment);
    uint32_t btCountPhase3 = MOS_ALIGN_CEIL(m_brcKernelStates[CODECHAL_HEVC_BRC_LCU_UPDATE].KernelParams.iBTCount, btIdxAlignment) +
                             MOS_ALIGN_CEIL(m_brcKernelStates[CODECHAL_HEVC_BRC_FRAME_UPDATE].KernelParams.iBTCount, btIdxAlignment) +
                             MOS_ALIGN_CEIL(m_mbEncKernelStates[MBENC_LCU32_KRNIDX].KernelParams.iBTCount, btIdxAlignment);

    uint32_t btCountPhase4 = MOS_ALIGN_CEIL(m_brcKernelStates[CODECHAL_HEVC_BRC_LCU_UPDATE].KernelParams.iBTCount, btIdxAlignment) +
                             MOS_ALIGN_CEIL(m_brcKernelStates[CODECHAL_HEVC_BRC_FRAME_UPDATE].KernelParams.iBTCount, btIdxAlignment) +
                             MOS_ALIGN_CEIL(m_mbEncKernelStates[MBENC_LCU64_KRNIDX].KernelParams.iBTCount, btIdxAlignment);

    uint32_t maxBtCount = MOS_MAX(btCountPhase1, btCountPhase2);
    maxBtCount          = MOS_MAX(maxBtCount, btCountPhase3);
    maxBtCount          = MOS_MAX(maxBtCount, btCountPhase4);

    return maxBtCount;
}

MOS_STATUS CodechalEncHevcStateG12::CalcScaledDimensions()
{
    return MOS_STATUS_SUCCESS;
}

void CodechalEncHevcStateG12::GetMaxRefFrames(uint8_t &maxNumRef0, uint8_t &maxNumRef1)
{
    maxNumRef0 = m_maxNumVmeL0Ref;
    maxNumRef1 = m_maxNumVmeL1Ref;

    return;
}

MOS_STATUS CodechalEncHevcStateG12::GetStatusReport(
    EncodeStatus *      encodeStatus,
    EncodeStatusReport *encodeStatusReport)
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    CODECHAL_ENCODE_CHK_NULL_RETURN(encodeStatus);
    CODECHAL_ENCODE_CHK_NULL_RETURN(encodeStatusReport);

    if (encodeStatusReport->UsedVdBoxNumber <= 1)
    {
        return CodechalEncodeHevcBase::GetStatusReport(encodeStatus, encodeStatusReport);
    }

    PCODECHAL_ENCODE_BUFFER tileSizeStatusReport = &m_tileRecordBuffer[encodeStatusReport->CurrOriginalPic.FrameIdx];

    MOS_LOCK_PARAMS lockFlags;
    MOS_ZeroMemory(&lockFlags, sizeof(MOS_LOCK_PARAMS));
    CODECHAL_ENCODE_CHK_NULL_RETURN(m_osInterface);
    HCPPakHWTileSizeRecord_G12 *tileStatusReport = (HCPPakHWTileSizeRecord_G12 *)m_osInterface->pfnLockResource(
        m_osInterface,
        &tileSizeStatusReport->sResource,
        &lockFlags);
    CODECHAL_ENCODE_CHK_NULL_RETURN(tileStatusReport);

    encodeStatusReport->CodecStatus                                      = CODECHAL_STATUS_SUCCESSFUL;
    encodeStatusReport->PanicMode                                        = false;
    encodeStatusReport->AverageQp                                        = 0;
    encodeStatusReport->QpY                                              = 0;
    encodeStatusReport->SuggestedQpYDelta                                = 0;
    encodeStatusReport->NumberPasses                                     = 1;
    encodeStatusReport->bitstreamSize                                    = 0;
    encodeStatus->ImageStatusCtrlOfLastBRCPass.hcpCumulativeFrameDeltaQp = 0;

    uint32_t totalCU = 0;
    double   sumQp   = 0.0;
    for (uint32_t i = 0; i < encodeStatusReport->NumberTilesInFrame; i++)
    {
        if (tileStatusReport[i].Length == 0)
        {
            encodeStatusReport->CodecStatus = CODECHAL_STATUS_INCOMPLETE;
            return eStatus;
        }

        encodeStatusReport->bitstreamSize += tileStatusReport[i].Length;
        totalCU += (m_tileParams[i].TileHeightInMinCbMinus1 + 1) * (m_tileParams[i].TileWidthInMinCbMinus1 + 1);
        sumQp += tileStatusReport[i].Hcp_Qp_Status_Count;
    }

    encodeStatusReport->NumberPasses = (uint8_t)encodeStatus->dwNumberPasses + 1;
    CODECHAL_ENCODE_VERBOSEMESSAGE("BRC Scalability Mode Exectued PAK Pass number: %d.\n", encodeStatusReport->NumberPasses);

    if (encodeStatusReport->bitstreamSize == 0 ||
        encodeStatusReport->bitstreamSize > m_bitstreamUpperBound)
    {
        encodeStatusReport->CodecStatus   = CODECHAL_STATUS_ERROR;
        encodeStatusReport->bitstreamSize = 0;
        CODECHAL_ENCODE_ASSERTMESSAGE("Bit-stream size exceeds upper bound!");
        return MOS_STATUS_INVALID_FILE_SIZE;
    }

    if (m_sseEnabled)
    {
        CODECHAL_ENCODE_CHK_STATUS_RETURN(CalculatePSNR(encodeStatus, encodeStatusReport));
    }

    CODECHAL_ENCODE_CHK_COND_RETURN(totalCU == 0, "Invalid totalCU count");
    encodeStatusReport->QpY = encodeStatusReport->AverageQp =
        (uint8_t)((sumQp / (double)totalCU) / 4.0);  // due to TU is 4x4 and there are 4 TUs in one CU

    if (m_enableTileStitchByHW)
    {
        return eStatus;
    }

    uint8_t *tempBsBuffer = nullptr, *bufPtr = nullptr;
    tempBsBuffer = bufPtr = (uint8_t *)MOS_AllocAndZeroMemory(encodeStatusReport->bitstreamSize);
    CODECHAL_ENCODE_CHK_NULL_RETURN(tempBsBuffer);

    CODEC_REF_LIST currRefList = *(encodeStatus->encodeStatusReport.pCurrRefList);
    MOS_ZeroMemory(&lockFlags, sizeof(MOS_LOCK_PARAMS));
    lockFlags.ReadOnly = 1;
    uint8_t *bitstream = (uint8_t *)m_osInterface->pfnLockResource(
        m_osInterface,
        &currRefList.resBitstreamBuffer,
        &lockFlags);
    if (bitstream == nullptr)
    {
        MOS_SafeFreeMemory(tempBsBuffer);
        CODECHAL_ENCODE_CHK_NULL_RETURN(nullptr);
    }

    for (uint32_t i = 0; i < encodeStatusReport->NumberTilesInFrame; i++)
    {
        uint32_t offset = m_tileParams[i].BitstreamByteOffset * CODECHAL_CACHELINE_SIZE;
        uint32_t len    = tileStatusReport[i].Length;

        MOS_SecureMemcpy(bufPtr, len, &bitstream[offset], len);
        bufPtr += len;
    }

    MOS_SecureMemcpy(bitstream, encodeStatusReport->bitstreamSize, tempBsBuffer, encodeStatusReport->bitstreamSize);
    MOS_ZeroMemory(&bitstream[encodeStatusReport->bitstreamSize],
        m_bitstreamUpperBound - encodeStatusReport->bitstreamSize);

    if (tempBsBuffer)
    {
        MOS_FreeMemory(tempBsBuffer);
    }

    if (m_osInterface && bitstream)
    {
        m_osInterface->pfnUnlockResource(m_osInterface, &currRefList.resBitstreamBuffer);
    }

    if (m_osInterface && tileStatusReport)
    {
        // clean-up the tile status report buffer
        MOS_ZeroMemory(tileStatusReport, sizeof(tileStatusReport[0]) * encodeStatusReport->NumberTilesInFrame);

        m_osInterface->pfnUnlockResource(m_osInterface, &tileSizeStatusReport->sResource);
    }

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::AllocateResourcesVariableSize()
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    if (!m_hevcPicParams->tiles_enabled_flag)
    {
        return eStatus;
    }

    uint32_t bufSize = 0;
    if (m_pakPiplStrmOutEnable)
    {
        // PAK CU Level Streamout Data:   DW57-59 in HCP pipe buffer address command
        // One CU has 16-byte. But, each tile needs to be aliged to the cache line
        uint32_t tileWidthInCus  = 0;
        uint32_t tileHeightInCus = 0;
        uint32_t numTileColumns  = m_hevcPicParams->num_tile_columns_minus1 + 1;
        uint32_t numTileRows     = m_hevcPicParams->num_tile_rows_minus1 + 1;
        for (uint32_t tileRow = 0; tileRow < numTileRows; tileRow++)
        {
            for (uint32_t tileCol = 0; tileCol < numTileColumns; tileCol++)
            {
                uint32_t idx = tileRow * numTileColumns + tileCol;

                tileHeightInCus = m_tileParams[idx].TileHeightInMinCbMinus1 + 1;
                tileWidthInCus  = m_tileParams[idx].TileWidthInMinCbMinus1 + 1;
                bufSize += (tileWidthInCus * tileHeightInCus * 16);
                bufSize = MOS_ALIGN_CEIL(bufSize, CODECHAL_CACHELINE_SIZE);
            }
        }
        if (Mos_ResourceIsNull(&m_resPakcuLevelStreamoutData.sResource) ||
            (bufSize > m_resPakcuLevelStreamoutData.dwSize))
        {
            if (!Mos_ResourceIsNull(&m_resPakcuLevelStreamoutData.sResource))
            {
                m_osInterface->pfnFreeResource(m_osInterface, &m_resPakcuLevelStreamoutData.sResource);
            }

            MOS_ALLOC_GFXRES_PARAMS allocParamsForBufferLinear;
            MOS_ZeroMemory(&allocParamsForBufferLinear, sizeof(MOS_ALLOC_GFXRES_PARAMS));
            allocParamsForBufferLinear.Type     = MOS_GFXRES_BUFFER;
            allocParamsForBufferLinear.TileType = MOS_TILE_LINEAR;
            allocParamsForBufferLinear.Format   = Format_Buffer;
            allocParamsForBufferLinear.dwBytes  = bufSize;
            allocParamsForBufferLinear.pBufName = "PAK CU Level Streamout Data";

            CODECHAL_ENCODE_CHK_STATUS_RETURN((MOS_STATUS)m_osInterface->pfnAllocateResource(
                m_osInterface,
                &allocParamsForBufferLinear,
                &m_resPakcuLevelStreamoutData.sResource));
            m_resPakcuLevelStreamoutData.dwSize = bufSize;
            CODECHAL_ENCODE_VERBOSEMESSAGE("reallocate cu steam out buffer, size=0x%x.\n", bufSize);
        }
    }

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::ExecutePictureLevel()
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    m_firstTaskInPhase = m_singleTaskPhaseSupported ? IsFirstPass() : true;
    m_lastTaskInPhase  = m_singleTaskPhaseSupported ? IsLastPass() : true;

    PerfTagSetting perfTag;
    CODECHAL_ENCODE_SET_PERFTAG_INFO(perfTag, CODECHAL_ENCODE_PERFTAG_CALL_PAK_ENGINE);

    CODECHAL_ENCODE_CHK_STATUS_RETURN(VerifyCommandBufferSize());

    if (!m_singleTaskPhaseSupportedInPak)
    {
        // Command buffer or patch list size are too small and so we cannot submit multiple pass of PAKs together
        m_firstTaskInPhase = true;
        m_lastTaskInPhase  = true;
    }

    if (m_vdboxIndex > m_mfxInterface->GetMaxVdboxIndex())
    {
        CODECHAL_ENCODE_ASSERTMESSAGE("ERROR - vdbox index exceed the maximum");
        eStatus = MOS_STATUS_INVALID_PARAMETER;
        return eStatus;
    }

    MOS_COMMAND_BUFFER cmdBuffer;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(GetCommandBuffer(&cmdBuffer));

    if ((!m_singleTaskPhaseSupported) || m_firstTaskInPhase)
    {
        // Send command buffer header at the beginning (OS dependent)
        // frame tracking tag is only added in the last command buffer header
        bool bRequestFrameTracking = m_singleTaskPhaseSupported ? m_firstTaskInPhase : m_lastTaskInPhase;

        CODECHAL_ENCODE_CHK_STATUS_RETURN(SendPrologWithFrameTracking(&cmdBuffer, bRequestFrameTracking));
    }

    // clean-up per VDBOX semaphore memory
    int32_t currentPipe = GetCurrentPipe();
    if (currentPipe < 0)
    {
        eStatus = MOS_STATUS_INVALID_PARAMETER;
        return eStatus;
    }

    if (m_numPipe >= 2 &&
        ((m_singleTaskPhaseSupported && IsFirstPass()) ||
            !m_singleTaskPhaseSupported))
    {
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_miInterface->AddWatchdogTimerStopCmd(&cmdBuffer));
        //HW Semaphore cmd to make sure all pipes start encode at the same time
        CODECHAL_ENCODE_CHK_STATUS_RETURN(SendMIAtomicCmd(&m_resPipeStartSemaMem, 1, MHW_MI_ATOMIC_INC, &cmdBuffer));
        CODECHAL_ENCODE_CHK_STATUS_RETURN(SendHWWaitCommand(
            &m_resPipeStartSemaMem,
            &cmdBuffer,
            m_numPipe));

        // Program some placeholder cmds to resolve the hazard between BEs sync
        MHW_MI_STORE_DATA_PARAMS dataParams;
        dataParams.pOsResource      = &m_resDelayMinus;
        dataParams.dwResourceOffset = 0;
        dataParams.dwValue          = 0xDE1A;
        for (uint32_t i = 0; i < m_numDelay; i++)
        {
            CODECHAL_ENCODE_CHK_STATUS_RETURN(m_miInterface->AddMiStoreDataImmCmd(
                &cmdBuffer,
                &dataParams));
        }

        //clean HW semaphore memory
        CODECHAL_ENCODE_CHK_STATUS_RETURN(SendMIAtomicCmd(&m_resPipeStartSemaMem, 1, MHW_MI_ATOMIC_DEC, &cmdBuffer));

        //Start Watchdog Timer
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_miInterface->AddWatchdogTimerStartCmd(&cmdBuffer));

        //To help test media reset, this hw semaphore wait will never be reached.
        if (m_enableTestMediaReset)
        {
            CODECHAL_ENCODE_CHK_STATUS_RETURN(SendHWWaitCommand(
                &m_resPipeStartSemaMem,
                &cmdBuffer,
                m_numPipe + 2));
        }
    }

    if (m_brcEnabled && !IsFirstPass())  // Only the regular BRC passes have the conditional batch buffer end
    {
        // Ensure the previous PAK BRC pass is done, mainly for pipes other than pipe0.
        if (m_singleTaskPhaseSupported && m_numPipe >= 2 &&
            !Mos_ResourceIsNull(&m_resBrcSemaphoreMem[currentPipe].sResource))
        {
            CODECHAL_ENCODE_CHK_STATUS_RETURN(
                SendHWWaitCommand(
                    &m_resBrcSemaphoreMem[currentPipe].sResource,
                    &cmdBuffer,
                    1));
        }

        // Insert conditional batch buffer end
        MHW_MI_CONDITIONAL_BATCH_BUFFER_END_PARAMS miConditionalBatchBufferEndParams;
        MOS_ZeroMemory(
            &miConditionalBatchBufferEndParams,
            sizeof(MHW_MI_CONDITIONAL_BATCH_BUFFER_END_PARAMS));
        uint32_t baseOffset = (m_encodeStatusBuf.wCurrIndex * m_encodeStatusBuf.dwReportSize) +
                              sizeof(uint32_t) * 2;  // pEncodeStatus is offset by 2 DWs in the resource       ;

        if (m_hucPakStitchEnabled && m_numPipe >= 2)  //BRC scalability
        {
            CODECHAL_ENCODE_ASSERT((m_encodeStatusBuf.dwHuCStatusMaskOffset & 7) == 0);  // Make sure uint64_t aligned
            CODECHAL_ENCODE_ASSERT((m_encodeStatusBuf.dwHuCStatusMaskOffset + sizeof(uint32_t)) == m_encodeStatusBuf.dwHuCStatusRegOffset);

            miConditionalBatchBufferEndParams.presSemaphoreBuffer = &m_encodeStatusBuf.resStatusBuffer;
            miConditionalBatchBufferEndParams.dwOffset            = baseOffset + m_encodeStatusBuf.dwHuCStatusMaskOffset;
        }
        else
        {
            CODECHAL_ENCODE_ASSERT((m_encodeStatusBuf.dwImageStatusMaskOffset & 7) == 0);  // Make sure uint64_t aligned
            CODECHAL_ENCODE_ASSERT((m_encodeStatusBuf.dwImageStatusMaskOffset + sizeof(uint32_t)) == m_encodeStatusBuf.dwImageStatusCtrlOffset);

            miConditionalBatchBufferEndParams.presSemaphoreBuffer = &m_encodeStatusBuf.resStatusBuffer;
            miConditionalBatchBufferEndParams.dwOffset            = baseOffset + m_encodeStatusBuf.dwImageStatusMaskOffset;
        }

        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_miInterface->AddMiConditionalBatchBufferEndCmd(
            &cmdBuffer,
            &miConditionalBatchBufferEndParams));

        auto                             mmioRegisters = m_hcpInterface->GetMmioRegisters(m_vdboxIndex);
        MHW_MI_STORE_REGISTER_MEM_PARAMS miStoreRegMemParams;
        MHW_MI_COPY_MEM_MEM_PARAMS       miCpyMemMemParams;
        if (m_hucPakStitchEnabled && m_numPipe >= 2)
        {
            // Write back the HCP image control register with HUC PAK Int Kernel output
            MHW_MI_LOAD_REGISTER_MEM_PARAMS miLoadRegMemParams;
            MOS_ZeroMemory(&miLoadRegMemParams, sizeof(miLoadRegMemParams));
            miLoadRegMemParams.presStoreBuffer = &m_resBrcDataBuffer;
            miLoadRegMemParams.dwOffset        = CODECHAL_OFFSETOF(PakIntegrationBrcData, HCP_ImageStatusControl);
            miLoadRegMemParams.dwRegister      = mmioRegisters->hcpEncImageStatusCtrlRegOffset;
            CODECHAL_ENCODE_CHK_STATUS_RETURN(m_miInterface->AddMiLoadRegisterMemCmd(&cmdBuffer, &miLoadRegMemParams));

            if (IsFirstPipe())
            {
                MOS_ZeroMemory(&miCpyMemMemParams, sizeof(miCpyMemMemParams));
                miCpyMemMemParams.presSrc     = &m_resBrcDataBuffer;
                miCpyMemMemParams.dwSrcOffset = CODECHAL_OFFSETOF(PakIntegrationBrcData, HCP_ImageStatusControl);
                miCpyMemMemParams.presDst     = &m_brcBuffers.resBrcPakStatisticBuffer[m_brcBuffers.uiCurrBrcPakStasIdxForWrite];
                miCpyMemMemParams.dwDstOffset = CODECHAL_OFFSETOF(CODECHAL_ENCODE_HEVC_PAK_STATS_BUFFER, HCP_IMAGE_STATUS_CONTROL_FOR_LAST_PASS);
                CODECHAL_ENCODE_CHK_STATUS_RETURN(m_miInterface->AddMiCopyMemMemCmd(&cmdBuffer, &miCpyMemMemParams));

                MOS_ZeroMemory(&miStoreRegMemParams, sizeof(miStoreRegMemParams));
                miStoreRegMemParams.presStoreBuffer = &m_encodeStatusBuf.resStatusBuffer;
                miStoreRegMemParams.dwOffset        = baseOffset + m_encodeStatusBuf.dwImageStatusCtrlOfLastBRCPassOffset;
                miStoreRegMemParams.dwRegister      = mmioRegisters->hcpEncImageStatusCtrlRegOffset;
                CODECHAL_ENCODE_CHK_STATUS_RETURN(m_miInterface->AddMiStoreRegisterMemCmd(&cmdBuffer, &miStoreRegMemParams));
            }
        }
        else
        {
            // Write back the HCP image control register for RC6 may clean it out
            MHW_MI_LOAD_REGISTER_MEM_PARAMS miLoadRegMemParams;
            MOS_ZeroMemory(&miLoadRegMemParams, sizeof(miLoadRegMemParams));
            miLoadRegMemParams.presStoreBuffer = &m_encodeStatusBuf.resStatusBuffer;
            miLoadRegMemParams.dwOffset        = baseOffset + m_encodeStatusBuf.dwImageStatusCtrlOffset;
            miLoadRegMemParams.dwRegister      = mmioRegisters->hcpEncImageStatusCtrlRegOffset;
            CODECHAL_ENCODE_CHK_STATUS_RETURN(m_miInterface->AddMiLoadRegisterMemCmd(&cmdBuffer, &miLoadRegMemParams));

            MOS_ZeroMemory(&miStoreRegMemParams, sizeof(miStoreRegMemParams));
            miStoreRegMemParams.presStoreBuffer = &m_brcBuffers.resBrcPakStatisticBuffer[m_brcBuffers.uiCurrBrcPakStasIdxForWrite];
            miStoreRegMemParams.dwOffset        = CODECHAL_OFFSETOF(CODECHAL_ENCODE_HEVC_PAK_STATS_BUFFER, HCP_IMAGE_STATUS_CONTROL_FOR_LAST_PASS);
            miStoreRegMemParams.dwRegister      = mmioRegisters->hcpEncImageStatusCtrlRegOffset;
            CODECHAL_ENCODE_CHK_STATUS_RETURN(m_miInterface->AddMiStoreRegisterMemCmd(&cmdBuffer, &miStoreRegMemParams));

            MOS_ZeroMemory(&miStoreRegMemParams, sizeof(miStoreRegMemParams));
            miStoreRegMemParams.presStoreBuffer = &m_encodeStatusBuf.resStatusBuffer;
            miStoreRegMemParams.dwOffset        = baseOffset + m_encodeStatusBuf.dwImageStatusCtrlOfLastBRCPassOffset;
            miStoreRegMemParams.dwRegister      = mmioRegisters->hcpEncImageStatusCtrlRegOffset;
            CODECHAL_ENCODE_CHK_STATUS_RETURN(m_miInterface->AddMiStoreRegisterMemCmd(&cmdBuffer, &miStoreRegMemParams));
        }
    }

    if (IsFirstPipe() && IsFirstPass() && m_osInterface->bTagResourceSync)
    {
        // This is a short term solution to solve the sync tag issue: the sync tag write for PAK is inserted at the end of 2nd pass PAK BB
        // which may be skipped in multi-pass PAK enabled case. The idea here is to insert the previous frame's tag at the beginning
        // of the BB and keep the current frame's tag at the end of the BB. There will be a delay for tag update but it should be fine
        // as long as Dec/VP/Enc won't depend on this PAK so soon.

        PMOS_RESOURCE globalGpuContextSyncTagBuffer = nullptr;
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_osInterface->pfnGetGpuStatusBufferResource(
            m_osInterface,
            globalGpuContextSyncTagBuffer));
        CODECHAL_ENCODE_CHK_NULL_RETURN(globalGpuContextSyncTagBuffer);

        MHW_MI_STORE_DATA_PARAMS params;
        params.pOsResource      = globalGpuContextSyncTagBuffer;
        params.dwResourceOffset = m_osInterface->pfnGetGpuStatusTagOffset(m_osInterface, m_osInterface->CurrentGpuContextOrdinal);
        uint32_t value          = m_osInterface->pfnGetGpuStatusTag(m_osInterface, m_osInterface->CurrentGpuContextOrdinal);
        params.dwValue          = (value > 0) ? (value - 1) : 0;
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_miInterface->AddMiStoreDataImmCmd(&cmdBuffer, &params));
    }

    if (IsFirstPipe())
    {
        CODECHAL_ENCODE_CHK_STATUS_RETURN(StartStatusReport(&cmdBuffer, CODECHAL_NUM_MEDIA_STATES));
    }

    if (m_numPipe >= 2)
    {
        // clean up hw semaphore for BRC PAK pass sync, used only in single task phase.
        if (m_singleTaskPhaseSupported &&
            m_brcEnabled &&
            !Mos_ResourceIsNull(&m_resBrcSemaphoreMem[currentPipe].sResource))
        {
            MHW_MI_STORE_DATA_PARAMS storeDataParams;
            MOS_ZeroMemory(&storeDataParams, sizeof(storeDataParams));
            storeDataParams.pOsResource      = &m_resBrcSemaphoreMem[currentPipe].sResource;
            storeDataParams.dwResourceOffset = 0;
            storeDataParams.dwValue          = 0;

            CODECHAL_ENCODE_CHK_STATUS_RETURN(m_miInterface->AddMiStoreDataImmCmd(
                &cmdBuffer,
                &storeDataParams));
        }
    }

    CODECHAL_ENCODE_CHK_STATUS_RETURN(AddHcpPipeModeSelectCmd(&cmdBuffer));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(AddHcpSurfaceStateCmds(&cmdBuffer));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(AddHcpPipeBufAddrCmd(&cmdBuffer));

    MHW_VDBOX_IND_OBJ_BASE_ADDR_PARAMS indObjBaseAddrParams;
    SetHcpIndObjBaseAddrParams(indObjBaseAddrParams);
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hcpInterface->AddHcpIndObjBaseAddrCmd(&cmdBuffer, &indObjBaseAddrParams));

    MHW_VDBOX_QM_PARAMS fqmParams, qmParams;
    SetHcpQmStateParams(fqmParams, qmParams);
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hcpInterface->AddHcpFqmStateCmd(&cmdBuffer, &fqmParams));
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hcpInterface->AddHcpQmStateCmd(&cmdBuffer, &qmParams));

    if (m_brcEnabled)
    {
        uint32_t picStateCmdOffset;
        if (m_hucPakStitchEnabled && m_numPipe >= 2)
        {
            //for non fist PAK pass, always use the 2nd HCP PIC STATE cmd buffer
            picStateCmdOffset = IsFirstPass() ? 0 : 1;
        }
        else
        {
            picStateCmdOffset = GetCurrentPass();
        }

        MOS_RESOURCE &brcHcpStateWriteBuffer = m_brcBuffers.resBrcImageStatesWriteBuffer[m_currRecycledBufIdx];
        if (IsPanicModePass())
        {
            // BRC kernel supports only 4 BrcImageStates read/write buffers.
            // So for panic PAK pass use HCP_PIC_STATE command from previous PAK pass.
            picStateCmdOffset -= 1;
        }

        MHW_BATCH_BUFFER batchBuffer;
        MOS_ZeroMemory(&batchBuffer, sizeof(batchBuffer));
        batchBuffer.OsResource   = brcHcpStateWriteBuffer;
        batchBuffer.dwOffset     = picStateCmdOffset * BRC_IMG_STATE_SIZE_PER_PASS_G12;
        batchBuffer.bSecondLevel = true;

        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_miInterface->AddMiBatchBufferStartCmd(
            &cmdBuffer,
            &batchBuffer));
    }
    else
    {
        CODECHAL_ENCODE_CHK_STATUS_RETURN(AddHcpPictureStateCmd(&cmdBuffer));
    }

    // Send HEVC_VP9_RDOQ_STATE command
    if (m_hevcRdoqEnabled)
    {
        MHW_VDBOX_HEVC_PIC_STATE picStateParams;
        SetHcpPicStateParams(picStateParams);

        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hcpInterface->AddHcpHevcVp9RdoqStateCmd(&cmdBuffer, &picStateParams));
    }

    CODECHAL_ENCODE_CHK_STATUS_RETURN(ReturnCommandBuffer(&cmdBuffer));
    return eStatus;
}

void CodechalEncHevcStateG12::SetHcpSliceStateCommonParams(
    MHW_VDBOX_HEVC_SLICE_STATE &sliceState)
{
    CodechalEncHevcState::SetHcpSliceStateCommonParams(sliceState);

    sliceState.RoundingIntra = m_roundingIntraInUse;
    sliceState.RoundingInter = m_roundingInterInUse;

    if ((m_hevcSliceParams->slice_type == CODECHAL_HEVC_P_SLICE && m_hevcPicParams->weighted_pred_flag) ||
        (m_hevcSliceParams->slice_type == CODECHAL_HEVC_B_SLICE && m_hevcPicParams->weighted_bipred_flag))
    {
        sliceState.bWeightedPredInUse = true;
    }
    else
    {
        sliceState.bWeightedPredInUse = false;
    }

    static_cast<MHW_VDBOX_HEVC_SLICE_STATE_G12 &>(sliceState).dwNumPipe = m_numPipe;

    sliceState.presDataBuffer = IsPanicModePass() ? &m_skipFrameInfo.m_resMbCodeSkipFrameSurface : &m_resMbCodeSurface;
}

void CodechalEncHevcStateG12::SetHcpSliceStateParams(
    MHW_VDBOX_HEVC_SLICE_STATE &          sliceState,
    PCODEC_ENCODER_SLCDATA                slcData,
    uint16_t                              slcCount,
    PMHW_VDBOX_HCP_TILE_CODING_PARAMS_G12 tileCodingParams,
    bool                                  lastSliceInTile,
    uint32_t                              idx)
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    sliceState.pEncodeHevcSliceParams                                           = &m_hevcSliceParams[slcCount];
    sliceState.dwDataBufferOffset                                               = slcData[slcCount].CmdOffset;
    sliceState.dwOffset                                                         = slcData[slcCount].SliceOffset;
    sliceState.dwLength                                                         = slcData[slcCount].BitSize;
    sliceState.uiSkipEmulationCheckCount                                        = slcData[slcCount].SkipEmulationByteCount;
    sliceState.dwSliceIndex                                                     = (uint32_t)slcCount;
    sliceState.bLastSlice                                                       = (slcCount == m_numSlices - 1);
    sliceState.bLastSliceInTile                                                 = lastSliceInTile;
    sliceState.bLastSliceInTileColumn                                           = (bool)lastSliceInTile & tileCodingParams[idx].IsLastTileofColumn;
    sliceState.bFirstPass                                                       = IsFirstPass();
    sliceState.bLastPass                                                        = IsLastPass();
    sliceState.bInsertBeforeSliceHeaders                                        = (slcCount == 0);
    sliceState.bSaoLumaFlag                                                     = (m_hevcSeqParams->SAO_enabled_flag) ? m_hevcSliceParams[slcCount].slice_sao_luma_flag : 0;
    sliceState.bSaoChromaFlag                                                   = (m_hevcSeqParams->SAO_enabled_flag) ? m_hevcSliceParams[slcCount].slice_sao_chroma_flag : 0;
    static_cast<MHW_VDBOX_HEVC_SLICE_STATE_G12 &>(sliceState).pTileCodingParams = tileCodingParams + idx;
    static_cast<MHW_VDBOX_HEVC_SLICE_STATE_G12 &>(sliceState).dwTileID          = idx;

    sliceState.DeblockingFilterDisable = m_hevcSliceParams[slcCount].slice_deblocking_filter_disable_flag;
    sliceState.TcOffsetDiv2            = m_hevcSliceParams[slcCount].tc_offset_div2;
    sliceState.BetaOffsetDiv2          = m_hevcSliceParams[slcCount].beta_offset_div2;

    CalcTransformSkipParameters(sliceState.EncodeHevcTransformSkipParams);
}

MOS_STATUS CodechalEncHevcStateG12::SetMfxVideoCopyCmdParams(
    PMOS_COMMAND_BUFFER cmdBuffer)
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_CHK_NULL_RETURN(m_hwInterface);
    CODECHAL_ENCODE_CHK_NULL_RETURN(m_hwInterface->GetCpInterface());
    MhwCpInterface *cpInterface = m_hwInterface->GetCpInterface();

    uint32_t index = m_virtualEngineBbIndex;

    MHW_CP_COPY_PARAMS cpCopyParams;
    MOS_ZeroMemory(&cpCopyParams, sizeof(cpCopyParams));

    cpCopyParams.size          = m_hwInterface->m_tileRecordSize;
    cpCopyParams.presSrc       = &m_tileRecordBuffer[index].sResource;
    cpCopyParams.presDst       = &m_resBitstreamBuffer;
    cpCopyParams.lengthOfTable = (uint8_t)(m_numTiles);
    cpCopyParams.isEncodeInUse = true;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(cpInterface->SetCpCopy(m_osInterface, cmdBuffer, &cpCopyParams));

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::ExecuteSliceLevel()
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    CODECHAL_ENCODE_CHK_NULL_RETURN(m_slcData);

    if (m_pakOnlyTest)
    {
        CODECHAL_ENCODE_CHK_STATUS_RETURN(LoadPakCommandAndCuRecordFromFile());
    }

    if (!m_hevcPicParams->tiles_enabled_flag)
    {
        CODECHAL_ENCODE_CHK_STATUS_RETURN(CodechalEncHevcState::ExecuteSliceLevel());
    }
    else
    {
        CODECHAL_ENCODE_CHK_STATUS_RETURN(EncTileLevel());
    }

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::EncTileLevel()
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    int32_t currentPipe = GetCurrentPipe();
    int32_t currentPass = GetCurrentPass();

    if (currentPipe < 0 || currentPass < 0)
    {
        CODECHAL_ENCODE_ASSERTMESSAGE("Invalid pipe number or pass number");
        return MOS_STATUS_INVALID_PARAMETER;
    }

    MHW_VDBOX_HEVC_SLICE_STATE_G12 sliceState;
    SetHcpSliceStateCommonParams(sliceState);

    MOS_COMMAND_BUFFER cmdBuffer;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(GetCommandBuffer(&cmdBuffer));

    uint32_t numTileColumns = m_hevcPicParams->num_tile_columns_minus1 + 1;
    uint32_t numTileRows    = m_hevcPicParams->num_tile_rows_minus1 + 1;

    for (uint32_t tileRow = 0; tileRow < numTileRows; tileRow++)
    {
        for (uint32_t tileCol = 0; tileCol < numTileColumns; tileCol++)
        {
            PCODEC_ENCODER_SLCDATA slcData = m_slcData;
            uint32_t               slcCount, idx, sliceNumInTile = 0;

            idx = tileRow * numTileColumns + tileCol;

            if ((m_numPipe > 1) && (tileCol != currentPipe))
            {
                continue;
            }

            // HCP_TILE_CODING commmand
            CODECHAL_ENCODE_CHK_STATUS_RETURN(
                static_cast<MhwVdboxHcpInterfaceG12 *>(m_hcpInterface)->AddHcpTileCodingCmd(&cmdBuffer, &m_tileParams[idx]));

            for (slcCount = 0; slcCount < m_numSlices; slcCount++)
            {
                bool lastSliceInTile = false, sliceInTile = false;

                CODECHAL_ENCODE_CHK_STATUS_RETURN(IsSliceInTile(slcCount,
                    &m_tileParams[idx],
                    &sliceInTile,
                    &lastSliceInTile));

                if (!sliceInTile)
                {
                    continue;
                }

                if (IsFirstPass())
                {
                    uint32_t startLcu = 0;
                    for (uint32_t ii = 0; ii < slcCount; ii++)
                    {
                        startLcu += m_hevcSliceParams[ii].NumLCUsInSlice;
                    }
                    slcData[slcCount].CmdOffset = startLcu * (m_hwInterface->GetHcpInterface()->GetHcpPakObjSize()) * sizeof(uint32_t);
                }

                SetHcpSliceStateParams(sliceState, slcData, (uint16_t)slcCount, m_tileParams, lastSliceInTile, idx);

                CODECHAL_ENCODE_CHK_STATUS_RETURN(SendHwSliceEncodeCommand(&cmdBuffer, &sliceState));

                sliceNumInTile++;
            }  // end of slice

            if (0 == sliceNumInTile)
            {
                // One tile must have at least one slice
                CODECHAL_ENCODE_ASSERT(false);
                eStatus = MOS_STATUS_INVALID_PARAMETER;
                return eStatus;
            }
        }  // end of row tile
    }      // end of column tile

    // Insert end of sequence/stream if set
    if ((m_lastPicInStream || m_lastPicInSeq) && IsLastPipe())
    {
        MHW_VDBOX_PAK_INSERT_PARAMS pakInsertObjectParams;
        MOS_ZeroMemory(&pakInsertObjectParams, sizeof(pakInsertObjectParams));
        pakInsertObjectParams.bLastPicInSeq    = m_lastPicInSeq;
        pakInsertObjectParams.bLastPicInStream = m_lastPicInStream;
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hcpInterface->AddHcpPakInsertObject(&cmdBuffer, &pakInsertObjectParams));
    }

    // Send VD_PIPELINE_FLUSH command
    MHW_VDBOX_VD_PIPE_FLUSH_PARAMS vdPipelineFlushParams;
    MOS_ZeroMemory(&vdPipelineFlushParams, sizeof(vdPipelineFlushParams));
    vdPipelineFlushParams.Flags.bWaitDoneHEVC           = 1;
    vdPipelineFlushParams.Flags.bFlushHEVC              = 1;
    vdPipelineFlushParams.Flags.bWaitDoneVDCmdMsgParser = 1;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_vdencInterface->AddVdPipelineFlushCmd(&cmdBuffer, &vdPipelineFlushParams));

    // Send MI_FLUSH command
    MHW_MI_FLUSH_DW_PARAMS flushDwParams;
    MOS_ZeroMemory(&flushDwParams, sizeof(flushDwParams));
    flushDwParams.bVideoPipelineCacheInvalidate = true;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_miInterface->AddMiFlushDwCmd(&cmdBuffer, &flushDwParams));

    //HW Semaphore cmd to make sure all pipes completion encode
    CODECHAL_ENCODE_CHK_STATUS_RETURN(SendMIAtomicCmd(&m_resPipeCompleteSemaMem, 1, MHW_MI_ATOMIC_INC, &cmdBuffer));

    if (IsFirstPipe())
    {
        // first pipe needs to ensure all other pipes are ready
        CODECHAL_ENCODE_CHK_STATUS_RETURN(SendHWWaitCommand(
            &m_resPipeCompleteSemaMem,
            &cmdBuffer,
            m_numPipe));

        //clean HW semaphore memory
        MHW_MI_STORE_DATA_PARAMS storeDataParams;
        MOS_ZeroMemory(&storeDataParams, sizeof(storeDataParams));
        storeDataParams.pOsResource = &m_resPipeCompleteSemaMem;
        storeDataParams.dwValue     = 0;
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_miInterface->AddMiStoreDataImmCmd(
            &cmdBuffer,
            &storeDataParams));

        // Use HW stitch commands only in the scalable mode
        if (m_numPipe > 1 && m_enableTileStitchByHW)
        {
            //call PAK Int Kernel in scalability case
            if (m_hucPakStitchEnabled)
            {
                CODECHAL_ENCODE_CHK_STATUS_RETURN(HucPakIntegrate(&cmdBuffer));
#if 0  // Need to enable this code once Gen12 becomes open source \
       // 2nd level BB buffer for stitching cmd                   \
       // current location to add cmds in 2nd level batch buffer
                m_HucStitchCmdBatchBuffer.iCurrent = 0;
                // reset starting location (offset) executing 2nd level batch buffer for each frame & each pass
                m_HucStitchCmdBatchBuffer.dwOffset = 0;
                CODECHAL_ENCODE_CHK_STATUS_RETURN(m_miInterface->AddMiBatchBufferStartCmd(&cmdBuffer, &m_HucStitchCmdBatchBuffer));
                // This wait cmd is needed to make sure copy command is done as suggested by HW folk in encode cases
                CODECHAL_ENCODE_CHK_STATUS_RETURN(m_miInterface->AddMfxWaitCmd(&cmdBuffer, nullptr, m_osInterface->osCpInterface->IsCpEnabled() ? true : false));
#endif
            }
            CODECHAL_ENCODE_CHK_STATUS_RETURN(SetMfxVideoCopyCmdParams(&cmdBuffer));
        }

        CODECHAL_ENCODE_CHK_STATUS_RETURN(ReadSseStatistics(&cmdBuffer));

        CODECHAL_ENCODE_CHK_STATUS_RETURN(EndStatusReport(&cmdBuffer, CODECHAL_NUM_MEDIA_STATES));

        if (m_numPipe <= 1)  // single pipe mode can read the info from MMIO register. Otherwise, we have to use the tile size statistic buffer
        {
            CODECHAL_ENCODE_CHK_STATUS_RETURN(ReadHcpStatus(&cmdBuffer));

            // BRC PAK statistics different for each pass
            if (m_brcEnabled)
            {
                CODECHAL_ENCODE_CHK_STATUS_RETURN(ReadBrcPakStats(&cmdBuffer));
            }
        }
        else
        {  //scalability mode
            if (m_brcEnabled)
            {
                //MMIO register is not used in scalability BRC case. all information is in TileSizeRecord stream out buffer
                CODECHAL_ENCODE_CHK_STATUS_RETURN(ReadBrcPakStatisticsForScalability(&cmdBuffer));
            }
            else
            {
                CODECHAL_ENCODE_CHK_STATUS_RETURN(ReadHcpStatus(&cmdBuffer));
            }
        }

#if (_DEBUG || _RELEASE_INTERNAL)
        //this is to support BRC scalbility test to match with single pipe. Will be removed later after enhanced BRC Scalability is enabled.
        if (m_brcEnabled && m_forceSinglePakPass)
        {
            CODECHAL_ENCODE_CHK_STATUS_RETURN(ResetImgCtrlRegInPAKStatisticsBuffer(&cmdBuffer));
        }
#endif

        if (m_singleTaskPhaseSupported &&
            m_brcEnabled && m_numPipe >= 2 && !IsLastPass())
        {
            // Signal HW semaphore for the BRC dependency (i.e., next BRC pass waits for the current BRC pass)
            for (auto i = 0; i < m_numPipe; i++)
            {
                if (!Mos_ResourceIsNull(&m_resBrcSemaphoreMem[i].sResource))
                {
                    MOS_ZeroMemory(&storeDataParams, sizeof(storeDataParams));
                    storeDataParams.pOsResource = &m_resBrcSemaphoreMem[i].sResource;
                    storeDataParams.dwValue     = 1;

                    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_miInterface->AddMiStoreDataImmCmd(
                        &cmdBuffer,
                        &storeDataParams));
                }
            }
        }
    }

    MOS_ZeroMemory(&flushDwParams, sizeof(flushDwParams));
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_miInterface->AddMiFlushDwCmd(&cmdBuffer, &flushDwParams));

    if (!m_singleTaskPhaseSupported || m_lastTaskInPhase)
    {
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_miInterface->AddMiBatchBufferEnd(&cmdBuffer, nullptr));
    }

    std::string pakPassName = "PAK_PASS" + std::to_string(static_cast<uint32_t>(m_currPass));
    CODECHAL_DEBUG_TOOL(
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpCmdBuffer(
            &cmdBuffer,
            CODECHAL_NUM_MEDIA_STATES,
            pakPassName.data()));)

    CODECHAL_ENCODE_CHK_STATUS_RETURN(ReturnCommandBuffer(&cmdBuffer));

    if (IsFirstPipe() &&
        (m_pakOnlyTest == 0) &&  // In the PAK only test, no need to wait for ENC's completion
        IsFirstPass() &&
        !Mos_ResourceIsNull(&m_resSyncObjectRenderContextInUse))
    {
        MOS_SYNC_PARAMS syncParams  = g_cInitSyncParams;
        syncParams.GpuContext       = m_videoContext;
        syncParams.presSyncResource = &m_resSyncObjectRenderContextInUse;

        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_osInterface->pfnEngineWait(m_osInterface, &syncParams));
    }

    if (!m_singleTaskPhaseSupported || m_lastTaskInPhase)
    {
        bool nullRendering = m_videoContextUsesNullHw;

        CODECHAL_ENCODE_CHK_STATUS_RETURN(SubmitCommandBuffer(&cmdBuffer, nullRendering));

        CODECHAL_DEBUG_TOOL(
            CODECHAL_ENCODE_CHK_STATUS_RETURN(DumpHucDebugOutputBuffers());
            CODECHAL_ENCODE_CHK_STATUS_RETURN(DumpPakOutput());
            if (m_mmcState) {
                m_mmcState->UpdateUserFeatureKey(&m_reconSurface);
            })

        if ((IsLastPipe()) &&
            (IsLastPass()) &&
            m_signalEnc &&
            m_currRefSync &&
            !Mos_ResourceIsNull(&m_currRefSync->resSyncObject))
        {
            // signal semaphore
            MOS_SYNC_PARAMS syncParams;
            syncParams                  = g_cInitSyncParams;
            syncParams.GpuContext       = m_videoContext;
            syncParams.presSyncResource = &m_currRefSync->resSyncObject;

            CODECHAL_ENCODE_CHK_STATUS_RETURN(m_osInterface->pfnEngineSignal(m_osInterface, &syncParams));
            m_currRefSync->uiSemaphoreObjCount++;
            m_currRefSync->bInUsed = true;
        }
    }

    // Reset parameters for next PAK execution
    if (IsLastPipe() && IsLastPass())
    {
        if (!m_singleTaskPhaseSupported)
        {
            m_osInterface->pfnResetPerfBufferID(m_osInterface);
        }

        m_currPakSliceIdx = (m_currPakSliceIdx + 1) % CODECHAL_HEVC_NUM_PAK_SLICE_BATCH_BUFFERS;

        if (m_hevcSeqParams->ParallelBRC)
        {
            m_brcBuffers.uiCurrBrcPakStasIdxForWrite =
                (m_brcBuffers.uiCurrBrcPakStasIdxForWrite + 1) % CODECHAL_ENCODE_RECYCLED_BUFFER_NUM;
        }

        m_newPpsHeader = 0;
        m_newSeqHeader = 0;
        m_frameNum++;
    }

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::DecideEncodingPipeNumber()
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    m_numPipe = m_numVdbox;

    uint8_t numTileColumns = m_hevcPicParams->num_tile_columns_minus1 + 1;

    if (numTileColumns > m_numPipe)
    {
        m_numPipe = 1;
    }

    if (numTileColumns < m_numPipe)
    {
        if (numTileColumns >= 1 && numTileColumns <= 4)
        {
            m_numPipe = numTileColumns;
        }
        else
        {
            m_numPipe = 1;  // invalid tile column test cases and switch back to the single VDBOX mode
        }
    }

    m_useVirtualEngine = true;  //always use virtual engine interface for single pipe and scalability mode

    if (!m_forceScalability)
    {
        //resolution < 4K, always go with single pipe
        if (m_frameWidth * m_frameHeight < ENCODE_HEVC_4K_PIC_WIDTH * ENCODE_HEVC_4K_PIC_HEIGHT)
        {
            m_numPipe = 1;
        }
    }

    m_numUsedVdbox       = m_numPipe;
    m_numberTilesInFrame = (m_hevcPicParams->num_tile_rows_minus1 + 1) * (m_hevcPicParams->num_tile_columns_minus1 + 1);

    if (m_scalabilityState)
    {
        // Create/ re-use a GPU context with 2 pipes
        m_scalabilityState->ucScalablePipeNum = m_numPipe;
    }
    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::PlatformCapabilityCheck()
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    CODECHAL_ENCODE_CHK_STATUS_RETURN(DecideEncodingPipeNumber());

    if (MOS_VE_CTXBASEDSCHEDULING_SUPPORTED(m_osInterface))
    {
        CODECHAL_ENCODE_CHK_STATUS_RETURN(CodechalEncodeScalability_ChkGpuCtxReCreation(this, m_scalabilityState, (PMOS_GPUCTX_CREATOPTIONS_ENHANCED)m_gpuCtxCreatOpt));
    }

    if (m_frameWidth * m_frameHeight > ENCODE_HEVC_MAX_16K_PIC_WIDTH * ENCODE_HEVC_MAX_16K_PIC_HEIGHT)
    {
        eStatus = MOS_STATUS_INVALID_PARAMETER;
        CODECHAL_ENCODE_CHK_STATUS_MESSAGE_RETURN(eStatus, "Frame resolution greater than 16k not supported");
    }

    if (m_vdencEnabled && m_chromaFormat == HCP_CHROMA_FORMAT_YUV444 && m_hevcSeqParams->TargetUsage == 7)
    {
        CODECHAL_ENCODE_ASSERTMESSAGE("Speed mode is not supported in VDENC 444, resetting TargetUsage to Normal mode\n");
        m_hevcSeqParams->TargetUsage = 4;
    }

    if ((uint8_t)HCP_CHROMA_FORMAT_YUV422 == m_chromaFormat &&
        (uint8_t)HCP_CHROMA_FORMAT_YUV422 == m_outputChromaFormat &&
        Format_YUY2 == m_reconSurface.Format)
    {
        if (m_reconSurface.dwHeight < m_oriFrameHeight * 2 ||
            m_reconSurface.dwWidth < m_oriFrameWidth / 2)
        {
            return MOS_STATUS_INVALID_PARAMETER;
        }
    }

    // set RDOQ Intra blocks Threshold for Gen11+
    m_rdoqIntraTuThreshold = 0;
    if (m_hevcRdoqEnabled)
    {
        if (1 == m_hevcSeqParams->TargetUsage)
        {
            m_rdoqIntraTuThreshold = 0xffff;
        }
        else if (4 == m_hevcSeqParams->TargetUsage)
        {
            m_rdoqIntraTuThreshold = m_picWidthInMb * m_picHeightInMb;
            m_rdoqIntraTuThreshold = MOS_MIN(m_rdoqIntraTuThreshold / 10, 0xffff);
        }
    }

    return eStatus;
}

bool CodechalEncHevcStateG12::CheckSupportedFormat(PMOS_SURFACE surface)
{
    CODECHAL_ENCODE_FUNCTION_ENTER;

    bool isColorFormatSupported = false;

    if (nullptr == surface)
    {
        CODECHAL_ENCODE_ASSERTMESSAGE("Invalid (nullptr) Pointer.");
        return isColorFormatSupported;
    }

    switch (surface->Format)
    {
    case Format_NV12:
        isColorFormatSupported = IS_Y_MAJOR_TILE_FORMAT(surface->TileType);
        break;
    case Format_YUY2:
    case Format_YUYV:
    case Format_A8R8G8B8:
    case Format_P010:
    case Format_P016:
    case Format_Y210:
    case Format_Y216:
        break;
    default:
        CODECHAL_ENCODE_ASSERTMESSAGE("Input surface color format = %d not supported!", surface->Format);
        break;
    }

    return isColorFormatSupported;
}

MOS_STATUS CodechalEncHevcStateG12::GetSystemPipeNumberCommon()
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    MOS_USER_FEATURE_VALUE_DATA userFeatureData;
    MOS_ZeroMemory(&userFeatureData, sizeof(userFeatureData));

    MOS_STATUS statusKey = MOS_STATUS_SUCCESS;
    statusKey            = MOS_UserFeature_ReadValue_ID(
        nullptr,
        __MEDIA_USER_FEATURE_VALUE_ENCODE_DISABLE_SCALABILITY,
        &userFeatureData,
        m_osInterface->pOsContext);

    bool disableScalability = true; // m_hwInterface->IsDisableScalability()
    if (statusKey == MOS_STATUS_SUCCESS)
    {
        disableScalability = userFeatureData.i32Data ? true : false;
    }

    MEDIA_SYSTEM_INFO *gtSystemInfo = m_osInterface->pfnGetGtSystemInfo(m_osInterface);
    CODECHAL_ENCODE_CHK_NULL_RETURN(gtSystemInfo);

    if (gtSystemInfo && disableScalability == false)
    {
        // Both VE mode and media solo mode should be able to get the VDBOX number via the same interface
        m_numVdbox = (uint8_t)(gtSystemInfo->VDBoxInfo.NumberOfVDBoxEnabled);
    }
    else
    {
        m_numVdbox = 1;
    }

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::HucPakIntegrate(
    PMOS_COMMAND_BUFFER cmdBuffer)
{
    CODECHAL_ENCODE_FUNCTION_ENTER;

    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_CHK_NULL_RETURN(cmdBuffer);

    CODECHAL_ENCODE_CHK_COND_RETURN(
        (m_vdboxIndex > m_hwInterface->GetMfxInterface()->GetMaxVdboxIndex()),
        "ERROR - vdbox index exceed the maximum");

    auto mmioRegisters = m_hwInterface->GetHucInterface()->GetMmioRegisters(m_vdboxIndex);

    // load kernel from WOPCM into L2 storage RAM
    MHW_VDBOX_HUC_IMEM_STATE_PARAMS imemParams;
    MOS_ZeroMemory(&imemParams, sizeof(imemParams));
    imemParams.dwKernelDescriptor = VDBOX_HUC_PAK_INTEGRATION_KERNEL_DESCRIPTOR;

    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hwInterface->GetHucInterface()->AddHucImemStateCmd(cmdBuffer, &imemParams));

    // pipe mode select
    MHW_VDBOX_PIPE_MODE_SELECT_PARAMS pipeModeSelectParams;
    pipeModeSelectParams.Mode = m_mode;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hwInterface->GetHucInterface()->AddHucPipeModeSelectCmd(cmdBuffer, &pipeModeSelectParams));

    // DMEM set
    MHW_VDBOX_HUC_DMEM_STATE_PARAMS dmemParams;
    if (m_brcEnabled && m_hevcSeqParams->RateControlMethod != RATECONTROL_ICQ)
    {
        CODECHAL_ENCODE_CHK_STATUS_RETURN(SetDmemHuCPakIntegrate(&dmemParams));
    }
    else
    {
        CODECHAL_ENCODE_CHK_STATUS_RETURN(SetDmemHuCPakIntegrateCqp(&dmemParams));
    }
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hwInterface->GetHucInterface()->AddHucDmemStateCmd(cmdBuffer, &dmemParams));

    MHW_VDBOX_HUC_VIRTUAL_ADDR_PARAMS virtualAddrParams;
    if (m_brcEnabled && m_hevcSeqParams->RateControlMethod != RATECONTROL_ICQ)
    {
        CODECHAL_ENCODE_CHK_STATUS_RETURN(SetRegionsHuCPakIntegrate(&virtualAddrParams));
    }
    else
    {
        CODECHAL_ENCODE_CHK_STATUS_RETURN(SetRegionsHuCPakIntegrateCqp(&virtualAddrParams));
    }
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hwInterface->GetHucInterface()->AddHucVirtualAddrStateCmd(cmdBuffer, &virtualAddrParams));

    // Write HUC_STATUS2 mask - bit 6 - valid IMEM loaded
    MHW_MI_STORE_DATA_PARAMS storeDataParams;
    MOS_ZeroMemory(&storeDataParams, sizeof(storeDataParams));
    storeDataParams.pOsResource      = &m_resHucStatus2Buffer;
    storeDataParams.dwResourceOffset = 0;
    storeDataParams.dwValue          = m_hwInterface->GetHucInterface()->GetHucStatus2ImemLoadedMask();
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_miInterface->AddMiStoreDataImmCmd(cmdBuffer, &storeDataParams));

    // Store HUC_STATUS2 register
    MHW_MI_STORE_REGISTER_MEM_PARAMS storeRegParams;
    MOS_ZeroMemory(&storeRegParams, sizeof(storeRegParams));
    storeRegParams.presStoreBuffer = &m_resHucStatus2Buffer;
    storeRegParams.dwOffset        = sizeof(uint32_t);
    storeRegParams.dwRegister      = mmioRegisters->hucStatus2RegOffset;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_miInterface->AddMiStoreRegisterMemCmd(cmdBuffer, &storeRegParams));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hwInterface->GetHucInterface()->AddHucStartCmd(cmdBuffer, true));

    // wait Huc completion (use HEVC bit for now)
    MHW_VDBOX_VD_PIPE_FLUSH_PARAMS vdPipeFlushParams;
    MOS_ZeroMemory(&vdPipeFlushParams, sizeof(vdPipeFlushParams));
    vdPipeFlushParams.Flags.bFlushHEVC    = 1;
    vdPipeFlushParams.Flags.bWaitDoneHEVC = 1;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hwInterface->GetVdencInterface()->AddVdPipelineFlushCmd(cmdBuffer, &vdPipeFlushParams));

    // Flush the engine to ensure memory written out
    MHW_MI_FLUSH_DW_PARAMS flushDwParams;
    MOS_ZeroMemory(&flushDwParams, sizeof(flushDwParams));
    flushDwParams.bVideoPipelineCacheInvalidate = true;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_miInterface->AddMiFlushDwCmd(cmdBuffer, &flushDwParams));

    EncodeStatusBuffer encodeStatusBuf = m_encodeStatusBuf;

    uint32_t baseOffset =
        (encodeStatusBuf.wCurrIndex * encodeStatusBuf.dwReportSize) + sizeof(uint32_t) * 2;  // pEncodeStatus is offset by 2 DWs in the resource

    // Write HUC_STATUS mask
    MOS_ZeroMemory(&storeDataParams, sizeof(storeDataParams));
    storeDataParams.pOsResource      = &encodeStatusBuf.resStatusBuffer;
    storeDataParams.dwResourceOffset = baseOffset + encodeStatusBuf.dwHuCStatusMaskOffset;
    storeDataParams.dwValue          = m_hwInterface->GetHucInterface()->GetHucStatusReEncodeMask();
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_miInterface->AddMiStoreDataImmCmd(
        cmdBuffer,
        &storeDataParams));

    // store HUC_STATUS register
    MOS_ZeroMemory(&storeRegParams, sizeof(storeRegParams));
    storeRegParams.presStoreBuffer = &encodeStatusBuf.resStatusBuffer;
    storeRegParams.dwOffset        = baseOffset + encodeStatusBuf.dwHuCStatusRegOffset;
    storeRegParams.dwRegister      = mmioRegisters->hucStatusRegOffset;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_miInterface->AddMiStoreRegisterMemCmd(
        cmdBuffer,
        &storeRegParams));

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::Initialize(CodechalSetting *settings)
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    MOS_USER_FEATURE_VALUE_DATA userFeatureData;
    MOS_STATUS                  statusKey = MOS_STATUS_SUCCESS;

#if (_DEBUG || _RELEASE_INTERNAL)
    char stringData[MOS_USER_CONTROL_MAX_DATA_SIZE];
    MOS_ZeroMemory(&userFeatureData, sizeof(userFeatureData));
    userFeatureData.StringData.pStringData = stringData;
    statusKey                              = MOS_UserFeature_ReadValue_ID(
        nullptr,
        __MEDIA_USER_FEATURE_VALUE_HEVC_ENCODE_PAK_ONLY_ID,
        &userFeatureData,
        m_osInterface->pOsContext);

    if (statusKey == MOS_STATUS_SUCCESS && userFeatureData.StringData.uSize > 0)
    {
        MOS_SecureStrcpy(m_pakOnlyDataFolder,
            sizeof(m_pakOnlyDataFolder) / sizeof(m_pakOnlyDataFolder[0]),
            stringData);

        uint32_t len = strlen(m_pakOnlyDataFolder);
        if (m_pakOnlyDataFolder[len - 1] == '\\')
        {
            m_pakOnlyDataFolder[len - 1] = 0;
        }

        m_pakOnlyTest = true;
        // PAK only mode does not need to init any kernel
    }

    MOS_ZeroMemory(&userFeatureData, sizeof(userFeatureData));
    userFeatureData.StringData.pStringData = stringData;
    statusKey                              = MOS_UserFeature_ReadValue_ID(
        nullptr,
        __MEDIA_USER_FEATURE_VALUE_HEVC_ENCODE_LOAD_KERNEL_INPUT_ID,
        &userFeatureData,
        m_osInterface->pOsContext);

    if (statusKey == MOS_STATUS_SUCCESS && userFeatureData.StringData.uSize > 0)
    {
        MOS_SecureStrcpy(m_loadKernelInputDataFolder,
            sizeof(m_loadKernelInputDataFolder) / sizeof(m_loadKernelInputDataFolder[0]),
            stringData);

        uint32_t len = strlen(m_loadKernelInputDataFolder);
        if (m_loadKernelInputDataFolder[len - 1] == '\\')
        {
            m_loadKernelInputDataFolder[len - 1] = 0;
        }
        m_loadKernelInput = true;
    }
#endif

    // Common initialization
    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodechalEncHevcState::Initialize(settings));

    m_numDelay                              = 15;  //Value suggested by HW team.
    m_bmeMethodTable                        = (uint8_t *)m_meMethod;
    m_b4XMeDistortionBufferSupported        = true;
    m_brcBuffers.dwBrcConstantSurfaceWidth  = HEVC_BRC_CONSTANT_SURFACE_WIDTH_G9;
    m_brcBuffers.dwBrcConstantSurfaceHeight = HEVC_BRC_CONSTANT_SURFACE_HEIGHT_G10;
    m_brcHistoryBufferSize                  = HEVC_BRC_HISTORY_BUFFER_SIZE_G12;
    m_maxNumSlicesSupported                 = CODECHAL_HEVC_MAX_NUM_SLICES_LVL_6;
    m_brcBuffers.dwBrcHcpPicStateSize       = BRC_IMG_STATE_SIZE_PER_PASS_G12 * CODECHAL_ENCODE_BRC_MAXIMUM_NUM_PASSES;

    MOS_ZeroMemory(&userFeatureData, sizeof(userFeatureData));
    MOS_UserFeature_ReadValue_ID(
        nullptr,
        __MEDIA_USER_FEATURE_VALUE_SINGLE_TASK_PHASE_ENABLE_ID,
        &userFeatureData,
        m_osInterface->pOsContext);
    m_singleTaskPhaseSupported = (userFeatureData.i32Data) ? true : false;

    // Max ConcurrentGroup used in the ENC kernel
    m_numberConcurrentGroup = 4;

    m_sizeOfHcpPakFrameStats = 9 * CODECHAL_CACHELINE_SIZE;  //Frame statistics occupying 9 caceline on gen12

    // Max Subthread number used in the ENC kernel
    m_numberEncKernelSubThread = 3;

    if (m_numberEncKernelSubThread > m_hevcThreadTaskDataNum)
    {
        m_numberEncKernelSubThread = m_hevcThreadTaskDataNum;  // support up to 2 sub-threads in one LCU64x64
    }

    MOS_ZeroMemory(&userFeatureData, sizeof(userFeatureData));
    MOS_UserFeature_ReadValue_ID(
        nullptr,
        __MEDIA_USER_FEATURE_VALUE_HEVC_ENCODE_26Z_ENABLE_ID,
        &userFeatureData,
        m_osInterface->pOsContext);
    m_enable26WalkingPattern = (userFeatureData.i32Data) ? false : true;

    MOS_ZeroMemory(&userFeatureData, sizeof(userFeatureData));
    MOS_UserFeature_ReadValue_ID(
        nullptr,
        __MEDIA_USER_FEATURE_VALUE_HEVC_ENCODE_RDOQ_ENABLE_ID,
        &userFeatureData,
        m_osInterface->pOsContext);
    m_hevcRdoqEnabled = userFeatureData.i32Data ? true : false;

    MOS_ZeroMemory(&userFeatureData, sizeof(userFeatureData));
    MOS_UserFeature_ReadValue_ID(
        nullptr,
        __MEDIA_USER_FEATURE_VALUE_HEVC_VME_ENCODE_SSE_ENABLE_ID,
        &userFeatureData,
        m_osInterface->pOsContext);
    m_sseSupported = userFeatureData.i32Data ? true : false;

    // Overriding the defaults here with 32 aligned dimensions
    // 2x Scaling WxH
    m_downscaledWidth2x =
        CODECHAL_GET_2xDS_SIZE_32ALIGNED(m_frameWidth);
    m_downscaledHeight2x =
        CODECHAL_GET_2xDS_SIZE_32ALIGNED(m_frameHeight);

    // HME Scaling WxH
    m_downscaledWidth4x =
        CODECHAL_GET_4xDS_SIZE_32ALIGNED(m_frameWidth);
    m_downscaledHeight4x =
        CODECHAL_GET_4xDS_SIZE_32ALIGNED(m_frameHeight);
    m_downscaledWidthInMb4x =
        CODECHAL_GET_WIDTH_IN_MACROBLOCKS(m_downscaledWidth4x);
    m_downscaledHeightInMb4x =
        CODECHAL_GET_WIDTH_IN_MACROBLOCKS(m_downscaledHeight4x);

    // SuperHME Scaling WxH
    m_downscaledWidth16x =
        CODECHAL_GET_4xDS_SIZE_32ALIGNED(m_downscaledWidth4x);
    m_downscaledHeight16x =
        CODECHAL_GET_4xDS_SIZE_32ALIGNED(m_downscaledHeight4x);
    m_downscaledWidthInMb16x =
        CODECHAL_GET_WIDTH_IN_MACROBLOCKS(m_downscaledWidth16x);
    m_downscaledHeightInMb16x =
        CODECHAL_GET_WIDTH_IN_MACROBLOCKS(m_downscaledHeight16x);

    // UltraHME Scaling WxH
    m_downscaledWidth32x =
        CODECHAL_GET_2xDS_SIZE_32ALIGNED(m_downscaledWidth16x);
    m_downscaledHeight32x =
        CODECHAL_GET_2xDS_SIZE_32ALIGNED(m_downscaledHeight16x);
    m_downscaledWidthInMb32x =
        CODECHAL_GET_WIDTH_IN_MACROBLOCKS(m_downscaledWidth32x);
    m_downscaledHeightInMb32x =
        CODECHAL_GET_WIDTH_IN_MACROBLOCKS(m_downscaledHeight32x);

    // disable MMCD if we enable Codechal dump. Because dump code changes the surface state from compressed to uncompressed,
    // this causes mis-match issue between dump is enabled or disabled.
    CODECHAL_DEBUG_TOOL(
        if (m_mmcState && m_debugInterface && m_debugInterface->m_dbgCfgHead){
            //m_mmcState->SetMmcDisabled();
        })

    CODECHAL_ENCODE_CHK_STATUS_RETURN(GetSystemPipeNumberCommon());

    if (MOS_VE_SUPPORTED(m_osInterface))
    {
        m_scalabilityState = (PCODECHAL_ENCODE_SCALABILITY_STATE)MOS_AllocAndZeroMemory(sizeof(CODECHAL_ENCODE_SCALABILITY_STATE));
        CODECHAL_ENCODE_CHK_NULL_RETURN(m_scalabilityState);
        //scalability initialize
        CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalEncodeScalability_InitializeState(m_scalabilityState, m_hwInterface));
    }

    MOS_ZeroMemory(&userFeatureData, sizeof(userFeatureData));
    statusKey = MOS_UserFeature_ReadValue_ID(
        nullptr,
        __MEDIA_USER_FEATURE_VALUE_HEVC_ENCODE_ENABLE_HW_STITCH,
        &userFeatureData,
        m_osInterface->pOsContext);
    m_enableTileStitchByHW = userFeatureData.i32Data ? true : false;

    statusKey = MOS_STATUS_SUCCESS;
    MOS_ZeroMemory(&userFeatureData, sizeof(userFeatureData));
    statusKey = MOS_UserFeature_ReadValue_ID(
        nullptr,
        __MEDIA_USER_FEATURE_VALUE_HEVC_ENCODE_ENABLE_HW_SEMAPHORE,
        &userFeatureData,
        m_osInterface->pOsContext);
    m_enableHWSemaphore = userFeatureData.i32Data ? true : false;

    MOS_ZeroMemory(&userFeatureData, sizeof(userFeatureData));
    statusKey = MOS_UserFeature_ReadValue_ID(
        nullptr,
        __MEDIA_USER_FEATURE_VALUE_HEVC_ENCODE_ENABLE_WP_SUPPORT_ID,
        &userFeatureData,
        m_osInterface->pOsContext);
    m_weightedPredictionSupported = userFeatureData.i32Data ? true : false;

#if (_DEBUG || _RELEASE_INTERNAL)
    MOS_ZeroMemory(&userFeatureData, sizeof(userFeatureData));
    statusKey = MOS_UserFeature_ReadValue_ID(
        nullptr,
        __MEDIA_USER_FEATURE_VALUE_HEVC_ENCODE_ENABLE_VE_DEBUG_OVERRIDE,
        &userFeatureData,
        m_osInterface->pOsContext);
    m_kmdVeOveride.Value = (uint64_t)userFeatureData.i64Data;

    MOS_ZeroMemory(&userFeatureData, sizeof(userFeatureData));
    MOS_UserFeature_ReadValue_ID(
        nullptr,
        __MEDIA_USER_FEATURE_VALUE_HEVC_VME_FORCE_SCALABILITY_ID,
        &userFeatureData,
        m_osInterface->pOsContext);
    m_forceScalability = userFeatureData.i32Data ? true : false;

    MOS_ZeroMemory(&userFeatureData, sizeof(userFeatureData));
    statusKey = MOS_UserFeature_ReadValue_ID(
        nullptr,
        __MEDIA_USER_FEATURE_VALUE_HEVC_VME_DISABLE_PANIC_MODE_ID,
        &userFeatureData,
        m_osInterface->pOsContext);
    if (statusKey == MOS_STATUS_SUCCESS)
    {
        m_enableFramePanicMode = userFeatureData.i32Data ? false : true;
    }

    MOS_ZeroMemory(&userFeatureData, sizeof(userFeatureData));
    MOS_UserFeature_ReadValue_ID(
        nullptr,
        __MEDIA_USER_FEATURE_VALUE_HEVC_VME_BRC_LTR_INTERVAL_ID,
        &userFeatureData,
        m_osInterface->pOsContext);
    m_ltrInterval = (uint32_t)(userFeatureData.i32Data);

    MOS_ZeroMemory(&userFeatureData, sizeof(userFeatureData));
    MOS_UserFeature_ReadValue_ID(
        nullptr,
        __MEDIA_USER_FEATURE_VALUE_HEVC_VME_BRC_LTR_DISABLE_ID,
        &userFeatureData,
        m_osInterface->pOsContext);
    m_enableBrcLTR = (userFeatureData.i32Data) ? false : true;
#endif

    if (m_codecFunction != CODECHAL_FUNCTION_PAK)
    {
        MOS_ZeroMemory(&userFeatureData, sizeof(userFeatureData));
        MOS_UserFeature_ReadValue_ID(
            nullptr,
            __MEDIA_USER_FEATURE_VALUE_HEVC_ENCODE_ME_ENABLE_ID,
            &userFeatureData,
            m_osInterface->pOsContext);
        m_hmeSupported = (userFeatureData.i32Data) ? true : false;

        MOS_ZeroMemory(&userFeatureData, sizeof(userFeatureData));
        MOS_UserFeature_ReadValue_ID(
            nullptr,
            __MEDIA_USER_FEATURE_VALUE_HEVC_ENCODE_16xME_ENABLE_ID,
            &userFeatureData,
            m_osInterface->pOsContext);
        m_16xMeSupported = (userFeatureData.i32Data) ? true : false;

        MOS_ZeroMemory(&userFeatureData, sizeof(userFeatureData));
        MOS_UserFeature_ReadValue_ID(
            nullptr,
            __MEDIA_USER_FEATURE_VALUE_HEVC_ENCODE_32xME_ENABLE_ID,
            &userFeatureData,
            m_osInterface->pOsContext);
        // Keeping UHME by Default ON for Gen12
        m_32xMeSupported = (userFeatureData.i32Data) ? false : true;

        MOS_ZeroMemory(&userFeatureData, sizeof(userFeatureData));
        MOS_UserFeature_ReadValue_ID(
            nullptr,
            __MEDIA_USER_FEATURE_VALUE_HEVC_NUM_THREADS_PER_LCU_ID,
            &userFeatureData,
            m_osInterface->pOsContext);
        m_totalNumThreadsPerLcu = (uint16_t)userFeatureData.i32Data;

        if (m_totalNumThreadsPerLcu < m_minThreadsPerLcuB || m_totalNumThreadsPerLcu > m_maxThreadsPerLcuB)
        {
            return MOS_STATUS_INVALID_PARAMETER;
        }
    }

    if (m_frameWidth < 128 || m_frameHeight < 128)
    {
        m_16xMeSupported = false;
        m_32xMeSupported = false;
    }
    else if (m_frameWidth < 512 || m_frameHeight < 512)
    {
        m_32xMeSupported = false;
    }

    return eStatus;
}

void CodechalEncHevcStateG12::LoadCosts(uint8_t sliceType, uint8_t qp)
{
    if (sliceType >= CODECHAL_HEVC_NUM_SLICE_TYPES)
    {
        CODECHAL_ENCODE_ASSERTMESSAGE("Invalid slice type");
        sliceType = CODECHAL_HEVC_I_SLICE;
    }

    double  qpScale   = 0.60;
    int32_t qpMinus12 = qp - 12;
    double  lambda    = sqrt(qpScale * pow(2.0, MOS_MAX(0, qpMinus12) / 3.0));
    uint8_t lcuIdx    = ((m_hevcSeqParams->log2_max_coding_block_size_minus3 + 3) == 6) ? 1 : 0;
    m_lambdaRD        = (uint16_t)(qpScale * pow(2.0, MOS_MAX(0, qpMinus12) / 3.0) * 4 + 0.5);

    m_modeCostCre[LUTCREMODE_INTRA_32X32]       = CRECOST(lambda, LUTMODEBITS_INTRA_32X32, lcuIdx, sliceType);
    m_modeCostCre[LUTCREMODE_INTRA_16X16]       = CRECOST(lambda, LUTMODEBITS_INTRA_16X16, lcuIdx, sliceType);
    m_modeCostCre[LUTCREMODE_INTRA_8X8]         = CRECOST(lambda, LUTMODEBITS_INTRA_8X8, lcuIdx, sliceType);
    m_modeCostCre[LUTCREMODE_INTRA_CHROMA]      = CRECOST(lambda, LUTMODEBITS_INTRA_CHROMA, lcuIdx, sliceType);
    m_modeCostCre[LUTCREMODE_INTER_32X32]       = CRECOST(lambda, LUTMODEBITS_INTER_32X32, lcuIdx, sliceType);
    m_modeCostCre[LUTCREMODE_INTER_32X16]       = CRECOST(lambda, LUTMODEBITS_INTER_32X16, lcuIdx, sliceType);
    m_modeCostCre[LUTCREMODE_INTER_16X16]       = CRECOST(lambda, LUTMODEBITS_INTER_16X16, lcuIdx, sliceType);
    m_modeCostCre[LUTCREMODE_INTER_16X8]        = CRECOST(lambda, LUTMODEBITS_INTER_16X8, lcuIdx, sliceType);
    m_modeCostCre[LUTCREMODE_INTER_8X8]         = CRECOST(lambda, LUTMODEBITS_INTER_8X8, lcuIdx, sliceType);
    m_modeCostCre[LUTCREMODE_INTER_BIDIR]       = CRECOST(lambda, LUTMODEBITS_INTER_BIDIR, lcuIdx, sliceType);
    m_modeCostCre[LUTCREMODE_INTER_SKIP]        = CRECOST(lambda, LUTMODEBITS_INTER_SKIP, lcuIdx, sliceType);
    m_modeCostCre[LUTCREMODE_INTRA_NONDC_32X32] = CRECOST(lambda, LUTMODEBITS_INTRA_NONDC_32X32, lcuIdx, sliceType);
    m_modeCostCre[LUTCREMODE_INTRA_NONDC_16X16] = CRECOST(lambda, LUTMODEBITS_INTRA_NONDC_16X16, lcuIdx, sliceType);
    m_modeCostCre[LUTCREMODE_INTRA_NONDC_8X8]   = CRECOST(lambda, LUTMODEBITS_INTRA_NONDC_8X8, lcuIdx, sliceType);
    m_modeCostCre[LUTCREMODE_INTRA_NONPRED]     = CRECOST(lambda, LUTMODEBITS_INTRA_MPM, lcuIdx, sliceType);

    m_modeCostRde[LUTRDEMODE_INTRA_64X64]       = RDEBITS62(LUTMODEBITS_INTRA_64X64, lcuIdx, sliceType);
    m_modeCostRde[LUTRDEMODE_INTRA_32X32]       = RDEBITS62(LUTMODEBITS_INTRA_32X32, lcuIdx, sliceType);
    m_modeCostRde[LUTRDEMODE_INTRA_16X16]       = RDEBITS62(LUTMODEBITS_INTRA_16X16, lcuIdx, sliceType);
    m_modeCostRde[LUTRDEMODE_INTRA_8X8]         = RDEBITS62(LUTMODEBITS_INTRA_8X8, lcuIdx, sliceType);
    m_modeCostRde[LUTRDEMODE_INTRA_NXN]         = RDEBITS62(LUTMODEBITS_INTRA_NXN, lcuIdx, sliceType);
    m_modeCostRde[LUTRDEMODE_INTRA_MPM]         = RDEBITS62(LUTMODEBITS_INTRA_MPM, lcuIdx, sliceType);
    m_modeCostRde[LUTRDEMODE_INTRA_DC_32X32]    = RDEBITS62(LUTMODEBITS_INTRA_DC_32X32, lcuIdx, sliceType);
    m_modeCostRde[LUTRDEMODE_INTRA_DC_8X8]      = RDEBITS62(LUTMODEBITS_INTRA_DC_8X8, lcuIdx, sliceType);
    m_modeCostRde[LUTRDEMODE_INTRA_NONDC_32X32] = RDEBITS62(LUTMODEBITS_INTRA_NONDC_32X32, lcuIdx, sliceType);
    m_modeCostRde[LUTRDEMODE_INTRA_NONDC_8X8]   = RDEBITS62(LUTMODEBITS_INTRA_NONDC_8X8, lcuIdx, sliceType);
    m_modeCostRde[LUTRDEMODE_INTER_BIDIR]       = RDEBITS62(LUTMODEBITS_INTER_BIDIR, lcuIdx, sliceType);
    m_modeCostRde[LUTRDEMODE_INTER_REFID]       = RDEBITS62(LUTMODEBITS_INTER_REFID, lcuIdx, sliceType);
    m_modeCostRde[LUTRDEMODE_SKIP_64X64]        = RDEBITS62(LUTMODEBITS_SKIP_64X64, lcuIdx, sliceType);
    m_modeCostRde[LUTRDEMODE_SKIP_32X32]        = RDEBITS62(LUTMODEBITS_SKIP_32X32, lcuIdx, sliceType);
    m_modeCostRde[LUTRDEMODE_SKIP_16X16]        = RDEBITS62(LUTMODEBITS_SKIP_16X16, lcuIdx, sliceType);
    m_modeCostRde[LUTRDEMODE_SKIP_8X8]          = RDEBITS62(LUTMODEBITS_SKIP_8X8, lcuIdx, sliceType);
    m_modeCostRde[LUTRDEMODE_MERGE_64X64]       = RDEBITS62(LUTMODEBITS_MERGE_64X64, lcuIdx, sliceType);
    m_modeCostRde[LUTRDEMODE_MERGE_32X32]       = RDEBITS62(LUTMODEBITS_MERGE_32X32, lcuIdx, sliceType);
    m_modeCostRde[LUTRDEMODE_MERGE_16X16]       = RDEBITS62(LUTMODEBITS_MERGE_16X16, lcuIdx, sliceType);
    m_modeCostRde[LUTRDEMODE_MERGE_8X8]         = RDEBITS62(LUTMODEBITS_MERGE_8X8, lcuIdx, sliceType);
    m_modeCostRde[LUTRDEMODE_INTER_32X32]       = RDEBITS62(LUTMODEBITS_INTER_32X32, lcuIdx, sliceType);
    m_modeCostRde[LUTRDEMODE_INTER_32X16]       = RDEBITS62(LUTMODEBITS_INTER_32X16, lcuIdx, sliceType);
    m_modeCostRde[LUTRDEMODE_INTER_16X16]       = RDEBITS62(LUTMODEBITS_INTER_16X16, lcuIdx, sliceType);
    m_modeCostRde[LUTRDEMODE_INTER_16X8]        = RDEBITS62(LUTMODEBITS_INTER_16X8, lcuIdx, sliceType);
    m_modeCostRde[LUTRDEMODE_INTER_8X8]         = RDEBITS62(LUTMODEBITS_INTER_8X8, lcuIdx, sliceType);
    m_modeCostRde[LUTRDEMODE_TU_DEPTH_0]        = RDEBITS62(LUTMODEBITS_TU_DEPTH_0, lcuIdx, sliceType);
    m_modeCostRde[LUTRDEMODE_TU_DEPTH_1]        = RDEBITS62(LUTMODEBITS_TU_DEPTH_1, lcuIdx, sliceType);

    for (uint8_t i = 0; i < 8; i++)
    {
        m_modeCostRde[LUTRDEMODE_CBF + i] = RDEBITS62(LUTMODEBITS_CBF + i, lcuIdx, sliceType);
    }
}

// ------------------------------------------------------------------------------
//| Purpose:    Setup curbe for HEVC MbEnc B Kernels
//| Return:     N/A
//------------------------------------------------------------------------------
MOS_STATUS CodechalEncHevcStateG12::SetCurbeMbEncBKernel()
{
    uint32_t        curIdx = m_currRecycledBufIdx;
    MOS_LOCK_PARAMS lockFlags;
    MOS_STATUS      eStatus = MOS_STATUS_SUCCESS;

    uint8_t tuMapping = ((m_hevcSeqParams->TargetUsage) / 3) % 3;  // Map TU 1,4,6 to 0,1,2

    // Initialize the CURBE data
    MBENC_CURBE curbe;

    if (m_hevcSeqParams->RateControlMethod == RATECONTROL_CQP)
    {
        curbe.QPType    = QP_TYPE_CONSTANT;
        curbe.ROIEnable = m_hevcPicParams->NumROI ? true : false;
    }
    else
    {
        curbe.QPType = m_lcuBrcEnabled ? QP_TYPE_CU_LEVEL : QP_TYPE_FRAME;
    }

    // TU based settings
    curbe.EnableCu64Check        = m_tuSettings[EnableCu64CheckTuParam][tuMapping];
    curbe.MaxNumIMESearchCenter  = m_tuSettings[MaxNumIMESearchCenterTuParam][tuMapping];
    curbe.MaxTransformDepthInter = m_tuSettings[Log2TUMaxDepthInterTuParam][tuMapping];
    curbe.MaxTransformDepthIntra = m_tuSettings[Log2TUMaxDepthIntraTuParam][tuMapping];
    curbe.Dynamic64Order         = m_tuSettings[Dynamic64OrderTuParam][tuMapping];
    curbe.DynamicOrderTh         = m_tuSettings[DynamicOrderThTuParam][tuMapping];
    curbe.Dynamic64Enable        = m_tuSettings[Dynamic64EnableTuParam][tuMapping];
    curbe.Dynamic64Th            = m_tuSettings[Dynamic64ThTuParam][tuMapping];
    curbe.IncreaseExitThresh     = m_tuSettings[IncreaseExitThreshTuParam][tuMapping];
    curbe.IntraSpotCheck         = m_tuSettings[IntraSpotCheckFlagTuParam][tuMapping];
    curbe.Fake32Enable           = m_tuSettings[Fake32EnableTuParam][tuMapping];

    curbe.FrameWidthInSamples  = m_frameWidth;
    curbe.FrameHeightInSamples = m_frameHeight;

    curbe.Log2MaxCUSize = m_hevcSeqParams->log2_max_coding_block_size_minus3 + 3;
    curbe.Log2MinCUSize = m_hevcSeqParams->log2_min_coding_block_size_minus3 + 3;
    curbe.Log2MaxTUSize = m_hevcSeqParams->log2_max_transform_block_size_minus2 + 2;
    curbe.Log2MinTUSize = m_hevcSeqParams->log2_min_transform_block_size_minus2 + 2;

    curbe.ChromaFormatType = m_hevcSeqParams->chroma_format_idc;

    curbe.TUDepthControl = curbe.MaxTransformDepthInter;

    int32_t sliceQp   = m_hevcSliceParams->slice_qp_delta + m_hevcPicParams->QpY;
    curbe.FrameQP     = abs(sliceQp);
    curbe.FrameQPSign = (sliceQp > 0) ? 0 : 1;

#if 0  // no need in the optimized kernel because kernel does the table look-up
    LoadCosts(CODECHAL_HEVC_B_SLICE, (uint8_t)sliceQp);
    curbe.DW4_ModeIntra32x32Cost = m_modeCostCre[LUTCREMODE_INTRA_32X32];
    curbe.DW4_ModeIntraNonDC32x32Cost = m_modeCostCre[LUTCREMODE_INTRA_NONDC_32X32];

    curbe.DW5_ModeIntra16x16Cost = m_modeCostCre[LUTCREMODE_INTRA_16X16];
    curbe.DW5_ModeIntraNonDC16x16Cost = m_modeCostCre[LUTCREMODE_INTRA_NONDC_16X16];
    curbe.DW5_ModeIntra8x8Cost = m_modeCostCre[LUTCREMODE_INTRA_8X8];
    curbe.DW5_ModeIntraNonDC8x8Cost = m_modeCostCre[LUTCREMODE_INTRA_NONDC_8X8];

    curbe.DW6_ModeIntraNonPred = m_modeCostCre[LUTCREMODE_INTRA_NONPRED];

    curbe.DW7_ChromaIntraModeCost = m_modeCostCre[LUTCREMODE_INTRA_CHROMA];

    curbe.DW12_IntraModeCostMPM = m_modeCostRde[LUTRDEMODE_INTRA_MPM];

    curbe.DW13_IntraTUDept0Cost = m_modeCostRde[LUTRDEMODE_TU_DEPTH_0];
    curbe.DW13_IntraTUDept1Cost = m_modeCostRde[LUTRDEMODE_TU_DEPTH_1];

    curbe.DW14_IntraTU4x4CBFCost = m_modeCostRde[LUTRDEMODE_INTRA_CBF_4X4];
    curbe.DW14_IntraTU8x8CBFCost = m_modeCostRde[LUTRDEMODE_INTRA_CBF_8X8];
    curbe.DW14_IntraTU16x16CBFCost = m_modeCostRde[LUTRDEMODE_INTRA_CBF_16X16];
    curbe.DW14_IntraTU32x32CBFCost = m_modeCostRde[LUTRDEMODE_INTRA_CBF_32X32];
    curbe.DW15_LambdaRD = (uint16_t)m_lambdaRD;
    curbe.DW17_IntraNonDC8x8Penalty = m_modeCostRde[LUTRDEMODE_INTRA_NONDC_8X8];
    curbe.DW17_IntraNonDC32x32Penalty = m_modeCostRde[LUTRDEMODE_INTRA_NONDC_32X32];
#endif

    curbe.NumofColumnTile = m_hevcPicParams->num_tile_columns_minus1 + 1;
    curbe.NumofRowTile    = m_hevcPicParams->num_tile_rows_minus1 + 1;
    curbe.HMEFlag         = m_hmeSupported ? 3 : 0;

    curbe.MaxRefIdxL0  = CODECHAL_ENCODE_HEVC_NUM_MAX_VME_L0_REF_G10 - 1;
    curbe.MaxRefIdxL1  = CODECHAL_ENCODE_HEVC_NUM_MAX_VME_L1_REF_G10 - 1;
    curbe.MaxBRefIdxL0 = CODECHAL_ENCODE_HEVC_NUM_MAX_VME_L0_REF_G10 - 1;

    // Check whether Last Frame is I frame or not
    if (m_frameNum == 0 || m_picHeightInMb == I_TYPE || (m_frameNum && m_lastPictureCodingType == I_TYPE))
    {
        // This is the flag to notify kernel not to use the history buffer
        curbe.LastFrameIsIntra = true;
    }
    else
    {
        curbe.LastFrameIsIntra = false;
    }

    curbe.SliceType             = PicCodingTypeToSliceType(m_hevcPicParams->CodingType);
    curbe.TemporalMvpEnableFlag = m_hevcSliceParams->slice_temporal_mvp_enable_flag;
    curbe.CollocatedFromL0Flag  = m_hevcSliceParams->collocated_from_l0_flag;
    curbe.theSameRefList        = m_sameRefList;
    curbe.IsLowDelay            = m_lowDelay;
    curbe.MaxNumMergeCand       = m_hevcSliceParams->MaxNumMergeCand;
    curbe.NumRefIdxL0           = m_hevcSliceParams->num_ref_idx_l0_active_minus1 + 1;
    curbe.NumRefIdxL1           = m_hevcSliceParams->num_ref_idx_l1_active_minus1 + 1;

    if (m_hevcSeqParams->TargetUsage == 1)
    {
        // MaxNumMergeCand C Model uses 4 for TU1,
        // for quality consideration, make sure not larger than the value from App as it will be used in PAK
        curbe.MaxNumMergeCand = MOS_MIN(m_hevcSliceParams->MaxNumMergeCand, 4);
    }
    else
    {
        // MaxNumMergeCand C Model uses 2 for TU4 and TU7,
        // for quality consideration, make sure not larger than the value from App as it will be used in PAK
        curbe.MaxNumMergeCand = MOS_MIN(m_hevcSliceParams->MaxNumMergeCand, 2);
    }

    int32_t tbRefListL0[CODECHAL_ENCODE_HEVC_NUM_MAX_VME_L0_REF_G10] = {0}, tbRefListL1[CODECHAL_ENCODE_HEVC_NUM_MAX_VME_L1_REF_G10] = {0};
    curbe.FwdPocNumber_L0_mTb_0 = tbRefListL0[0] = ComputeTemporalDifferent(m_hevcSliceParams->RefPicList[0][0]);
    curbe.BwdPocNumber_L1_mTb_0 = tbRefListL1[0] = ComputeTemporalDifferent(m_hevcSliceParams->RefPicList[1][0]);
    curbe.FwdPocNumber_L0_mTb_1 = tbRefListL0[1] = ComputeTemporalDifferent(m_hevcSliceParams->RefPicList[0][1]);
    curbe.BwdPocNumber_L1_mTb_1 = tbRefListL1[1] = ComputeTemporalDifferent(m_hevcSliceParams->RefPicList[1][1]);

    curbe.FwdPocNumber_L0_mTb_2 = tbRefListL0[2] = ComputeTemporalDifferent(m_hevcSliceParams->RefPicList[0][2]);
    curbe.BwdPocNumber_L1_mTb_2 = tbRefListL1[2] = ComputeTemporalDifferent(m_hevcSliceParams->RefPicList[1][2]);
    curbe.FwdPocNumber_L0_mTb_3 = tbRefListL0[3] = ComputeTemporalDifferent(m_hevcSliceParams->RefPicList[0][3]);
    curbe.BwdPocNumber_L1_mTb_3 = tbRefListL1[3] = ComputeTemporalDifferent(m_hevcSliceParams->RefPicList[1][3]);

    curbe.RefFrameWinHeight = m_frameHeight;
    curbe.RefFrameWinWidth  = m_frameWidth;

    // Hard coding for now from Gen10HEVC_TU4_default.par
    curbe.RoundingInter      = (m_roundingInter + 1) << 4;  // Should be an input from par(slice state)
    curbe.RoundingIntra      = (m_roundingIntra + 1) << 4;  // Should be an input from par(slice state)
    curbe.RDEQuantRoundValue = (m_roundingInter + 1) << 4;

    uint32_t gopP = (m_hevcSeqParams->GopRefDist) ? ((m_hevcSeqParams->GopPicSize - 1) / m_hevcSeqParams->GopRefDist) : 0;
    uint32_t gopB = m_hevcSeqParams->GopPicSize - 1 - gopP;

    curbe.CostScalingForRA = 1;  // default setting

    // get the min distance between current pic and ref pics
    uint32_t minPocDist     = 255;
    uint32_t costTableIndex = 0;
    if (curbe.CostScalingForRA == 1)
    {
        for (uint8_t ref = 0; ref < curbe.NumRefIdxL0; ref++)
        {
            if ((uint32_t)abs(tbRefListL0[ref]) < minPocDist)
                minPocDist = abs(tbRefListL0[ref]);
        }
        for (uint8_t ref = 0; ref < curbe.NumRefIdxL1; ref++)
        {
            if ((uint32_t)abs(tbRefListL1[ref]) < minPocDist)
                minPocDist = abs(tbRefListL1[ref]);
        }

        if (gopB == 4)
        {
            if (minPocDist == 1 || minPocDist == 2 || minPocDist == 4)
                costTableIndex = minPocDist;
        }
        if (gopB == 8)
        {
            if (minPocDist == 1 || minPocDist == 2 || minPocDist == 4 || minPocDist == 8)
                costTableIndex = minPocDist + 3;
        }
    }

    curbe.CostTableIndex = costTableIndex;

    // the following fields are needed by the new optimized kernel in v052417
    curbe.Log2ParallelMergeLevel  = m_hevcPicParams->log2_parallel_merge_level_minus2 + 2;
    curbe.MaxIntraRdeIter         = 1;
    curbe.CornerNeighborPixel     = 0;
    curbe.IntraNeighborAvailFlags = 0;
    curbe.SubPelMode              = 3;  // qual-pel search
    curbe.InterSADMeasure         = 2;  // Haar transform
    curbe.IntraSADMeasure         = 2;  // Haar transform
    curbe.IntraPrediction         = 0;  // enable 32x32, 16x16, and 8x8 luma intra prediction
    curbe.RefIDCostMode           = 1;  // 0: AVC and 1: linear method
    curbe.TUBasedCostSetting      = 0;
    curbe.ConcurrentGroupNum      = m_numberConcurrentGroup;
     curbe.WaveFrontSplitVQFix = ((1 << (m_hevcSeqParams->log2_min_coding_block_size_minus3 + 3)) == 64) ? 1 : 0;
    curbe.NumofUnitInWaveFront    = m_numWavefrontInOneRegion;
    curbe.LoadBalenceEnable       = 0;  // when this flag is false, kernel does not use LoadBalance (or MBENC_B_FRAME_CONCURRENT_TG_DATA) buffe
    curbe.ThreadNumber            = MOS_MIN(2, m_numberEncKernelSubThread);
    curbe.Pic_init_qp_B           = m_hevcSliceParams->slice_qp_delta + m_hevcPicParams->QpY;
    curbe.Pic_init_qp_P           = m_hevcSliceParams->slice_qp_delta + m_hevcPicParams->QpY;
    curbe.Pic_init_qp_I           = m_hevcSliceParams->slice_qp_delta + m_hevcPicParams->QpY;
    curbe.WaveFrontSplitsEnable   = (m_numberConcurrentGroup == 1) ? false : true;
    curbe.SuperHME                = m_16xMeSupported;
    curbe.UltraHME                = m_32xMeSupported;
    curbe.PerBFrameQPOffset       = 0;

    switch (m_hevcSeqParams->TargetUsage)
    {
    case 1:
        curbe.Degree45          = 0;
        curbe.Break12Dependency = 0;
        break;
    case 4:
    default:
        curbe.Degree45          = 1;
        curbe.Break12Dependency = 1;
        break;
    }

    curbe.LongTermReferenceFlags_L0 = 0;
    for (uint32_t i = 0; i < curbe.NumRefIdxL0; i++)
    {
        curbe.LongTermReferenceFlags_L0 |= (m_hevcSliceParams->RefPicList[0][i].PicFlags & PICTURE_LONG_TERM_REFERENCE) << i;
    }
    curbe.LongTermReferenceFlags_L1 = 0;
    for (uint32_t i = 0; i < curbe.NumRefIdxL1; i++)
    {
        curbe.LongTermReferenceFlags_L1 |= (m_hevcSliceParams->RefPicList[1][i].PicFlags & PICTURE_LONG_TERM_REFERENCE) << i;
    }

    curbe.Stepping           = 0;
    curbe.Cu64SkipCheckOnly  = 0;
    curbe.Cu642Nx2NCheckOnly = 0;
    curbe.EnableCu64AmpCheck = 1;
    curbe.IntraSpeedMode     = 0;  // 35 mode
    curbe.DisableIntraNxN    = 0;

    if (m_hwInterface->GetPlatform().usRevId == 0)
    {
        curbe.Stepping               = 1;
        curbe.TUDepthControl         = 1;
        curbe.MaxTransformDepthInter = 1;
        curbe.MaxTransformDepthIntra = 0;
        //buf->curbe.EnableCu64Check       = 1;
        curbe.Cu64SkipCheckOnly  = 0;
        curbe.Cu642Nx2NCheckOnly = 1;
        curbe.EnableCu64AmpCheck = 0;
        curbe.IntraSpeedMode     = 0;  // 35 mode
        curbe.DisableIntraNxN    = 1;
        curbe.MaxNumMergeCand    = 1;
    }

    MOS_ZeroMemory(&lockFlags, sizeof(MOS_LOCK_PARAMS));
    lockFlags.WriteOnly = 1;
    auto buf            = (PMBENC_COMBINED_BUFFER1)m_osInterface->pfnLockResource(
        m_osInterface,
        &m_encBCombinedBuffer1[curIdx].sResource,
        &lockFlags);
    CODECHAL_ENCODE_CHK_NULL_RETURN(buf);

    if (curbe.Degree45)
    {
        MOS_ZeroMemory(&buf->concurrent, sizeof(buf->concurrent));
    }
    buf->Curbe = curbe;

    m_osInterface->pfnUnlockResource(
        m_osInterface,
        &m_encBCombinedBuffer1[curIdx].sResource);

    // clean-up the thread dependency buffer in the second combined buffer
    if (m_numberEncKernelSubThread > 1)
    {
        MOS_LOCK_PARAMS lockFlags;

        MOS_ZeroMemory(&lockFlags, sizeof(MOS_LOCK_PARAMS));
        lockFlags.WriteOnly = 1;
        auto data           = (uint8_t *)m_osInterface->pfnLockResource(
            m_osInterface,
            &m_encBCombinedBuffer2[curIdx].sResource,
            &lockFlags);
        CODECHAL_ENCODE_CHK_NULL_RETURN(data);

        MOS_ZeroMemory(&data[m_threadTaskBufferOffset], m_threadTaskBufferSize);

        m_osInterface->pfnUnlockResource(
            m_osInterface,
            &m_encBCombinedBuffer2[curIdx].sResource);
    }

    if (m_initEncConstTable)
    {
        // Initialize the Enc Constant Table surface
        MOS_LOCK_PARAMS lockFlags;
        MOS_ZeroMemory(&lockFlags, sizeof(MOS_LOCK_PARAMS));
        lockFlags.WriteOnly = 1;

        auto data = (uint8_t *)m_osInterface->pfnLockResource(
            m_osInterface,
            &m_encConstantTableForB.sResource,
            &lockFlags);
        CODECHAL_ENCODE_CHK_NULL_RETURN(data);

        if (m_isMaxLcu64)
        {
            MOS_SecureMemcpy(data, m_encConstantTableForB.dwSize, (const void *)m_encLcu64ConstantDataLut, sizeof(m_encLcu64ConstantDataLut));
        }
        else
        {
            MOS_SecureMemcpy(data, m_encConstantTableForB.dwSize, (const void *)m_encLcu32ConstantDataLut, sizeof(m_encLcu32ConstantDataLut));
        }

        m_osInterface->pfnUnlockResource(
            m_osInterface,
            &m_encConstantTableForB.sResource);
        m_initEncConstTable = false;
    }

    // binding table index
    MBENC_COMBINED_BTI params;
    if (m_isMaxLcu64)
    {
        for (uint32_t i = 0; i < MAX_MULTI_FRAME_NUMBER; i++)
        {
            params.BTI_LCU64.Combined1DSurIndexMF1[i]           = MBENC_B_FRAME_ENCODER_COMBINED_BUFFER1;
            params.BTI_LCU64.Combined1DSurIndexMF2[i]           = MBENC_B_FRAME_ENCODER_COMBINED_BUFFER2;
            params.BTI_LCU64.VMEInterPredictionSurfIndexMF[i]   = MBENC_B_FRAME_VME_PRED_CURR_PIC_IDX0;
            params.BTI_LCU64.SrcSurfIndexMF[i]                  = MBENC_B_FRAME_CURR_Y;
            params.BTI_LCU64.SrcReconSurfIndexMF[i]             = MBENC_B_FRAME_CURR_Y_WITH_RECON_BOUNDARY_PIX;
            params.BTI_LCU64.CURecordSurfIndexMF[i]             = MBENC_B_FRAME_ENC_CU_RECORD;
            params.BTI_LCU64.PAKObjectSurfIndexMF[i]            = MBENC_B_FRAME_PAK_OBJ;
            params.BTI_LCU64.CUPacketSurfIndexMF[i]             = MBENC_B_FRAME_PAK_CU_RECORD;
            params.BTI_LCU64.SWScoreBoardSurfIndexMF[i]         = MBENC_B_FRAME_SW_SCOREBOARD;
            params.BTI_LCU64.QPCU16SurfIndexMF[i]               = MBENC_B_FRAME_CU_QP_DATA;
            params.BTI_LCU64.LCULevelDataSurfIndexMF[i]         = MBENC_B_FRAME_LCU_LEVEL_DATA_INPUT;
            params.BTI_LCU64.TemporalMVSurfIndexMF[i]           = MBENC_B_FRAME_COLOCATED_CU_MV_DATA;
            params.BTI_LCU64.HmeDataSurfIndexMF[i]              = MBENC_B_FRAME_HME_MOTION_PREDICTOR_DATA;
            params.BTI_LCU64.VME2XInterPredictionSurfIndexMF[i] = MBENC_B_FRAME_VME_PRED_FOR_2X_DS_CURR;
        }
        params.BTI_LCU64.DebugSurfIndexMF[0]  = MBENC_B_FRAME_DEBUG_SURFACE;
        params.BTI_LCU64.DebugSurfIndexMF[1]  = MBENC_B_FRAME_DEBUG_SURFACE1;
        params.BTI_LCU64.DebugSurfIndexMF[2]  = MBENC_B_FRAME_DEBUG_SURFACE2;
        params.BTI_LCU64.DebugSurfIndexMF[3]  = MBENC_B_FRAME_DEBUG_SURFACE3;
        params.BTI_LCU64.HEVCCnstLutSurfIndex = MBENC_B_FRAME_ENC_CONST_TABLE;
        params.BTI_LCU64.LoadBalenceSurfIndex = MBENC_B_FRAME_CONCURRENT_TG_DATA;
    }
    else
    {
        for (uint32_t i = 0; i < MAX_MULTI_FRAME_NUMBER; i++)
        {
            params.BTI_LCU32.Combined1DSurIndexMF1[i]         = MBENC_B_FRAME_ENCODER_COMBINED_BUFFER1;
            params.BTI_LCU32.Combined1DSurIndexMF2[i]         = MBENC_B_FRAME_ENCODER_COMBINED_BUFFER2;
            params.BTI_LCU32.VMEInterPredictionSurfIndexMF[i] = MBENC_B_FRAME_VME_PRED_CURR_PIC_IDX0;
            params.BTI_LCU32.SrcSurfIndexMF[i]                = MBENC_B_FRAME_CURR_Y;
            params.BTI_LCU32.SrcReconSurfIndexMF[i]           = MBENC_B_FRAME_CURR_Y_WITH_RECON_BOUNDARY_PIX;
            params.BTI_LCU32.CURecordSurfIndexMF[i]           = MBENC_B_FRAME_ENC_CU_RECORD;
            params.BTI_LCU32.PAKObjectSurfIndexMF[i]          = MBENC_B_FRAME_PAK_OBJ;
            params.BTI_LCU32.CUPacketSurfIndexMF[i]           = MBENC_B_FRAME_PAK_CU_RECORD;
            params.BTI_LCU32.SWScoreBoardSurfIndexMF[i]       = MBENC_B_FRAME_SW_SCOREBOARD;
            params.BTI_LCU32.QPCU16SurfIndexMF[i]             = MBENC_B_FRAME_CU_QP_DATA;
            params.BTI_LCU32.LCULevelDataSurfIndexMF[i]       = MBENC_B_FRAME_LCU_LEVEL_DATA_INPUT;
            params.BTI_LCU32.TemporalMVSurfIndexMF[i]         = MBENC_B_FRAME_COLOCATED_CU_MV_DATA;
            params.BTI_LCU32.HmeDataSurfIndexMF[i]            = MBENC_B_FRAME_HME_MOTION_PREDICTOR_DATA;
        }
        params.BTI_LCU32.DebugSurfIndexMF[0]  = MBENC_B_FRAME_DEBUG_SURFACE;
        params.BTI_LCU32.DebugSurfIndexMF[1]  = MBENC_B_FRAME_DEBUG_SURFACE1;
        params.BTI_LCU32.DebugSurfIndexMF[2]  = MBENC_B_FRAME_DEBUG_SURFACE2;
        params.BTI_LCU32.DebugSurfIndexMF[3]  = MBENC_B_FRAME_DEBUG_SURFACE3;
        params.BTI_LCU32.HEVCCnstLutSurfIndex = MBENC_B_FRAME_ENC_CONST_TABLE;
        params.BTI_LCU32.LoadBalenceSurfIndex = MBENC_B_FRAME_CONCURRENT_TG_DATA;
    }

    CODECHAL_ENCODE_CHK_NULL_RETURN(m_mbEncKernelStates);
    PMHW_KERNEL_STATE kernelState = m_isMaxLcu64 ? &m_mbEncKernelStates[MBENC_LCU64_KRNIDX] : &m_mbEncKernelStates[MBENC_LCU32_KRNIDX];
    CODECHAL_ENCODE_CHK_STATUS_RETURN(kernelState->m_dshRegion.AddData(
        &params,
        kernelState->dwCurbeOffset,
        sizeof(params)));

    return eStatus;
}

// ------------------------------------------------------------------------------
//| Purpose:    Setup curbe for HEVC BrcInitReset Kernel
//| Return:     N/A
//------------------------------------------------------------------------------
MOS_STATUS CodechalEncHevcStateG12::SetCurbeBrcInitReset(
    CODECHAL_HEVC_BRC_KRNIDX brcKrnIdx)
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_CHK_NULL_RETURN(m_brcKernelStates);

    if (brcKrnIdx != CODECHAL_HEVC_BRC_INIT && brcKrnIdx != CODECHAL_HEVC_BRC_RESET)
    {
        CODECHAL_ENCODE_ASSERTMESSAGE("Brc kernel requested is not init or reset\n");
        return MOS_STATUS_INVALID_PARAMETER;
    }

    // Initialize the CURBE data
    BRC_INITRESET_CURBE curbe = m_brcInitResetCurbeInit;

    uint32_t profileLevelMaxFrame = GetProfileLevelMaxFrameSize();

    if (m_hevcSeqParams->RateControlMethod == RATECONTROL_CBR ||
        m_hevcSeqParams->RateControlMethod == RATECONTROL_VBR ||
        m_hevcSeqParams->RateControlMethod == RATECONTROL_AVBR)
    {
        if (m_hevcSeqParams->InitVBVBufferFullnessInBit == 0)
        {
            CODECHAL_ENCODE_ASSERTMESSAGE("Initial VBV Buffer Fullness is zero\n");
            return MOS_STATUS_INVALID_PARAMETER;
        }

        if (m_hevcSeqParams->VBVBufferSizeInBit == 0)
        {
            CODECHAL_ENCODE_ASSERTMESSAGE("VBV buffer size in bits is zero\n");
            return MOS_STATUS_INVALID_PARAMETER;
        }
    }

    curbe.DW0_ProfileLevelMaxFrame = profileLevelMaxFrame;
    curbe.DW1_InitBufFull          = m_hevcSeqParams->InitVBVBufferFullnessInBit;
    curbe.DW2_BufSize              = m_hevcSeqParams->VBVBufferSizeInBit;
    curbe.DW3_TargetBitRate        = m_hevcSeqParams->TargetBitRate * CODECHAL_ENCODE_BRC_KBPS;  //DDI in Kbits
    curbe.DW4_MaximumBitRate       = m_hevcSeqParams->MaxBitRate * CODECHAL_ENCODE_BRC_KBPS;
    curbe.DW5_MinimumBitRate       = 0;
    curbe.DW6_FrameRateM           = m_hevcSeqParams->FrameRate.Numerator;
    curbe.DW7_FrameRateD           = m_hevcSeqParams->FrameRate.Denominator;
    curbe.DW8_BRCFlag              = BRCINIT_IGNORE_PICTURE_HEADER_SIZE;  // always ignore the picture header size set in BRC Update curbe;

    if (m_hevcPicParams->NumROI)
    {
        curbe.DW8_BRCFlag |= BRCINIT_DISABLE_MBBRC;  // BRC ROI need disable MBBRC logic in LcuBrc Kernel
    }
    else
    {
        curbe.DW8_BRCFlag |= (m_lcuBrcEnabled) ? 0 : BRCINIT_DISABLE_MBBRC;
    }

    curbe.DW8_BRCFlag |= (m_brcEnabled && m_numPipe > 1) ? BRCINIT_USEHUCBRC : 0;
    // For non-ICQ, ACQP Buffer always set to 1
    curbe.DW25_ACQPBuffer        = 1;
    curbe.DW25_SlidingWindowSize = m_slidingWindowSize;

    if (m_hevcSeqParams->RateControlMethod == RATECONTROL_CBR)
    {
        curbe.DW4_MaximumBitRate = curbe.DW3_TargetBitRate;
        curbe.DW8_BRCFlag |= BRCINIT_ISCBR;
    }
    else if (m_hevcSeqParams->RateControlMethod == RATECONTROL_VBR)
    {
        if (curbe.DW4_MaximumBitRate < curbe.DW3_TargetBitRate)
        {
            curbe.DW4_MaximumBitRate = 2 * curbe.DW3_TargetBitRate;
        }
        curbe.DW8_BRCFlag |= BRCINIT_ISVBR;
    }
    else if (m_hevcSeqParams->RateControlMethod == RATECONTROL_AVBR)
    {
        curbe.DW8_BRCFlag |= BRCINIT_ISAVBR;
        // For AVBR, max bitrate = target bitrate,
        curbe.DW3_TargetBitRate  = m_hevcSeqParams->TargetBitRate * CODECHAL_ENCODE_BRC_KBPS;  //DDI in Kbits
        curbe.DW4_MaximumBitRate = m_hevcSeqParams->TargetBitRate * CODECHAL_ENCODE_BRC_KBPS;
    }
    else if (m_hevcSeqParams->RateControlMethod == RATECONTROL_ICQ)
    {
        curbe.DW8_BRCFlag |= BRCINIT_ISICQ;
        curbe.DW25_ACQPBuffer = m_hevcSeqParams->ICQQualityFactor;
    }
    else if (m_hevcSeqParams->RateControlMethod == RATECONTROL_VCM)
    {
        curbe.DW4_MaximumBitRate = curbe.DW3_TargetBitRate;
        curbe.DW8_BRCFlag |= BRCINIT_ISVCM;
    }
    else if (m_hevcSeqParams->RateControlMethod == RATECONTROL_CQP)
    {
        curbe.DW8_BRCFlag = BRCINIT_ISCQP;
    }
    else if (m_hevcSeqParams->RateControlMethod == RATECONTROL_QVBR)
    {
        if (curbe.DW4_MaximumBitRate < curbe.DW3_TargetBitRate)
        {
            curbe.DW4_MaximumBitRate = curbe.DW3_TargetBitRate;  // Use max bit rate for HRD compliance
        }
        curbe.DW8_BRCFlag = curbe.DW8_BRCFlag | BRCINIT_ISQVBR | BRCINIT_ISVBR;  // We need to make sure that VBR is used for QP determination.
        // use ICQQualityFactor to determine the larger Qp for each MB
        curbe.DW25_ACQPBuffer = m_hevcSeqParams->ICQQualityFactor;
    }
    curbe.DW9_FrameWidth       = m_oriFrameWidth;
    curbe.DW10_FrameHeight     = m_oriFrameHeight;
    curbe.DW10_AVBRAccuracy    = m_usAvbrAccuracy;
    curbe.DW11_AVBRConvergence = m_usAvbrConvergence;
    curbe.DW12_NumberSlice     = m_numSlices;

    /**********************************************************************
    In case of non-HB/BPyramid Structure
    BRC_Param_A = GopP
    BRC_Param_B = GopB
    In case of HB/BPyramid GOP Structure
    BRC_Param_A, BRC_Param_B, BRC_Param_C, BRC_Param_D are
    BRC Parameters set as follows as per CModel equation
    ***********************************************************************/
    // BPyramid GOP
    if (m_HierchGopBRCEnabled)
    {
        curbe.DW8_BRCGopP   = ((m_hevcSeqParams->GopPicSize + m_hevcSeqParams->GopRefDist - 1) / m_hevcSeqParams->GopRefDist);
        curbe.DW9_BRCGopB   = curbe.DW8_BRCGopP;
        curbe.DW13_BRCGopB1 = curbe.DW8_BRCGopP * 2;
        curbe.DW14_BRCGopB2 = ((m_hevcSeqParams->GopPicSize) - (curbe.DW8_BRCGopP) - (curbe.DW13_BRCGopB1) - (curbe.DW9_BRCGopB));
        // B1 Level GOP
        if (m_hevcSeqParams->GopRefDist <= 4 || curbe.DW14_BRCGopB2 == 0)
        {
            curbe.DW14_MaxBRCLevel = 3;
        }
        // B2 Level GOP
        else
        {
            curbe.DW14_MaxBRCLevel = 4;
        }
    }
    // For Regular GOP - No BPyramid
    else
    {
        curbe.DW14_MaxBRCLevel = 1;
        curbe.DW8_BRCGopP      = (m_hevcSeqParams->GopRefDist) ? ((m_hevcSeqParams->GopPicSize - 1) / m_hevcSeqParams->GopRefDist) : 0;
        curbe.DW9_BRCGopB      = m_hevcSeqParams->GopPicSize - 1 - curbe.DW8_BRCGopP;
    }

    // Set dynamic thresholds
    double inputBitsPerFrame = (double)((double)curbe.DW4_MaximumBitRate * (double)curbe.DW7_FrameRateD);
    inputBitsPerFrame        = (double)(inputBitsPerFrame / curbe.DW6_FrameRateM);

    if (curbe.DW2_BufSize < (uint32_t)inputBitsPerFrame * 4)
    {
        curbe.DW2_BufSize = (uint32_t)inputBitsPerFrame * 4;
    }

    if (curbe.DW1_InitBufFull == 0)
    {
        curbe.DW1_InitBufFull = 7 * curbe.DW2_BufSize / 8;
    }
    if (curbe.DW1_InitBufFull < (uint32_t)(inputBitsPerFrame * 2))
    {
        curbe.DW1_InitBufFull = (uint32_t)(inputBitsPerFrame * 2);
    }
    if (curbe.DW1_InitBufFull > curbe.DW2_BufSize)
    {
        curbe.DW1_InitBufFull = curbe.DW2_BufSize;
    }

    if (m_hevcSeqParams->RateControlMethod == RATECONTROL_AVBR)
    {
        // For AVBR, Buffer size =  2*Bitrate, InitVBV = 0.75 * BufferSize
        curbe.DW2_BufSize     = 2 * m_hevcSeqParams->TargetBitRate * CODECHAL_ENCODE_BRC_KBPS;
        curbe.DW1_InitBufFull = (uint32_t)(0.75 * curbe.DW2_BufSize);
    }

    if (m_hevcSeqParams->FrameSizeTolerance == EFRAMESIZETOL_EXTREMELY_LOW)
    {
        curbe.DW15_LongTermInterval = 0;  // no LTR for low delay brc
    }
    else
    {
        curbe.DW15_LongTermInterval = (m_enableBrcLTR && m_ltrInterval) ? m_ltrInterval : m_enableBrcLTR ? HEVC_BRC_LONG_TERM_REFRENCE_FLAG : 0;
    }

    double bpsRatio = ((double)inputBitsPerFrame / ((double)(curbe.DW2_BufSize) / 30));
    bpsRatio        = (bpsRatio < 0.1) ? 0.1 : (bpsRatio > 3.5) ? 3.5 : bpsRatio;

    curbe.DW19_DeviationThreshold0_PBframe = (uint32_t)(-50 * pow(0.90, bpsRatio));
    curbe.DW19_DeviationThreshold1_PBframe = (uint32_t)(-50 * pow(0.66, bpsRatio));
    curbe.DW19_DeviationThreshold2_PBframe = (uint32_t)(-50 * pow(0.46, bpsRatio));
    curbe.DW19_DeviationThreshold3_PBframe = (uint32_t)(-50 * pow(0.3, bpsRatio));

    curbe.DW20_DeviationThreshold4_PBframe = (uint32_t)(50 * pow(0.3, bpsRatio));
    curbe.DW20_DeviationThreshold5_PBframe = (uint32_t)(50 * pow(0.46, bpsRatio));
    curbe.DW20_DeviationThreshold6_PBframe = (uint32_t)(50 * pow(0.7, bpsRatio));
    curbe.DW20_DeviationThreshold7_PBframe = (uint32_t)(50 * pow(0.9, bpsRatio));

    curbe.DW21_DeviationThreshold0_VBRcontrol = (uint32_t)(-50 * pow(0.9, bpsRatio));
    curbe.DW21_DeviationThreshold1_VBRcontrol = (uint32_t)(-50 * pow(0.7, bpsRatio));
    curbe.DW21_DeviationThreshold2_VBRcontrol = (uint32_t)(-50 * pow(0.5, bpsRatio));
    curbe.DW21_DeviationThreshold3_VBRcontrol = (uint32_t)(-50 * pow(0.3, bpsRatio));

    curbe.DW22_DeviationThreshold4_VBRcontrol = (uint32_t)(100 * pow(0.4, bpsRatio));
    curbe.DW22_DeviationThreshold5_VBRcontrol = (uint32_t)(100 * pow(0.5, bpsRatio));
    curbe.DW22_DeviationThreshold6_VBRcontrol = (uint32_t)(100 * pow(0.75, bpsRatio));
    curbe.DW22_DeviationThreshold7_VBRcontrol = (uint32_t)(100 * pow(0.9, bpsRatio));

    curbe.DW23_DeviationThreshold0_Iframe = (uint32_t)(-50 * pow(0.8, bpsRatio));
    curbe.DW23_DeviationThreshold1_Iframe = (uint32_t)(-50 * pow(0.6, bpsRatio));
    curbe.DW23_DeviationThreshold2_Iframe = (uint32_t)(-50 * pow(0.34, bpsRatio));
    curbe.DW23_DeviationThreshold3_Iframe = (uint32_t)(-50 * pow(0.2, bpsRatio));

    curbe.DW24_DeviationThreshold4_Iframe = (uint32_t)(50 * pow(0.2, bpsRatio));
    curbe.DW24_DeviationThreshold5_Iframe = (uint32_t)(50 * pow(0.4, bpsRatio));
    curbe.DW24_DeviationThreshold6_Iframe = (uint32_t)(50 * pow(0.66, bpsRatio));
    curbe.DW24_DeviationThreshold7_Iframe = (uint32_t)(50 * pow(0.9, bpsRatio));

    if (m_hevcSeqParams->HierarchicalFlag && !m_hevcSeqParams->LowDelayMode &&
        (m_hevcSeqParams->GopRefDist == 4 || m_hevcSeqParams->GopRefDist == 8))
    {
        curbe.DW26_RandomAccess = true;
    }
    else
    {
        curbe.DW26_RandomAccess = false;
    }

    if (m_brcInit)
    {
        m_dBrcInitCurrentTargetBufFullInBits = curbe.DW1_InitBufFull;
    }

    m_brcInitResetBufSizeInBits      = curbe.DW2_BufSize;
    m_dBrcInitResetInputBitsPerFrame = inputBitsPerFrame;

    PMHW_KERNEL_STATE kernelState = &m_brcKernelStates[brcKrnIdx];
    CODECHAL_ENCODE_CHK_STATUS_RETURN(kernelState->m_dshRegion.AddData(
        &curbe,
        kernelState->dwCurbeOffset,
        sizeof(curbe)));

    return eStatus;
}

// ------------------------------------------------------------------------------
//| Purpose:    Setup curbe for HEVC BrcUpdate Kernel
//| Return:     N/A
//------------------------------------------------------------------------------
MOS_STATUS CodechalEncHevcStateG12::SetCurbeBrcUpdate(
    CODECHAL_HEVC_BRC_KRNIDX brcKrnIdx)
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    if (brcKrnIdx != CODECHAL_HEVC_BRC_FRAME_UPDATE && brcKrnIdx != CODECHAL_HEVC_BRC_LCU_UPDATE)
    {
        CODECHAL_ENCODE_ASSERTMESSAGE("Brc kernel requested is not frame update or LCU update\n");
        return MOS_STATUS_INVALID_PARAMETER;
    }

    CODECHAL_ENCODE_CHK_NULL_RETURN(m_brcKernelStates);

    // Initialize the CURBE data
    BRCUPDATE_CURBE curbe = m_brcUpdateCurbeInit;

    curbe.DW5_TargetSize_Flag = 0;

    if (m_dBrcInitCurrentTargetBufFullInBits > (double)m_brcInitResetBufSizeInBits)
    {
        m_dBrcInitCurrentTargetBufFullInBits -= (double)m_brcInitResetBufSizeInBits;
        curbe.DW5_TargetSize_Flag = 1;
    }

    if (m_numSkipFrames)
    {
        // pass num/size of skipped frames to update BRC
        curbe.DW6_NumSkippedFrames     = m_numSkipFrames;
        curbe.DW15_SizeOfSkippedFrames = m_sizeSkipFrames;

        // account for skipped frame in calculating CurrentTargetBufFullInBits
        m_dBrcInitCurrentTargetBufFullInBits += m_dBrcInitResetInputBitsPerFrame * m_numSkipFrames;
    }

    curbe.DW0_TargetSize  = (uint32_t)(m_dBrcInitCurrentTargetBufFullInBits);
    curbe.DW1_FrameNumber = m_storeData - 1;  // Check if we can remove this (set to 0)

    // BRC PAK statistic buffer from last frame, the encoded size includes header already.
    // in BRC Initreset kernel, curbe DW8_BRCFlag will always ignore picture header size, so no need to set picture header size here.
    curbe.DW2_PictureHeaderSize = 0;
    curbe.DW5_CurrFrameBrcLevel = m_currFrameBrcLevel;
    curbe.DW5_MaxNumPAKs        = m_hwInterface->GetMfxInterface()->GetBrcNumPakPasses();

    if (m_hevcSeqParams->RateControlMethod == RATECONTROL_CQP)
    {
        curbe.DW6_CqpValue = m_hevcPicParams->QpY + m_hevcSliceParams->slice_qp_delta;
    }
    if (m_hevcPicParams->NumROI)
    {
        curbe.DW6_ROIEnable    = m_brcEnabled ? false : true;
        curbe.DW6_BRCROIEnable = m_brcEnabled ? true : false;
        curbe.DW6_RoiRatio     = CalculateROIRatio();
    }
    curbe.DW6_SlidingWindowEnable = (m_hevcSeqParams->FrameSizeTolerance == EFRAMESIZETOL_LOW);

    //for low delay brc
    curbe.DW6_LowDelayEnable    = (m_hevcSeqParams->FrameSizeTolerance == EFRAMESIZETOL_EXTREMELY_LOW);
    curbe.DW16_UserMaxFrameSize = GetProfileLevelMaxFrameSize();
    curbe.DW14_ParallelMode     = m_hevcSeqParams->ParallelBRC;

    if (m_hevcSeqParams->RateControlMethod == RATECONTROL_AVBR)
    {
        curbe.DW3_StartGAdjFrame0 = (uint32_t)((10 * m_usAvbrConvergence) / (double)150);
        curbe.DW3_StartGAdjFrame1 = (uint32_t)((50 * m_usAvbrConvergence) / (double)150);
        curbe.DW4_StartGAdjFrame2 = (uint32_t)((100 * m_usAvbrConvergence) / (double)150);
        curbe.DW4_StartGAdjFrame3 = (uint32_t)((150 * m_usAvbrConvergence) / (double)150);

        curbe.DW11_gRateRatioThreshold0 =
            (uint32_t)((100 - (m_usAvbrAccuracy / (double)30) * (100 - 40)));
        curbe.DW11_gRateRatioThreshold1 =
            (uint32_t)((100 - (m_usAvbrAccuracy / (double)30) * (100 - 75)));
        curbe.DW12_gRateRatioThreshold2 = (uint32_t)((100 - (m_usAvbrAccuracy / (double)30) * (100 - 97)));
        curbe.DW12_gRateRatioThreshold3 = (uint32_t)((100 + (m_usAvbrAccuracy / (double)30) * (103 - 100)));
        curbe.DW12_gRateRatioThreshold4 = (uint32_t)((100 + (m_usAvbrAccuracy / (double)30) * (125 - 100)));
        curbe.DW12_gRateRatioThreshold5 = (uint32_t)((100 + (m_usAvbrAccuracy / (double)30) * (160 - 100)));
    }

    if (m_hevcSeqParams->FrameSizeTolerance == EFRAMESIZETOL_EXTREMELY_LOW)
    {
        curbe.DW17_LongTerm_Current = 0;  // no LTR for low delay brc
    }
    else
    {
        m_isFrameLTR                = (CodecHal_PictureIsLongTermRef(m_currReconstructedPic));
        curbe.DW17_LongTerm_Current = (m_enableBrcLTR && m_isFrameLTR) ? 1 : 0;
    }

    PMHW_KERNEL_STATE kernelState = &m_brcKernelStates[brcKrnIdx];
    CODECHAL_ENCODE_CHK_STATUS_RETURN(kernelState->m_dshRegion.AddData(
        &curbe,
        kernelState->dwCurbeOffset,
        sizeof(curbe)));

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::SendMbEncSurfacesIKernel(
    PMOS_COMMAND_BUFFER cmdBuffer)
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    uint32_t                               startBTI = 0, mbenc_I_KRNIDX = MBENC_LCU32_KRNIDX;
    CODECHAL_SURFACE_CODEC_PARAMS          surfaceCodecParams;
    PMOS_SURFACE                           inputSurface = m_rawSurfaceToEnc;
    PMHW_KERNEL_STATE                      kernelState  = &m_mbEncKernelStates[mbenc_I_KRNIDX];
    PCODECHAL_ENCODE_BINDING_TABLE_GENERIC bindingTable = &m_mbEncKernelBindingTable[mbenc_I_KRNIDX];

    // Combined 1D buffer 1, which contains regular kernel curbe and concurrent map
    startBTI = MBENC_B_FRAME_ENCODER_COMBINED_BUFFER1;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams1D(
        &surfaceCodecParams,
        &m_encBCombinedBuffer1[m_currRecycledBufIdx].sResource,
        m_encBCombinedBuffer1[m_currRecycledBufIdx].dwSize,
        0,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_ELLC_LLC_ONLY].Value,
        bindingTable->dwBindingTableEntries[startBTI++],
        false));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    CODECHAL_DEBUG_TOOL(
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpBuffer(
            &m_encBCombinedBuffer1[m_currRecycledBufIdx].sResource,
            CodechalDbgAttr::attrOutput,
            "Hevc_CombinedBuffer1",
            m_encBCombinedBuffer1[m_currRecycledBufIdx].dwSize,
            0,
            CODECHAL_MEDIA_STATE_HEVC_I_MBENC)););

    // VME surfaces
    startBTI = 0;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParamsVME(
        &surfaceCodecParams,
        inputSurface,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_CURR_ENCODE].Value,
        bindingTable->dwBindingTableEntries[startBTI++]));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    // Programming dummy surfaces even if not used (VME requirement), currently setting to input surface
    for (int32_t surface_idx = 0; surface_idx < 8; surface_idx++)
    {
        CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParamsVME(
            &surfaceCodecParams,
            inputSurface,
            m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_REF_ENCODE].Value,
            bindingTable->dwBindingTableEntries[startBTI++]));

        CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
            m_hwInterface,
            cmdBuffer,
            &surfaceCodecParams,
            kernelState));
    }

    //Source Y and UV
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams2D(
        &surfaceCodecParams,
        inputSurface,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_CURR_ENCODE].Value,
        bindingTable->dwBindingTableEntries[startBTI++],
        m_verticalLineStride,
        false));

    surfaceCodecParams.bUseUVPlane = true;

    surfaceCodecParams.dwUVBindingTableOffset = bindingTable->dwBindingTableEntries[startBTI++];
    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    CODECHAL_DEBUG_TOOL(
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpYUVSurface(
            inputSurface,
            CodechalDbgAttr::attrEncodeRawInputSurface,
            "MbEnc_Input_SrcSurf")));
    // Current Y with reconstructed boundary pixels
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams2D(
        &surfaceCodecParams,
        &m_currPicWithReconBoundaryPix,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_PAK_OBJECT_ENCODE].Value,
        bindingTable->dwBindingTableEntries[startBTI++],
        m_verticalLineStride,
        true));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    // Enc CU Record
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams2D(
        &surfaceCodecParams,
        &m_intermediateCuRecordSurfaceLcu32,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_PAK_OBJECT_ENCODE].Value,
        bindingTable->dwBindingTableEntries[startBTI++],
        m_verticalLineStride,
        true));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    // PAK object command surface
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams1D(
        &surfaceCodecParams,
        &m_resMbCodeSurface,
        m_mvOffset,
        0,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_PAK_OBJECT_ENCODE].Value,
        bindingTable->dwBindingTableEntries[startBTI++],
        true));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    // CU packet for PAK surface
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams1D(
        &surfaceCodecParams,
        &m_resMbCodeSurface,
        m_mbCodeSize - m_mvOffset,
        m_mvOffset,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_PAK_OBJECT_ENCODE].Value,
        bindingTable->dwBindingTableEntries[startBTI++],
        true));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    //Software scoreboard surface
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams2D(
        &surfaceCodecParams,
        m_swScoreboardState->GetCurSwScoreboardSurface(),
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_ELLC_LLC_ONLY].Value,
        bindingTable->dwBindingTableEntries[startBTI++],
        m_verticalLineStride,
        true));

    surfaceCodecParams.bUse32UINTSurfaceFormat = true;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    // Scratch surface for Internal Use Only
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams2D(
        &surfaceCodecParams,
        &m_scratchSurface,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_ELLC_LLC_ONLY].Value,
        bindingTable->dwBindingTableEntries[startBTI++],
        m_verticalLineStride,
        true));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    // CU 16x16 QP data input surface
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams2D(
        &surfaceCodecParams,
        &m_brcBuffers.sBrcMbQpBuffer,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_ELLC_LLC_ONLY].Value,
        bindingTable->dwBindingTableEntries[startBTI++],
        m_verticalLineStride,
        false));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    // Lcu level data input
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams2D(
        &surfaceCodecParams,
        &m_lcuLevelInputDataSurface[m_currRecycledBufIdx],
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_ELLC_LLC_ONLY].Value,
        bindingTable->dwBindingTableEntries[startBTI++],
        m_verticalLineStride,
        false));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    // Enc I Constant Table surface // CostLUT Buf
    startBTI = MBENC_I_FRAME_ENC_CONST_TABLE;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams1D(
        &surfaceCodecParams,
        &m_encConstantTableForB.sResource,
        m_encConstantTableForB.dwSize,
        0,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_ELLC_LLC_ONLY].Value,
        bindingTable->dwBindingTableEntries[startBTI++],
        false));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

#if 0
    // Concurrent Thread Group Data surface
    startBTI = MBENC_I_FRAME_CONCURRENT_TG_DATA;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams1D(
        &surfaceCodecParams,
        &resConcurrentThreadGroupData.sResource,
        resConcurrentThreadGroupData.dwSize,
        0,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_ELLC_LLC_ONLY].Value,
        bindingTable->dwBindingTableEntries[startBTI++],
        false));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));
#endif

    // Brc Combined Enc parameter surface
    startBTI = MBENC_I_FRAME_BRC_COMBINED_ENC_PARAMETER_SURFACE;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams1D(
        &surfaceCodecParams,
        &m_brcInputForEncKernelBuffer->sResource,
        HEVC_FRAMEBRC_BUF_CONST_SIZE,
        0,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_ELLC_LLC_ONLY].Value,
        bindingTable->dwBindingTableEntries[startBTI++],
        false));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    // Kernel debug surface
    startBTI = MBENC_I_FRAME_DEBUG_DUMP;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams1D(
        &surfaceCodecParams,
        &m_debugSurface[0].sResource,
        m_debugSurface[0].dwSize,
        0,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_ELLC_LLC_ONLY].Value,
        bindingTable->dwBindingTableEntries[startBTI++],
        false));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::SendMbEncSurfacesBKernel(
    PMOS_COMMAND_BUFFER cmdBuffer)
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_CHK_NULL_RETURN(m_mbEncKernelStates);
    PMHW_KERNEL_STATE kernelState = m_isMaxLcu64 ? &m_mbEncKernelStates[MBENC_LCU64_KRNIDX] : &m_mbEncKernelStates[MBENC_LCU32_KRNIDX];

    CODECHAL_ENCODE_CHK_NULL_RETURN(m_mbEncKernelBindingTable);
    PCODECHAL_ENCODE_BINDING_TABLE_GENERIC bindingTable = m_isMaxLcu64 ? &m_mbEncKernelBindingTable[MBENC_LCU64_KRNIDX] : &m_mbEncKernelBindingTable[MBENC_LCU32_KRNIDX];

    PMOS_SURFACE                  inputSurface = m_rawSurfaceToEnc;
    uint32_t                      startBTI     = MBENC_B_FRAME_VME_PRED_CURR_PIC_IDX0;
    CODECHAL_SURFACE_CODEC_PARAMS surfaceCodecParams;

    // Combined 1D buffer 1, which contains regular kernel curbe and concurrent map
    startBTI = MBENC_B_FRAME_ENCODER_COMBINED_BUFFER1;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams1D(
        &surfaceCodecParams,
        &m_encBCombinedBuffer1[m_currRecycledBufIdx].sResource,
        m_encBCombinedBuffer1[m_currRecycledBufIdx].dwSize,
        0,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_ELLC_LLC_ONLY].Value,
        bindingTable->dwBindingTableEntries[startBTI++],
        false));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    CODECHAL_DEBUG_TOOL(
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpBuffer(
            &m_encBCombinedBuffer1[m_currRecycledBufIdx].sResource,
            CodechalDbgAttr::attrOutput,
            "Hevc_CombinedBuffer1",
            m_encBCombinedBuffer1[m_currRecycledBufIdx].dwSize,
            0,
            CODECHAL_MEDIA_STATE_HEVC_B_MBENC)););
    // Combined 1D buffer 2, which contains non fixed sizes of buffers
    startBTI = MBENC_B_FRAME_ENCODER_COMBINED_BUFFER2;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams1D(
        &surfaceCodecParams,
        &m_encBCombinedBuffer2[m_currRecycledBufIdx].sResource,
        m_encBCombinedBuffer2[m_currRecycledBufIdx].dwSize,
        0,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_ELLC_LLC_ONLY].Value,
        bindingTable->dwBindingTableEntries[startBTI++],
        false));
    surfaceCodecParams.bRawSurface = true;

    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    CODECHAL_DEBUG_TOOL(
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpBuffer(
            &m_encBCombinedBuffer2[m_currRecycledBufIdx].sResource,
            CodechalDbgAttr::attrOutput,
            "Hevc_CombinedBuffer2",
            m_encBCombinedBuffer2[m_currRecycledBufIdx].dwSize,
            0,
            CODECHAL_MEDIA_STATE_HEVC_B_MBENC)););
    // VME surfaces
    startBTI = MBENC_B_FRAME_VME_PRED_CURR_PIC_IDX0;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParamsVME(
        &surfaceCodecParams,
        inputSurface,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_CURR_ENCODE].Value,
        bindingTable->dwBindingTableEntries[startBTI++]));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    for (int32_t surface_idx = 0; surface_idx < 4; surface_idx++)
    {
        int32_t       ll     = 0;
        CODEC_PICTURE refPic = m_hevcSliceParams->RefPicList[ll][surface_idx];
        if (!CodecHal_PictureIsInvalid(refPic) &&
            !CodecHal_PictureIsInvalid(m_hevcPicParams->RefFrameList[refPic.FrameIdx]))
        {
            int32_t      idx = m_hevcPicParams->RefFrameList[refPic.FrameIdx].FrameIdx;
            PMOS_SURFACE refSurfacePtr;
            if (surface_idx == 0 && m_useWeightedSurfaceForL0)
            {
                refSurfacePtr = m_wpState->GetWPOutputPicList(CODEC_WP_OUTPUT_L0_START + surface_idx);
            }
            else
            {
                refSurfacePtr = &m_refList[idx]->sRefBuffer;
            }

            // Picture Y VME
            CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParamsVME(
                &surfaceCodecParams,
                refSurfacePtr,
                m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_REF_ENCODE].Value,
                bindingTable->dwBindingTableEntries[startBTI++]));

            CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
                m_hwInterface,
                cmdBuffer,
                &surfaceCodecParams,
                kernelState));

            CODECHAL_DEBUG_TOOL(
                m_debugInterface->m_refIndex = (uint16_t)refPic.FrameIdx;
                std::string refSurfName      = "RefSurf" + std::to_string(static_cast<uint32_t>(m_debugInterface->m_refIndex));
                CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpYUVSurface(
                    &m_refList[idx]->sRefBuffer,
                    CodechalDbgAttr::attrReferenceSurfaces,
                    refSurfName.data())));
        }
        else
        {
            // Providing Dummy surface as per VME requirement.
            CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParamsVME(
                &surfaceCodecParams,
                inputSurface,
                m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_REF_ENCODE].Value,
                bindingTable->dwBindingTableEntries[startBTI++]));

            CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
                m_hwInterface,
                cmdBuffer,
                &surfaceCodecParams,
                kernelState));
        }

        ll     = 1;
        refPic = m_hevcSliceParams->RefPicList[ll][surface_idx];
        if (!CodecHal_PictureIsInvalid(refPic) &&
            !CodecHal_PictureIsInvalid(m_hevcPicParams->RefFrameList[refPic.FrameIdx]))
        {
            int32_t      idx = m_hevcPicParams->RefFrameList[refPic.FrameIdx].FrameIdx;
            PMOS_SURFACE refSurfacePtr;
            if (surface_idx == 0 && m_useWeightedSurfaceForL1)
            {
                refSurfacePtr = m_wpState->GetWPOutputPicList(CODEC_WP_OUTPUT_L1_START + surface_idx);
            }
            else
            {
                refSurfacePtr = &m_refList[idx]->sRefBuffer;
            }

            // Picture Y VME
            CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParamsVME(
                &surfaceCodecParams,
                refSurfacePtr,
                m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_REF_ENCODE].Value,
                bindingTable->dwBindingTableEntries[startBTI++]));

            CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
                m_hwInterface,
                cmdBuffer,
                &surfaceCodecParams,
                kernelState));

            CODECHAL_DEBUG_TOOL(
                m_debugInterface->m_refIndex = (uint16_t)refPic.FrameIdx;
                std::string refSurfName      = "RefSurf" + std::to_string(static_cast<uint32_t>(m_debugInterface->m_refIndex));
                CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpYUVSurface(
                    &m_refList[idx]->sRefBuffer,
                    CodechalDbgAttr::attrReferenceSurfaces,
                    refSurfName.data())));
        }
        else
        {
            // Providing Dummy surface as per VME requirement.
            CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParamsVME(
                &surfaceCodecParams,
                inputSurface,
                m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_REF_ENCODE].Value,
                bindingTable->dwBindingTableEntries[startBTI++]));

            CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
                m_hwInterface,
                cmdBuffer,
                &surfaceCodecParams,
                kernelState));
        }
    }

    //Source Y and UV
    startBTI = MBENC_B_FRAME_CURR_Y;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams2D(
        &surfaceCodecParams,
        inputSurface,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_CURR_ENCODE].Value,
        bindingTable->dwBindingTableEntries[startBTI++],
        m_verticalLineStride,
        false));

    surfaceCodecParams.bUseUVPlane = true;

    surfaceCodecParams.dwUVBindingTableOffset = bindingTable->dwBindingTableEntries[startBTI];
    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    CODECHAL_DEBUG_TOOL(
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpYUVSurface(
            inputSurface,
            CodechalDbgAttr::attrEncodeRawInputSurface,
            "MbEnc_Input_SrcSurf")));

    // Current Y with reconstructed boundary pixels
    startBTI = MBENC_B_FRAME_CURR_Y_WITH_RECON_BOUNDARY_PIX;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams2D(
        &surfaceCodecParams,
        &m_currPicWithReconBoundaryPix,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_PAK_OBJECT_ENCODE].Value,
        bindingTable->dwBindingTableEntries[startBTI],
        m_verticalLineStride,
        true));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    // Enc CU Record
    startBTI = MBENC_B_FRAME_ENC_CU_RECORD;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams2D(
        &surfaceCodecParams,
        &m_intermediateCuRecordSurfaceLcu32,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_PAK_OBJECT_ENCODE].Value,
        bindingTable->dwBindingTableEntries[startBTI],
        0,
        true));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    // PAK object command surface
    startBTI = MBENC_B_FRAME_PAK_OBJ;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams1D(
        &surfaceCodecParams,
        &m_resMbCodeSurface,
        m_mvOffset,
        0,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_PAK_OBJECT_ENCODE].Value,
        bindingTable->dwBindingTableEntries[startBTI],
        true));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    // CU packet for PAK surface
    startBTI = MBENC_B_FRAME_PAK_CU_RECORD;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams1D(
        &surfaceCodecParams,
        &m_resMbCodeSurface,
        m_mbCodeSize - m_mvOffset,
        m_mvOffset,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_PAK_OBJECT_ENCODE].Value,
        bindingTable->dwBindingTableEntries[startBTI],
        true));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    //Software scoreboard surface
    startBTI = MBENC_B_FRAME_SW_SCOREBOARD;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams2D(
        &surfaceCodecParams,
        m_swScoreboardState->GetCurSwScoreboardSurface(),
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_ELLC_LLC_ONLY].Value,
        bindingTable->dwBindingTableEntries[startBTI],
        m_verticalLineStride,
        true));

    surfaceCodecParams.bUse32UINTSurfaceFormat = true;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    // Scratch surface for Internal Use Only
    startBTI = MBENC_B_FRAME_SCRATCH_SURFACE;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams2D(
        &surfaceCodecParams,
        &m_scratchSurface,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_ELLC_LLC_ONLY].Value,
        bindingTable->dwBindingTableEntries[startBTI],
        m_verticalLineStride,
        true));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    // CU 16x16 QP data input surface
    startBTI = MBENC_B_FRAME_CU_QP_DATA;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams2D(
        &surfaceCodecParams,
        &m_brcBuffers.sBrcMbQpBuffer,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_ELLC_LLC_ONLY].Value,
        bindingTable->dwBindingTableEntries[startBTI],
        m_verticalLineStride,
        false));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    // Lcu level data input
    startBTI = MBENC_B_FRAME_LCU_LEVEL_DATA_INPUT;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams2D(
        &surfaceCodecParams,
        &m_lcuLevelInputDataSurface[m_currRecycledBufIdx],
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_ELLC_LLC_ONLY].Value,
        bindingTable->dwBindingTableEntries[startBTI],
        m_verticalLineStride,
        false));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    // Enc B 32x32 Constant Table surface
    startBTI = MBENC_B_FRAME_ENC_CONST_TABLE;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams1D(
        &surfaceCodecParams,
        &m_encConstantTableForB.sResource,
        m_encConstantTableForB.dwSize,
        0,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_ELLC_LLC_ONLY].Value,
        bindingTable->dwBindingTableEntries[startBTI],
        false));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    // Colocated CU Motion Vector Data surface
    startBTI                    = MBENC_B_FRAME_COLOCATED_CU_MV_DATA;
    uint8_t mbCodeIdxForTempMVP = 0xFF;
    if (m_hevcPicParams->CollocatedRefPicIndex != 0xFF && m_hevcPicParams->CollocatedRefPicIndex < CODEC_MAX_NUM_REF_FRAME_HEVC)
    {
        uint8_t frameIdx = m_hevcPicParams->RefFrameList[m_hevcPicParams->CollocatedRefPicIndex].FrameIdx;

        mbCodeIdxForTempMVP = m_refList[frameIdx]->ucScalingIdx;
    }

    if (m_pictureCodingType == I_TYPE)
    {
        // No temoporal MVP in the I frame
        m_hevcSliceParams->slice_temporal_mvp_enable_flag = false;
    }
    else
    {
        if (mbCodeIdxForTempMVP == 0xFF && m_hevcSliceParams->slice_temporal_mvp_enable_flag)
        {
            // Temporal reference MV index is invalid and so disable the temporal MVP
            CODECHAL_ENCODE_ASSERT(false);
            m_hevcSliceParams->slice_temporal_mvp_enable_flag = false;
        }
    }

    if (mbCodeIdxForTempMVP == 0xFF)
    {
        startBTI++;
    }
    else
    {
        CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams1D(
            &surfaceCodecParams,
            m_trackedBuf->GetMvTemporalBuffer(mbCodeIdxForTempMVP),
            m_sizeOfMvTemporalBuffer,
            0,
            m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_MV_DATA_ENCODE].Value,
            bindingTable->dwBindingTableEntries[startBTI++],
            false));

        CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
            m_hwInterface,
            cmdBuffer,
            &surfaceCodecParams,
            kernelState));
    }

    startBTI = MBENC_B_FRAME_HME_MOTION_PREDICTOR_DATA;

    // HME motion predictor data
    if (m_hmeEnabled)
    {
        CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams2D(
            &surfaceCodecParams,
            m_hmeKernel->GetSurface(CodechalKernelHme::SurfaceId::me4xMvDataBuffer),
            m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_MV_DATA_ENCODE].Value,
            bindingTable->dwBindingTableEntries[startBTI++],
            m_verticalLineStride,
            false));

        CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
            m_hwInterface,
            cmdBuffer,
            &surfaceCodecParams,
            kernelState));
    }
    else
    {
        startBTI++;
    }

    // Brc Combined Enc parameter surface
    startBTI = MBENC_B_FRAME_BRC_COMBINED_ENC_PARAMETER_SURFACE;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams1D(
        &surfaceCodecParams,
        &m_brcInputForEncKernelBuffer->sResource,
        HEVC_FRAMEBRC_BUF_CONST_SIZE,
        0,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_ELLC_LLC_ONLY].Value,
        bindingTable->dwBindingTableEntries[startBTI++],
        false));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    startBTI = MBENC_B_FRAME_VME_PRED_FOR_2X_DS_CURR;
    if (m_isMaxLcu64)
    {
        PMOS_SURFACE currScaledSurface2x = m_trackedBuf->Get2xDsSurface(CODEC_CURR_TRACKED_BUFFER);

        //VME 2X Inter prediction surface for current frame
        CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParamsVME(
            &surfaceCodecParams,
            currScaledSurface2x,
            m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_CURR_ENCODE].Value,
            bindingTable->dwBindingTableEntries[startBTI++]));

        CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
            m_hwInterface,
            cmdBuffer,
            &surfaceCodecParams,
            kernelState));

        CODECHAL_DEBUG_TOOL(
            CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpYUVSurface(
                currScaledSurface2x,
                CodechalDbgAttr::attrReferenceSurfaces,
                "2xScaledSurf")));

        // RefFrame's 2x DS surface
        for (int32_t surface_idx = 0; surface_idx < 4; surface_idx++)
        {
            int32_t       ll     = 0;
            CODEC_PICTURE refPic = m_hevcSliceParams->RefPicList[ll][surface_idx];
            if (!CodecHal_PictureIsInvalid(refPic) &&
                !CodecHal_PictureIsInvalid(m_hevcPicParams->RefFrameList[refPic.FrameIdx]))
            {
                int32_t idx = m_hevcPicParams->RefFrameList[refPic.FrameIdx].FrameIdx;

                // Picture Y VME
                CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParamsVME(
                    &surfaceCodecParams,
                    m_trackedBuf->Get2xDsSurface(m_refList[idx]->ucScalingIdx),
                    m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_REF_ENCODE].Value,
                    bindingTable->dwBindingTableEntries[startBTI++]));

                CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
                    m_hwInterface,
                    cmdBuffer,
                    &surfaceCodecParams,
                    kernelState));

                CODECHAL_DEBUG_TOOL(
                    m_debugInterface->m_refIndex = (uint16_t)refPic.FrameIdx;
                    std::string refSurfName      = "Ref2xScaledSurf" + std::to_string(static_cast<uint32_t>(m_debugInterface->m_refIndex));
                    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpYUVSurface(
                        m_trackedBuf->Get2xDsSurface(m_refList[idx]->ucScalingIdx),
                        CodechalDbgAttr::attrReferenceSurfaces,
                        refSurfName.data())));
            }
            else
            {
                // Providing Dummy surface as per VME requirement.
                CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParamsVME(
                    &surfaceCodecParams,
                    currScaledSurface2x,
                    m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_REF_ENCODE].Value,
                    bindingTable->dwBindingTableEntries[startBTI++]));

                CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
                    m_hwInterface,
                    cmdBuffer,
                    &surfaceCodecParams,
                    kernelState));
            }

            ll     = 1;
            refPic = m_hevcSliceParams->RefPicList[ll][surface_idx];
            if (!CodecHal_PictureIsInvalid(refPic) &&
                !CodecHal_PictureIsInvalid(m_hevcPicParams->RefFrameList[refPic.FrameIdx]))
            {
                int32_t idx = m_hevcPicParams->RefFrameList[refPic.FrameIdx].FrameIdx;

                // Picture Y VME
                CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParamsVME(
                    &surfaceCodecParams,
                    m_trackedBuf->Get2xDsSurface(m_refList[idx]->ucScalingIdx),
                    m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_REF_ENCODE].Value,
                    bindingTable->dwBindingTableEntries[startBTI++]));

                CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
                    m_hwInterface,
                    cmdBuffer,
                    &surfaceCodecParams,
                    kernelState));

                CODECHAL_DEBUG_TOOL(
                    m_debugInterface->m_refIndex = (uint16_t)refPic.FrameIdx;
                    std::string refSurfName      = "Ref2xScaledSurf" + std::to_string(static_cast<uint32_t>(m_debugInterface->m_refIndex));
                    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpYUVSurface(
                        m_trackedBuf->Get2xDsSurface(m_refList[idx]->ucScalingIdx),
                        CodechalDbgAttr::attrReferenceSurfaces,
                        refSurfName.data())));
            }
            else
            {
                // Providing Dummy surface as per VME requirement.
                CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParamsVME(
                    &surfaceCodecParams,
                    currScaledSurface2x,
                    m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_REF_ENCODE].Value,
                    bindingTable->dwBindingTableEntries[startBTI++]));

                CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
                    m_hwInterface,
                    cmdBuffer,
                    &surfaceCodecParams,
                    kernelState));
            }
        }
    }

    // Encoder History Input Buffer
    startBTI = MBENC_B_FRAME_ENCODER_HISTORY_INPUT_BUFFER;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams2D(
        &surfaceCodecParams,
        &m_encoderHistoryInputBuffer,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_ELLC_LLC_ONLY].Value,
        bindingTable->dwBindingTableEntries[startBTI++],
        m_verticalLineStride,
        true));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    // Encoder History Output Buffer
    startBTI = MBENC_B_FRAME_ENCODER_HISTORY_OUTPUT_BUFFER;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams2D(
        &surfaceCodecParams,
        &m_encoderHistoryOutputBuffer,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_ELLC_LLC_ONLY].Value,
        bindingTable->dwBindingTableEntries[startBTI++],
        m_verticalLineStride,
        true));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    // Kernel debug surface
    startBTI = MBENC_B_FRAME_DEBUG_SURFACE;
    for (uint32_t i = 0; i < CODECHAL_GET_ARRAY_LENGTH(m_debugSurface); i++, startBTI++)
    {
        CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams1D(
            &surfaceCodecParams,
            &m_debugSurface[i].sResource,
            m_debugSurface[i].dwSize,
            0,
            m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_ELLC_LLC_ONLY].Value,
            bindingTable->dwBindingTableEntries[startBTI],
            false));

        CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
            m_hwInterface,
            cmdBuffer,
            &surfaceCodecParams,
            kernelState));
    }

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::SendBrcInitResetSurfaces(
    PMOS_COMMAND_BUFFER      cmdBuffer,
    CODECHAL_HEVC_BRC_KRNIDX krnIdx)
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    if (krnIdx != CODECHAL_HEVC_BRC_INIT && krnIdx != CODECHAL_HEVC_BRC_RESET)
    {
        CODECHAL_ENCODE_ASSERTMESSAGE("Brc kernel requested is not init or reset\n");
        return MOS_STATUS_INVALID_PARAMETER;
    }

    PCODECHAL_ENCODE_BINDING_TABLE_GENERIC bindingTable = &m_brcKernelBindingTable[krnIdx];
    uint32_t                               startBti     = 0;
    CODECHAL_SURFACE_CODEC_PARAMS          surfaceCodecParams;
    // BRC History Buffer
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams1D(
        &surfaceCodecParams,
        &m_brcBuffers.resBrcHistoryBuffer,
        m_brcHistoryBufferSize,
        0,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_ELLC_LLC_ONLY].Value,
        bindingTable->dwBindingTableEntries[startBti++],
        true));

    PMHW_KERNEL_STATE kernelState = &m_brcKernelStates[krnIdx];
    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    // BRC Distortion Surface
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams2D(
        &surfaceCodecParams,
        m_brcDistortion,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_BRC_ME_DISTORTION_ENCODE].Value,
        bindingTable->dwBindingTableEntries[startBti++],
        0,
        true));
    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::SetupBrcConstantTable(
    PMOS_SURFACE brcConstantData)
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    MOS_LOCK_PARAMS lockFlags;
    MOS_ZeroMemory(&lockFlags, sizeof(MOS_LOCK_PARAMS));
    lockFlags.WriteOnly = 1;
    uint8_t *outputData = (uint8_t *)m_osInterface->pfnLockResource(m_osInterface, &brcConstantData->OsResource, &lockFlags);
    CODECHAL_ENCODE_CHK_NULL_RETURN(outputData);
    uint8_t *inputData  = (uint8_t *)g_cInit_HEVC_BRC_QP_ADJUST;
    uint32_t inputSize  = sizeof(g_cInit_HEVC_BRC_QP_ADJUST);
    uint32_t outputSize = brcConstantData->dwHeight * brcConstantData->dwPitch;

    // 576-byte of Qp adjust table
    while ((inputSize >= brcConstantData->dwWidth) && (outputSize >= brcConstantData->dwWidth))
    {
        MOS_SecureMemcpy(outputData, outputSize, inputData, brcConstantData->dwWidth);
        outputData += brcConstantData->dwPitch;
        outputSize -= brcConstantData->dwPitch;
        inputData += brcConstantData->dwWidth;
        inputSize -= brcConstantData->dwWidth;
    }
    //lambda and mode cost
    if (m_isMaxLcu64)
    {
        inputData = (uint8_t *)m_brcLcu64x64LambdaModeCostInit;
        inputSize = sizeof(m_brcLcu64x64LambdaModeCostInit);
    }
    else
    {
        inputData = (uint8_t *)m_brcLcu32x32LambdaModeCostInit;
        inputSize = sizeof(m_brcLcu32x32LambdaModeCostInit);
    }

    while ((inputSize >= brcConstantData->dwWidth) && (outputSize >= brcConstantData->dwWidth))
    {
        MOS_SecureMemcpy(outputData, outputSize, inputData, brcConstantData->dwWidth);
        outputData += brcConstantData->dwPitch;
        outputSize -= brcConstantData->dwPitch;
        inputData += brcConstantData->dwWidth;
        inputSize -= brcConstantData->dwWidth;
    }

    m_osInterface->pfnUnlockResource(m_osInterface, &brcConstantData->OsResource);

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::SendBrcFrameUpdateSurfaces(
    PMOS_COMMAND_BUFFER cmdBuffer)
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    // Fill HCP_IMG_STATE so that BRC kernel can use it to generate the write buffer for PAK
    PMOS_RESOURCE            brcHcpStateReadBuffer = &m_brcBuffers.resBrcImageStatesReadBuffer[m_currRecycledBufIdx];
    MHW_VDBOX_HEVC_PIC_STATE mhwHevcPicState;
    mhwHevcPicState.pHevcEncSeqParams     = m_hevcSeqParams;
    mhwHevcPicState.pHevcEncPicParams     = m_hevcPicParams;
    mhwHevcPicState.bUseVDEnc             = m_vdencEnabled ? 1 : 0;
    mhwHevcPicState.brcNumPakPasses       = m_mfxInterface->GetBrcNumPakPasses();
    mhwHevcPicState.sseEnabledInVmeEncode = m_sseEnabled;
    mhwHevcPicState.rhodomainRCEnable     = m_brcEnabled && (m_numPipe > 1);
    mhwHevcPicState.bSAOEnable            = m_hevcSeqParams->SAO_enabled_flag ? (m_hevcSliceParams->slice_sao_luma_flag || m_hevcSliceParams->slice_sao_chroma_flag) : 0;
    mhwHevcPicState.bTransformSkipEnable  = m_hevcPicParams->transform_skip_enabled_flag;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hcpInterface->AddHcpHevcPicBrcBuffer(brcHcpStateReadBuffer, &mhwHevcPicState));

    PMOS_SURFACE brcConstantData = &m_brcBuffers.sBrcConstantDataBuffer[m_currRecycledBufIdx];
    CODECHAL_ENCODE_CHK_STATUS_RETURN(SetupBrcConstantTable(brcConstantData));

    uint32_t                               startBti     = 0;
    PMHW_KERNEL_STATE                      kernelState  = &m_brcKernelStates[CODECHAL_HEVC_BRC_FRAME_UPDATE];
    PCODECHAL_ENCODE_BINDING_TABLE_GENERIC bindingTable = &m_brcKernelBindingTable[CODECHAL_HEVC_BRC_FRAME_UPDATE];
    CODECHAL_SURFACE_CODEC_PARAMS          surfaceCodecParams;

    // BRC History Buffer
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams1D(
        &surfaceCodecParams,
        &m_brcBuffers.resBrcHistoryBuffer,
        m_brcHistoryBufferSize,
        0,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_ELLC_LLC_ONLY].Value,
        bindingTable->dwBindingTableEntries[startBti++],
        true));
    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    // BRC Prev PAK statistics output buffer
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams1D(
        &surfaceCodecParams,
        &m_brcBuffers.resBrcPakStatisticBuffer[m_brcBuffers.uiCurrBrcPakStasIdxForRead],
        m_hevcBrcPakStatisticsSize,
        0,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_ELLC_LLC_ONLY].Value,
        bindingTable->dwBindingTableEntries[startBti++],
        false));
    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    // BRC HCP_PIC_STATE read
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams1D(
        &surfaceCodecParams,
        brcHcpStateReadBuffer,
        m_brcBuffers.dwBrcHcpPicStateSize,
        0,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_ELLC_LLC_ONLY].Value,
        bindingTable->dwBindingTableEntries[startBti++],
        false));
    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    // BRC HCP_PIC_STATE write
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams1D(
        &surfaceCodecParams,
        &m_brcBuffers.resBrcImageStatesWriteBuffer[m_currRecycledBufIdx],
        m_brcBuffers.dwBrcHcpPicStateSize,
        0,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_ELLC_LLC_ONLY].Value,
        bindingTable->dwBindingTableEntries[startBti++],
        true));
    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    // Combined ENC-parameter buffer
    startBti++;

    // BRC Distortion Surface
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams2D(
        &surfaceCodecParams,
        m_brcDistortion,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_BRC_ME_DISTORTION_ENCODE].Value,
        bindingTable->dwBindingTableEntries[startBti++],
        0,
        true));
    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    // BRC Data Surface
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams2D(
        &surfaceCodecParams,
        brcConstantData,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_ELLC_LLC_ONLY].Value,
        bindingTable->dwBindingTableEntries[startBti++],
        0,
        false));
    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    // Pixel MB Statistics surface
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams1D(
        &surfaceCodecParams,
        &m_resMbStatsBuffer,
        m_hwInterface->m_avcMbStatBufferSize,
        0,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_ELLC_LLC_ONLY].Value,
        bindingTable->dwBindingTableEntries[startBti++],
        false));
    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    // Mv and Distortion summation surface
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams1D(
        &surfaceCodecParams,
        &m_mvAndDistortionSumSurface.sResource,
        m_mvAndDistortionSumSurface.dwSize,
        0,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_ELLC_LLC_ONLY].Value,
        bindingTable->dwBindingTableEntries[startBti++],
        false));
    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    CODECHAL_DEBUG_TOOL(
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpBuffer(
            &m_mvAndDistortionSumSurface.sResource,
            CodechalDbgAttr::attrInput,
            "MvDistSum",
            m_mvAndDistortionSumSurface.dwSize,
            0,
            CODECHAL_MEDIA_STATE_BRC_UPDATE));

        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpBuffer(
            &m_brcBuffers.resBrcImageStatesReadBuffer[m_currRecycledBufIdx],
            CodechalDbgAttr::attrInput,
            "ImgStateRead",
            BRC_IMG_STATE_SIZE_PER_PASS * m_hwInterface->GetMfxInterface()->GetBrcNumPakPasses(),
            0,
            CODECHAL_MEDIA_STATE_BRC_UPDATE));

        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpSurface(
            &m_brcBuffers.sBrcConstantDataBuffer[m_currRecycledBufIdx],
            CodechalDbgAttr::attrInput,
            "ConstData",
            CODECHAL_MEDIA_STATE_BRC_UPDATE));

        // PAK statistics buffer is only dumped for BrcUpdate kernel input
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpBuffer(
            &m_brcBuffers.resBrcPakStatisticBuffer[m_brcBuffers.uiCurrBrcPakStasIdxForRead],
            CodechalDbgAttr::attrInput,
            "PakStats",
            HEVC_BRC_PAK_STATISTCS_SIZE,
            0,
            CODECHAL_MEDIA_STATE_BRC_UPDATE));

        // HEVC maintains a ptr to its own distortion surface, as it may be a couple different surfaces
        if (m_brcDistortion) {
            CODECHAL_ENCODE_CHK_STATUS_RETURN(
                m_debugInterface->DumpBuffer(
                    &m_brcDistortion->OsResource,
                    CodechalDbgAttr::attrInput,
                    "BrcDist_BeforeFrameBRC",
                    m_brcBuffers.sMeBrcDistortionBuffer.dwPitch * m_brcBuffers.sMeBrcDistortionBuffer.dwHeight,
                    m_brcBuffers.dwMeBrcDistortionBottomFieldOffset,
                    CODECHAL_MEDIA_STATE_BRC_UPDATE));
        }

        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpBuffer(&m_brcBuffers.resBrcHistoryBuffer,
            CodechalDbgAttr::attrInput,
            "HistoryRead_beforeFramBRC",
            m_brcHistoryBufferSize,
            0,
            CODECHAL_MEDIA_STATE_BRC_UPDATE));

        if (m_brcBuffers.pMbEncKernelStateInUse) {
            CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpCurbe(
                CODECHAL_MEDIA_STATE_BRC_UPDATE,
                m_brcBuffers.pMbEncKernelStateInUse));
        }

        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpBuffer(&m_resMbStatsBuffer,
            CodechalDbgAttr::attrInput,
            "MBStatsSurf",
            m_hwInterface->m_avcMbStatBufferSize,
            0,
            CODECHAL_MEDIA_STATE_BRC_UPDATE));)
    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::SendBrcLcuUpdateSurfaces(
    PMOS_COMMAND_BUFFER cmdBuffer)
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    PMHW_KERNEL_STATE                      kernelState  = &m_brcKernelStates[CODECHAL_HEVC_BRC_LCU_UPDATE];
    PCODECHAL_ENCODE_BINDING_TABLE_GENERIC bindingTable = &m_brcKernelBindingTable[CODECHAL_HEVC_BRC_LCU_UPDATE];
    uint32_t                               startBTI     = 0;
    CODECHAL_SURFACE_CODEC_PARAMS          surfaceCodecParams;

    if (m_brcEnabled)
    {
        // BRC History Buffer
        CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams1D(
            &surfaceCodecParams,
            &m_brcBuffers.resBrcHistoryBuffer,
            m_brcHistoryBufferSize,
            0,
            m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_ELLC_LLC_ONLY].Value,
            bindingTable->dwBindingTableEntries[startBTI++],
            true));
        CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
            m_hwInterface,
            cmdBuffer,
            &surfaceCodecParams,
            kernelState));

        // BRC Distortion Surface - Intra or Inter
        CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams2D(
            &surfaceCodecParams,
            m_brcDistortion,
            m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_BRC_ME_DISTORTION_ENCODE].Value,
            bindingTable->dwBindingTableEntries[startBTI++],
            0,
            true));
        CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
            m_hwInterface,
            cmdBuffer,
            &surfaceCodecParams,
            kernelState));

        // Pixel MB Statistics surface
        CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams1D(
            &surfaceCodecParams,
            &m_resMbStatsBuffer,
            m_hwInterface->m_avcMbStatBufferSize,
            0,
            m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_ELLC_LLC_ONLY].Value,
            bindingTable->dwBindingTableEntries[startBTI++],
            false));
        CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
            m_hwInterface,
            cmdBuffer,
            &surfaceCodecParams,
            kernelState));
    }
    else
    {
        // CQP ROI
        startBTI += 3;
    }
    // MB QP surface
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams2D(
        &surfaceCodecParams,
        &m_brcBuffers.sBrcMbQpBuffer,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_ELLC_LLC_ONLY].Value,
        bindingTable->dwBindingTableEntries[startBTI++],
        0,
        true));
    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    // ROI surface
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitSurfaceCodecParams2D(
        &surfaceCodecParams,
        &m_brcBuffers.sBrcRoiSurface,
        m_hwInterface->GetCacheabilitySettings()[MOS_CODEC_RESOURCE_USAGE_SURFACE_BRC_ROI_ENCODE].Value,
        bindingTable->dwBindingTableEntries[startBTI++],
        0,
        false));
    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceCodecParams,
        kernelState));

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::GetCustomDispatchPattern(
    PMHW_WALKER_PARAMS            walkerParams,
    PCODECHAL_WALKER_CODEC_PARAMS walkerCodecParams)
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_CHK_NULL_RETURN(walkerParams);
    CODECHAL_ENCODE_CHK_NULL_RETURN(walkerCodecParams);

    MOS_ZeroMemory(walkerParams, sizeof(*walkerParams));

    walkerParams->WalkerMode = (MHW_WALKER_MODE)walkerCodecParams->WalkerMode;

    walkerParams->dwLocalLoopExecCount  = 0xFFFF;  //MAX VALUE
    walkerParams->dwGlobalLoopExecCount = 0xFFFF;  //MAX VALUE

    // the following code is copied from the kernel ULT
    uint32_t maxThreadWidth, maxThreadHeight;
    uint32_t threadSpaceWidth, threadSpaceHeight, concurGroupNum, threadScaleV;

    threadSpaceWidth  = walkerCodecParams->dwResolutionX;
    threadSpaceHeight = walkerCodecParams->dwResolutionY;
    maxThreadWidth    = threadSpaceWidth;
    maxThreadHeight   = threadSpaceHeight;
    concurGroupNum    = m_numberConcurrentGroup;
    threadScaleV      = m_numberEncKernelSubThread;

    if (concurGroupNum > 1)
    {
        maxThreadWidth  = threadSpaceWidth;
        maxThreadHeight = threadSpaceWidth + (threadSpaceWidth + threadSpaceHeight + concurGroupNum - 2) / concurGroupNum;
        maxThreadHeight *= threadScaleV;
        maxThreadHeight += 1;
    }
    else
    {
        threadSpaceHeight *= threadScaleV;
        maxThreadHeight *= threadScaleV;
    }

    uint32_t localLoopExecCount = m_degree45Needed ? (2 * m_numWavefrontInOneRegion + 1) : m_numWavefrontInOneRegion;

    eStatus = InitMediaObjectWalker(maxThreadWidth,
        maxThreadHeight,
        concurGroupNum - 1,
        m_swScoreboardState->GetDependencyPattern(),
        m_numberEncKernelSubThread - 1,
        localLoopExecCount,
        *walkerParams);

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::GenerateLcuLevelData(MOS_SURFACE &lcuLevelInputDataSurfaceParam)
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    CODECHAL_ENCODE_CHK_NULL_RETURN(m_tileParams);

    uint32_t numTileColumns = m_hevcPicParams->num_tile_columns_minus1 + 1;
    uint32_t numTileRows    = m_hevcPicParams->num_tile_rows_minus1 + 1;

    uint32_t shift    = m_hevcSeqParams->log2_max_coding_block_size_minus3 - m_hevcSeqParams->log2_min_coding_block_size_minus3;
    uint32_t residual = (1 << shift) - 1;

    uint32_t frameWidthInLcu  = (m_hevcSeqParams->wFrameWidthInMinCbMinus1 + 1 + residual) >> shift;
    uint32_t frameHeightInLcu = (m_hevcSeqParams->wFrameHeightInMinCbMinus1 + 1 + residual) >> shift;

    PLCU_LEVEL_DATA *lcuInfo = (PLCU_LEVEL_DATA *)MOS_AllocMemory(sizeof(PLCU_LEVEL_DATA) * frameWidthInLcu);
    CODECHAL_ENCODE_CHK_NULL_RETURN(lcuInfo);
    for (uint32_t i = 0; i < frameWidthInLcu; i++)
    {
        lcuInfo[i] = (PLCU_LEVEL_DATA)MOS_AllocMemory(sizeof(LCU_LEVEL_DATA) * frameHeightInLcu);
        if (lcuInfo[i] == nullptr)
        {
            for (uint32_t j = 0; j < i; j++)
            {
                MOS_FreeMemory(lcuInfo[j]);
            }
            MOS_FreeMemory(lcuInfo);
            CODECHAL_ENCODE_CHK_NULL_RETURN(nullptr);
        }
        MOS_ZeroMemory(lcuInfo[i], (sizeof(LCU_LEVEL_DATA) * frameHeightInLcu));
    }

    // Tiling case
    if (numTileColumns > 1 || numTileRows > 1)
    {
        // This assumes that the entire Slice is contained within a Tile
        for (uint32_t tileRow = 0; tileRow < numTileRows; tileRow++)
        {
            for (uint32_t tileCol = 0; tileCol < numTileColumns; tileCol++)
            {
                uint32_t                             tileId      = tileRow * numTileColumns + tileCol;
                MHW_VDBOX_HCP_TILE_CODING_PARAMS_G12 currentTile = m_tileParams[tileId];

                uint32_t tileColumnWidth = (currentTile.TileWidthInMinCbMinus1 + 1 + residual) >> shift;
                uint32_t tileRowHeight   = (currentTile.TileHeightInMinCbMinus1 + 1 + residual) >> shift;

                for (uint32_t startLCU = 0, sliceStartLcu = 0, slcCount = 0; slcCount < m_numSlices; slcCount++)
                {
                    bool lastSliceInTile = false, sliceInTile = false;

                    eStatus = (MOS_STATUS)IsSliceInTile(slcCount,
                        &currentTile,
                        &sliceInTile,
                        &lastSliceInTile);
                    if (eStatus != MOS_STATUS_SUCCESS)
                    {
                        for (uint32_t i = 0; i < frameWidthInLcu; i++)
                        {
                            MOS_FreeMemory(lcuInfo[i]);
                        }
                        MOS_FreeMemory(lcuInfo);
                        CODECHAL_ENCODE_CHK_STATUS_RETURN(eStatus);
                    }

                    if (!sliceInTile)
                    {
                        startLCU += m_hevcSliceParams[slcCount].NumLCUsInSlice;
                        continue;
                    }

                    sliceStartLcu      = m_hevcSliceParams[slcCount].slice_segment_address;
                    uint32_t sliceLcuX = sliceStartLcu % frameWidthInLcu;
                    uint32_t sliceLcuY = sliceStartLcu / frameWidthInLcu;

                    for (uint32_t i = 0; i < m_hevcSliceParams[slcCount].NumLCUsInSlice; i++)
                    {
                        lcuInfo[sliceLcuX][sliceLcuY].SliceStartLcuIndex   = (uint16_t)startLCU;
                        lcuInfo[sliceLcuX][sliceLcuY].SliceEndLcuIndex     = (uint16_t)(startLCU + m_hevcSliceParams[slcCount].NumLCUsInSlice);  // this should be next slice start index
                        lcuInfo[sliceLcuX][sliceLcuY].SliceId              = (uint16_t)slcCount;
                        lcuInfo[sliceLcuX][sliceLcuY].TileId               = (uint16_t)tileId;
                        lcuInfo[sliceLcuX][sliceLcuY].TileStartCoordinateX = (uint16_t)currentTile.TileStartLCUX;
                        lcuInfo[sliceLcuX][sliceLcuY].TileStartCoordinateY = (uint16_t)currentTile.TileStartLCUY;
                        lcuInfo[sliceLcuX][sliceLcuY].TileEndCoordinateX   = (uint16_t)(currentTile.TileStartLCUX + tileColumnWidth);
                        lcuInfo[sliceLcuX][sliceLcuY].TileEndCoordinateY   = (uint16_t)(currentTile.TileStartLCUY + tileRowHeight);

                        sliceLcuX++;

                        if (sliceLcuX >= currentTile.TileStartLCUX + tileColumnWidth)
                        {
                            sliceLcuX = currentTile.TileStartLCUX;
                            sliceLcuY++;
                        }
                    }
                    startLCU += m_hevcSliceParams[slcCount].NumLCUsInSlice;
                }
            }
        }
    }
    else  // non-tiling case
    {
        for (uint32_t startLCU = 0, sliceStartLcu = 0, slcCount = 0; slcCount < m_numSlices; slcCount++)
        {
            sliceStartLcu      = m_hevcSliceParams[slcCount].slice_segment_address;
            uint32_t sliceLcuX = sliceStartLcu % frameWidthInLcu;
            uint32_t sliceLcuY = sliceStartLcu / frameWidthInLcu;

            for (uint32_t i = 0; i < m_hevcSliceParams[slcCount].NumLCUsInSlice; i++)
            {
                lcuInfo[sliceLcuX][sliceLcuY].SliceStartLcuIndex   = (uint16_t)startLCU;
                lcuInfo[sliceLcuX][sliceLcuY].SliceEndLcuIndex     = (uint16_t)(startLCU + m_hevcSliceParams[slcCount].NumLCUsInSlice);  // this should be next slice start index
                lcuInfo[sliceLcuX][sliceLcuY].SliceId              = (uint16_t)slcCount;
                lcuInfo[sliceLcuX][sliceLcuY].TileId               = 0;
                lcuInfo[sliceLcuX][sliceLcuY].TileStartCoordinateX = 0;
                lcuInfo[sliceLcuX][sliceLcuY].TileStartCoordinateY = 0;
                lcuInfo[sliceLcuX][sliceLcuY].TileEndCoordinateX   = (uint16_t)frameWidthInLcu;
                lcuInfo[sliceLcuX][sliceLcuY].TileEndCoordinateY   = (uint16_t)frameHeightInLcu;

                sliceLcuX++;

                if (sliceLcuX >= frameWidthInLcu)
                {
                    sliceLcuX = 0;
                    sliceLcuY++;
                }
            }
            startLCU += m_hevcSliceParams[slcCount].NumLCUsInSlice;
        }
    }

    // Write LCU Info to the surface
    if (!Mos_ResourceIsNull(&lcuLevelInputDataSurfaceParam.OsResource))
    {
        MOS_LOCK_PARAMS lockFlags;
        MOS_ZeroMemory(&lockFlags, sizeof(MOS_LOCK_PARAMS));
        lockFlags.WriteOnly          = 1;
        PLCU_LEVEL_DATA lcuLevelData = (PLCU_LEVEL_DATA)m_osInterface->pfnLockResource(
            m_osInterface,
            &lcuLevelInputDataSurfaceParam.OsResource,
            &lockFlags);
        if (lcuLevelData == nullptr)
        {
            for (uint32_t i = 0; i < frameWidthInLcu; i++)
            {
                MOS_FreeMemory(lcuInfo[i]);
            }
            MOS_FreeMemory(lcuInfo);
            CODECHAL_ENCODE_CHK_NULL_RETURN(nullptr);
        }

        uint8_t *dataRowStart = (uint8_t *)lcuLevelData;

        for (uint32_t sliceLcuY = 0; sliceLcuY < frameHeightInLcu; sliceLcuY++)
        {
            for (uint32_t sliceLcuX = 0; sliceLcuX < frameWidthInLcu; sliceLcuX++)
            {
                *(lcuLevelData) = lcuInfo[sliceLcuX][sliceLcuY];

                if ((sliceLcuX + 1) == frameWidthInLcu)
                {
                    dataRowStart += lcuLevelInputDataSurfaceParam.dwPitch;
                    lcuLevelData = (PLCU_LEVEL_DATA)dataRowStart;
                }
                else
                {
                    lcuLevelData++;
                }
            }
        }

        m_osInterface->pfnUnlockResource(
            m_osInterface,
            &lcuLevelInputDataSurfaceParam.OsResource);
    }
    else
    {
        eStatus = MOS_STATUS_NULL_POINTER;
        CODECHAL_ENCODE_ASSERTMESSAGE("Null pointer exception\n");
    }

    // Freeing the temporarily allocated memory
    if (lcuInfo)
    {
        for (uint32_t i = 0; i < frameWidthInLcu; i++)
        {
            MOS_FreeMemory(lcuInfo[i]);
        }
        MOS_FreeMemory(lcuInfo);
    }
    return eStatus;
}

// Helper class to describe quadtree node
class QuadTreeNode
{
    friend class QuadTree;

public:
    QuadTreeNode(const QuadTreeNode *ctb, uint32_t x, uint32_t y, uint32_t level, uint32_t ctbLog2Size) : m_ctb(ctb), m_x(x), m_y(y), m_level(level), m_size((1 << ctbLog2Size) >> level), m_ctbLog2Size(ctbLog2Size)
    {
    }

protected:
    void Build(uint32_t picWidth, uint32_t picHeight)
    {
        if (DoesBlockCrossCodedPicture(picWidth, picHeight))
        {
            CreateCUs();
            for_each(m_childBlocks.begin(), m_childBlocks.end(), [&](QuadTreeNode &blk) { blk.Build(picWidth, picHeight); });
        }
    }

    void CreateCUs()
    {
        uint32_t size  = m_size / 2;
        uint32_t level = m_level + 1;

        m_childBlocks.emplace_back(m_ctb, m_x, m_y, level, m_ctbLog2Size);
        m_childBlocks.emplace_back(m_ctb, m_x + size, m_y, level, m_ctbLog2Size);
        m_childBlocks.emplace_back(m_ctb, m_x, m_y + size, level, m_ctbLog2Size);
        m_childBlocks.emplace_back(m_ctb, m_x + size, m_y + size, level, m_ctbLog2Size);
    }

    bool DoesBlockCrossCodedPicture(uint32_t w, uint32_t h) const
    {
        return (m_x < w && ((m_x + m_size) > w)) || (m_y < h && ((m_y + m_size) > h));
    }

public:
    const QuadTreeNode *      m_ctb         = nullptr;  // the root of CTB
    const uint32_t            m_x           = 0;
    const uint32_t            m_y           = 0;
    const uint32_t            m_level       = 0;
    const uint32_t            m_size        = 0;
    const uint32_t            m_ctbLog2Size = 0;
    std::vector<QuadTreeNode> m_childBlocks = {};
};

class QuadTree : public QuadTreeNode
{
public:
    QuadTree(uint32_t x, uint32_t y, uint32_t ctbLog2Size)
        : QuadTreeNode(this, x, y, 0, ctbLog2Size)
    {
    }

    // Build quadtree in the way none of the blocks crosses picture boundary
    void BuildQuadTree(uint32_t width, uint32_t height)
    {
        m_picWidth  = width;
        m_picHeight = height;

        Build(width, height);

        CUs.reserve(64);
        FillCuList(*this, CUs);
    }

    static void GetSplitFlags(const QuadTreeNode &blk, HcpPakObjectG12 &pakObj)
    {
        auto idx = [](uint32_t x0, uint32_t y0, uint32_t x, uint32_t y, uint32_t log2CbSize) {
            auto const nCbS = (1 << log2CbSize);
            return (x - x0) / nCbS + (y - y0) / nCbS * 2;
        };

        if (blk.m_childBlocks.empty())  // Block doesn't have splits
            return;

        switch (blk.m_level)
        {
        case 0:
            pakObj.DW1.Split_flag_level0 = 1;
            break;

        case 1:
        {
            auto const blkIdx = idx(blk.m_ctb->m_x, blk.m_ctb->m_y, blk.m_x, blk.m_y, blk.m_ctbLog2Size - 1);
            pakObj.DW1.Split_flag_level1 |= 1 << blkIdx;
        }
        break;

        case 2:
        {
            auto const blkIdx1 = idx(blk.m_ctb->m_x, blk.m_ctb->m_y, blk.m_x, blk.m_y, blk.m_ctbLog2Size - 1);
            auto const nCbS1   = (1 << (blk.m_ctbLog2Size - 1));
            auto const x1      = blk.m_ctb->m_x + nCbS1 * (blkIdx1 % 2);
            auto const y1      = blk.m_ctb->m_y + nCbS1 * (blkIdx1 / 2);
            auto const blkIdx2 = idx(x1, y1, blk.m_x, blk.m_y, blk.m_ctbLog2Size - 2);
            switch (blkIdx1)
            {
            case 0:
                pakObj.DW1.Split_flag_level2_level1part0 |= 1 << blkIdx2;
                break;
            case 1:
                pakObj.DW1.Split_flag_level2_level1part1 |= 1 << blkIdx2;
                break;
            case 2:
                pakObj.DW1.Split_flag_level2_level1part2 |= 1 << blkIdx2;
                break;
            case 3:
                pakObj.DW1.Split_flag_level2_level1part3 |= 1 << blkIdx2;
                break;
            };
        }
        break;
        }

        for_each(blk.m_childBlocks.begin(), blk.m_childBlocks.end(), [&](const QuadTreeNode &blk) { GetSplitFlags(blk, pakObj); });
    }

protected:
    // Prepare a list of CU inside a coded picure boundary
    void FillCuList(const QuadTreeNode &cu, std::vector<const QuadTreeNode *> &list)
    {
        if (cu.m_childBlocks.empty() && ((cu.m_x + cu.m_size) <= m_picWidth) && ((cu.m_y + cu.m_size) <= m_picHeight))
            list.push_back(&cu);
        else
            for_each(cu.m_childBlocks.begin(), cu.m_childBlocks.end(), [&](const QuadTreeNode &blk) { FillCuList(blk, list); });
    }

    uint32_t m_picWidth  = 0;
    uint32_t m_picHeight = 0;

public:
    std::vector<const QuadTreeNode *> CUs = {};
};

MOS_STATUS CodechalEncHevcStateG12::GenerateSkipFrameMbCodeSurface(SkipFrameInfo &skipframeInfo)
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;
    CODECHAL_ENCODE_FUNCTION_ENTER;

    MOS_LOCK_PARAMS lockFlags = {};
    lockFlags.WriteOnly       = 1;
    uint8_t *data             = (uint8_t *)m_osInterface->pfnLockResource(m_osInterface, &skipframeInfo.m_resMbCodeSkipFrameSurface, &lockFlags);
    CODECHAL_ENCODE_CHK_NULL_RETURN(data);
    MOS_ZeroMemory(data, m_mbCodeSize + 8 * CODECHAL_CACHELINE_SIZE);

    auto pakObjData = (HcpPakObjectG12 *)data;
    auto cuData     = (EncodeHevcCuDataG12 *)(data + m_mvOffset);

    auto const ctbSize          = 1 << (m_hevcSeqParams->log2_max_coding_block_size_minus3 + 3);
    auto const maxNumCuInCtb    = (ctbSize / CODECHAL_HEVC_MIN_CU_SIZE) * (ctbSize / CODECHAL_HEVC_MIN_CU_SIZE);
    auto const picWidthInCtb    = MOS_ROUNDUP_DIVIDE(m_frameWidth, ctbSize);
    auto const picHeightInCtb   = MOS_ROUNDUP_DIVIDE(m_frameHeight, ctbSize);
    CODECHAL_ENCODE_CHK_COND_RETURN(picWidthInCtb <= 0, "Invalid m_frameWidth");
    CODECHAL_ENCODE_CHK_COND_RETURN(picHeightInCtb <= 0, "Invalid m_frameHeight");
    uint32_t   num_tile_columns = m_hevcPicParams->num_tile_columns_minus1 + 1;
    uint32_t * tileColumnsStartPosition{new uint32_t[num_tile_columns]{}};

    for (uint32_t i = 0; i < (num_tile_columns); i++)
    {
        if (m_hevcPicParams->tile_column_width[i] == 0)
        {
            tileColumnsStartPosition[i] = picWidthInCtb;
            break;
        }

        if (i == 0)
        {
            tileColumnsStartPosition[i] = m_hevcPicParams->tile_column_width[i];
            continue;
        }

        tileColumnsStartPosition[i] = tileColumnsStartPosition[i - 1] + m_hevcPicParams->tile_column_width[i];
    }

    // Prepare CTB splits for corner cases:
    // Last column
    QuadTree lastColumnCtb((picWidthInCtb - 1) * ctbSize, 0, m_hevcSeqParams->log2_max_coding_block_size_minus3 + 3);
    lastColumnCtb.BuildQuadTree(m_frameWidth, m_frameHeight);

    // Last row
    QuadTree lastRowCtb(0, (picHeightInCtb - 1) * ctbSize, m_hevcSeqParams->log2_max_coding_block_size_minus3 + 3);
    lastRowCtb.BuildQuadTree(m_frameWidth, m_frameHeight);

    // Right bottom CTB
    QuadTree lastColRowCtb((picWidthInCtb - 1) * ctbSize, (picHeightInCtb - 1) * ctbSize, m_hevcSeqParams->log2_max_coding_block_size_minus3 + 3);
    lastColRowCtb.BuildQuadTree(m_frameWidth, m_frameHeight);

    uint32_t sliceFirstCtbIdx;
    uint32_t ctbXAddr;
    uint32_t ctbYAddr;
    uint32_t nCUs;
    uint32_t tileEnd;
    uint32_t tileStart;
    for (uint32_t slcIdx = 0; slcIdx < m_numSlices; ++slcIdx)
    {
        sliceFirstCtbIdx = m_hevcSliceParams[slcIdx].slice_segment_address;
        tileEnd          = 0;
        tileStart        = 0;
        ctbXAddr         = sliceFirstCtbIdx % picWidthInCtb;
        ctbYAddr         = sliceFirstCtbIdx / picWidthInCtb;
        for (uint32_t i = 0; i < num_tile_columns; i++)
        {
            //Determine what tile slice belongs to
            if (ctbXAddr < tileColumnsStartPosition[i])
            {
                tileEnd   = tileColumnsStartPosition[i];
                tileStart = (i == 0) ? 0 : tileColumnsStartPosition[i - 1];
                break;
            }
        }

        for (uint32_t ctbIdxInSlice = 0; ctbIdxInSlice < m_hevcSliceParams[slcIdx].NumLCUsInSlice; ++ctbIdxInSlice, ++pakObjData, ++ctbXAddr)
        {
            if (ctbXAddr >= tileEnd)
            {
                ctbYAddr++;
                ctbXAddr = tileStart;
            }
            pakObjData->DW0.Type                    = 0x03;
            pakObjData->DW0.Opcode                  = 0x27;
            pakObjData->DW0.SubOp                   = 0x21;
            pakObjData->DW0.DwordLength             = 0x3;
            pakObjData->DW2.Current_LCU_X_Addr      = ctbXAddr;
            pakObjData->DW2.Current_LCU_Y_Addr      = ctbYAddr;
            pakObjData->DW4.LCUForceZeroCoeff       = 1;  // Force skip CUs
            pakObjData->DW4.Disable_SAO_On_LCU_Flag = 1;

            const bool bCtbCrossRightPicBoundary       = (ctbXAddr + 1) * ctbSize > m_frameWidth;
            const bool bCtbCrossBottomPicBoundary      = (ctbYAddr + 1) * ctbSize > m_frameHeight;
            const bool bCtbCrossRightBottomPicBoundary = bCtbCrossRightPicBoundary && bCtbCrossBottomPicBoundary;
            if (bCtbCrossRightBottomPicBoundary)
            {
                QuadTree::GetSplitFlags(lastColRowCtb, *pakObjData);
                nCUs = lastColRowCtb.CUs.size();
            }
            else if (bCtbCrossRightPicBoundary)
            {
                QuadTree::GetSplitFlags(lastColumnCtb, *pakObjData);
                nCUs = lastColumnCtb.CUs.size();
            }
            else if (bCtbCrossBottomPicBoundary)
            {
                QuadTree::GetSplitFlags(lastRowCtb, *pakObjData);
                nCUs = lastRowCtb.CUs.size();
            }
            else  // default case
            {
                nCUs = 1;
                // For regular CTB, CU splits are not needed. All level values are zero
            }
            pakObjData->DW1.CU_count_minus1 = nCUs - 1;

            if (ctbIdxInSlice == (m_hevcSliceParams[slcIdx].NumLCUsInSlice - 1))
            {
                pakObjData->DW1.LastCtbOfTileFlag = pakObjData->DW1.LastCtbOfSliceFlag = 1;
                pakObjData->DW5                                                        = 0x05000000;  // Add batch buffer end flag
            }

            auto CeilLog2 = [](uint32_t x) {
                auto l = 0;
                while (x > (1U << l)) l++;
                return l;
            };

            // Fill CU records
            for (unsigned int cuIdx = 0; cuIdx < nCUs; ++cuIdx, ++cuData)
            {
                cuData->DW7_CuPredMode = 1;  // Inter

                // Note that this can work only for B slices.
                // If P slice support appears, we need to have the 2nd skipFrameMbCodeSurface
                // When panic mode is triggered backwards reference only should be used
                cuData->DW7_InterPredIdcMv0 = 0;
                cuData->DW7_InterPredIdcMv1 = 0;

                if (bCtbCrossRightBottomPicBoundary)
                {
                    cuData->DW7_CuSize = CeilLog2(lastColRowCtb.CUs[cuIdx]->m_size) - 3;
                }
                else if (bCtbCrossRightPicBoundary)
                {
                    cuData->DW7_CuSize = CeilLog2(lastColumnCtb.CUs[cuIdx]->m_size) - 3;
                }
                else if (bCtbCrossBottomPicBoundary)
                {
                    cuData->DW7_CuSize = CeilLog2(lastRowCtb.CUs[cuIdx]->m_size) - 3;
                }
                else
                {
                    cuData->DW7_CuSize = m_hevcSeqParams->log2_max_coding_block_size_minus3;
                }

                if (cuData->DW7_CuSize == 3)  // 64x64
                {
                    cuData->DW5_TuSize        = 0xff;  // 4 TUs 32x32
                    cuData->DW6_TuCountMinus1 = 3;
                }
                else if (cuData->DW7_CuSize == 2)  // 32x32
                {
                    cuData->DW5_TuSize = 3;  // 1 TU 32x32
                }
                else if (cuData->DW7_CuSize == 1)  // 16x16
                {
                    cuData->DW5_TuSize = 2;  // 1 TU 16x16
                }
                else  // 8x8
                {
                    cuData->DW5_TuSize = 1;  // 1 TU 8x8
                }
            }
            cuData += (maxNumCuInCtb - nCUs);  // Shift to CUs of next CTB


        }
    }
    m_osInterface->pfnUnlockResource(m_osInterface, &skipframeInfo.m_resMbCodeSkipFrameSurface);
    delete[] tileColumnsStartPosition;

    skipframeInfo.numSlices = m_numSlices;
    uint32_t mbCodeSize     = m_mbCodeSize + 8 * CODECHAL_CACHELINE_SIZE;

    #if USE_CODECHAL_DEBUG_TOOL
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpBuffer(
        &skipframeInfo.m_resMbCodeSkipFrameSurface,
        CodechalDbgAttr::attrInput,
        "SkipFrameSurface",
        mbCodeSize,
        0,
        CODECHAL_MEDIA_STATE_BRC_UPDATE));
    #endif

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::GenerateConcurrentThreadGroupData(MOS_RESOURCE &concurrentThreadGroupData)
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    if (!Mos_ResourceIsNull(&concurrentThreadGroupData))
    {
        MOS_LOCK_PARAMS lockFlags;
        MOS_ZeroMemory(&lockFlags, sizeof(MOS_LOCK_PARAMS));
        lockFlags.WriteOnly                            = 1;
        PCONCURRENT_THREAD_GROUP_DATA concurrentTgData = (PCONCURRENT_THREAD_GROUP_DATA)m_osInterface->pfnLockResource(
            m_osInterface,
            &concurrentThreadGroupData,
            &lockFlags);
        CODECHAL_ENCODE_CHK_NULL_RETURN(concurrentTgData);

        MOS_ZeroMemory(concurrentTgData, concurrentThreadGroupData.iSize);

        uint32_t shift    = m_hevcSeqParams->log2_max_coding_block_size_minus3 - m_hevcSeqParams->log2_min_coding_block_size_minus3;
        uint32_t residual = (1 << shift) - 1;

        uint32_t frameWidthInLCU  = (m_hevcSeqParams->wFrameWidthInMinCbMinus1 + 1 + residual) >> shift;
        uint32_t frameHeightInLCU = (m_hevcSeqParams->wFrameHeightInMinCbMinus1 + 1 + residual) >> shift;

        uint32_t slcCount = 0;
        // Currently only using one thread group for each slice. Extend it to multiple soon.
        for (uint32_t startLcu = 0; slcCount < m_numSlices; slcCount++, startLcu += m_hevcSliceParams[slcCount].NumLCUsInSlice)
        {
            uint32_t sliceStartLcu  = m_hevcSliceParams[slcCount].slice_segment_address;
            uint32_t sliceStartLcux = sliceStartLcu % frameWidthInLCU;
            uint32_t sliceStartLcuy = sliceStartLcu / frameWidthInLCU;

            uint32_t sliceEndLcu  = (uint16_t)(startLcu + m_hevcSliceParams[slcCount].NumLCUsInSlice);  // this should be next slice start index
            uint32_t sliceEndLcux = sliceStartLcu % frameWidthInLCU;
            uint32_t sliceEndLcuy = sliceStartLcu / frameWidthInLCU;

            concurrentTgData->CurrSliceStartLcuX = (uint16_t)sliceStartLcux;
            concurrentTgData->CurrSliceStartLcuY = (uint16_t)sliceStartLcuy;

            concurrentTgData->CurrSliceEndLcuX = (uint16_t)sliceEndLcux;
            concurrentTgData->CurrSliceEndLcuY = (uint16_t)sliceEndLcuy;

            concurrentTgData->CurrTgStartLcuX = (uint16_t)sliceStartLcux;
            concurrentTgData->CurrTgStartLcuY = (uint16_t)sliceStartLcuy;

            concurrentTgData->CurrTgEndLcuX = (uint16_t)sliceEndLcux;
            concurrentTgData->CurrTgEndLcuY = (uint16_t)sliceEndLcuy;
        }

        m_osInterface->pfnUnlockResource(
            m_osInterface,
            &concurrentThreadGroupData);
    }
    else
    {
        CODECHAL_ENCODE_ASSERTMESSAGE("Null pointer exception\n");
        return MOS_STATUS_NULL_POINTER;
    }

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::EncodeMbEncKernel(
    CODECHAL_MEDIA_STATE_TYPE encFunctionType)
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    PerfTagSetting perfTag;
    CODECHAL_ENCODE_SET_PERFTAG_INFO(perfTag, CODECHAL_ENCODE_PERFTAG_CALL_MBENC_KERNEL);

    // Initialize DSH kernel state
    PMHW_KERNEL_STATE            kernelState;
    CODECHAL_WALKER_CODEC_PARAMS walkerCodecParams;
    CODECHAL_WALKER_DEGREE       walkerDegree;
    MHW_WALKER_PARAMS            walkerParams;
    uint32_t                     walkerResolutionX, walkerResolutionY;
    bool                         customDispatchPattern = true;
    uint16_t                     totalThreadNumPerLcu  = 1;

    if (m_hevcPicParams->CodingType == I_TYPE)
    {
        encFunctionType = CODECHAL_MEDIA_STATE_HEVC_I_MBENC;
    }
    else
    {
        encFunctionType = m_isMaxLcu64 ? CODECHAL_MEDIA_STATE_HEVC_LCU64_B_MBENC : CODECHAL_MEDIA_STATE_HEVC_B_MBENC;
    }

    if (m_isMaxLcu64)
    {
        kernelState = &m_mbEncKernelStates[MBENC_LCU64_KRNIDX];
        if (m_hevcSeqParams->TargetUsage == 1)
        {
            walkerResolutionX = MOS_ALIGN_CEIL(m_frameWidth, MAX_LCU_SIZE) >> 6;
            walkerResolutionY = MOS_ALIGN_CEIL(m_frameHeight, MAX_LCU_SIZE) >> 6;
        }
        else
        {
            walkerResolutionX = 2 * (MOS_ALIGN_CEIL(m_frameWidth, MAX_LCU_SIZE) >> 6);
            walkerResolutionY = 2 * (MOS_ALIGN_CEIL(m_frameHeight, MAX_LCU_SIZE) >> 6);
        }
    }
    else
    {
        kernelState       = &m_mbEncKernelStates[MBENC_LCU32_KRNIDX];
        walkerResolutionX = MOS_ALIGN_CEIL(m_frameWidth, 32) >> 5;
        walkerResolutionY = MOS_ALIGN_CEIL(m_frameHeight, 32) >> 5;
    }

    MOS_ZeroMemory(&walkerCodecParams, sizeof(walkerCodecParams));
    walkerCodecParams.WalkerMode             = m_walkerMode;
    walkerCodecParams.dwResolutionX          = walkerResolutionX;
    walkerCodecParams.dwResolutionY          = walkerResolutionY;
    walkerCodecParams.dwNumSlices            = m_numSlices;
    walkerCodecParams.usTotalThreadNumPerLcu = totalThreadNumPerLcu;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(GetCustomDispatchPattern(&walkerParams, &walkerCodecParams));

    // If Single Task Phase is not enabled, use BT count for the kernel state.
    if (m_firstTaskInPhase == true || !m_singleTaskPhaseSupported)
    {
        uint32_t maxBtCount = m_singleTaskPhaseSupported ? m_maxBtCount : kernelState->KernelParams.iBTCount;
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_stateHeapInterface->pfnRequestSshSpaceForCmdBuf(
            m_stateHeapInterface,
            maxBtCount));
        m_vmeStatesSize = m_hwInterface->GetKernelLoadCommandSize(maxBtCount);
        CODECHAL_ENCODE_CHK_STATUS_RETURN(VerifySpaceAvailable());
    }

    // Set up the DSH/SSH as normal
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hwInterface->AssignDshAndSshSpace(
        m_stateHeapInterface,
        kernelState,
        false,
        0,
        false,
        m_storeData));

    MHW_INTERFACE_DESCRIPTOR_PARAMS idParams;
    MOS_ZeroMemory(&idParams, sizeof(idParams));
    idParams.pKernelState = kernelState;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_stateHeapInterface->pfnSetInterfaceDescriptor(
        m_stateHeapInterface,
        1,
        &idParams));

    // Generate Lcu Level Data
    CODECHAL_ENCODE_CHK_STATUS_RETURN(GenerateLcuLevelData(m_lcuLevelInputDataSurface[m_currRecycledBufIdx]));

    // Generate Concurrent Thread Group Data
    if (m_swScoreboardState->GetDependencyPattern() == dependencyWavefront26Degree ||
        m_swScoreboardState->GetDependencyPattern() == dependencyWavefront26ZDegree ||
        m_swScoreboardState->GetDependencyPattern() == dependencyWavefront26XDegree)
    {
        // Generate Concurrent Thread Group Data
        uint32_t curIdx = m_currRecycledBufIdx;

        CODECHAL_ENCODE_CHK_STATUS_RETURN(GenerateConcurrentThreadGroupData(m_encBCombinedBuffer1[curIdx].sResource));
    }
    else
    {
        // For 45D walking patter, kernel generates the concurrent thread group by itself. No need for driver to generate.
    }

    // setup curbe
    CODECHAL_ENCODE_CHK_STATUS_RETURN(SetCurbeMbEncBKernel());

    CODECHAL_DEBUG_TOOL(
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpKernelRegion(
            encFunctionType,
            MHW_DSH_TYPE,
            kernelState));

        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpCurbe(
            encFunctionType,
            kernelState));
        //CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHal_DbgDumpHEVCMbEncCurbeG12(
        //m_debugInterface,
        //encFunctionType,
        //&m_encBCombinedBuffer1[m_currRecycledBufIdx].sResource));

        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpKernelRegion(
            encFunctionType,
            MHW_ISH_TYPE,
            kernelState));)

    MOS_COMMAND_BUFFER cmdBuffer;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_osInterface->pfnGetCommandBuffer(m_osInterface, &cmdBuffer, 0));

    SendKernelCmdsParams sendKernelCmdsParams = SendKernelCmdsParams();
    sendKernelCmdsParams.EncFunctionType      = encFunctionType;
    sendKernelCmdsParams.pKernelState         = kernelState;
    // TO DO : Remove scoreboard from VFE STATE Command
    sendKernelCmdsParams.bEnableCustomScoreBoard = false;
    sendKernelCmdsParams.pCustomScoreBoard       = nullptr;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(SendGenericKernelCmds(&cmdBuffer, &sendKernelCmdsParams));

    // Add binding table
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_stateHeapInterface->pfnSetBindingTable(
        m_stateHeapInterface,
        kernelState));

    // send surfaces
    CODECHAL_ENCODE_CHK_STATUS_RETURN(SendMbEncSurfacesBKernel(&cmdBuffer));

    CODECHAL_DEBUG_TOOL(
        if (m_pictureCodingType == I_TYPE) {
            CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpSurface(
                &m_lcuLevelInputDataSurface[m_currRecycledBufIdx],
                CodechalDbgAttr::attrOutput,
                "HEVC_I_MBENC_LcuLevelData_In",
                CODECHAL_MEDIA_STATE_HEVC_I_MBENC));
        } else {
            CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpSurface(
                &m_lcuLevelInputDataSurface[m_currRecycledBufIdx],
                CodechalDbgAttr::attrOutput,
                "HEVC_B_MBENC_LcuLevelData_In",
                CODECHAL_MEDIA_STATE_HEVC_B_MBENC));
        })

    if ((encFunctionType == CODECHAL_MEDIA_STATE_HEVC_B_MBENC) || (encFunctionType == CODECHAL_MEDIA_STATE_HEVC_LCU64_B_MBENC))
    {
        CODECHAL_DEBUG_TOOL(CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpBuffer(
            &m_encConstantTableForB.sResource,
            "HEVC_B_MBENC_ConstantData_In",
            CodechalDbgAttr::attrOutput,
            m_encConstantTableForB.dwSize,
            0,
            encFunctionType)));
    }

    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hwInterface->GetRenderInterface()->AddMediaObjectWalkerCmd(
        &cmdBuffer,
        &walkerParams));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(EndStatusReport(&cmdBuffer, encFunctionType));

    // Add dump for MBEnc surface state heap here
    CODECHAL_DEBUG_TOOL(
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpKernelRegion(
            encFunctionType,
            MHW_SSH_TYPE,
            kernelState));)

    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_stateHeapInterface->pfnSubmitBlocks(
        m_stateHeapInterface,
        kernelState));

    if (!m_singleTaskPhaseSupported || m_lastTaskInPhase)
    {
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_stateHeapInterface->pfnUpdateGlobalCmdBufId(
            m_stateHeapInterface));
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hwInterface->GetMiInterface()->AddMiBatchBufferEnd(
            &cmdBuffer,
            nullptr));
    }

    CODECHAL_DEBUG_TOOL(CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpCmdBuffer(
        &cmdBuffer,
        encFunctionType,
        nullptr)));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hwInterface->UpdateSSEuForCmdBuffer(&cmdBuffer, m_singleTaskPhaseSupported, m_lastTaskInPhase));

    m_osInterface->pfnReturnCommandBuffer(m_osInterface, &cmdBuffer, 0);

    if (!m_singleTaskPhaseSupported || m_lastTaskInPhase)
    {
        m_osInterface->pfnSubmitCommandBuffer(m_osInterface, &cmdBuffer, m_renderContextUsesNullHw);
        m_lastTaskInPhase = false;
    }

    CODECHAL_DEBUG_TOOL(
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpBuffer(
            &m_debugSurface[0].sResource,
            CodechalDbgAttr::attrOutput,
            "DebugDataSurface_Out0",
            m_debugSurface[0].dwSize,
            0,
            encFunctionType));
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpBuffer(
            &m_debugSurface[1].sResource,
            CodechalDbgAttr::attrOutput,
            "DebugDataSurface_Out1",
            m_debugSurface[1].dwSize,
            0,
            encFunctionType));
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpBuffer(
            &m_debugSurface[2].sResource,
            CodechalDbgAttr::attrOutput,
            "DebugDataSurface_Out2",
            m_debugSurface[2].dwSize,
            0,
            encFunctionType));
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpBuffer(
            &m_debugSurface[3].sResource,
            CodechalDbgAttr::attrOutput,
            "DebugDataSurface_Out3",
            m_debugSurface[3].dwSize,
            0,
            encFunctionType)););

#if 0  // the dump should be done in the GetStatusReport. However, if ENC causes PAK hangs-up, there is no way to get them.
    {
        CODECHAL_DEBUG_TOOL(
            CODEC_REF_LIST      currRefList;

        currRefList = *(pRefList[m_currReconstructedPic.FrameIdx]);
        currRefList.RefPic = m_currOriginalPic;

        m_debugInterface->CurrPic = m_currOriginalPic;
        m_debugInterface->dwBufferDumpFrameNum = m_storeData;
        m_debugInterface->wFrameType = m_pictureCodingType;

        //CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHal_DbgDumpEncodeMbEncMbPakOutput(
        //    m_debugInterface,
        //    this,
        //    &currRefList,
        //    (m_codecFunction != CODECHAL_FUNCTION_HYBRIDPAK) ?
        //    CODECHAL_MEDIA_STATE_ENC_NORMAL : CODECHAL_MEDIA_STATE_HYBRID_PAK_P2));
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpBuffer(
            &currRefList.resRefMbCodeBuffer,
            CodechalDbgAttr::attrOutput,
            "MbCode",
            m_picWidthInMb * m_frameFieldHeightInMb*64,
            CodecHal_PictureIsBottomField(currRefList.RefPic) ? m_frameFieldHeightInMb * m_picWidthInMb * 64 : 0,
            (m_codecFunction != CODECHAL_FUNCTION_HYBRIDPAK) ?
            CODECHAL_MEDIA_STATE_ENC_NORMAL : CODECHAL_MEDIA_STATE_HYBRID_PAK_P2));

        if (m_mvDataSize)
        {
            CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpBuffer(
                &currRefList.resRefMvDataBuffer,
                CodechalDbgAttr::attrOutput,
                "MbData",
                m_picWidthInMb * m_frameFieldHeightInMb * (32 * 4),
                CodecHal_PictureIsBottomField(currRefList.RefPic) ? MOS_ALIGN_CEIL(m_frameFieldHeightInMb * m_picWidthInMb * (32 * 4), 0x1000) : 0,
                (m_codecFunction != CODECHAL_FUNCTION_HYBRIDPAK) ?
                CODECHAL_MEDIA_STATE_ENC_NORMAL : CODECHAL_MEDIA_STATE_HYBRID_PAK_P2));
        }
        if (CodecHalIsFeiEncode(m_codecFunction))
        {
            CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpBuffer(
                &m_resDistortionBuffer,
                CodechalDbgAttr::attrOutput,
                "DistortionSurf",
                m_picWidthInMb * m_frameFieldHeightInMb * 48,
                CodecHal_PictureIsBottomField(currRefList.RefPic) ? MOS_ALIGN_CEIL(m_frameFieldHeightInMb * m_picWidthInMb * 48, 0x1000) : 0,
                (m_codecFunction != CODECHAL_FUNCTION_HYBRIDPAK) ?
                CODECHAL_MEDIA_STATE_ENC_NORMAL : CODECHAL_MEDIA_STATE_HYBRID_PAK_P2));
        }

        )

            CODECHAL_DEBUG_TOOL(
                CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHal_DbgDumpEncodeCombineBuffer(
                    this,
                    &m_encBCombinedBuffer2[m_currRecycledBufIdx].sResource,
                    m_encBCombinedBuffer2[m_currRecycledBufIdx].dwSize,
                    (const char*)"_Hevc_CombinedBuffer2",
                    false));
        );

        // Dump SW scoreboard surface - Output of MBENC
        CODECHAL_DEBUG_TOOL(
            CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHal_DbgDumpHevcEncodeSwScoreboardSurface(
                m_debugInterface,
                m_swScoreboardState->GetCurSwScoreboardSurface(), false));
        );

        CODECHAL_DEBUG_TOOL(
            CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHal_DbgDumpEncodeCombineBuffer(
                this,
                &m_encConstantTableForB.sResource,
                m_encConstantTableForB.dwSize,
                (const char*)"_Hevc_EncConstantTable",
                true));
        );

        CODECHAL_DEBUG_TOOL(
            CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHal_DbgDumpEncodeCombineBuffer(
                this,
                &m_debugSurface[0].sResource,
                m_debugSurface[0].dwSize,
                (const char*)"_Hevc_DebugDump0",
                true));
        );

        CODECHAL_DEBUG_TOOL(
            CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHal_DbgDumpEncodeCombineBuffer(
                this,
                &m_debugSurface[1].sResource,
                m_debugSurface[1].dwSize,
                (const char*)"_Hevc_DebugDump1",
                true));
        );

        CODECHAL_DEBUG_TOOL(
            CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHal_DbgDumpEncodeCombineBuffer(
                this,
                &m_debugSurface[2].sResource,
                m_debugSurface[2].dwSize,
                (const char*)"_Hevc_DebugDump2",
                true));
        );

        CODECHAL_DEBUG_TOOL(
            CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHal_DbgDumpEncodeCombineBuffer(
                this,
                &m_debugSurface[3].sResource,
                m_debugSurface[3].dwSize,
                (const char*)"_Hevc_DebugDump3",
                true));
        );

        CODECHAL_DEBUG_TOOL(
            CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpYUVSurface(
                &m_currPicWithReconBoundaryPix,
                CodechalDbgAttr::attrReconstructedSurface,
                "ReconSurf")));
    }
#endif

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::EncodeBrcInitResetKernel()
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    CODECHAL_ENCODE_CHK_NULL_RETURN(m_brcKernelStates);

    CODECHAL_HEVC_BRC_KRNIDX brcKrnIdx = m_brcInit ? CODECHAL_HEVC_BRC_INIT : CODECHAL_HEVC_BRC_RESET;

    // Initialize DSH kernel state
    PMHW_KERNEL_STATE kernelState = &m_brcKernelStates[brcKrnIdx];

    // If Single Task Phase is not enabled, use BT count for the kernel state.
    if (m_firstTaskInPhase == true || !m_singleTaskPhaseSupported)
    {
        uint32_t maxBtCount = m_singleTaskPhaseSupported ? m_maxBtCount : kernelState->KernelParams.iBTCount;
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_stateHeapInterface->pfnRequestSshSpaceForCmdBuf(
            m_stateHeapInterface,
            maxBtCount));
        m_vmeStatesSize = m_hwInterface->GetKernelLoadCommandSize(maxBtCount);
        CODECHAL_ENCODE_CHK_STATUS_RETURN(VerifySpaceAvailable());
    }

    // Set up the DSH/SSH as normal
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hwInterface->AssignDshAndSshSpace(
        m_stateHeapInterface,
        kernelState,
        false,
        0,
        false,
        m_storeData));

    MHW_INTERFACE_DESCRIPTOR_PARAMS idParams;
    MOS_ZeroMemory(&idParams, sizeof(idParams));
    idParams.pKernelState = kernelState;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_stateHeapInterface->pfnSetInterfaceDescriptor(
        m_stateHeapInterface,
        1,
        &idParams));

    // Setup curbe for BrcInitReset kernel
    CODECHAL_ENCODE_CHK_STATUS_RETURN(SetCurbeBrcInitReset(
        brcKrnIdx));

    CODECHAL_MEDIA_STATE_TYPE encFunctionType = CODECHAL_MEDIA_STATE_BRC_INIT_RESET;
    CODECHAL_DEBUG_TOOL(
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpKernelRegion(
            encFunctionType,
            MHW_DSH_TYPE,
            kernelState));
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpCurbe(
            encFunctionType,
            kernelState));
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpKernelRegion(
            encFunctionType,
            MHW_ISH_TYPE,
            kernelState));)

    MOS_COMMAND_BUFFER cmdBuffer;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_osInterface->pfnGetCommandBuffer(m_osInterface, &cmdBuffer, 0));

    SendKernelCmdsParams sendKernelCmdsParams = SendKernelCmdsParams();
    sendKernelCmdsParams.EncFunctionType      = encFunctionType;
    sendKernelCmdsParams.pKernelState         = kernelState;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(SendGenericKernelCmds(&cmdBuffer, &sendKernelCmdsParams));

    // Add binding table
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_stateHeapInterface->pfnSetBindingTable(
        m_stateHeapInterface,
        kernelState));

    // Send surfaces for BrcInitReset Kernel
    CODECHAL_ENCODE_CHK_STATUS_RETURN(SendBrcInitResetSurfaces(&cmdBuffer, brcKrnIdx));

    MHW_MEDIA_OBJECT_PARAMS mediaObjectParams;
    MOS_ZeroMemory(&mediaObjectParams, sizeof(mediaObjectParams));

    MediaObjectInlineData mediaObjectInlineData;
    MOS_ZeroMemory(&mediaObjectInlineData, sizeof(mediaObjectInlineData));
    mediaObjectParams.pInlineData      = &mediaObjectInlineData;
    mediaObjectParams.dwInlineDataSize = sizeof(mediaObjectInlineData);
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hwInterface->GetRenderInterface()->AddMediaObject(
        &cmdBuffer,
        nullptr,
        &mediaObjectParams));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(EndStatusReport(&cmdBuffer, encFunctionType));

    // Add dump for BrcInitReset surface state heap here
    CODECHAL_DEBUG_TOOL(
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpKernelRegion(
            encFunctionType,
            MHW_SSH_TYPE,
            kernelState));)

    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_stateHeapInterface->pfnSubmitBlocks(
        m_stateHeapInterface,
        kernelState));

    if (!m_singleTaskPhaseSupported || m_lastTaskInPhase)
    {
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_stateHeapInterface->pfnUpdateGlobalCmdBufId(
            m_stateHeapInterface));
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hwInterface->GetMiInterface()->AddMiBatchBufferEnd(
            &cmdBuffer,
            nullptr));
    }

    CODECHAL_DEBUG_TOOL(CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpCmdBuffer(
        &cmdBuffer,
        encFunctionType,
        nullptr)));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hwInterface->UpdateSSEuForCmdBuffer(&cmdBuffer, m_singleTaskPhaseSupported, m_lastTaskInPhase));

    m_osInterface->pfnReturnCommandBuffer(m_osInterface, &cmdBuffer, 0);

    if (!m_singleTaskPhaseSupported || m_lastTaskInPhase)
    {
        m_osInterface->pfnSubmitCommandBuffer(m_osInterface, &cmdBuffer, m_renderContextUsesNullHw);
        m_lastTaskInPhase = false;
    }

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::EncodeBrcFrameUpdateKernel()
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    PerfTagSetting perfTag;
    CODECHAL_ENCODE_SET_PERFTAG_INFO(perfTag, CODECHAL_ENCODE_PERFTAG_CALL_BRC_UPDATE);

    CODECHAL_HEVC_BRC_KRNIDX brcKrnIdx = CODECHAL_HEVC_BRC_FRAME_UPDATE;

    // Initialize DSH kernel state
    PMHW_KERNEL_STATE kernelState = &m_brcKernelStates[brcKrnIdx];

    // If Single Task Phase is not enabled, use BT count for the kernel state.
    if (m_firstTaskInPhase == true || !m_singleTaskPhaseSupported)
    {
        uint32_t maxBtCount = m_singleTaskPhaseSupported ? m_maxBtCount : kernelState->KernelParams.iBTCount;
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_stateHeapInterface->pfnRequestSshSpaceForCmdBuf(
            m_stateHeapInterface,
            maxBtCount));
        m_vmeStatesSize = m_hwInterface->GetKernelLoadCommandSize(maxBtCount);
        CODECHAL_ENCODE_CHK_STATUS_RETURN(VerifySpaceAvailable());
    }

    // Set up the DSH/SSH as normal
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hwInterface->AssignDshAndSshSpace(
        m_stateHeapInterface,
        kernelState,
        false,
        0,
        false,
        m_storeData));

    MHW_INTERFACE_DESCRIPTOR_PARAMS idParams;
    MOS_ZeroMemory(&idParams, sizeof(idParams));
    idParams.pKernelState = kernelState;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_stateHeapInterface->pfnSetInterfaceDescriptor(
        m_stateHeapInterface,
        1,
        &idParams));

    // Setup curbe for BrcFrameUpdate kernel
    CODECHAL_ENCODE_CHK_STATUS_RETURN(SetCurbeBrcUpdate(
        brcKrnIdx));

    CODECHAL_MEDIA_STATE_TYPE encFunctionType = CODECHAL_MEDIA_STATE_BRC_UPDATE;
    CODECHAL_DEBUG_TOOL(
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpKernelRegion(
            encFunctionType,
            MHW_DSH_TYPE,
            kernelState));
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpCurbe(
            encFunctionType,
            kernelState));
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpKernelRegion(
            encFunctionType,
            MHW_ISH_TYPE,
            kernelState));)

    MOS_COMMAND_BUFFER cmdBuffer;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_osInterface->pfnGetCommandBuffer(m_osInterface, &cmdBuffer, 0));

    SendKernelCmdsParams sendKernelCmdsParams;
    sendKernelCmdsParams                 = SendKernelCmdsParams();
    sendKernelCmdsParams.EncFunctionType = encFunctionType;
    sendKernelCmdsParams.pKernelState    = kernelState;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(SendGenericKernelCmds(&cmdBuffer, &sendKernelCmdsParams));

    // Add binding table
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_stateHeapInterface->pfnSetBindingTable(
        m_stateHeapInterface,
        kernelState));

    // Send surfaces for BrcFrameUpdate Kernel
    CODECHAL_ENCODE_CHK_STATUS_RETURN(SendBrcFrameUpdateSurfaces(&cmdBuffer));

    MHW_MEDIA_OBJECT_PARAMS mediaObjectParams;
    MOS_ZeroMemory(&mediaObjectParams, sizeof(mediaObjectParams));

    MediaObjectInlineData mediaObjectInlineData;
    MOS_ZeroMemory(&mediaObjectInlineData, sizeof(mediaObjectInlineData));
    mediaObjectParams.pInlineData      = &mediaObjectInlineData;
    mediaObjectParams.dwInlineDataSize = sizeof(mediaObjectInlineData);
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hwInterface->GetRenderInterface()->AddMediaObject(
        &cmdBuffer,
        nullptr,
        &mediaObjectParams));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(EndStatusReport(&cmdBuffer, encFunctionType));

    // Add dump for BrcFrameUpdate surface state heap here
    CODECHAL_DEBUG_TOOL(
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpKernelRegion(
            encFunctionType,
            MHW_SSH_TYPE,
            kernelState));)
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_stateHeapInterface->pfnSubmitBlocks(
        m_stateHeapInterface,
        kernelState));

    if (!m_singleTaskPhaseSupported || m_lastTaskInPhase)
    {
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_stateHeapInterface->pfnUpdateGlobalCmdBufId(
            m_stateHeapInterface));
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hwInterface->GetMiInterface()->AddMiBatchBufferEnd(
            &cmdBuffer,
            nullptr));
    }

    CODECHAL_DEBUG_TOOL(CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpCmdBuffer(
        &cmdBuffer,
        encFunctionType,
        nullptr)));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hwInterface->UpdateSSEuForCmdBuffer(&cmdBuffer, m_singleTaskPhaseSupported, m_lastTaskInPhase));

    m_osInterface->pfnReturnCommandBuffer(m_osInterface, &cmdBuffer, 0);

    if (!m_singleTaskPhaseSupported || m_lastTaskInPhase)
    {
        m_osInterface->pfnSubmitCommandBuffer(m_osInterface, &cmdBuffer, m_renderContextUsesNullHw);
        m_lastTaskInPhase = false;
    }

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::EncodeBrcLcuUpdateKernel()
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    PerfTagSetting perfTag;
    CODECHAL_ENCODE_SET_PERFTAG_INFO(perfTag, CODECHAL_ENCODE_PERFTAG_CALL_BRC_UPDATE_LCU);

    CODECHAL_HEVC_BRC_KRNIDX brcKrnIdx = CODECHAL_HEVC_BRC_LCU_UPDATE;

    // Initialize DSH kernel state
    PMHW_KERNEL_STATE kernelState = &m_brcKernelStates[brcKrnIdx];

    // If Single Task Phase is not enabled, use BT count for the kernel state.
    if (m_firstTaskInPhase == true || !m_singleTaskPhaseSupported)
    {
        uint32_t maxBtCount = m_singleTaskPhaseSupported ? m_maxBtCount : kernelState->KernelParams.iBTCount;
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_stateHeapInterface->pfnRequestSshSpaceForCmdBuf(
            m_stateHeapInterface,
            maxBtCount));
        m_vmeStatesSize = m_hwInterface->GetKernelLoadCommandSize(maxBtCount);
        CODECHAL_ENCODE_CHK_STATUS_RETURN(VerifySpaceAvailable());
    }

    // Set up the DSH/SSH as normal
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hwInterface->AssignDshAndSshSpace(
        m_stateHeapInterface,
        kernelState,
        false,
        0,
        false,
        m_storeData));

    MHW_INTERFACE_DESCRIPTOR_PARAMS idParams;
    MOS_ZeroMemory(&idParams, sizeof(idParams));
    idParams.pKernelState = kernelState;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_stateHeapInterface->pfnSetInterfaceDescriptor(
        m_stateHeapInterface,
        1,
        &idParams));

    // Setup curbe for BrcFrameUpdate kernel
    CODECHAL_ENCODE_CHK_STATUS_RETURN(SetCurbeBrcUpdate(
        brcKrnIdx));

    CODECHAL_MEDIA_STATE_TYPE encFunctionType = CODECHAL_MEDIA_STATE_MB_BRC_UPDATE;
    CODECHAL_DEBUG_TOOL(
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpKernelRegion(
            encFunctionType,
            MHW_DSH_TYPE,
            kernelState));
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpCurbe(
            encFunctionType,
            kernelState));
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpKernelRegion(
            encFunctionType,
            MHW_ISH_TYPE,
            kernelState));)

    MOS_COMMAND_BUFFER cmdBuffer;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_osInterface->pfnGetCommandBuffer(m_osInterface, &cmdBuffer, 0));

    SendKernelCmdsParams sendKernelCmdsParams = SendKernelCmdsParams();
    sendKernelCmdsParams.EncFunctionType      = encFunctionType;
    sendKernelCmdsParams.pKernelState         = kernelState;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(SendGenericKernelCmds(&cmdBuffer, &sendKernelCmdsParams));

    // Add binding table
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_stateHeapInterface->pfnSetBindingTable(
        m_stateHeapInterface,
        kernelState));

    if (m_hevcPicParams->NumROI)
    {
        CODECHAL_ENCODE_CHK_STATUS_RETURN(SetupROISurface());
    }

    // Send surfaces for BrcFrameUpdate Kernel
    CODECHAL_ENCODE_CHK_STATUS_RETURN(SendBrcLcuUpdateSurfaces(&cmdBuffer));

    // Program Media walker
    uint32_t resolutionX, resolutionY;
    resolutionX = CODECHAL_GET_WIDTH_IN_MACROBLOCKS(m_frameWidth);
    resolutionX = MOS_ROUNDUP_SHIFT(resolutionX, 4);
    resolutionY = CODECHAL_GET_HEIGHT_IN_MACROBLOCKS(m_frameHeight);
    resolutionY = MOS_ROUNDUP_SHIFT(resolutionY, 3);
    CODECHAL_ENCODE_ASSERTMESSAGE("LucBRC thread space = %d x %d", resolutionX, resolutionY);

    MHW_WALKER_PARAMS walkerParams;
    MOS_ZeroMemory(&walkerParams, sizeof(walkerParams));

    CODECHAL_WALKER_CODEC_PARAMS walkerCodecParams;
    MOS_ZeroMemory(&walkerCodecParams, sizeof(walkerCodecParams));
    walkerCodecParams.WalkerMode              = m_walkerMode;
    walkerCodecParams.dwResolutionX           = resolutionX;
    walkerCodecParams.dwResolutionY           = resolutionY;
    walkerCodecParams.bNoDependency           = true;
    walkerCodecParams.bGroupIdSelectSupported = m_groupIdSelectSupported;
    walkerCodecParams.ucGroupId               = m_groupId;
    walkerCodecParams.wPictureCodingType      = m_pictureCodingType;
    walkerCodecParams.bUseScoreboard          = false;

    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalInitMediaObjectWalkerParams(
        m_hwInterface,
        &walkerParams,
        &walkerCodecParams));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hwInterface->GetRenderInterface()->AddMediaObjectWalkerCmd(
        &cmdBuffer,
        &walkerParams));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(EndStatusReport(&cmdBuffer, encFunctionType));

    // Add dump for BrcFrameUpdate surface state heap here
    CODECHAL_DEBUG_TOOL(
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpKernelRegion(
            encFunctionType,
            MHW_SSH_TYPE,
            kernelState));)

    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_stateHeapInterface->pfnSubmitBlocks(
        m_stateHeapInterface,
        kernelState));

    if (!m_singleTaskPhaseSupported || m_lastTaskInPhase)
    {
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_stateHeapInterface->pfnUpdateGlobalCmdBufId(
            m_stateHeapInterface));
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hwInterface->GetMiInterface()->AddMiBatchBufferEnd(
            &cmdBuffer,
            nullptr));
    }

    CODECHAL_DEBUG_TOOL(CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpCmdBuffer(
        &cmdBuffer,
        encFunctionType,
        nullptr)));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hwInterface->UpdateSSEuForCmdBuffer(&cmdBuffer, m_singleTaskPhaseSupported, m_lastTaskInPhase));

    m_osInterface->pfnReturnCommandBuffer(m_osInterface, &cmdBuffer, 0);

    if (!m_singleTaskPhaseSupported || m_lastTaskInPhase)
    {
        m_osInterface->pfnSubmitCommandBuffer(m_osInterface, &cmdBuffer, m_renderContextUsesNullHw);
        m_lastTaskInPhase = false;
    }

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::EncodeKernelFunctions()
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    if (m_pakOnlyTest)
    {
        // Skip ENC when PAK only mode is enabled
        return eStatus;
    }

    if (m_pictureCodingType == P_TYPE)
    {
        m_lowDelay = true;
    }

    if (m_hevcPicParams->bUsedAsRef || m_brcEnabled)
    {
        m_currRefSync = &m_refSync[m_currMbCodeIdx];

        // Check if the signal obj has been used before
        if (!m_hevcSeqParams->ParallelBRC && (m_currRefSync->uiSemaphoreObjCount || m_currRefSync->bInUsed))
        {
            MOS_SYNC_PARAMS syncParams  = g_cInitSyncParams;
            syncParams.GpuContext       = m_renderContext;
            syncParams.presSyncResource = &m_currRefSync->resSyncObject;
            syncParams.uiSemaphoreCount = m_currRefSync->uiSemaphoreObjCount;

            CODECHAL_ENCODE_CHK_STATUS_RETURN(m_osInterface->pfnEngineWait(m_osInterface, &syncParams));
            m_currRefSync->uiSemaphoreObjCount = 0;
            m_currRefSync->bInUsed             = false;
        }
    }
    else
    {
        m_currRefSync = nullptr;
    }

    //Reset to use a different performance tag ID
    m_osInterface->pfnResetPerfBufferID(m_osInterface);

    m_firstTaskInPhase = true;
    m_lastTaskInPhase  = false;

    m_brcInputForEncKernelBuffer = &m_encBCombinedBuffer2[m_currRecycledBufIdx];

    // BRC init/reset needs to be called before HME since it will reset the Brc Distortion surface
    // BRC init is called once even for CQP mode when ROI is enabled, hence also checking for first frame flag
    if ((m_brcEnabled && (m_brcInit || m_brcReset)) || (m_firstFrame && m_hevcPicParams->NumROI))
    {
        m_firstTaskInPhase = m_lastTaskInPhase = true;
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hevcBrcG12->EncodeBrcInitResetKernel());
        m_brcInit = m_brcReset = false;
    }

    m_firstTaskInPhase = true;
    m_lastTaskInPhase  = false;

    CodechalEncodeSwScoreboard::KernelParams swScoreboardKernelParames;
    MOS_ZeroMemory(&swScoreboardKernelParames, sizeof(swScoreboardKernelParames));

    InitSwScoreBoardParams(swScoreboardKernelParames);

    if (m_useSwInitScoreboard)
    {
        SetupSwScoreBoard(&swScoreboardKernelParames);
    }
    else
    {
        // Call SW scoreboard Init kernel used by MBEnc kernel
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_swScoreboardState->Execute(&swScoreboardKernelParames));
    }

    // Dump SW scoreboard surface - Output of SW scoreboard Init Kernel and Input to MBENC
    CODECHAL_DEBUG_TOOL(CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpSurface(
        m_swScoreboardState->GetCurSwScoreboardSurface(),
        CodechalDbgAttr::attrInput,
        "InitSWScoreboard_In",
        CODECHAL_MEDIA_STATE_SW_SCOREBOARD_INIT)));

    // Csc, Downscaling, and/or 10-bit to 8-bit conversion
    CODECHAL_ENCODE_CHK_NULL_RETURN(m_cscDsState);

    CodechalEncodeCscDs::KernelParams cscScalingKernelParams;
    MOS_ZeroMemory(&cscScalingKernelParams, sizeof(cscScalingKernelParams));
    cscScalingKernelParams.bLastTaskInPhaseCSC =
        cscScalingKernelParams.bLastTaskInPhase4xDS = !(m_16xMeSupported || m_hmeEnabled || m_brcEnabled);
    cscScalingKernelParams.bLastTaskInPhase16xDS    = !(m_32xMeSupported || m_hmeEnabled || m_brcEnabled);
    cscScalingKernelParams.bLastTaskInPhase32xDS    = !(m_hmeEnabled || m_brcEnabled);

    CodechalEncodeCscDsG12::HevcExtKernelParams hevcExtCscParams;
    MOS_ZeroMemory(&hevcExtCscParams, sizeof(hevcExtCscParams));

    if (m_isMaxLcu64)
    {
        hevcExtCscParams.bHevcEncHistorySum            = true;
        hevcExtCscParams.bUseLCU32                     = false;
        hevcExtCscParams.presHistoryBuffer             = &m_encBCombinedBuffer2[m_lastRecycledBufIdx].sResource;
        hevcExtCscParams.dwSizeHistoryBuffer           = m_historyOutBufferSize;
        hevcExtCscParams.dwOffsetHistoryBuffer         = m_historyOutBufferOffset;
        hevcExtCscParams.presHistorySumBuffer          = &m_encBCombinedBuffer2[m_currRecycledBufIdx].sResource;
        hevcExtCscParams.dwSizeHistorySumBuffer        = sizeof(MBENC_COMBINED_BUFFER2::ucHistoryInBuffer);
        hevcExtCscParams.dwOffsetHistorySumBuffer      = sizeof(MBENC_COMBINED_BUFFER2::ucBrcCombinedEncBuffer);
        hevcExtCscParams.presMultiThreadTaskBuffer     = &m_encBCombinedBuffer2[m_currRecycledBufIdx].sResource;
        hevcExtCscParams.dwSizeMultiThreadTaskBuffer   = m_threadTaskBufferSize;
        hevcExtCscParams.dwOffsetMultiThreadTaskBuffer = m_threadTaskBufferOffset;
        cscScalingKernelParams.hevcExtParams           = &hevcExtCscParams;
    }
    else
    {
        cscScalingKernelParams.hevcExtParams = nullptr;  // LCU32 does not require history buffers
    }

    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_cscDsState->KernelFunctions(&cscScalingKernelParams));

    if (m_hmeEnabled)
    {
        CODECHAL_ENCODE_CHK_STATUS_RETURN(EncodeMeKernel());
    }
    else if (m_brcEnabled && m_hevcPicParams->CodingType == I_TYPE)
    {
        m_lastTaskInPhase = true;
        CODECHAL_ENCODE_CHK_STATUS_RETURN(EncodeIntraDistKernel());
    }

    // BRC + MbEnc in second task phase
    m_firstTaskInPhase = true;
    m_lastTaskInPhase  = false;

    // Wait for PAK if necessary
    CODECHAL_ENCODE_CHK_STATUS_RETURN(WaitForPak());

    // ROI uses the BRC LCU update kernel, even in CQP.  So we will call it
    if (m_hevcPicParams->NumROI && !m_brcEnabled)
    {
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hevcBrcG12->EncodeBrcLcuUpdateKernel());
        m_dBrcInitCurrentTargetBufFullInBits += m_dBrcInitResetInputBitsPerFrame;

        CODECHAL_DEBUG_TOOL(
            if (!Mos_ResourceIsNull(&m_brcBuffers.sBrcMbQpBuffer.OsResource)) {
                CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpBuffer(
                    &m_brcBuffers.sBrcMbQpBuffer.OsResource,
                    CodechalDbgAttr::attrOutput,
                    "MbQp",
                    m_brcBuffers.sBrcMbQpBuffer.dwPitch * m_brcBuffers.sBrcMbQpBuffer.dwHeight,
                    m_brcBuffers.dwBrcMbQpBottomFieldOffset,
                    CODECHAL_MEDIA_STATE_BRC_UPDATE));
            } CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpBuffer(&m_brcDistortion->OsResource,
                CodechalDbgAttr::attrInput,
                "BrcDist_AfterLcuBrc",
                m_brcBuffers.sMeBrcDistortionBuffer.dwPitch * m_brcBuffers.sMeBrcDistortionBuffer.dwHeight,
                m_brcBuffers.dwMeBrcDistortionBottomFieldOffset,
                CODECHAL_MEDIA_STATE_BRC_UPDATE));)
    }

    if (m_brcEnabled)
    {
        m_hevcBrcG12->m_brcNumPakPasses = m_hwInterface->GetMfxInterface()->GetBrcNumPakPasses();
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hevcBrcG12->EncodeBrcFrameUpdateKernel());

        CODECHAL_DEBUG_TOOL(
            CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpBuffer(
                &m_brcDistortion->OsResource,
                CodechalDbgAttr::attrInput,
                "BrcDist_AfterFrameBrc",
                m_brcBuffers.sMeBrcDistortionBuffer.dwPitch * m_brcBuffers.sMeBrcDistortionBuffer.dwHeight,
                m_brcBuffers.dwMeBrcDistortionBottomFieldOffset,
                CODECHAL_MEDIA_STATE_BRC_UPDATE));
            CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpBuffer(
                &m_brcBuffers.resBrcHistoryBuffer,
                CodechalDbgAttr::attrOutput,
                "HistoryWrite",
                m_brcHistoryBufferSize,
                0,
                CODECHAL_MEDIA_STATE_BRC_UPDATE));
            CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpBuffer(
                &m_brcBuffers.resBrcImageStatesWriteBuffer[m_currRecycledBufIdx],
                CodechalDbgAttr::attrOutput,
                "ImgStateWrite",
                BRC_IMG_STATE_SIZE_PER_PASS_G11 * m_hwInterface->GetMfxInterface()->GetBrcNumPakPasses(),
                0,
                CODECHAL_MEDIA_STATE_BRC_UPDATE));)

        CODECHAL_DEBUG_TOOL(
            CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpBuffer(
                &m_brcDistortion->OsResource,
                CodechalDbgAttr::attrInput,
                "BrcDist_AfterFrameBrcUpdate",
                m_brcBuffers.sMeBrcDistortionBuffer.dwPitch * m_brcBuffers.sMeBrcDistortionBuffer.dwHeight,
                m_brcBuffers.dwMeBrcDistortionBottomFieldOffset,
                CODECHAL_MEDIA_STATE_BRC_UPDATE));
            CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpBuffer(
                &m_brcBuffers.resBrcImageStatesWriteBuffer[m_currRecycledBufIdx],
                CodechalDbgAttr::attrOutput,
                "ImgStateWrite",
                BRC_IMG_STATE_SIZE_PER_PASS * m_hwInterface->GetMfxInterface()->GetBrcNumPakPasses(),
                0,
                CODECHAL_MEDIA_STATE_BRC_UPDATE));
            CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpBuffer(
                &m_brcBuffers.resBrcHistoryBuffer,
                CodechalDbgAttr::attrOutput,
                "HistoryWrite",
                m_brcHistoryBufferSize,
                0,
                CODECHAL_MEDIA_STATE_BRC_UPDATE));
            CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpBuffer(
                &m_brcBuffers.sBrcIntraDistortionBuffer.OsResource,
                CodechalDbgAttr::attrOutput,
                "Idistortion",
                m_brcBuffers.sBrcIntraDistortionBuffer.dwWidth * m_brcBuffers.sBrcIntraDistortionBuffer.dwHeight,
                0,
                CODECHAL_MEDIA_STATE_BRC_UPDATE));)

        if (m_lcuBrcEnabled || m_hevcPicParams->NumROI)
        {
            // LCU-based BRC needs to have frame-based one to be call first in order to get HCP_IMG_STATE command result
            CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hevcBrcG12->EncodeBrcLcuUpdateKernel());
            m_dBrcInitCurrentTargetBufFullInBits += m_dBrcInitResetInputBitsPerFrame;
        }
        else
        {
            m_dBrcInitCurrentTargetBufFullInBits += m_dBrcInitResetInputBitsPerFrame;
        }

        CODECHAL_DEBUG_TOOL(
            if (!Mos_ResourceIsNull(&m_brcBuffers.sBrcMbQpBuffer.OsResource)) {
                CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpBuffer(
                    &m_brcBuffers.sBrcMbQpBuffer.OsResource,
                    CodechalDbgAttr::attrOutput,
                    "MbQp",
                    m_brcBuffers.sBrcMbQpBuffer.dwPitch * m_brcBuffers.sBrcMbQpBuffer.dwHeight,
                    m_brcBuffers.dwBrcMbQpBottomFieldOffset,
                    CODECHAL_MEDIA_STATE_BRC_UPDATE));
            } CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpBuffer(&m_brcDistortion->OsResource,
                CodechalDbgAttr::attrInput,
                "BrcDist_AfterLcuBrcUpdate",
                m_brcBuffers.sMeBrcDistortionBuffer.dwPitch * m_brcBuffers.sMeBrcDistortionBuffer.dwHeight,
                m_brcBuffers.dwMeBrcDistortionBottomFieldOffset,
                CODECHAL_MEDIA_STATE_BRC_UPDATE));)
    }

    m_useWeightedSurfaceForL0 = false;
    m_useWeightedSurfaceForL1 = false;

    //currently only support same weightoffset for all slices, and only support Luma weighted prediction
    auto slicetype = m_hevcSliceParams->slice_type;
    if (m_weightedPredictionSupported && !m_feiEnable &&
        ((slicetype == CODECHAL_HEVC_P_SLICE && m_hevcPicParams->weighted_pred_flag) ||
            (slicetype == CODECHAL_HEVC_B_SLICE && m_hevcPicParams->weighted_bipred_flag)))
    {
        uint32_t                      LumaWeightFlag[2] = {0};  //[L0, L1]
        CodechalEncodeWP::SliceParams sliceWPParams;
        MOS_FillMemory((void *)&sliceWPParams, sizeof(sliceWPParams), 0);

        //populate the slice WP parameter structure
        sliceWPParams.luma_log2_weight_denom = m_hevcSliceParams->luma_log2_weight_denom;  // luma weidht denom
        for (auto i = 0; i < 2; i++)
        {
            for (auto j = 0; j < CODEC_MAX_NUM_REF_FRAME_HEVC; j++)
            {
                sliceWPParams.weights[i][j][0][0] = (1 << m_hevcSliceParams->luma_log2_weight_denom) +
                                                    m_hevcSliceParams->delta_luma_weight[i][j];  //Luma weight
                sliceWPParams.weights[i][j][0][1] = m_hevcSliceParams->luma_offset[i][j];        //Luma offset

                if (m_hevcSliceParams->delta_luma_weight[i][j] || m_hevcSliceParams->luma_offset[i][j])
                {
                    LumaWeightFlag[i] |= (1 << j);
                }
            }
        }

        CodechalEncodeWP::KernelParams wpKernelParams;
        MOS_FillMemory((void *)&wpKernelParams, sizeof(wpKernelParams), 0);
        wpKernelParams.useWeightedSurfaceForL0 = &m_useWeightedSurfaceForL0;
        wpKernelParams.useWeightedSurfaceForL1 = &m_useWeightedSurfaceForL1;
        wpKernelParams.slcWPParams             = &sliceWPParams;

        // Weighted Prediction to be applied for L0
        for (auto i = 0; i < (m_hevcSliceParams->num_ref_idx_l0_active_minus1 + 1); i++)
        {
            if ((LumaWeightFlag[LIST_0] & (1 << i)) && (i < CODEC_MAX_FORWARD_WP_FRAME))
            {
                CODEC_PICTURE refPic = m_hevcSliceParams->RefPicList[LIST_0][i];
                if (!CodecHal_PictureIsInvalid(refPic) && m_picIdx[refPic.FrameIdx].bValid)
                {
                    MOS_SURFACE refFrameInput;
                    uint8_t     frameIndex = m_picIdx[refPic.FrameIdx].ucPicIdx;
                    refFrameInput          = m_hevcPicParams->bUseRawPicForRef ? m_refList[frameIndex]->sRefRawBuffer : m_refList[frameIndex]->sRefReconBuffer;

                    //Weighted Prediction for ith forward reference frame
                    wpKernelParams.useRefPicList1 = false;
                    wpKernelParams.wpIndex        = i;
                    wpKernelParams.refFrameInput  = &refFrameInput;
                    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_wpState->Execute(&wpKernelParams));
                }
            }
        }

        // Weighted Predition to be applied for L1
        if (slicetype == CODECHAL_HEVC_B_SLICE && m_hevcPicParams->weighted_bipred_flag)
        {
            for (auto i = 0; i < (m_hevcSliceParams->num_ref_idx_l1_active_minus1 + 1); i++)
            {
                if ((LumaWeightFlag[LIST_1] & (1 << i)) && (i < CODEC_MAX_BACKWARD_WP_FRAME))
                {
                    CODEC_PICTURE refPic = m_hevcSliceParams->RefPicList[LIST_1][i];
                    if (!CodecHal_PictureIsInvalid(refPic) && m_picIdx[refPic.FrameIdx].bValid)
                    {
                        MOS_SURFACE refFrameInput;
                        uint8_t     frameIndex = m_picIdx[refPic.FrameIdx].ucPicIdx;
                        refFrameInput          = m_hevcPicParams->bUseRawPicForRef ? m_refList[frameIndex]->sRefRawBuffer : m_refList[frameIndex]->sRefReconBuffer;

                        //Weighted Prediction for ith backward reference frame
                        wpKernelParams.useRefPicList1 = true;
                        wpKernelParams.wpIndex        = i;
                        wpKernelParams.refFrameInput  = &refFrameInput;
                        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_wpState->Execute(&wpKernelParams));
                    }
                }
            }
        }
    }

    // Reset to use a different performance tag ID
    m_osInterface->pfnResetPerfBufferID(m_osInterface);

    m_lastTaskInPhase = true;

    if (m_hevcPicParams->CodingType == I_TYPE)
    {
        CODECHAL_ENCODE_CHK_STATUS_RETURN(EncodeMbEncKernel(CODECHAL_MEDIA_STATE_HEVC_I_MBENC));
    }
    else
    {
        CODECHAL_ENCODE_CHK_STATUS_RETURN(EncodeMbEncKernel(m_isMaxLcu64 ? CODECHAL_MEDIA_STATE_HEVC_LCU64_B_MBENC : CODECHAL_MEDIA_STATE_HEVC_B_MBENC));
    }

    if (m_brcEnabled && m_enableFramePanicMode && (false == m_hevcSeqParams->DisableHRDConformance) &&
        m_skipFrameInfo.numSlices != m_numSlices)  // 'numSlices != m_numSlices' check is to re-generate surface if slice layout changed from previous frame
    {
        CODECHAL_ENCODE_CHK_STATUS_RETURN(GenerateSkipFrameMbCodeSurface(m_skipFrameInfo));
    }

    // Notify PAK engine once ENC is done
    if (!Mos_ResourceIsNull(&m_resSyncObjectRenderContextInUse))
    {
        MOS_SYNC_PARAMS syncParams = g_cInitSyncParams;
        if (m_useMdf)
        {
            if (!m_computeContextEnabled)
            {
                syncParams.GpuContext = MOS_GPU_CONTEXT_RENDER3;  //MDF uses render3
            }
            else
            {
                syncParams.GpuContext = MOS_GPU_CONTEXT_CM_COMPUTE;
            }
        }
        else
        {
            syncParams.GpuContext = m_renderContext;
        }
        syncParams.presSyncResource = &m_resSyncObjectRenderContextInUse;

        uint32_t old_stream_index  = m_osInterface->streamIndex;
        m_osInterface->streamIndex = static_cast<CmQueueRT *>(m_cmQueue)->StreamIndex();
        CODECHAL_ENCODE_CHK_STATUS_RETURN(
            m_osInterface->pfnEngineSignal(m_osInterface, &syncParams));
        m_osInterface->streamIndex = old_stream_index;
    }

    if (m_brcEnabled)
    {
        if (m_hevcSeqParams->ParallelBRC)
        {
            m_brcBuffers.uiCurrBrcPakStasIdxForRead =
                (m_brcBuffers.uiCurrBrcPakStasIdxForRead + 1) % CODECHAL_ENCODE_RECYCLED_BUFFER_NUM;
        }
    }

    CODECHAL_DEBUG_TOOL(
        uint8_t       index;
        CODEC_PICTURE refPic;
        if (m_useWeightedSurfaceForL0) {
            refPic = m_hevcSliceParams->RefPicList[LIST_0][0];
            index  = m_hevcPicParams->RefFrameList[refPic.FrameIdx].FrameIdx;

            CODECHAL_DEBUG_TOOL(CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpYUVSurface(
                &m_refList[index]->sRefBuffer,
                CodechalDbgAttr::attrReferenceSurfaces,
                "WP_In_L0")));

            CODECHAL_DEBUG_TOOL(CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpYUVSurface(
                m_wpState->GetWPOutputPicList(CODEC_WP_OUTPUT_L0_START + 0),
                CodechalDbgAttr::attrReferenceSurfaces,
                "WP_Out_L0")));
        } if (m_useWeightedSurfaceForL1) {
            refPic = m_hevcSliceParams->RefPicList[LIST_1][0];
            index  = m_hevcPicParams->RefFrameList[refPic.FrameIdx].FrameIdx;

            CODECHAL_DEBUG_TOOL(CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpYUVSurface(
                &m_refList[index]->sRefBuffer,
                CodechalDbgAttr::attrReferenceSurfaces,
                "WP_In_L1")));

            CODECHAL_DEBUG_TOOL(CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpYUVSurface(
                m_wpState->GetWPOutputPicList(CODEC_WP_OUTPUT_L1_START + 0),
                CodechalDbgAttr::attrReferenceSurfaces,
                "WP_Out_L1")));
        })

    m_lastPictureCodingType = m_pictureCodingType;
    m_lastRecycledBufIdx    = m_currRecycledBufIdx;

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::EncodeIntraDistKernel()
{
    CodechalKernelIntraDist::CurbeParam curbeParam;
    curbeParam.downScaledWidthInMb4x  = m_downscaledWidthInMb4x;
    curbeParam.downScaledHeightInMb4x = m_downscaledHeightInMb4x;

    CodechalKernelIntraDist::SurfaceParams surfaceParam;
    surfaceParam.input4xDsSurface =
        surfaceParam.input4xDsVmeSurface    = m_trackedBuf->Get4xDsSurface(CODEC_CURR_TRACKED_BUFFER);
    surfaceParam.intraDistSurface           = m_brcDistortion;
    surfaceParam.intraDistBottomFieldOffset = m_brcBuffers.dwMeBrcDistortionBottomFieldOffset;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_intraDistKernel->Execute(curbeParam, surfaceParam));

    return MOS_STATUS_SUCCESS;
}

MOS_STATUS CodechalEncHevcStateG12::InitKernelState()
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    // Init kernel state
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitKernelStateMbEnc());
    CODECHAL_ENCODE_CHK_STATUS_RETURN(InitKernelStateBrc());

    // Create weighted prediction kernel state
    CODECHAL_ENCODE_CHK_NULL_RETURN(m_wpState = MOS_New(CodechalEncodeWPG12, this));
    m_wpState->SetKernelBase(m_kernelBase);
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_wpState->InitKernelState());
    // create intra distortion kernel
    m_intraDistKernel = MOS_New(CodechalKernelIntraDist, this);
    CODECHAL_ENCODE_CHK_NULL_RETURN(m_intraDistKernel);
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_intraDistKernel->Initialize(
        GetCommonKernelHeaderAndSizeG12,
        m_kernelBase,
        m_kuidCommon));

    // Create SW scoreboard init kernel state
    CODECHAL_ENCODE_CHK_NULL_RETURN(m_swScoreboardState = MOS_New(CodechalEncodeSwScoreboardG12, this));
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_swScoreboardState->InitKernelState());
    // Create Hme kernel
    m_hmeKernel = MOS_New(CodechalKernelHmeG12, this);
    CODECHAL_ENCODE_CHK_NULL_RETURN(m_hmeKernel);
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hmeKernel->Initialize(
        GetCommonKernelHeaderAndSizeG12,
        m_kernelBase,
        m_kuidCommon));

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::SetDmemHuCPakIntegrate(
    PMHW_VDBOX_HUC_DMEM_STATE_PARAMS dmemParams)
{
    CODECHAL_ENCODE_FUNCTION_ENTER;

    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    MOS_LOCK_PARAMS lockFlagsWriteOnly;
    MOS_ZeroMemory(&lockFlagsWriteOnly, sizeof(MOS_LOCK_PARAMS));
    lockFlagsWriteOnly.WriteOnly = true;

    int32_t currentPass = GetCurrentPass();
    if (currentPass < 0 || currentPass >= CODECHAL_HEVC_MAX_NUM_BRC_PASSES || !m_brcEnabled)
    {
        eStatus = MOS_STATUS_INVALID_PARAMETER;
        return eStatus;
    }

    HucPakStitchDmemEncG12 *hucPakStitchDmem = (HucPakStitchDmemEncG12 *)m_osInterface->pfnLockResource(
        m_osInterface, &(m_resHucPakStitchDmemBuffer[m_currRecycledBufIdx][currentPass]), &lockFlagsWriteOnly);
    CODECHAL_ENCODE_CHK_NULL_RETURN(hucPakStitchDmem);

    MOS_ZeroMemory(hucPakStitchDmem, sizeof(HucPakStitchDmemEncG12));

    // reset all the offsets to -1
    uint32_t TotalOffsetSize = sizeof(hucPakStitchDmem->TileSizeRecord_offset) +
                               sizeof(hucPakStitchDmem->VDENCSTAT_offset) +
                               sizeof(hucPakStitchDmem->HEVC_PAKSTAT_offset) +
                               sizeof(hucPakStitchDmem->HEVC_Streamout_offset) +
                               sizeof(hucPakStitchDmem->VP9_PAK_STAT_offset) +
                               sizeof(hucPakStitchDmem->Vp9CounterBuffer_offset);
    MOS_FillMemory(hucPakStitchDmem, TotalOffsetSize, 0xFF);

    uint16_t numTileRows    = m_hevcPicParams->num_tile_rows_minus1 + 1;
    uint16_t numTileColumns = m_hevcPicParams->num_tile_columns_minus1 + 1;
    CODECHAL_ENCODE_ASSERT(numTileColumns > 0 && numTileColumns % 2 == 0);                       //numTileColumns is nonzero and even number; 2 or 4
    CODECHAL_ENCODE_ASSERT(m_numPipe > 0 && m_numPipe % 2 == 0 && numTileColumns <= m_numPipe);  //ucNumPipe is nonzero and even number; 2 or 4
    uint16_t numTiles        = numTileRows * numTileColumns;
    uint16_t numTilesPerPipe = m_numTiles / m_numPipe;

    hucPakStitchDmem->PicWidthInPixel          = (uint16_t)m_frameWidth;
    hucPakStitchDmem->PicHeightInPixel         = (uint16_t)m_frameHeight;
    hucPakStitchDmem->TotalNumberOfPAKs        = m_numPipe;
    hucPakStitchDmem->Codec                    = 1;  // 1: HEVC DP; 2: HEVC VDEnc; 3: VP9 VDEnc
    hucPakStitchDmem->MAXPass                  = m_brcEnabled ? (m_numPassesInOnePipe + 1) : 1;
    hucPakStitchDmem->CurrentPass              = (uint8_t)currentPass + 1;  // // Current BRC pass [1..MAXPass]
    hucPakStitchDmem->MinCUSize                = m_hevcSeqParams->log2_min_coding_block_size_minus3 + 3;
    hucPakStitchDmem->CabacZeroWordFlag        = true;                                          // to do: set to true later
    hucPakStitchDmem->bitdepth_luma            = m_hevcSeqParams->bit_depth_luma_minus8 + 8;    // default: 8
    hucPakStitchDmem->bitdepth_chroma          = m_hevcSeqParams->bit_depth_chroma_minus8 + 8;  // default: 8
    hucPakStitchDmem->ChromaFormatIdc          = m_hevcSeqParams->chroma_format_idc;
    hucPakStitchDmem->TotalSizeInCommandBuffer = m_numTiles * CODECHAL_CACHELINE_SIZE;
    // Last tile length may get modified by HuC. Obtain last Tile Record, Add an offset of 8bytes to skip address field in Tile Record
    hucPakStitchDmem->OffsetInCommandBuffer   = m_tileParams[m_numTiles - 1].TileSizeStreamoutOffset * CODECHAL_CACHELINE_SIZE + 8;
    hucPakStitchDmem->LastTileBS_StartInBytes = m_tileParams[m_numTiles - 1].BitstreamByteOffset * CODECHAL_CACHELINE_SIZE;

    hucPakStitchDmem->StitchEnable        = false;
    hucPakStitchDmem->StitchCommandOffset = 0;
    hucPakStitchDmem->BBEndforStitch      = HUC_BATCH_BUFFER_END;
    hucPakStitchDmem->brcUnderFlowEnable  = false;  //temporally disable underflow bit rate control in HUC fw since it need more tuning.

    PCODEC_ENCODER_SLCDATA slcData = m_slcData;
    CODECHAL_ENCODE_CHK_NULL_RETURN(slcData);
    uint32_t totalSliceHeaderSize = 0;
    for (uint32_t slcCount = 0; slcCount < m_numSlices; slcCount++)
    {
        totalSliceHeaderSize += (slcData->BitSize + 7) >> 3;
        slcData++;
    }
    hucPakStitchDmem->SliceHeaderSizeinBits = totalSliceHeaderSize * 8;
    hucPakStitchDmem->currFrameBRClevel     = m_currFrameBrcLevel;

    //Set the kernel output offsets
    hucPakStitchDmem->TileSizeRecord_offset[0] = m_hevcFrameStatsOffset.uiTileSizeRecord;
    hucPakStitchDmem->HEVC_PAKSTAT_offset[0]   = m_hevcFrameStatsOffset.uiHevcPakStatistics;
    hucPakStitchDmem->HEVC_Streamout_offset[0] = 0xFFFFFFFF;
    hucPakStitchDmem->VDENCSTAT_offset[0]      = 0xFFFFFFFF;

    for (auto i = 0; i < m_numPipe; i++)
    {
        hucPakStitchDmem->NumTiles[i] = numTilesPerPipe;

        // Statistics are dumped out at a tile level. Driver shares with kernel starting offset of each pipe statistic.
        // Offset is calculated by adding size of statistics/pipe to the offset in combined statistics region.
        hucPakStitchDmem->TileSizeRecord_offset[i + 1] = (i * numTilesPerPipe * m_hevcStatsSize.uiTileSizeRecord) +
                                                         m_hevcTileStatsOffset.uiTileSizeRecord;
        hucPakStitchDmem->HEVC_PAKSTAT_offset[i + 1] = (i * numTilesPerPipe * m_hevcStatsSize.uiHevcPakStatistics) +
                                                       m_hevcTileStatsOffset.uiHevcPakStatistics;
    }

    m_osInterface->pfnUnlockResource(m_osInterface, &(m_resHucPakStitchDmemBuffer[m_currRecycledBufIdx][currentPass]));

    MOS_ZeroMemory(dmemParams, sizeof(MHW_VDBOX_HUC_DMEM_STATE_PARAMS));
    dmemParams->presHucDataSource = &(m_resHucPakStitchDmemBuffer[m_currRecycledBufIdx][currentPass]);
    dmemParams->dwDataLength      = MOS_ALIGN_CEIL(sizeof(HucPakStitchDmemEncG12), CODECHAL_CACHELINE_SIZE);
    dmemParams->dwDmemOffset      = HUC_DMEM_OFFSET_RTOS_GEMS;

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::SetRegionsHuCPakIntegrate(
    PMHW_VDBOX_HUC_VIRTUAL_ADDR_PARAMS virtualAddrParams)
{
    CODECHAL_ENCODE_FUNCTION_ENTER;

    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    int32_t currentPass = GetCurrentPass();
    if (currentPass < 0 ||
        (currentPass >= CODECHAL_HEVC_MAX_NUM_BRC_PASSES && m_brcEnabled) ||
        (currentPass != 0 && m_cqpEnabled))
    {
        eStatus = MOS_STATUS_INVALID_PARAMETER;
        return eStatus;
    }

    CODECHAL_ENCODE_CHK_STATUS_RETURN(ConfigStitchDataBuffer());
    MOS_ZeroMemory(virtualAddrParams, sizeof(MHW_VDBOX_HUC_VIRTUAL_ADDR_PARAMS));
    // Add Virtual addr
    virtualAddrParams->regionParams[0].presRegion  = &m_resTileBasedStatisticsBuffer[m_virtualEngineBbIndex].sResource;  // Region 0 - Tile based input statistics from PAK/ VDEnc
    virtualAddrParams->regionParams[0].dwOffset    = 0;
    virtualAddrParams->regionParams[1].presRegion  = &m_resHuCPakAggregatedFrameStatsBuffer.sResource;  // Region 1 - HuC Frame statistics output
    virtualAddrParams->regionParams[1].isWritable  = true;
    virtualAddrParams->regionParams[4].presRegion  = &m_resBitstreamBuffer;  // Region 4 - Last Tile bitstream
    virtualAddrParams->regionParams[5].presRegion  = &m_resBitstreamBuffer;  // Region 5 - HuC modifies the last tile bitstream before stitch command
    virtualAddrParams->regionParams[5].isWritable  = true;
    virtualAddrParams->regionParams[6].presRegion  = &m_brcBuffers.resBrcHistoryBuffer;  // Region 6  History Buffer (Input/Output)
    virtualAddrParams->regionParams[6].isWritable  = true;
    virtualAddrParams->regionParams[7].presRegion  = &m_brcBuffers.resBrcImageStatesWriteBuffer[m_currRecycledBufIdx];  //&m_resHucPakStitchReadBatchBuffer;             // Region 7 - HCP PIC state command
    virtualAddrParams->regionParams[9].presRegion  = &m_resBrcDataBuffer;                                               // Region 9  HuC outputs BRC data
    virtualAddrParams->regionParams[9].isWritable  = true;
    virtualAddrParams->regionParams[8].presRegion  = &m_resHucStitchDataBuffer[m_currRecycledBufIdx][currentPass];  // Region 8 - data buffer read by HUC for stitching cmd generation
    virtualAddrParams->regionParams[10].presRegion = &m_HucStitchCmdBatchBuffer.OsResource;                         // Region 10 - SLB for stitching cmd output from Huc
    virtualAddrParams->regionParams[10].isWritable = true;
    virtualAddrParams->regionParams[15].presRegion = &m_tileRecordBuffer[m_virtualEngineBbIndex].sResource;  // Region 15 [In/Out] - Tile Record Buffer
    virtualAddrParams->regionParams[15].dwOffset   = 0;

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::SetDmemHuCPakIntegrateCqp(
    PMHW_VDBOX_HUC_DMEM_STATE_PARAMS dmemParams)
{
    CODECHAL_ENCODE_FUNCTION_ENTER;

    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    MOS_LOCK_PARAMS lockFlagsWriteOnly;
    MOS_ZeroMemory(&lockFlagsWriteOnly, sizeof(MOS_LOCK_PARAMS));
    lockFlagsWriteOnly.WriteOnly = true;

    int32_t currentPass = GetCurrentPass();
    if (currentPass != 0 || (!m_cqpEnabled && m_hevcSeqParams->RateControlMethod != RATECONTROL_ICQ))
    {
        eStatus = MOS_STATUS_INVALID_PARAMETER;
        return eStatus;
    }

    HucPakStitchDmemEncG12 *hucPakStitchDmem = (HucPakStitchDmemEncG12 *)m_osInterface->pfnLockResource(
        m_osInterface, &(m_resHucPakStitchDmemBuffer[m_currRecycledBufIdx][currentPass]), &lockFlagsWriteOnly);
    CODECHAL_ENCODE_CHK_NULL_RETURN(hucPakStitchDmem);

    MOS_ZeroMemory(hucPakStitchDmem, sizeof(HucPakStitchDmemEncG12));

    // reset all the offsets to -1
    uint32_t TotalOffsetSize = sizeof(hucPakStitchDmem->TileSizeRecord_offset) +
                               sizeof(hucPakStitchDmem->VDENCSTAT_offset) +
                               sizeof(hucPakStitchDmem->HEVC_PAKSTAT_offset) +
                               sizeof(hucPakStitchDmem->HEVC_Streamout_offset) +
                               sizeof(hucPakStitchDmem->VP9_PAK_STAT_offset) +
                               sizeof(hucPakStitchDmem->Vp9CounterBuffer_offset);
    MOS_FillMemory(hucPakStitchDmem, TotalOffsetSize, 0xFF);

    uint16_t numTileRows    = m_hevcPicParams->num_tile_rows_minus1 + 1;
    uint16_t numTileColumns = m_hevcPicParams->num_tile_columns_minus1 + 1;
    CODECHAL_ENCODE_ASSERT(numTileColumns > 0 && numTileColumns % 2 == 0);                       //numTileColumns is nonzero and even number; 2 or 4
    CODECHAL_ENCODE_ASSERT(m_numPipe > 0 && m_numPipe % 2 == 0 && numTileColumns <= m_numPipe);  //ucNumPipe is nonzero and even number; 2 or 4
    uint16_t numTiles        = numTileRows * numTileColumns;
    uint16_t numTilesPerPipe = m_numTiles / m_numPipe;

    hucPakStitchDmem->PicWidthInPixel          = (uint16_t)m_frameWidth;
    hucPakStitchDmem->PicHeightInPixel         = (uint16_t)m_frameHeight;
    hucPakStitchDmem->TotalNumberOfPAKs        = m_numPipe;
    hucPakStitchDmem->Codec                    = 2;  //HEVC DP CQP
    hucPakStitchDmem->MAXPass                  = 1;
    hucPakStitchDmem->CurrentPass              = 1;
    hucPakStitchDmem->MinCUSize                = m_hevcSeqParams->log2_min_coding_block_size_minus3 + 3;
    hucPakStitchDmem->CabacZeroWordFlag        = true;
    hucPakStitchDmem->bitdepth_luma            = m_hevcSeqParams->bit_depth_luma_minus8 + 8;    // default: 8
    hucPakStitchDmem->bitdepth_chroma          = m_hevcSeqParams->bit_depth_chroma_minus8 + 8;  // default: 8
    hucPakStitchDmem->ChromaFormatIdc          = m_hevcSeqParams->chroma_format_idc;
    hucPakStitchDmem->TotalSizeInCommandBuffer = m_numTiles * CODECHAL_CACHELINE_SIZE;
    // Last tile length may get modified by HuC. Obtain last Tile Record, Add an offset of 8bytes to skip address field in Tile Record
    hucPakStitchDmem->OffsetInCommandBuffer   = m_tileParams[m_numTiles - 1].TileSizeStreamoutOffset * CODECHAL_CACHELINE_SIZE + 8;
    hucPakStitchDmem->LastTileBS_StartInBytes = m_tileParams[m_numTiles - 1].BitstreamByteOffset * CODECHAL_CACHELINE_SIZE;

    hucPakStitchDmem->StitchEnable        = false;
    hucPakStitchDmem->StitchCommandOffset = 0;
    hucPakStitchDmem->BBEndforStitch      = HUC_BATCH_BUFFER_END;

    //Set the kernel output offsets
    hucPakStitchDmem->TileSizeRecord_offset[0] = m_hevcFrameStatsOffset.uiTileSizeRecord;
    hucPakStitchDmem->HEVC_PAKSTAT_offset[0]   = 0xFFFFFFFF;
    hucPakStitchDmem->HEVC_Streamout_offset[0] = 0xFFFFFFFF;
    hucPakStitchDmem->VDENCSTAT_offset[0]      = 0xFFFFFFFF;

    for (auto i = 0; i < m_numPipe; i++)
    {
        hucPakStitchDmem->NumTiles[i] = numTilesPerPipe;

        // Statistics are dumped out at a tile level. Driver shares with kernel starting offset of each pipe statistic.
        // Offset is calculated by adding size of statistics/pipe to the offset in combined statistics region.
        hucPakStitchDmem->TileSizeRecord_offset[i + 1] = (i * numTilesPerPipe * m_hevcStatsSize.uiTileSizeRecord) +
                                                         m_hevcTileStatsOffset.uiTileSizeRecord;
    }

    m_osInterface->pfnUnlockResource(m_osInterface, &(m_resHucPakStitchDmemBuffer[m_currRecycledBufIdx][currentPass]));

    MOS_ZeroMemory(dmemParams, sizeof(MHW_VDBOX_HUC_DMEM_STATE_PARAMS));
    dmemParams->presHucDataSource = &(m_resHucPakStitchDmemBuffer[m_currRecycledBufIdx][currentPass]);
    dmemParams->dwDataLength      = MOS_ALIGN_CEIL(sizeof(HucPakStitchDmemEncG12), CODECHAL_CACHELINE_SIZE);
    dmemParams->dwDmemOffset      = HUC_DMEM_OFFSET_RTOS_GEMS;

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::ConfigStitchDataBuffer()
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;
    CODECHAL_ENCODE_FUNCTION_ENTER;
    int32_t currentPass = GetCurrentPass();
    if (currentPass < 0 ||
        (currentPass >= CODECHAL_HEVC_MAX_NUM_BRC_PASSES && m_brcEnabled) ||
        (currentPass != 0 && m_cqpEnabled))
    {
        eStatus = MOS_STATUS_INVALID_PARAMETER;
        return eStatus;
    }

    MOS_LOCK_PARAMS lockFlagsWriteOnly;
    MOS_ZeroMemory(&lockFlagsWriteOnly, sizeof(MOS_LOCK_PARAMS));
    lockFlagsWriteOnly.WriteOnly = 1;

    HucCommandData *hucStitchDataBuf = (HucCommandData *)m_osInterface->pfnLockResource(m_osInterface, &m_resHucStitchDataBuffer[m_currRecycledBufIdx][currentPass], &lockFlagsWriteOnly);
    CODECHAL_ENCODE_CHK_NULL_RETURN(hucStitchDataBuf);

    MOS_ZeroMemory(hucStitchDataBuf, sizeof(HucCommandData));
    hucStitchDataBuf->TotalCommands          = 1;
    hucStitchDataBuf->InputCOM[0].SizeOfData = 0xF;

    HucInputCmdG12 hucInputCmd;
    MOS_ZeroMemory(&hucInputCmd, sizeof(HucInputCmdG12));

    CODECHAL_ENCODE_CHK_NULL_RETURN(m_osInterface->osCpInterface);
    hucInputCmd.SelectionForIndData = m_osInterface->osCpInterface->IsCpEnabled() ? 4 : 0;
    hucInputCmd.CmdMode             = HUC_CMD_LIST_MODE;
    hucInputCmd.LengthOfTable       = (uint8_t)(m_numTiles);
    hucInputCmd.CopySize            = m_hwInterface->m_tileRecordSize;
    ;

    PMOS_RESOURCE presSrc = &m_tileRecordBuffer[m_virtualEngineBbIndex].sResource;

    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_osInterface->pfnRegisterResource(
        m_osInterface,
        presSrc,
        false,
        false));
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_osInterface->pfnRegisterResource(
        m_osInterface,
        &m_resBitstreamBuffer,
        true,
        true));

    uint64_t srcAddr          = m_osInterface->pfnGetResourceGfxAddress(m_osInterface, presSrc);
    uint64_t destAddr         = m_osInterface->pfnGetResourceGfxAddress(m_osInterface, &m_resBitstreamBuffer);
    hucInputCmd.SrcAddrBottom = (uint32_t)(srcAddr & 0x00000000FFFFFFFF);
    hucInputCmd.SrcAddrTop    = (uint32_t)((srcAddr & 0xFFFFFFFF00000000) >> 32);

    hucInputCmd.DestAddrBottom = (uint32_t)(destAddr & 0x00000000FFFFFFFF);
    hucInputCmd.DestAddrTop    = (uint32_t)((destAddr & 0xFFFFFFFF00000000) >> 32);

    MOS_SecureMemcpy(hucStitchDataBuf->InputCOM[0].data, sizeof(HucInputCmdG12), &hucInputCmd, sizeof(HucInputCmdG12));

    m_osInterface->pfnUnlockResource(m_osInterface, &m_resHucStitchDataBuffer[m_currRecycledBufIdx][currentPass]);

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::SetRegionsHuCPakIntegrateCqp(
    PMHW_VDBOX_HUC_VIRTUAL_ADDR_PARAMS virtualAddrParams)
{
    CODECHAL_ENCODE_FUNCTION_ENTER;

    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    int32_t currentPass = GetCurrentPass();
    if (currentPass < 0 ||
        (m_hevcSeqParams->RateControlMethod != RATECONTROL_ICQ && m_brcEnabled) ||
        (currentPass != 0 && m_cqpEnabled))
    {
        eStatus = MOS_STATUS_INVALID_PARAMETER;
        return eStatus;
    }
    MOS_ZeroMemory(virtualAddrParams, sizeof(MHW_VDBOX_HUC_VIRTUAL_ADDR_PARAMS));

    CODECHAL_ENCODE_CHK_STATUS_RETURN(ConfigStitchDataBuffer());

    // Add Virtual addr
    virtualAddrParams->regionParams[0].presRegion = &m_resTileBasedStatisticsBuffer[m_virtualEngineBbIndex].sResource;  // Region 0 - Tile based input statistics from PAK/ VDEnc
    virtualAddrParams->regionParams[0].dwOffset   = 0;
    virtualAddrParams->regionParams[1].presRegion = &m_resHuCPakAggregatedFrameStatsBuffer.sResource;  // Region 1 - HuC Frame statistics output
    virtualAddrParams->regionParams[1].isWritable = true;
    virtualAddrParams->regionParams[4].presRegion = &m_resBitstreamBuffer;  // Region 4 - Last Tile bitstream
    virtualAddrParams->regionParams[5].presRegion = &m_resBitstreamBuffer;  // Region 5 - HuC modifies the last tile bitstream before stitch command
    virtualAddrParams->regionParams[5].isWritable = true;
    virtualAddrParams->regionParams[6].presRegion = &m_brcBuffers.resBrcHistoryBuffer;  // Region 6  History Buffer (Input/Output)
    virtualAddrParams->regionParams[6].isWritable = true;
    virtualAddrParams->regionParams[7].presRegion = &m_brcBuffers.resBrcImageStatesWriteBuffer[m_currRecycledBufIdx];  //&m_resHucPakStitchReadBatchBuffer;             // Region 7 - HCP PIC state command

    virtualAddrParams->regionParams[9].presRegion  = &m_resBrcDataBuffer;  // Region 9  HuC outputs BRC data
    virtualAddrParams->regionParams[9].isWritable  = true;
    virtualAddrParams->regionParams[8].presRegion  = &m_resHucStitchDataBuffer[m_currRecycledBufIdx][currentPass];  // Region 8 - data buffer read by HUC for stitching cmd generation
    virtualAddrParams->regionParams[10].presRegion = &m_HucStitchCmdBatchBuffer.OsResource;                         // Region 10 - SLB for stitching cmd output from Huc
    virtualAddrParams->regionParams[10].isWritable = true;
    virtualAddrParams->regionParams[15].presRegion = &m_tileRecordBuffer[m_virtualEngineBbIndex].sResource;  // Region 15 [In/Out] - Tile Record Buffer
    virtualAddrParams->regionParams[15].dwOffset   = 0;

    return eStatus;
}

#if (_DEBUG || _RELEASE_INTERNAL)
MOS_STATUS CodechalEncHevcStateG12::ResetImgCtrlRegInPAKStatisticsBuffer(
    PMOS_COMMAND_BUFFER cmdBuffer)
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    MHW_MI_STORE_DATA_PARAMS storeDataParams;
    MOS_ZeroMemory(&storeDataParams, sizeof(storeDataParams));
    storeDataParams.pOsResource      = &m_brcBuffers.resBrcPakStatisticBuffer[m_brcBuffers.uiCurrBrcPakStasIdxForWrite];
    storeDataParams.dwResourceOffset = CODECHAL_OFFSETOF(CODECHAL_ENCODE_HEVC_PAK_STATS_BUFFER, HCP_IMAGE_STATUS_CONTROL);
    storeDataParams.dwValue          = 0;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_miInterface->AddMiStoreDataImmCmd(
        cmdBuffer,
        &storeDataParams));

    return eStatus;
}
#endif

MOS_STATUS CodechalEncHevcStateG12::ReadBrcPakStatisticsForScalability(
    PMOS_COMMAND_BUFFER cmdBuffer)
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    CODECHAL_ENCODE_CHK_NULL_RETURN(cmdBuffer);

    MHW_MI_COPY_MEM_MEM_PARAMS miCpyMemMemParams;
    MOS_ZeroMemory(&miCpyMemMemParams, sizeof(miCpyMemMemParams));
    miCpyMemMemParams.presSrc     = &m_resBrcDataBuffer;
    miCpyMemMemParams.dwSrcOffset = CODECHAL_OFFSETOF(PakIntegrationBrcData, FrameByteCount);
    miCpyMemMemParams.presDst     = &m_brcBuffers.resBrcPakStatisticBuffer[m_brcBuffers.uiCurrBrcPakStasIdxForWrite];
    miCpyMemMemParams.dwDstOffset = CODECHAL_OFFSETOF(CODECHAL_ENCODE_HEVC_PAK_STATS_BUFFER, HCP_BITSTREAM_BYTECOUNT_FRAME);
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_miInterface->AddMiCopyMemMemCmd(cmdBuffer, &miCpyMemMemParams));

    MOS_ZeroMemory(&miCpyMemMemParams, sizeof(miCpyMemMemParams));
    miCpyMemMemParams.presSrc     = &m_resBrcDataBuffer;
    miCpyMemMemParams.dwSrcOffset = CODECHAL_OFFSETOF(PakIntegrationBrcData, FrameByteCountNoHeader);
    miCpyMemMemParams.presDst     = &m_brcBuffers.resBrcPakStatisticBuffer[m_brcBuffers.uiCurrBrcPakStasIdxForWrite];
    miCpyMemMemParams.dwDstOffset = CODECHAL_OFFSETOF(CODECHAL_ENCODE_HEVC_PAK_STATS_BUFFER, HCP_BITSTREAM_BYTECOUNT_FRAME_NOHEADER);
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_miInterface->AddMiCopyMemMemCmd(cmdBuffer, &miCpyMemMemParams));

    MOS_ZeroMemory(&miCpyMemMemParams, sizeof(miCpyMemMemParams));
    miCpyMemMemParams.presSrc     = &m_resBrcDataBuffer;
    miCpyMemMemParams.dwSrcOffset = CODECHAL_OFFSETOF(PakIntegrationBrcData, HCP_ImageStatusControl);
    miCpyMemMemParams.presDst     = &m_brcBuffers.resBrcPakStatisticBuffer[m_brcBuffers.uiCurrBrcPakStasIdxForWrite];
    miCpyMemMemParams.dwDstOffset = CODECHAL_OFFSETOF(CODECHAL_ENCODE_HEVC_PAK_STATS_BUFFER, HCP_IMAGE_STATUS_CONTROL);
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_miInterface->AddMiCopyMemMemCmd(cmdBuffer, &miCpyMemMemParams));

    uint32_t dwOffset = (m_encodeStatusBuf.wCurrIndex * m_encodeStatusBuf.dwReportSize) +
                        m_encodeStatusBuf.dwNumPassesOffset +  // Num passes offset
                        sizeof(uint32_t) * 2;                  // encodeStatus is offset by 2 DWs in the resource

    MHW_MI_STORE_DATA_PARAMS storeDataParams;
    storeDataParams.pOsResource      = &m_encodeStatusBuf.resStatusBuffer;
    storeDataParams.dwResourceOffset = dwOffset;
    storeDataParams.dwValue          = (uint8_t)GetCurrentPass();
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_miInterface->AddMiStoreDataImmCmd(cmdBuffer, &storeDataParams));

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::DumpHucDebugOutputBuffers()
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    //only dump HuC in/out buffers in brc scalability case
    bool dumpDebugBuffers = IsLastPipe() && (m_numPipe >= 2) && m_brcEnabled;
    if (m_singleTaskPhaseSupported)
    {
        dumpDebugBuffers = dumpDebugBuffers && IsLastPass();
    }

    if (dumpDebugBuffers)
    {
        CODECHAL_DEBUG_TOOL(
            int32_t currentPass = GetCurrentPass();
            CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpHucDmem(
                &m_resHucPakStitchDmemBuffer[m_currRecycledBufIdx][currentPass],
                sizeof(HucPakStitchDmemEncG12),
                currentPass,
                hucRegionDumpPakIntegrate));

            // Region 7 - HEVC PIC State Command
            CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpHucRegion(
                &m_brcBuffers.resBrcImageStatesWriteBuffer[m_currRecycledBufIdx],
                0,
                m_hwInterface->m_vdenc2ndLevelBatchBufferSize,
                7,
                "_PicState",
                true,
                currentPass,
                hucRegionDumpPakIntegrate));

            // Region 5 -  Last Tile PAK Bitstream Output
            CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpHucRegion(
                &m_resBitstreamBuffer,
                0,
                m_encodeParams.dwBitstreamSize,
                5,
                "_Bitstream",
                false,
                currentPass,
                hucRegionDumpPakIntegrate));

            // Region 6 - BRC History buffer
            CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpHucRegion(
                &m_brcBuffers.resBrcHistoryBuffer,
                0,
                m_brcHistoryBufferSize,
                6,
                "_HistoryBuffer",
                false,
                currentPass,
                hucRegionDumpPakIntegrate));
            // Region 9 - HCP BRC Data Output
            CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpHucRegion(
                &m_resBrcDataBuffer,
                0,
                CODECHAL_CACHELINE_SIZE,
                9,
                "_HcpBrcData",
                false,
                currentPass,
                hucRegionDumpPakIntegrate));
            // Region 1 - Output Aggregated Frame Level Statistics
            CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpHucRegion(
                &m_resHuCPakAggregatedFrameStatsBuffer.sResource,
                0,
                m_hwInterface->m_pakIntAggregatedFrameStatsSize,  // program exact out size
                1,
                "_AggregateFrameStats",
                false,
                currentPass,
                hucRegionDumpPakIntegrate));
            // Region 0 - Tile Statistics Constant Buffer
            CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpHucRegion(
                &m_resTileBasedStatisticsBuffer[m_virtualEngineBbIndex].sResource,
                0,
                m_hwInterface->m_pakIntTileStatsSize,
                0,
                "_TileBasedStats",
                true,
                currentPass,
                hucRegionDumpPakIntegrate));
            // Region 15 - Tile Record Buffer
            CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpHucRegion(
                &m_tileRecordBuffer[m_virtualEngineBbIndex].sResource,
                0,
                m_tileRecordBuffer[m_virtualEngineBbIndex].dwSize,
                15,
                "_TileRecord",
                false,
                currentPass,
                hucRegionDumpPakIntegrate));)
    }

    return eStatus;
}

CodechalEncHevcStateG12::CodechalEncHevcStateG12(
    CodechalHwInterface *   hwInterface,
    CodechalDebugInterface *debugInterface,
    PCODECHAL_STANDARD_INFO standardInfo)
    : CodechalEncHevcState(hwInterface, debugInterface, standardInfo)
{
    m_2xMeSupported =
        m_useCommonKernel = true;
    m_useHwScoreboard     = false;
#if defined(ENABLE_KERNELS) && !defined(_FULL_OPEN_SOURCE)
    m_kernelBase          = (uint8_t *)IGCODECKRN_G12;
#else
    m_kernelBase          = nullptr;
#endif
    m_kuidCommon          = IDR_CODEC_HME_DS_SCOREBOARD_KERNEL;
    m_hucPakStitchEnabled = true;
    m_scalabilityState    = nullptr;

    MOS_ZeroMemory(&m_currPicWithReconBoundaryPix, sizeof(m_currPicWithReconBoundaryPix));
    MOS_ZeroMemory(&m_lcuLevelInputDataSurface, sizeof(m_lcuLevelInputDataSurface));
    MOS_ZeroMemory(&m_encoderHistoryInputBuffer, sizeof(m_encoderHistoryInputBuffer));
    MOS_ZeroMemory(&m_encoderHistoryOutputBuffer, sizeof(m_encoderHistoryOutputBuffer));
    MOS_ZeroMemory(&m_intermediateCuRecordSurfaceLcu32, sizeof(m_intermediateCuRecordSurfaceLcu32));
    MOS_ZeroMemory(&m_scratchSurface, sizeof(m_scratchSurface));
    MOS_ZeroMemory(&m_16x16QpInputData, sizeof(m_16x16QpInputData));
    MOS_ZeroMemory(m_debugSurface, sizeof(m_debugSurface));
    MOS_ZeroMemory(&m_encConstantTableForB, sizeof(m_encConstantTableForB));
    MOS_ZeroMemory(&m_mvAndDistortionSumSurface, sizeof(m_mvAndDistortionSumSurface));
    MOS_ZeroMemory(m_encBCombinedBuffer1, sizeof(m_encBCombinedBuffer1));
    MOS_ZeroMemory(m_encBCombinedBuffer2, sizeof(m_encBCombinedBuffer2));

    MOS_ZeroMemory(&m_resPakcuLevelStreamoutData, sizeof(m_resPakcuLevelStreamoutData));
    MOS_ZeroMemory(&m_resPakSliceLevelStreamoutData, sizeof(m_resPakSliceLevelStreamoutData));
    MOS_ZeroMemory(m_resTileBasedStatisticsBuffer, sizeof(m_resTileBasedStatisticsBuffer));
    MOS_ZeroMemory(&m_resHuCPakAggregatedFrameStatsBuffer, sizeof(m_resHuCPakAggregatedFrameStatsBuffer));
    MOS_ZeroMemory(m_tileRecordBuffer, sizeof(m_tileRecordBuffer));
    MOS_ZeroMemory(&m_kmdVeOveride, sizeof(m_kmdVeOveride));
    MOS_ZeroMemory(&m_resHcpScalabilitySyncBuffer, sizeof(m_resHcpScalabilitySyncBuffer));

    MOS_ZeroMemory(m_veBatchBuffer, sizeof(m_veBatchBuffer));
    MOS_ZeroMemory(&m_realCmdBuffer, sizeof(m_realCmdBuffer));
    MOS_ZeroMemory(&m_resBrcSemaphoreMem, sizeof(m_resBrcSemaphoreMem));
    MOS_ZeroMemory(&m_resBrcPakSemaphoreMem, sizeof(m_resBrcPakSemaphoreMem));
    MOS_ZeroMemory(&m_resPipeStartSemaMem, sizeof(m_resPipeStartSemaMem));
    MOS_ZeroMemory(&m_resPipeCompleteSemaMem, sizeof(m_resPipeCompleteSemaMem));
    MOS_ZeroMemory(m_resHucPakStitchDmemBuffer, sizeof(m_resHucPakStitchDmemBuffer));
    MOS_ZeroMemory(&m_resBrcDataBuffer, sizeof(m_resBrcDataBuffer));
    MOS_ZeroMemory(&m_skipFrameInfo.m_resMbCodeSkipFrameSurface, sizeof(m_skipFrameInfo.m_resMbCodeSkipFrameSurface));

    CODECHAL_ENCODE_CHK_NULL_NO_STATUS_RETURN(m_osInterface);
    m_hwInterface->GetStateHeapSettings()->dwNumSyncTags = CODECHAL_ENCODE_HEVC_NUM_SYNC_TAGS;
    m_hwInterface->GetStateHeapSettings()->dwDshSize     = CODECHAL_INIT_DSH_SIZE_HEVC_ENC;

    m_kuid             = IDR_CODEC_HEVC_COMBINED_KENREL_INTEL;
    MOS_STATUS eStatus = CodecHalGetKernelBinaryAndSize(
        m_kernelBase,
        m_kuid,
        &m_kernelBinary,
        &m_combinedKernelSize);
    CODECHAL_ENCODE_ASSERT(eStatus == MOS_STATUS_SUCCESS);

    m_hwInterface->GetStateHeapSettings()->dwIshSize +=
        MOS_ALIGN_CEIL(m_combinedKernelSize, (1 << MHW_KERNEL_OFFSET_SHIFT));

    m_osInterface->pfnVirtualEngineSupported(m_osInterface, false, true);

    Mos_SetVirtualEngineSupported(m_osInterface, true);
}

CodechalEncHevcStateG12::~CodechalEncHevcStateG12()
{
    CODECHAL_ENCODE_FUNCTION_ENTER;

    if (m_wpState)
    {
        MOS_Delete(m_wpState);
        m_wpState = nullptr;
    }

    if (m_intraDistKernel)
    {
        MOS_Delete(m_intraDistKernel);
        m_intraDistKernel = nullptr;
    }

    if (m_swScoreboardState)
    {
        MOS_Delete(m_swScoreboardState);
        m_swScoreboardState = nullptr;
    }

    if (m_scalabilityState)
    {
        MOS_FreeMemAndSetNull(m_scalabilityState);
    }

#if (_DEBUG || _RELEASE_INTERNAL)
    if (m_statusReportDebugInterface != nullptr)
    {
        MOS_Delete(m_statusReportDebugInterface);
        m_statusReportDebugInterface = nullptr;
    }
#endif
}

MOS_STATUS CodechalEncHevcStateG12::Allocate(CodechalSetting *codecHalSettings)
{
#if (_DEBUG || _RELEASE_INTERNAL)
    if (!m_statusReportDebugInterface)
    {
        m_statusReportDebugInterface = MOS_New(CodechalDebugInterface);
        CODECHAL_ENCODE_CHK_NULL_RETURN(m_statusReportDebugInterface);
        CODECHAL_ENCODE_CHK_STATUS_RETURN(
            m_statusReportDebugInterface->Initialize(m_hwInterface, codecHalSettings->codecFunction));
    }
#endif

    return CodechalEncoderState::Allocate(codecHalSettings);
}

uint32_t CodechalEncHevcStateG12::CodecHalHevc_GetFileSize(char *fileName)
{
    FILE *   fp       = nullptr;
    uint32_t fileSize = 0;
    MosUtilities::MosSecureFileOpen(&fp, fileName, "rb");
    if (fp == nullptr)
    {
        return 0;
    }
    fseek(fp, 0, SEEK_END);
    fileSize = ftell(fp);
    fseek(fp, 0, SEEK_SET);
    fclose(fp);

    return fileSize;
}

MOS_STATUS CodechalEncHevcStateG12::LoadSourceAndRef2xDSFromFile(
    PMOS_SURFACE pRef2xSurface,
    PMOS_SURFACE pSrc2xSurface,
    uint8_t      reflist,
    uint8_t      refIdx)
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    if (m_loadKernelInput == false || (pSrc2xSurface && Mos_ResourceIsNull(&pSrc2xSurface->OsResource)) ||
        (pRef2xSurface && Mos_ResourceIsNull(&pRef2xSurface->OsResource)) ||
        (pSrc2xSurface == NULL && pRef2xSurface == NULL))
    {
        return eStatus;
    }

    char pathOfRef2xDSCmd[MOS_USER_CONTROL_MAX_DATA_SIZE];
    MOS_SecureStringPrint(pathOfRef2xDSCmd,
        sizeof(pathOfRef2xDSCmd),
        sizeof(pathOfRef2xDSCmd),
        "%s\\Ref2xDSL%1d%1d.dat.%d",
        m_loadKernelInputDataFolder,
        reflist,
        refIdx,
        m_frameNum);
    char pathOfSrc2xDSCmd[MOS_USER_CONTROL_MAX_DATA_SIZE];
    MOS_SecureStringPrint(pathOfSrc2xDSCmd,
        sizeof(pathOfSrc2xDSCmd),
        sizeof(pathOfSrc2xDSCmd),
        "%s\\Src2xDS.dat.%d",
        m_loadKernelInputDataFolder,
        m_frameNum);

    uint32_t sizeRef2xDS = CodecHalHevc_GetFileSize(pathOfRef2xDSCmd);
    uint32_t sizeSrc2xDS = CodecHalHevc_GetFileSize(pathOfSrc2xDSCmd);
    if (sizeRef2xDS == 0 && sizeSrc2xDS == 0)
        return MOS_STATUS_SUCCESS;
    MOS_LOCK_PARAMS lockFlags;

    if (pRef2xSurface && sizeRef2xDS)
    {
        if (sizeRef2xDS > (pRef2xSurface->dwPitch * pRef2xSurface->dwHeight * 3 / 2))
        {
            return MOS_STATUS_INVALID_FILE_SIZE;
        }
        MOS_ZeroMemory(&lockFlags, sizeof(MOS_LOCK_PARAMS));
        lockFlags.WriteOnly = 1;
        uint8_t *data       = (uint8_t *)m_osInterface->pfnLockResource(
            m_osInterface, &pRef2xSurface->OsResource, &lockFlags);
        CODECHAL_ENCODE_CHK_NULL_RETURN(data);

        FILE *Ref2xDS = nullptr;
        eStatus       = MosUtilities::MosSecureFileOpen(&Ref2xDS, pathOfRef2xDSCmd, "rb");
        if (Ref2xDS == nullptr)
        {
            m_osInterface->pfnUnlockResource(m_osInterface, &pRef2xSurface->OsResource);
            return eStatus;
        }

        uint32_t sizeToRead = sizeRef2xDS * 2 / 3;
        if (sizeToRead != fread((void *)data, 1, sizeToRead, Ref2xDS))
        {
            fclose(Ref2xDS);
            m_osInterface->pfnUnlockResource(m_osInterface, &pRef2xSurface->OsResource);
            return MOS_STATUS_INVALID_FILE_SIZE;
        }
        fclose(Ref2xDS);
        //MOS_ZeroMemory(data + sizeToRead, sizeRef2xDS-sizeToRead);

        m_osInterface->pfnUnlockResource(m_osInterface, &pRef2xSurface->OsResource);
    }

    if (pSrc2xSurface && sizeSrc2xDS)
    {
        if (sizeSrc2xDS > (pSrc2xSurface->dwPitch * pSrc2xSurface->dwHeight * 3 / 2))
        {
            return MOS_STATUS_INVALID_FILE_SIZE;
        }

        MOS_ZeroMemory(&lockFlags, sizeof(MOS_LOCK_PARAMS));
        lockFlags.WriteOnly = 1;
        uint8_t *data       = (uint8_t *)m_osInterface->pfnLockResource(
            m_osInterface, &pSrc2xSurface->OsResource, &lockFlags);
        CODECHAL_ENCODE_CHK_NULL_RETURN(data);

        FILE *Src2xDS = nullptr;
        eStatus       = MosUtilities::MosSecureFileOpen(&Src2xDS, pathOfSrc2xDSCmd, "rb");
        if (Src2xDS == nullptr)
        {
            m_osInterface->pfnUnlockResource(m_osInterface, &pSrc2xSurface->OsResource);
            return eStatus;
        }

        uint32_t sizeToRead = sizeSrc2xDS * 2 / 3;
        if (sizeToRead != fread((void *)data, 1, sizeToRead, Src2xDS))
        {
            fclose(Src2xDS);
            m_osInterface->pfnUnlockResource(m_osInterface, &pSrc2xSurface->OsResource);
            return MOS_STATUS_INVALID_FILE_SIZE;
        }
        fclose(Src2xDS);
        //MOS_ZeroMemory(data + sizeToRead, sizeRef2xDS-sizeToRead);

        m_osInterface->pfnUnlockResource(m_osInterface, &pSrc2xSurface->OsResource);
    }

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::LoadPakCommandAndCuRecordFromFile()
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    char pathOfPakCmd[MOS_USER_CONTROL_MAX_DATA_SIZE];
    MOS_SecureStringPrint(pathOfPakCmd,
        sizeof(pathOfPakCmd),
        sizeof(pathOfPakCmd),
        "%s\\PAKObj.dat.%d",
        m_pakOnlyDataFolder,
        m_frameNum);

    char pathOfCuRecord[MOS_USER_CONTROL_MAX_DATA_SIZE];
    MOS_SecureStringPrint(pathOfCuRecord,
        sizeof(pathOfCuRecord),
        sizeof(pathOfCuRecord),
        "%s\\CURecord.dat.%d",
        m_pakOnlyDataFolder,
        m_frameNum);

    uint32_t sizePakObj = CodecHalHevc_GetFileSize(pathOfPakCmd);
    if (sizePakObj == 0 || sizePakObj > m_mvOffset)
    {
        return MOS_STATUS_INVALID_FILE_SIZE;
    }

    uint32_t sizeCuRecord = CodecHalHevc_GetFileSize(pathOfCuRecord);
    if (sizeCuRecord == 0 || sizeCuRecord > m_mbCodeSize - m_mvOffset)
    {
        return MOS_STATUS_INVALID_FILE_SIZE;
    }

    MOS_LOCK_PARAMS lockFlags;
    MOS_ZeroMemory(&lockFlags, sizeof(MOS_LOCK_PARAMS));
    lockFlags.WriteOnly = 1;
    uint8_t *data       = (uint8_t *)m_osInterface->pfnLockResource(
        m_osInterface, &m_resMbCodeSurface, &lockFlags);
    CODECHAL_ENCODE_CHK_NULL_RETURN(data);

    FILE *pakObj = nullptr;
    eStatus      = MosUtilities::MosSecureFileOpen(&pakObj, pathOfPakCmd, "rb");
    if (pakObj == nullptr)
    {
        m_osInterface->pfnUnlockResource(m_osInterface, &m_resMbCodeSurface);
        return eStatus;
    }

    uint8_t *pakCmd = data;
    if (sizePakObj != fread((void *)pakCmd, 1, sizePakObj, pakObj))
    {
        fclose(pakObj);
        m_osInterface->pfnUnlockResource(m_osInterface, &m_resMbCodeSurface);
        return MOS_STATUS_INVALID_FILE_SIZE;
    }
    fclose(pakObj);

    uint8_t *record  = data + m_mvOffset;
    FILE *   fRecord = nullptr;
    eStatus          = MosUtilities::MosSecureFileOpen(&fRecord, pathOfCuRecord, "rb");
    if (fRecord == nullptr)
    {
        m_osInterface->pfnUnlockResource(m_osInterface, &m_resMbCodeSurface);
        return eStatus;
    }

    if (sizeCuRecord != fread((void *)record, 1, sizeCuRecord, fRecord))
    {
        fclose(fRecord);
        m_osInterface->pfnUnlockResource(m_osInterface, &m_resMbCodeSurface);
        return MOS_STATUS_INVALID_FILE_SIZE;
    }
    fclose(fRecord);

    m_osInterface->pfnUnlockResource(m_osInterface, &m_resMbCodeSurface);

    if (m_brcEnabled)
    {
        //Image State
        char pathOfPicState[MOS_USER_CONTROL_MAX_DATA_SIZE];
        MOS_SecureStringPrint(pathOfPicState,
            sizeof(pathOfPicState),
            sizeof(pathOfPicState),
            "%s\\BrcUpdate_ImgStateWrite.dat.%d",
            m_pakOnlyDataFolder,
            m_frameNum);

        int32_t tmpSizePicState = CodecHalHevc_GetFileSize(pathOfPicState);
        uint32_t sizePicState   = 0;
        if (tmpSizePicState <= 0)
        {
            return MOS_STATUS_INVALID_FILE_SIZE;
        }
        else
        {
            sizePicState = static_cast<uint32_t>(tmpSizePicState);
        }

        data = (uint8_t *)m_osInterface->pfnLockResource(
            m_osInterface, &m_brcBuffers.resBrcImageStatesWriteBuffer[m_currRecycledBufIdx], &lockFlags);
        CODECHAL_ENCODE_CHK_NULL_RETURN(data);

        FILE *fPicState = nullptr;
        eStatus         = MosUtilities::MosSecureFileOpen(&fPicState, pathOfPicState, "rb");
        if (fPicState == nullptr)
        {
            m_osInterface->pfnUnlockResource(m_osInterface, &m_brcBuffers.resBrcImageStatesWriteBuffer[m_currRecycledBufIdx]);
            return eStatus;
        }

        if (sizePicState != fread((void *)data, 1, sizePicState, fPicState))
        {
            fclose(fPicState);
            m_osInterface->pfnUnlockResource(m_osInterface, &m_brcBuffers.resBrcImageStatesWriteBuffer[m_currRecycledBufIdx]);
            return MOS_STATUS_INVALID_FILE_SIZE;
        }
        fclose(fPicState);
        m_osInterface->pfnUnlockResource(m_osInterface, &m_brcBuffers.resBrcImageStatesWriteBuffer[m_currRecycledBufIdx]);
    }

    return eStatus;
}

uint8_t CodechalEncHevcStateG12::PicCodingTypeToSliceType(uint16_t pictureCodingType)
{
    uint8_t sliceType = 0;

    switch (pictureCodingType)
    {
    case I_TYPE:
        sliceType = CODECHAL_ENCODE_HEVC_I_SLICE;
        break;
    case P_TYPE:
        sliceType = CODECHAL_ENCODE_HEVC_P_SLICE;
        break;
    case B_TYPE:
    case B1_TYPE:
    case B2_TYPE:
        sliceType = CODECHAL_ENCODE_HEVC_B_SLICE;
        break;
    default:
        CODECHAL_ENCODE_ASSERT(false);
    }
    return sliceType;
}

// The following code is from the kernel ULT
MOS_STATUS CodechalEncHevcStateG12::InitMediaObjectWalker(
    uint32_t           threadSpaceWidth,
    uint32_t           threadSpaceHeight,
    uint32_t           colorCountMinusOne,
    DependencyPattern  dependencyPattern,
    uint32_t           childThreadNumber,
    uint32_t           localLoopExecCount,
    MHW_WALKER_PARAMS &walkerParams)
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    walkerParams.ColorCountMinusOne    = colorCountMinusOne;
    walkerParams.dwGlobalLoopExecCount = 0x3ff;
    walkerParams.dwLocalLoopExecCount  = 0x3ff;

    if (dependencyPattern == dependencyWavefrontHorizontal)
    {
        // Global
        walkerParams.GlobalResolution.x       = threadSpaceWidth;
        walkerParams.GlobalResolution.y       = threadSpaceHeight;
        walkerParams.GlobalStart.x            = 0;
        walkerParams.GlobalStart.y            = 0;
        walkerParams.GlobalOutlerLoopStride.x = threadSpaceWidth;
        walkerParams.GlobalOutlerLoopStride.y = 0;
        walkerParams.GlobalInnerLoopUnit.x    = 0;
        walkerParams.GlobalInnerLoopUnit.y    = threadSpaceHeight;

        // Local
        walkerParams.BlockResolution.x    = threadSpaceWidth;
        walkerParams.BlockResolution.y    = threadSpaceHeight;
        walkerParams.LocalStart.x         = 0;
        walkerParams.LocalStart.y         = 0;
        walkerParams.LocalOutLoopStride.x = 1;
        walkerParams.LocalOutLoopStride.y = 0;
        walkerParams.LocalInnerLoopUnit.x = 0;
        walkerParams.LocalInnerLoopUnit.y = 1;

        // Mid
        walkerParams.MiddleLoopExtraSteps = 0;
        walkerParams.MidLoopUnitX         = 0;
        walkerParams.MidLoopUnitY         = 0;
    }
    else if (dependencyPattern == dependencyWavefrontVertical)
    {
        // Global
        walkerParams.GlobalResolution.x       = threadSpaceWidth;
        walkerParams.GlobalResolution.y       = threadSpaceHeight;
        walkerParams.GlobalStart.x            = 0;
        walkerParams.GlobalStart.y            = 0;
        walkerParams.GlobalOutlerLoopStride.x = threadSpaceWidth;
        walkerParams.GlobalOutlerLoopStride.y = 0;
        walkerParams.GlobalInnerLoopUnit.x    = 0;
        walkerParams.GlobalInnerLoopUnit.y    = threadSpaceHeight;

        // Local
        walkerParams.BlockResolution.x    = threadSpaceWidth;
        walkerParams.BlockResolution.y    = threadSpaceHeight;
        walkerParams.LocalStart.x         = 0;
        walkerParams.LocalStart.y         = 0;
        walkerParams.LocalOutLoopStride.x = 0;
        walkerParams.LocalOutLoopStride.y = 1;
        walkerParams.LocalInnerLoopUnit.x = 1;
        walkerParams.LocalInnerLoopUnit.y = 0;

        // Mid
        walkerParams.MiddleLoopExtraSteps = 0;
        walkerParams.MidLoopUnitX         = 0;
        walkerParams.MidLoopUnitY         = 0;
    }
    else if (dependencyPattern == dependencyWavefront45Degree)
    {
        // Global
        walkerParams.GlobalResolution.x       = threadSpaceWidth;
        walkerParams.GlobalResolution.y       = threadSpaceHeight;
        walkerParams.GlobalStart.x            = 0;
        walkerParams.GlobalStart.y            = 0;
        walkerParams.GlobalOutlerLoopStride.x = threadSpaceWidth;
        walkerParams.GlobalOutlerLoopStride.y = 0;
        walkerParams.GlobalInnerLoopUnit.x    = 0;
        walkerParams.GlobalInnerLoopUnit.y    = threadSpaceHeight;

        // Local
        walkerParams.BlockResolution.x    = threadSpaceWidth;
        walkerParams.BlockResolution.y    = threadSpaceHeight;
        walkerParams.LocalStart.x         = 0;
        walkerParams.LocalStart.y         = 0;
        walkerParams.LocalOutLoopStride.x = 1;
        walkerParams.LocalOutLoopStride.y = 0;
        walkerParams.LocalInnerLoopUnit.x = -1;
        walkerParams.LocalInnerLoopUnit.y = 1;

        // Mid
        walkerParams.MiddleLoopExtraSteps = 0;
        walkerParams.MidLoopUnitX         = 0;
        walkerParams.MidLoopUnitY         = 0;
    }
    else if (dependencyPattern == dependencyWavefront26Degree)
    {
        // Global
        walkerParams.GlobalResolution.x       = threadSpaceWidth;
        walkerParams.GlobalResolution.y       = threadSpaceHeight;
        walkerParams.GlobalStart.x            = 0;
        walkerParams.GlobalStart.y            = 0;
        walkerParams.GlobalOutlerLoopStride.x = threadSpaceWidth;
        walkerParams.GlobalOutlerLoopStride.y = 0;
        walkerParams.GlobalInnerLoopUnit.x    = 0;
        walkerParams.GlobalInnerLoopUnit.y    = threadSpaceHeight;

        // Local
        walkerParams.BlockResolution.x    = threadSpaceWidth;
        walkerParams.BlockResolution.y    = threadSpaceHeight;
        walkerParams.LocalStart.x         = 0;
        walkerParams.LocalStart.y         = 0;
        walkerParams.LocalOutLoopStride.x = 1;
        walkerParams.LocalOutLoopStride.y = 0;
        walkerParams.LocalInnerLoopUnit.x = -2;
        walkerParams.LocalInnerLoopUnit.y = 1;

        // Mid
        walkerParams.MiddleLoopExtraSteps = 0;
        walkerParams.MidLoopUnitX         = 0;
        walkerParams.MidLoopUnitY         = 0;
    }
    else if ((dependencyPattern == dependencyWavefront45XDegree) ||
             (dependencyPattern == dependencyWavefront45XDegreeAlt))
    {
        // Global
        walkerParams.GlobalResolution.x       = threadSpaceWidth;
        walkerParams.GlobalResolution.y       = threadSpaceHeight;
        walkerParams.GlobalStart.x            = 0;
        walkerParams.GlobalStart.y            = 0;
        walkerParams.GlobalOutlerLoopStride.x = threadSpaceWidth;
        walkerParams.GlobalOutlerLoopStride.y = 0;
        walkerParams.GlobalInnerLoopUnit.x    = 0;
        walkerParams.GlobalInnerLoopUnit.y    = threadSpaceHeight;

        // Local
        walkerParams.BlockResolution.x    = threadSpaceWidth;
        walkerParams.BlockResolution.y    = threadSpaceHeight;
        walkerParams.LocalStart.x         = 0;
        walkerParams.LocalStart.y         = 0;
        walkerParams.LocalOutLoopStride.x = 1;
        walkerParams.LocalOutLoopStride.y = 0;
        walkerParams.LocalInnerLoopUnit.x = -1;
        walkerParams.LocalInnerLoopUnit.y = childThreadNumber + 1;

        // Mid
        walkerParams.MiddleLoopExtraSteps = childThreadNumber;
        walkerParams.MidLoopUnitX         = 0;
        walkerParams.MidLoopUnitY         = 1;
    }
    else if ((dependencyPattern == dependencyWavefront26XDegree) ||
             (dependencyPattern == dependencyWavefront26XDegreeAlt))
    {
        // Global
        walkerParams.GlobalResolution.x       = threadSpaceWidth;
        walkerParams.GlobalResolution.y       = threadSpaceHeight;
        walkerParams.GlobalStart.x            = 0;
        walkerParams.GlobalStart.y            = 0;
        walkerParams.GlobalOutlerLoopStride.x = threadSpaceWidth;
        walkerParams.GlobalOutlerLoopStride.y = 0;
        walkerParams.GlobalInnerLoopUnit.x    = 0;
        walkerParams.GlobalInnerLoopUnit.y    = threadSpaceHeight;

        // Local
        walkerParams.BlockResolution.x    = threadSpaceWidth;
        walkerParams.BlockResolution.y    = threadSpaceHeight;
        walkerParams.LocalStart.x         = 0;
        walkerParams.LocalStart.y         = 0;
        walkerParams.LocalOutLoopStride.x = 1;
        walkerParams.LocalOutLoopStride.y = 0;
        walkerParams.LocalInnerLoopUnit.x = -2;
        walkerParams.LocalInnerLoopUnit.y = childThreadNumber + 1;

        // Mid
        walkerParams.MiddleLoopExtraSteps = childThreadNumber;
        walkerParams.MidLoopUnitX         = 0;
        walkerParams.MidLoopUnitY         = 1;
    }
    else if (dependencyPattern == dependencyWavefront45XVp9Degree)
    {
        // Global
        walkerParams.GlobalResolution.x       = threadSpaceWidth;
        walkerParams.GlobalResolution.y       = threadSpaceHeight;
        walkerParams.GlobalStart.x            = 0;
        walkerParams.GlobalStart.y            = 0;
        walkerParams.GlobalOutlerLoopStride.x = threadSpaceWidth;
        walkerParams.GlobalOutlerLoopStride.y = 0;
        walkerParams.GlobalInnerLoopUnit.x    = 0;
        walkerParams.GlobalInnerLoopUnit.y    = threadSpaceHeight;

        // Local
        walkerParams.BlockResolution.x    = threadSpaceWidth;
        walkerParams.BlockResolution.y    = threadSpaceHeight;
        walkerParams.LocalStart.x         = 0;
        walkerParams.LocalStart.y         = 0;
        walkerParams.LocalOutLoopStride.x = 1;
        walkerParams.LocalOutLoopStride.y = 0;
        walkerParams.LocalInnerLoopUnit.x = -1;
        walkerParams.LocalInnerLoopUnit.y = 4;

        // Mid
        walkerParams.MiddleLoopExtraSteps = 3;
        walkerParams.MidLoopUnitX         = 0;
        walkerParams.MidLoopUnitY         = 1;
    }
    else if (dependencyPattern == dependencyWavefront26ZDegree)
    {
        // Global
        walkerParams.GlobalResolution.x       = threadSpaceWidth;
        walkerParams.GlobalResolution.y       = threadSpaceHeight;
        walkerParams.GlobalStart.x            = 0;
        walkerParams.GlobalStart.y            = 0;
        walkerParams.GlobalOutlerLoopStride.x = 2;
        walkerParams.GlobalOutlerLoopStride.y = 0;
        walkerParams.GlobalInnerLoopUnit.x    = -4;
        walkerParams.GlobalInnerLoopUnit.y    = 2;

        // Local
        walkerParams.BlockResolution.x    = 2;
        walkerParams.BlockResolution.y    = 2;
        walkerParams.LocalStart.x         = 0;
        walkerParams.LocalStart.y         = 0;
        walkerParams.LocalOutLoopStride.x = 0;
        walkerParams.LocalOutLoopStride.y = 1;
        walkerParams.LocalInnerLoopUnit.x = 1;
        walkerParams.LocalInnerLoopUnit.y = 0;

        // Mid
        walkerParams.MiddleLoopExtraSteps = 0;
        walkerParams.MidLoopUnitX         = 0;
        walkerParams.MidLoopUnitY         = 0;
    }
    else if (dependencyPattern == dependencyWavefront26ZigDegree)
    {
        int32_t size_x = threadSpaceWidth;   //(threadSpaceWidth + 1)>> 1;
        int32_t size_y = threadSpaceHeight;  //threadSpaceHeight << 1;

        // Global
        walkerParams.GlobalResolution.x       = size_x;
        walkerParams.GlobalResolution.y       = size_y;
        walkerParams.GlobalStart.x            = 0;
        walkerParams.GlobalStart.y            = 0;
        walkerParams.GlobalOutlerLoopStride.x = size_x;
        walkerParams.GlobalOutlerLoopStride.y = 0;
        walkerParams.GlobalInnerLoopUnit.x    = 0;
        walkerParams.GlobalInnerLoopUnit.y    = size_y;

        // Local
        walkerParams.BlockResolution.x    = size_x;
        walkerParams.BlockResolution.y    = size_y;
        walkerParams.LocalStart.x         = 0;
        walkerParams.LocalStart.y         = 0;
        walkerParams.LocalOutLoopStride.x = 1;
        walkerParams.LocalOutLoopStride.y = 0;
        walkerParams.LocalInnerLoopUnit.x = -2;
        walkerParams.LocalInnerLoopUnit.y = 4;

        // Mid
        walkerParams.MiddleLoopExtraSteps = 3;
        walkerParams.MidLoopUnitX         = 0;
        walkerParams.MidLoopUnitY         = 1;
    }
    else if (dependencyPattern == dependencyWavefront45DDegree)
    {
        // Global
        walkerParams.GlobalResolution.x       = threadSpaceWidth;
        walkerParams.GlobalResolution.y       = threadSpaceHeight;
        walkerParams.GlobalStart.x            = 0;
        walkerParams.GlobalStart.y            = 0;
        walkerParams.GlobalOutlerLoopStride.x = threadSpaceWidth;
        walkerParams.GlobalOutlerLoopStride.y = 0;
        walkerParams.GlobalInnerLoopUnit.x    = 0;
        walkerParams.GlobalInnerLoopUnit.y    = threadSpaceHeight;

        // Local
        walkerParams.BlockResolution.x    = threadSpaceWidth;
        walkerParams.BlockResolution.y    = threadSpaceHeight;
        walkerParams.LocalStart.x         = threadSpaceWidth;
        walkerParams.LocalStart.y         = 0;
        walkerParams.LocalOutLoopStride.x = 1;
        walkerParams.LocalOutLoopStride.y = 0;
        walkerParams.LocalInnerLoopUnit.x = -1;
        walkerParams.LocalInnerLoopUnit.y = 1;

        // Mid
        walkerParams.MiddleLoopExtraSteps = 0;
        walkerParams.MidLoopUnitX         = 0;
        walkerParams.MidLoopUnitY         = 0;
        if (colorCountMinusOne > 0)
        {
            walkerParams.dwLocalLoopExecCount = localLoopExecCount;
        }
    }
    else if (dependencyPattern == dependencyWavefront26DDegree)
    {
        // Global
        walkerParams.GlobalResolution.x       = threadSpaceWidth;
        walkerParams.GlobalResolution.y       = threadSpaceHeight;
        walkerParams.GlobalStart.x            = 0;
        walkerParams.GlobalStart.y            = 0;
        walkerParams.GlobalOutlerLoopStride.x = threadSpaceWidth;
        walkerParams.GlobalOutlerLoopStride.y = 0;
        walkerParams.GlobalInnerLoopUnit.x    = 0;
        walkerParams.GlobalInnerLoopUnit.y    = threadSpaceHeight;
        // Local
        walkerParams.BlockResolution.x    = threadSpaceWidth;
        walkerParams.BlockResolution.y    = threadSpaceHeight;
        walkerParams.LocalStart.x         = threadSpaceWidth;
        walkerParams.LocalStart.y         = 0;
        walkerParams.LocalOutLoopStride.x = 1;
        walkerParams.LocalOutLoopStride.y = 0;
        walkerParams.LocalInnerLoopUnit.x = -2;
        walkerParams.LocalInnerLoopUnit.y = 1;
        // Mid
        walkerParams.MiddleLoopExtraSteps = 0;
        walkerParams.MidLoopUnitX         = 0;
        walkerParams.MidLoopUnitY         = 0;

        if (colorCountMinusOne > 0)
        {
            walkerParams.dwLocalLoopExecCount = localLoopExecCount;
        }
    }
    else if (dependencyPattern == dependencyWavefront45XDDegree)
    {
        // Global
        walkerParams.GlobalResolution.x       = threadSpaceWidth;
        walkerParams.GlobalResolution.y       = threadSpaceHeight;
        walkerParams.GlobalStart.x            = 0;
        walkerParams.GlobalStart.y            = 0;
        walkerParams.GlobalOutlerLoopStride.x = threadSpaceWidth;
        walkerParams.GlobalOutlerLoopStride.y = 0;
        walkerParams.GlobalInnerLoopUnit.x    = 0;
        walkerParams.GlobalInnerLoopUnit.y    = threadSpaceHeight;

        // Local
        walkerParams.BlockResolution.x    = threadSpaceWidth;
        walkerParams.BlockResolution.y    = threadSpaceHeight;
        walkerParams.LocalStart.x         = threadSpaceWidth;
        walkerParams.LocalStart.y         = 0;
        walkerParams.LocalOutLoopStride.x = 1;
        walkerParams.LocalOutLoopStride.y = 0;
        walkerParams.LocalInnerLoopUnit.x = -1;
        walkerParams.LocalInnerLoopUnit.y = childThreadNumber + 1;

        // Mid
        walkerParams.MiddleLoopExtraSteps = childThreadNumber;
        walkerParams.MidLoopUnitX         = 0;
        walkerParams.MidLoopUnitY         = 1;
        if (colorCountMinusOne > 0)
        {
            walkerParams.dwLocalLoopExecCount = localLoopExecCount;
        }
    }
    else if (dependencyPattern == dependencyWavefront26XDDegree)
    {
        // Global
        walkerParams.GlobalResolution.x       = threadSpaceWidth;
        walkerParams.GlobalResolution.y       = threadSpaceHeight;
        walkerParams.GlobalStart.x            = 0;
        walkerParams.GlobalStart.y            = 0;
        walkerParams.GlobalOutlerLoopStride.x = threadSpaceWidth;
        walkerParams.GlobalOutlerLoopStride.y = 0;
        walkerParams.GlobalInnerLoopUnit.x    = 0;
        walkerParams.GlobalInnerLoopUnit.y    = threadSpaceHeight;
        // Local
        walkerParams.BlockResolution.x    = threadSpaceWidth;
        walkerParams.BlockResolution.y    = threadSpaceHeight;
        walkerParams.LocalStart.x         = threadSpaceWidth;
        walkerParams.LocalStart.y         = 0;
        walkerParams.LocalOutLoopStride.x = 1;
        walkerParams.LocalOutLoopStride.y = 0;
        walkerParams.LocalInnerLoopUnit.x = -2;
        walkerParams.LocalInnerLoopUnit.y = childThreadNumber + 1;
        // Mid
        walkerParams.MiddleLoopExtraSteps = childThreadNumber;
        walkerParams.MidLoopUnitX         = 0;
        walkerParams.MidLoopUnitY         = 1;

        if (colorCountMinusOne > 0)
        {
            walkerParams.dwLocalLoopExecCount = localLoopExecCount;
        }
    }
    else
    {
        CODECHAL_ENCODE_ASSERTMESSAGE("Unsupported walking pattern is observed\n");
        eStatus = MOS_STATUS_INVALID_PARAMETER;
    }
    return eStatus;
}

bool CodechalEncHevcStateG12::IsDegree45Needed()
{
    if (m_numberConcurrentGroup == 1 && m_numberEncKernelSubThread == 1)
    {
        return false;
    }
    return true;
}

void CodechalEncHevcStateG12::DecideConcurrentGroupAndWaveFrontNumber()
{
    uint32_t          shift       = m_hevcSeqParams->log2_max_coding_block_size_minus3 - m_hevcSeqParams->log2_min_coding_block_size_minus3;
    uint32_t          widthInLcu  = MOS_ROUNDUP_SHIFT((m_hevcSeqParams->wFrameWidthInMinCbMinus1 + 1), shift);
    uint32_t          heightInLcu = MOS_ROUNDUP_SHIFT((m_hevcSeqParams->wFrameHeightInMinCbMinus1 + 1), shift);
    DependencyPattern walkerDegree;

    //As per kernel ULT,for all non TU1 cases m_numberEncKernelSubThread should be set to 1
    // LCU32 has no multiple thread support,
    if (!m_isMaxLcu64 || m_hevcSeqParams->TargetUsage != 1)
    {
        m_numberEncKernelSubThread = 1;  // LCU32 has no multiple thread support
    }

    while (heightInLcu / m_numberConcurrentGroup == 0)
    {
        m_numberConcurrentGroup = m_numberConcurrentGroup >> 1;
        if (m_numberConcurrentGroup == 0)
        {
            // Try out all values and now have to use the default ones.
            // Concurrent group and wave-front split must be enabled together
            m_numberConcurrentGroup = 1;
            break;
        }
    }

    if (m_numberConcurrentGroup > 1)
    {
        m_numWavefrontInOneRegion = 0;
        while (m_numWavefrontInOneRegion == 0)
        {
            uint32_t shift = m_degree45Needed ? 0 : 1;

            m_numWavefrontInOneRegion =
                (widthInLcu + ((heightInLcu - 1) << shift) + m_numberConcurrentGroup - 1) / m_numberConcurrentGroup;

            if (m_numWavefrontInOneRegion > 0)
            {
                // this is a valid setting and number of regisions is greater than or equal to 1
                break;
            }
            m_numberConcurrentGroup = m_numberConcurrentGroup >> 1;
            if (m_numberConcurrentGroup == 0)
            {
                // Try out all values and now have to use the default ones.
                m_numberConcurrentGroup = 1;
                break;
            }
        }
    }
    else
    {
        m_numWavefrontInOneRegion = 0;
    }

    m_numberEncKernelSubThread = MOS_MIN(m_numberEncKernelSubThread, m_hevcThreadTaskDataNum);

    return;
}

void CodechalEncHevcStateG12::InitSwScoreBoardParams(CodechalEncodeSwScoreboard::KernelParams &swScoreboardKernelParames)
{
    uint32_t widthAlignedMaxLcu;
    uint32_t heightAlignedMaxLcu;
    uint32_t widthAlignedLcu32;
    uint32_t heightAlignedLcu32;

    if (m_mfeEnabled && m_colorBitMfeEnabled)
    {
        widthAlignedMaxLcu  = MOS_ALIGN_CEIL(m_mfeEncodeParams.maxWidth, MAX_LCU_SIZE);
        heightAlignedMaxLcu = MOS_ALIGN_CEIL(m_mfeEncodeParams.maxHeight, MAX_LCU_SIZE);
        widthAlignedLcu32   = MOS_ALIGN_CEIL(m_mfeEncodeParams.maxWidth, 32);
        heightAlignedLcu32  = MOS_ALIGN_CEIL(m_mfeEncodeParams.maxHeight, 32);
    }
    else
    {
        widthAlignedMaxLcu  = m_widthAlignedMaxLcu;
        heightAlignedMaxLcu = m_heightAlignedMaxLcu;
        widthAlignedLcu32   = m_widthAlignedLcu32;
        heightAlignedLcu32  = m_heightAlignedLcu32;
    }

    // SW scoreboard Kernel Call -- to be continued - DS + HME kernel call
    swScoreboardKernelParames.isHevc = false;  // can be set to false. Need to enabled only for an optimization which is not needed for now

    m_degree45Needed = true;
    if (m_hevcSeqParams->TargetUsage == 1)
    {
        m_numberConcurrentGroup = MOS_MIN(m_maxWavefrontsforTU1, m_numberConcurrentGroup);
        // m_numberConcurrentGroup should  default to 2 here for TU1. the only other value allowed from reg key will be 1
        m_degree45Needed = false;
    }
    else if (m_hevcSeqParams->TargetUsage == 4)
    {
        m_numberConcurrentGroup = MOS_MIN(m_maxWavefrontsforTU4, m_numberConcurrentGroup);
    }
    DecideConcurrentGroupAndWaveFrontNumber();

    DependencyPattern walkPattern;
    if (m_hevcSeqParams->TargetUsage == 1)
    {
        if (m_isMaxLcu64)
        {
            walkPattern = m_numberConcurrentGroup == 1 ? dependencyWavefront26XDegreeAlt : dependencyWavefront26XDDegree;
        }
        else
        {
            walkPattern = m_numberConcurrentGroup == 1 ? dependencyWavefront26Degree : dependencyWavefront26DDegree;
        }
    }
    else if (m_hevcSeqParams->TargetUsage == 4)
    {
        walkPattern = m_numberConcurrentGroup == 1 ? dependencyWavefront45Degree : dependencyWavefront45DDegree;
    }
    else
    {
        walkPattern = dependencyWavefront45DDegree;
    }
    m_swScoreboardState->SetDependencyPattern(walkPattern);

    if (m_isMaxLcu64)
    {
        if (m_hevcSeqParams->TargetUsage == 1)
        {
            swScoreboardKernelParames.scoreboardWidth  = (widthAlignedMaxLcu >> 6);
            swScoreboardKernelParames.scoreboardHeight = (heightAlignedMaxLcu >> 6) * m_numberEncKernelSubThread;
        }
        else
        {
            swScoreboardKernelParames.scoreboardWidth  = 2 * (widthAlignedMaxLcu >> 6);
            swScoreboardKernelParames.scoreboardHeight = 2 * (heightAlignedMaxLcu >> 6);
        }
        swScoreboardKernelParames.numberOfWaveFrontSplit = m_numberConcurrentGroup;
        swScoreboardKernelParames.numberOfChildThread    = m_numberEncKernelSubThread - 1;  // child thread number is minus one of the total sub-thread for the main thread takes one.
    }
    else
    {
        swScoreboardKernelParames.scoreboardWidth        = widthAlignedLcu32 >> 5;
        swScoreboardKernelParames.scoreboardHeight       = heightAlignedLcu32 >> 5;
        swScoreboardKernelParames.numberOfWaveFrontSplit = m_numberConcurrentGroup;
        swScoreboardKernelParames.numberOfChildThread    = 0;
    }

    swScoreboardKernelParames.swScoreboardSurfaceWidth  = swScoreboardKernelParames.scoreboardWidth;
    swScoreboardKernelParames.swScoreboardSurfaceHeight = swScoreboardKernelParames.scoreboardHeight;

    m_swScoreboardState->SetCurSwScoreboardSurfaceIndex(m_currRecycledBufIdx);

    swScoreboardKernelParames.lcuInfoSurface = &m_lcuLevelInputDataSurface[m_currRecycledBufIdx];
}

MOS_STATUS CodechalEncHevcStateG12::UserFeatureKeyReport()
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodechalEncHevcState::UserFeatureKeyReport());
#if (_DEBUG || _RELEASE_INTERNAL)
    CodecHalEncode_WriteKey(__MEDIA_USER_FEATURE_VALUE_HEVC_ENCODE_REGION_NUMBER_ID, m_numberConcurrentGroup, m_osInterface->pOsContext);
    CodecHalEncode_WriteKey(__MEDIA_USER_FEATURE_VALUE_HEVC_ENCODE_SUBTHREAD_NUM_ID, m_numberEncKernelSubThread, m_osInterface->pOsContext);
    CodecHalEncode_WriteKey64(__MEDIA_USER_FEATURE_VALUE_HEVC_ENCODE_ENABLE_VE_DEBUG_OVERRIDE, m_kmdVeOveride.Value, m_osInterface->pOsContext);

    if (m_pakOnlyTest)
    {
        CodecHalEncode_WriteStringKey(__MEDIA_USER_FEATURE_VALUE_HEVC_ENCODE_PAK_ONLY_ID, m_pakOnlyDataFolder, strlen(m_pakOnlyDataFolder), m_osInterface->pOsContext);
    }
    CodecHalEncode_WriteKey(__MEDIA_USER_FEATURE_VALUE_ENCODE_USED_VDBOX_NUM_ID, m_numPipe, m_osInterface->pOsContext);
    CodecHalEncode_WriteKey(__MEDIA_USER_FEATURE_VALUE_ENABLE_ENCODE_VE_CTXSCHEDULING_ID, MOS_VE_CTXBASEDSCHEDULING_SUPPORTED(m_osInterface), m_osInterface->pOsContext);
#endif

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::SetupSwScoreBoard(CodechalEncodeSwScoreboard::KernelParams *params)
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    MEDIA_WA_TABLE* waTable = m_osInterface->pfnGetWaTable(m_osInterface);
    uint32_t memType = (MEDIA_IS_WA(waTable, WaForceAllocateLML4)) ? MOS_MEMPOOL_DEVICEMEMORY : 0;

    if (Mos_ResourceIsNull(&m_swScoreboardState->GetCurSwScoreboardSurface()->OsResource))
    {
        MOS_ZeroMemory(m_swScoreboardState->GetCurSwScoreboardSurface(), sizeof(*m_swScoreboardState->GetCurSwScoreboardSurface()));

        MOS_ALLOC_GFXRES_PARAMS allocParamsForBuffer2D;
        MOS_ZeroMemory(&allocParamsForBuffer2D, sizeof(MOS_ALLOC_GFXRES_PARAMS));
        allocParamsForBuffer2D.Type     = MOS_GFXRES_2D;
        allocParamsForBuffer2D.TileType = MOS_TILE_LINEAR;
        allocParamsForBuffer2D.Format   = Format_R32U;
        allocParamsForBuffer2D.dwWidth  = params->swScoreboardSurfaceWidth;
        allocParamsForBuffer2D.dwHeight = params->swScoreboardSurfaceHeight;
        allocParamsForBuffer2D.pBufName = "SW Scoreboard Init buffer";
        allocParamsForBuffer2D.dwMemType = memType;

        eStatus = (MOS_STATUS)m_osInterface->pfnAllocateResource(
            m_osInterface,
            &allocParamsForBuffer2D,
            &m_swScoreboardState->GetCurSwScoreboardSurface()->OsResource);

        CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalGetResourceInfo(
            m_osInterface,
            m_swScoreboardState->GetCurSwScoreboardSurface()));
    }

    if (m_swScoreboard == nullptr)
    {
        m_swScoreboard = (uint8_t *)MOS_AllocAndZeroMemory(params->scoreboardWidth * sizeof(uint32_t) * params->scoreboardHeight);
        InitSWScoreboard(m_swScoreboard, params->scoreboardWidth, params->scoreboardHeight, m_swScoreboardState->GetDependencyPattern(), (char)(params->numberOfChildThread));
    }

    MOS_LOCK_PARAMS lockFlags;

    MOS_ZeroMemory(&lockFlags, sizeof(MOS_LOCK_PARAMS));
    lockFlags.WriteOnly = 1;
    uint8_t *data       = (uint8_t *)m_osInterface->pfnLockResource(
        m_osInterface,
        &m_swScoreboardState->GetCurSwScoreboardSurface()->OsResource,
        &lockFlags);
    CODECHAL_ENCODE_CHK_NULL_RETURN(data);

    for (uint32_t h = 0; h < params->scoreboardHeight; h++)
    {
        uint32_t s = params->scoreboardWidth * sizeof(uint32_t);
        MOS_SecureMemcpy(data, s, &m_swScoreboard[h * s], s);
        data += m_swScoreboardState->GetCurSwScoreboardSurface()->dwPitch;
    }

    m_osInterface->pfnUnlockResource(
        m_osInterface,
        &m_swScoreboardState->GetCurSwScoreboardSurface()->OsResource);

    return eStatus;
}

void CodechalEncHevcStateG12::SetDependency(
    uint8_t &numDependencies,
    char *   scoreboardDeltaX,
    char *   scoreboardDeltaY,
    uint32_t dependencyPattern,
    char     childThreadNumber)
{
    if (dependencyPattern == dependencyWavefrontHorizontal)
    {
        numDependencies = m_numDependencyHorizontal;
        MOS_SecureMemcpy(scoreboardDeltaX, m_numDependencyHorizontal, m_dxWavefrontHorizontal, m_numDependencyHorizontal);
        MOS_SecureMemcpy(scoreboardDeltaY, m_numDependencyHorizontal, m_dyWavefrontHorizontal, m_numDependencyHorizontal);
    }
    else if (dependencyPattern == dependencyWavefrontVertical)
    {
        numDependencies = m_numDependencyVertical;
        MOS_SecureMemcpy(scoreboardDeltaX, m_numDependencyVertical, m_dxWavefrontVertical, m_numDependencyVertical);
        MOS_SecureMemcpy(scoreboardDeltaY, m_numDependencyVertical, m_dyWavefrontVertical, m_numDependencyVertical);
    }
    else if (dependencyPattern == dependencyWavefront45Degree)
    {
        numDependencies = m_numDependency45Degree;
        MOS_SecureMemcpy(scoreboardDeltaX, m_numDependency45Degree, m_dxWavefront45Degree, m_numDependency45Degree);
        MOS_SecureMemcpy(scoreboardDeltaY, m_numDependency45Degree, m_dyWavefront45Degree, m_numDependency45Degree);
    }
    else if (dependencyPattern == dependencyWavefront26Degree ||
             dependencyPattern == dependencyWavefront26DDegree)
    {
        numDependencies = m_numDependency26Degree;
        MOS_SecureMemcpy(scoreboardDeltaX, m_numDependency26Degree, m_dxWavefront26Degree, m_numDependency26Degree);
        MOS_SecureMemcpy(scoreboardDeltaY, m_numDependency26Degree, m_dyWavefront26Degree, m_numDependency26Degree);
    }
    else if (dependencyPattern == dependencyWavefront45XDegree)
    {
        numDependencies = m_numDependency45xDegree;
        MOS_SecureMemcpy(scoreboardDeltaX, m_numDependency45xDegree, m_dxWavefront45xDegree, m_numDependency45xDegree);
        MOS_SecureMemcpy(scoreboardDeltaY, m_numDependency45xDegree, m_dyWavefront45xDegree, m_numDependency45xDegree);
        numDependencies     = childThreadNumber + 2;
        scoreboardDeltaY[0] = childThreadNumber;
    }
    else if (dependencyPattern == dependencyWavefront26XDegree)
    {
        numDependencies = m_numDependency26xDegree;
        MOS_SecureMemcpy(scoreboardDeltaX, m_numDependency26xDegree, m_dxWavefront26xDegree, m_numDependency26xDegree);
        MOS_SecureMemcpy(scoreboardDeltaY, m_numDependency26xDegree, m_dyWavefront26xDegree, m_numDependency26xDegree);
        numDependencies     = childThreadNumber + 3;
        scoreboardDeltaY[0] = childThreadNumber;
    }
    else if ((dependencyPattern == dependencyWavefront45XDegreeAlt) ||
             (dependencyPattern == dependencyWavefront45XDDegree))
    {
        numDependencies = m_numDependency45xDegreeAlt;
        MOS_SecureMemcpy(scoreboardDeltaX, m_numDependency45xDegreeAlt, m_dxWavefront45xDegreeAlt, m_numDependency45xDegreeAlt);
        MOS_SecureMemcpy(scoreboardDeltaY, m_numDependency45xDegreeAlt, m_dyWavefront45xDegreeAlt, m_numDependency45xDegreeAlt);
        scoreboardDeltaY[0] = childThreadNumber;
    }
    else if ((dependencyPattern == dependencyWavefront26XDegreeAlt) ||
             (dependencyPattern == dependencyWavefront26XDDegree))
    {
        numDependencies = m_numDependency26xDegreeAlt;
        MOS_SecureMemcpy(scoreboardDeltaX, m_numDependency26xDegreeAlt, m_dxWavefront26xDegreeAlt, m_numDependency26xDegreeAlt);
        MOS_SecureMemcpy(scoreboardDeltaY, m_numDependency26xDegreeAlt, m_dyWavefront26xDegreeAlt, m_numDependency26xDegreeAlt);
        scoreboardDeltaY[0] = childThreadNumber;
    }
    else if (dependencyPattern == dependencyWavefront45XVp9Degree)
    {
        numDependencies = m_numDependency45xVp9Degree;
        MOS_SecureMemcpy(scoreboardDeltaX, m_numDependency45xVp9Degree, m_dxWavefront45xVp9Degree, m_numDependency45xVp9Degree);
        MOS_SecureMemcpy(scoreboardDeltaY, m_numDependency45xVp9Degree, m_dyWavefront45xVp9Degree, m_numDependency45xVp9Degree);
    }
    else if (dependencyPattern == dependencyWavefront26ZDegree)
    {
        numDependencies = m_numDependency26zDegree;
        MOS_SecureMemcpy(scoreboardDeltaX, m_numDependency26zDegree, m_dxWavefront26zDegree, m_numDependency26zDegree);
        MOS_SecureMemcpy(scoreboardDeltaY, m_numDependency26zDegree, m_dyWavefront26zDegree, m_numDependency26zDegree);
    }
    else if (dependencyPattern == dependencyWavefront26ZigDegree)
    {
        numDependencies = m_numDependency26ZigDegree;
        MOS_SecureMemcpy(scoreboardDeltaX, m_numDependency26ZigDegree, m_dxWavefront26ZigDegree, m_numDependency26ZigDegree);
        MOS_SecureMemcpy(scoreboardDeltaY, m_numDependency26ZigDegree, m_dyWavefront26ZigDegree, m_numDependency26ZigDegree);
    }
    else if (dependencyPattern == dependencyWavefront45DDegree)
    {
        numDependencies = m_numDependency45Degree;
        MOS_SecureMemcpy(scoreboardDeltaX, m_numDependency45Degree, m_dxWavefront45Degree, m_numDependency45Degree);
        MOS_SecureMemcpy(scoreboardDeltaY, m_numDependency45Degree, m_dyWavefront45Degree, m_numDependency45Degree);
    }
    else
    {
        numDependencies = m_numDependencyNone;
        MOS_SecureMemcpy(scoreboardDeltaX, m_numDependencyNone, m_dxWavefrontNone, m_numDependencyNone);
        MOS_SecureMemcpy(scoreboardDeltaY, m_numDependencyNone, m_dyWavefrontNone, m_numDependencyNone);
    }
}

// ========================================================================================
// FUNCTION:        InitSWScoreboard
// DESCRIPTION:        Initialize software scoreboard for a specific dependency pattern.
// INPUTS:            scoreboardWidth - Width of scoreboard in Entries
//                    scoreboardHeight - Height of scoreboard in Entries
//                    dependencyPattern - The Enumeration of the Dependency Pattern
// OUTPUTS:            scoreboard - Pointer to scoreboard in Memory
// ========================================================================================
void CodechalEncHevcStateG12::InitSWScoreboard(uint8_t *scoreboard, uint32_t scoreboardWidth, uint32_t scoreboardHeight, uint32_t dependencyPattern, char childThreadNumber)
{
    // 1. Select Dependency Pattern
    uint8_t numDependencies = 0;
    char    scoreboardDeltaX[m_maxNumDependency];
    char    scoreboardDeltaY[m_maxNumDependency];
    memset(scoreboardDeltaX, 0, sizeof(scoreboardDeltaX));
    memset(scoreboardDeltaY, 0, sizeof(scoreboardDeltaY));

    SetDependency(numDependencies, scoreboardDeltaX, scoreboardDeltaY, dependencyPattern, childThreadNumber);

    // 2. Initialize scoreboard (CPU Based)
    int32_t   dependentLocationX = 0;
    int32_t   dependentLocationY = 0;
    uint32_t *scoreboardInDws    = (uint32_t *)scoreboard;
    int32_t   totalThreadNumber  = childThreadNumber + 1;
    for (int32_t y = 0; y < (int32_t)scoreboardHeight; y += totalThreadNumber)
    {
        for (int32_t x = 0; x < (int32_t)scoreboardWidth; x++)
        {
            scoreboardInDws[y * scoreboardWidth + x] = 0;

            // Add dependencies accordingly
            for (int32_t i = 0; i < numDependencies; i++)
            {
                dependentLocationX = x + scoreboardDeltaX[i];
                dependentLocationY = y + scoreboardDeltaY[i];
                if ((dependentLocationX < 0) || (dependentLocationY < 0) ||
                    (dependentLocationX >= (int32_t)scoreboardWidth) ||
                    (dependentLocationY >= (int32_t)scoreboardHeight))
                {
                    // Do not add dependency because thread does not exist
                }
                else
                {
                    scoreboardInDws[y * scoreboardWidth + x] |= (1 << i);
                }
            }  // End NumDep
        }      // End x

        for (int32_t n = y + 1; n < y + totalThreadNumber; n++)
        {
            for (int32_t k = 0; k < (int32_t)scoreboardWidth; k++)
            {
                scoreboardInDws[n * scoreboardWidth + k] = scoreboardInDws[y * scoreboardWidth + k];
            }
        }

    }  // End y
}

void CodechalEncHevcStateG12::CreateMhwParams()
{
    m_sliceStateParams     = MOS_New(MHW_VDBOX_HEVC_SLICE_STATE_G12);
    m_pipeModeSelectParams = MOS_New(MHW_VDBOX_PIPE_MODE_SELECT_PARAMS_G12);
    m_pipeBufAddrParams    = MOS_New(MHW_VDBOX_PIPE_BUF_ADDR_PARAMS_G12);
}

MOS_STATUS CodechalEncHevcStateG12::CalculatePictureStateCommandSize()
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    MHW_VDBOX_STATE_CMDSIZE_PARAMS_G12 stateCmdSizeParams;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(
        m_hwInterface->GetHxxStateCommandSize(
            CODECHAL_ENCODE_MODE_HEVC,
            &m_defaultPictureStatesSize,
            &m_defaultPicturePatchListSize,
            &stateCmdSizeParams));

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::AddHcpPipeBufAddrCmd(
    PMOS_COMMAND_BUFFER cmdBuffer)
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    *m_pipeBufAddrParams = {};
    SetHcpPipeBufAddrParams(*m_pipeBufAddrParams);
#ifdef _MMC_SUPPORTED
    m_mmcState->SetPipeBufAddr(m_pipeBufAddrParams);
#endif
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hcpInterface->AddHcpPipeBufAddrCmd(cmdBuffer, m_pipeBufAddrParams));

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::SetTileData(
    MHW_VDBOX_HCP_TILE_CODING_PARAMS_G12 *tileCodingParams,
    uint32_t                              bitstreamBufSize)
{
    CODECHAL_ENCODE_FUNCTION_ENTER;

    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    if (!m_hevcPicParams->tiles_enabled_flag)
    {
        return eStatus;
    }

    uint32_t colBd[100]       = {0};
    uint32_t num_tile_columns = m_hevcPicParams->num_tile_columns_minus1 + 1;
    for (uint32_t i = 0; i < num_tile_columns; i++)
    {
        colBd[i + 1] = colBd[i] + m_hevcPicParams->tile_column_width[i];
    }

    uint32_t rowBd[100]    = {0};
    uint32_t num_tile_rows = m_hevcPicParams->num_tile_rows_minus1 + 1;
    for (uint32_t i = 0; i < num_tile_rows; i++)
    {
        rowBd[i + 1] = rowBd[i] + m_hevcPicParams->tile_row_height[i];
    }

    m_numTiles = num_tile_rows * num_tile_columns;

    uint32_t const uiNumCuRecordTab[]  = {1, 4, 16, 64};  //LCU: 8x8->1, 16x16->4, 32x32->16, 64x64->64
    uint32_t       numCuRecord         = uiNumCuRecordTab[MOS_MIN(3, m_hevcSeqParams->log2_max_coding_block_size_minus3)];
    uint32_t       bitstreamByteOffset = 0, saoRowstoreOffset = 0, cuLevelStreamoutOffset = 0, sseRowstoreOffset = 0;
    int32_t        frameWidthInMinCb  = m_hevcSeqParams->wFrameWidthInMinCbMinus1 + 1;
    int32_t        frameHeightInMinCb = m_hevcSeqParams->wFrameHeightInMinCbMinus1 + 1;
    int32_t        shift              = m_hevcSeqParams->log2_max_coding_block_size_minus3 - m_hevcSeqParams->log2_min_coding_block_size_minus3;
    uint32_t       NumLCUInPic        = 0;

    for (uint32_t i = 0; i < num_tile_rows; i++)
    {
        for (uint32_t j = 0; j < num_tile_columns; j++)
        {
            NumLCUInPic += m_hevcPicParams->tile_row_height[i] * m_hevcPicParams->tile_column_width[j];
        }
    }

    uint32_t numSliceInTile = 0;
    for (uint32_t uiNumLCUsInTiles = 0, i = 0; i < num_tile_rows; i++)
    {
        for (uint32_t j = 0; j < num_tile_columns; j++)
        {
            uint32_t idx          = i * num_tile_columns + j;
            uint32_t numLCUInTile = m_hevcPicParams->tile_row_height[i] * m_hevcPicParams->tile_column_width[j];

            tileCodingParams[idx].TileStartLCUX = colBd[j];
            tileCodingParams[idx].TileStartLCUY = rowBd[i];

            tileCodingParams[idx].TileColumnStoreSelect = j % 2;
            tileCodingParams[idx].TileRowStoreSelect    = i % 2;

            if (j != num_tile_columns - 1)
            {
                tileCodingParams[idx].TileWidthInMinCbMinus1 = (m_hevcPicParams->tile_column_width[j] << shift) - 1;
                tileCodingParams[idx].IsLastTileofRow        = false;
            }
            else
            {
                tileCodingParams[idx].TileWidthInMinCbMinus1 = (frameWidthInMinCb - (colBd[j] << shift)) - 1;
                tileCodingParams[idx].IsLastTileofRow        = true;
            }

            if (i != num_tile_rows - 1)
            {
                tileCodingParams[idx].IsLastTileofColumn      = false;
                tileCodingParams[idx].TileHeightInMinCbMinus1 = (m_hevcPicParams->tile_row_height[i] << shift) - 1;
            }
            else
            {
                tileCodingParams[idx].TileHeightInMinCbMinus1 = (frameHeightInMinCb - (rowBd[i] << shift)) - 1;
                tileCodingParams[idx].IsLastTileofColumn      = true;
            }

            tileCodingParams[idx].NumOfTilesInFrame       = m_numTiles;
            tileCodingParams[idx].NumOfTileColumnsInFrame = num_tile_columns;
            tileCodingParams[idx].CuRecordOffset          = MOS_ALIGN_CEIL(((numCuRecord * uiNumLCUsInTiles) * m_hcpInterface->GetHevcEncCuRecordSize()),
                                                       CODECHAL_CACHELINE_SIZE) /
                                                   CODECHAL_CACHELINE_SIZE;
            tileCodingParams[idx].NumberOfActiveBePipes = (m_numPipe > 1) ? m_numPipe : 1;

            tileCodingParams[idx].PakTileStatisticsOffset              = m_sizeOfHcpPakFrameStats * idx / CODECHAL_CACHELINE_SIZE;
            tileCodingParams[idx].TileSizeStreamoutOffset              = idx;
            tileCodingParams[idx].Vp9ProbabilityCounterStreamoutOffset = 0;
            tileCodingParams[idx].presHcpSyncBuffer                    = &m_resHcpScalabilitySyncBuffer.sResource;
            tileCodingParams[idx].CuLevelStreamoutOffset               = cuLevelStreamoutOffset;
            tileCodingParams[idx].SliceSizeStreamoutOffset             = numSliceInTile;
            tileCodingParams[idx].SseRowstoreOffset                    = sseRowstoreOffset;
            tileCodingParams[idx].BitstreamByteOffset                  = bitstreamByteOffset;
            tileCodingParams[idx].SaoRowstoreOffset                    = saoRowstoreOffset;

            cuLevelStreamoutOffset += MOS_ALIGN_CEIL((tileCodingParams[idx].TileWidthInMinCbMinus1 + 1) * (tileCodingParams[idx].TileHeightInMinCbMinus1 + 1) * 16, CODECHAL_CACHELINE_SIZE) / CODECHAL_CACHELINE_SIZE;
            sseRowstoreOffset += ((m_hevcPicParams->tile_column_width[j] + 3) * m_sizeOfSseSrcPixelRowStoreBufferPerLcu) / CODECHAL_CACHELINE_SIZE;
            saoRowstoreOffset += (MOS_ALIGN_CEIL(m_hevcPicParams->tile_column_width[j], 4) * CODECHAL_HEVC_SAO_STRMOUT_SIZE_PERLCU) / CODECHAL_CACHELINE_SIZE;
            uint64_t totalSizeTemp        = (uint64_t)bitstreamBufSize * (uint64_t)numLCUInTile;
            uint32_t bitStreamSizePerTile = (uint32_t)(totalSizeTemp / (uint64_t)NumLCUInPic) + ((totalSizeTemp % (uint64_t)NumLCUInPic) ? 1 : 0);
            bitstreamByteOffset += MOS_ALIGN_CEIL(bitStreamSizePerTile, CODECHAL_CACHELINE_SIZE) / CODECHAL_CACHELINE_SIZE;
            uiNumLCUsInTiles += numLCUInTile;

            for (uint32_t slcCount = 0; slcCount < m_numSlices; slcCount++)
            {
                bool lastSliceInTile = false, sliceInTile = false;
                CODECHAL_ENCODE_CHK_STATUS_RETURN(IsSliceInTile(slcCount,
                    &tileCodingParams[idx],
                    &sliceInTile,
                    &lastSliceInTile));
                numSliceInTile += (sliceInTile ? 1 : 0);
            }
        }
        // same row store buffer for different tile rows.
        saoRowstoreOffset = 0;
        sseRowstoreOffset = 0;
    }

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::IsSliceInTile(
    uint32_t                              sliceNumber,
    PMHW_VDBOX_HCP_TILE_CODING_PARAMS_G12 currentTile,
    bool *                                sliceInTile,
    bool *                                lastSliceInTile)
{
    CODECHAL_ENCODE_FUNCTION_ENTER;

    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_CHK_NULL_RETURN(currentTile);
    CODECHAL_ENCODE_CHK_NULL_RETURN(sliceInTile);
    CODECHAL_ENCODE_CHK_NULL_RETURN(lastSliceInTile);

    uint32_t shift            = m_hevcSeqParams->log2_max_coding_block_size_minus3 - m_hevcSeqParams->log2_min_coding_block_size_minus3;
    uint32_t residual         = (1 << shift) - 1;
    uint32_t frameWidthInLCU  = (m_hevcSeqParams->wFrameWidthInMinCbMinus1 + 1 + residual) >> shift;
    uint32_t frameHeightInLCU = (m_hevcSeqParams->wFrameHeightInMinCbMinus1 + 1 + residual) >> shift;

    PCODEC_HEVC_ENCODE_SLICE_PARAMS hevcSlcParams = &m_hevcSliceParams[sliceNumber];
    uint32_t                        sliceStartLCU = hevcSlcParams->slice_segment_address;
    uint32_t                        sliceLCUx     = sliceStartLCU % frameWidthInLCU;
    uint32_t                        sliceLCUy     = sliceStartLCU / frameWidthInLCU;

    uint32_t tile_column_width = (currentTile->TileWidthInMinCbMinus1 + 1 + residual) >> shift;
    uint32_t tile_row_height   = (currentTile->TileHeightInMinCbMinus1 + 1 + residual) >> shift;
    if (sliceLCUx < currentTile->TileStartLCUX ||
        sliceLCUy < currentTile->TileStartLCUY ||
        sliceLCUx >= currentTile->TileStartLCUX + tile_column_width ||
        sliceLCUy >= currentTile->TileStartLCUY + tile_row_height)
    {
        // slice start is not in the tile boundary
        *lastSliceInTile = *sliceInTile = false;
        return eStatus;
    }

    sliceLCUx += (hevcSlcParams->NumLCUsInSlice - 1) % tile_column_width;
    sliceLCUy += (hevcSlcParams->NumLCUsInSlice - 1) / tile_column_width;

    if (sliceLCUx >= currentTile->TileStartLCUX + tile_column_width)
    {
        sliceLCUx -= tile_column_width;
        sliceLCUy++;
    }

    if (sliceLCUx < currentTile->TileStartLCUX ||
        sliceLCUy < currentTile->TileStartLCUY ||
        sliceLCUx >= currentTile->TileStartLCUX + tile_column_width ||
        sliceLCUy >= currentTile->TileStartLCUY + tile_row_height)
    {
        // last LCU of the slice is out of the tile boundary
        *lastSliceInTile = *sliceInTile = false;
        return eStatus;
    }

    *sliceInTile = true;

    sliceLCUx++;
    sliceLCUy++;

    // the end of slice is at the boundary of tile
    *lastSliceInTile = (sliceLCUx == currentTile->TileStartLCUX + tile_column_width &&
                        sliceLCUy == currentTile->TileStartLCUY + tile_row_height);

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::AddHcpRefIdxCmd(
    PMOS_COMMAND_BUFFER         cmdBuffer,
    PMHW_BATCH_BUFFER           batchBuffer,
    PMHW_VDBOX_HEVC_SLICE_STATE params)
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    CODECHAL_ENCODE_CHK_NULL_RETURN(params);
    CODECHAL_ENCODE_CHK_NULL_RETURN(params->pEncodeHevcSliceParams);
    CODECHAL_ENCODE_CHK_NULL_RETURN(params->pEncodeHevcPicParams);

    if (cmdBuffer == nullptr && batchBuffer == nullptr)
    {
        CODECHAL_ENCODE_ASSERTMESSAGE("There was no valid buffer to add the HW command to.");
        return MOS_STATUS_NULL_POINTER;
    }

    PCODEC_HEVC_ENCODE_PICTURE_PARAMS hevcPicParams = params->pEncodeHevcPicParams;
    PCODEC_HEVC_ENCODE_SLICE_PARAMS   hevcSlcParams = params->pEncodeHevcSliceParams;

    if (hevcSlcParams->slice_type != CODECHAL_ENCODE_HEVC_I_SLICE)
    {
        MHW_VDBOX_HEVC_REF_IDX_PARAMS_G12 refIdxParams;

        refIdxParams.CurrPic         = hevcPicParams->CurrReconstructedPic;
        refIdxParams.isEncode        = true;
        refIdxParams.ucList          = LIST_0;
        refIdxParams.ucNumRefForList = hevcSlcParams->num_ref_idx_l0_active_minus1 + 1;
        eStatus                      = MOS_SecureMemcpy(&refIdxParams.RefPicList, sizeof(refIdxParams.RefPicList), &hevcSlcParams->RefPicList, sizeof(hevcSlcParams->RefPicList));
        if (eStatus != MOS_STATUS_SUCCESS)
        {
            CODECHAL_ENCODE_ASSERTMESSAGE("Failed to copy memory.");
            return eStatus;
        }

        refIdxParams.hevcRefList  = (void **)m_refList;
        refIdxParams.poc_curr_pic = hevcPicParams->CurrPicOrderCnt;
        for (auto i = 0; i < CODEC_MAX_NUM_REF_FRAME_HEVC; i++)
        {
            refIdxParams.poc_list[i] = hevcPicParams->RefFramePOCList[i];
        }

        refIdxParams.pRefIdxMapping     = params->pRefIdxMapping;
        refIdxParams.RefFieldPicFlag    = 0;  // there is no interlaced support in encoder
        refIdxParams.RefBottomFieldFlag = 0;  // there is no interlaced support in encoder

        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hcpInterface->AddHcpRefIdxStateCmd(cmdBuffer, batchBuffer, &refIdxParams));

        if (hevcSlcParams->slice_type == CODECHAL_ENCODE_HEVC_B_SLICE)
        {
            refIdxParams.ucList          = LIST_1;
            refIdxParams.ucNumRefForList = hevcSlcParams->num_ref_idx_l1_active_minus1 + 1;
            CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hcpInterface->AddHcpRefIdxStateCmd(cmdBuffer, batchBuffer, &refIdxParams));
        }
    }

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::SendPrologWithFrameTracking(
    PMOS_COMMAND_BUFFER   cmdBuffer,
    bool                  frameTrackingRequested,
    MHW_MI_MMIOREGISTERS *mmioRegister)
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    MOS_GPU_CONTEXT gpuContext = m_osInterface->pfnGetGpuContext(m_osInterface);

    MHW_MI_FORCE_WAKEUP_PARAMS forceWakeupParams;
    MOS_ZeroMemory(&forceWakeupParams, sizeof(MHW_MI_FORCE_WAKEUP_PARAMS));
    forceWakeupParams.bMFXPowerWellControl      = false;
    forceWakeupParams.bMFXPowerWellControlMask  = true;
    forceWakeupParams.bHEVCPowerWellControl     = true;
    forceWakeupParams.bHEVCPowerWellControlMask = true;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_miInterface->AddMiForceWakeupCmd(
        cmdBuffer,
        &forceWakeupParams));

    if (UseRenderCommandBuffer())
    {
        CODECHAL_ENCODE_CHK_STATUS_RETURN(CodechalEncoderState::SendPrologWithFrameTracking(cmdBuffer, frameTrackingRequested, mmioRegister));
        return eStatus;
    }

#ifdef _MMC_SUPPORTED
    CODECHAL_ENCODE_CHK_NULL_RETURN(m_mmcState);
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_mmcState->SendPrologCmd(m_miInterface, cmdBuffer, gpuContext));
#endif

    if (!IsLastPipe())
    {
        return eStatus;
    }

    PMOS_COMMAND_BUFFER commandBufferInUse;
    if (m_realCmdBuffer.pCmdBase)
    {
        commandBufferInUse = &m_realCmdBuffer;
    }
    else if (cmdBuffer && cmdBuffer->pCmdBase)
    {
        commandBufferInUse = cmdBuffer;
    }
    else
    {
        eStatus = MOS_STATUS_INVALID_PARAMETER;
        return eStatus;
    }

    // initialize command buffer attributes
    commandBufferInUse->Attributes.bTurboMode               = m_hwInterface->m_turboMode;
    commandBufferInUse->Attributes.dwNumRequestedEUSlices   = m_hwInterface->m_numRequestedEuSlices;
    commandBufferInUse->Attributes.dwNumRequestedSubSlices  = m_hwInterface->m_numRequestedSubSlices;
    commandBufferInUse->Attributes.dwNumRequestedEUs        = m_hwInterface->m_numRequestedEus;
    commandBufferInUse->Attributes.bValidPowerGatingRequest = true;

    if (frameTrackingRequested && m_frameTrackingEnabled)
    {
        commandBufferInUse->Attributes.bEnableMediaFrameTracking = true;
        commandBufferInUse->Attributes.resMediaFrameTrackingSurface =
            &m_encodeStatusBuf.resStatusBuffer;
        commandBufferInUse->Attributes.dwMediaFrameTrackingTag = m_storeData;
        // Set media frame tracking address offset(the offset from the encoder status buffer page)
        commandBufferInUse->Attributes.dwMediaFrameTrackingAddrOffset = 0;
    }

    MHW_GENERIC_PROLOG_PARAMS genericPrologParams;
    MOS_ZeroMemory(&genericPrologParams, sizeof(genericPrologParams));
    genericPrologParams.pOsInterface     = m_hwInterface->GetOsInterface();
    genericPrologParams.pvMiInterface    = m_hwInterface->GetMiInterface();
    genericPrologParams.bMmcEnabled      = m_mmcState ? m_mmcState->IsMmcEnabled() : false;
    genericPrologParams.dwStoreDataValue = m_storeData - 1;

    CODECHAL_ENCODE_CHK_STATUS_RETURN(Mhw_SendGenericPrologCmd(commandBufferInUse, &genericPrologParams));

    return eStatus;
}

void CodechalEncHevcStateG12::ResizeOnResChange()
{
    CODECHAL_ENCODE_FUNCTION_ENTER;

    CodechalEncoderState::ResizeOnResChange();

    // need to re-allocate surfaces according to resolution
    m_swScoreboardState->ReleaseResources();
}

MOS_STATUS CodechalEncHevcStateG12::InitMmcState()
{
    CODECHAL_ENCODE_FUNCTION_ENTER;
#ifdef _MMC_SUPPORTED
    m_mmcState = MOS_New(CodechalMmcEncodeHevcG12, m_hwInterface, this);
    CODECHAL_ENCODE_CHK_NULL_RETURN(m_mmcState);
#endif
    return MOS_STATUS_SUCCESS;
}

#if USE_CODECHAL_DEBUG_TOOL

//MOS_STATUS CodechalEncHevcStateG12::CodecHal_DbgDumpHEVCMbEncCurbeG12(
//    CodechalDebugInterface         *pDebugInterface,
//    CODECHAL_MEDIA_STATE_TYPE       Function,
//    PMOS_RESOURCE                   presDBuffer)
//{
//#define WRITE_CURBE_FIELD_TO_FILE(field) {\
//    MOS_SecureStringPrint(sOutBuf, sizeof(sOutBuf), sizeof(sOutBuf), "field = %d\n", pCurbeData->field);\
//    CodecHal_DbgAddStringToBufferNewLine(&FileParams, sOutBuf);}
//
//    PMOS_INTERFACE              m_osInterface = nullptr;
//    PCCHAR                      pcFunction = nullptr;
//    char                        sAttrib[125];
//    char                        sOutBuf[MAX_FIELD_LENGTH];
//    CODECHAL_DBG_FILE_PARAMS    FileParams;
//    MOS_STATUS                  eStatus = MOS_STATUS_SUCCESS;
//    MOS_LOCK_PARAMS             LockFlags;
//    CodechalEncHevcStateG12::MBENC_COMBINED_BUFFER1 *pEncComBuf1 = nullptr;
//
//    CODECHAL_DEBUG_FUNCTION_ENTER;
//
//    CODECHAL_DEBUG_CHK_NULL(pDebugInterface);
//    CODECHAL_DEBUG_CHK_NULL(pDebugInterface->pOsInterface);
//    CODECHAL_DEBUG_CHK_NULL(pDebugInterface->pHwInterface);
//    m_osInterface = pDebugInterface->pOsInterface;
//
//    pcFunction = CodecHal_DbgGetFunctionType(
//        pDebugInterface, Function, DBG_CMD_BUFFER_DUMP_DEFAULT);
//    CODECHAL_DEBUG_CHK_NULL(pcFunction);
//
//    MOS_SecureStringPrint(sAttrib, sizeof(sAttrib), sizeof(sAttrib), "%s%s", pcFunction, CODECHAL_DBG_STRING_CURBE);
//
//    MOS_ZeroMemory(&LockFlags, sizeof(MOS_LOCK_PARAMS));
//    LockFlags.ReadOnly = 1;
//
//    pEncComBuf1 = (CodechalEncHevcStateG12::MBENC_COMBINED_BUFFER1*)m_osInterface->pfnLockResource(
//        m_osInterface,
//        presDBuffer,
//        &LockFlags);
//
//    FileParams = g_cInitDbgFileParams;
//
//    if (!CodecHal_DbgAttribIsEnabled(pDebugInterface, sAttrib))
//    {
//        return eStatus;
//    }
//
//    MOS_ZeroMemory(pDebugInterface->sPath, sizeof(pDebugInterface->sPath));
//
//    CODECHAL_DEBUG_CHK_STATUS(CodecHal_DbgConstructFilenameString(
//        pDebugInterface,
//        pcFunction,
//        CODECHAL_DBG_STRING_CURBE,
//        CODECHAL_DBG_STRING_TXT));
//
//    if (CodecHal_DbgAttribIsEnabled(pDebugInterface, CODECHAL_DBG_STRING_DUMPDATAINBINARY))
//    {
//        CODECHAL_DEBUG_CHK_STATUS(CodecHal_DbgDumpBufferInHexDwords(
//            pDebugInterface,
//            (uint8_t*)&pEncComBuf1->Curbe,
//            sizeof(pEncComBuf1->Curbe)));
//    }
//    else
//    {
//        CodechalEncHevcStateG12::MBENC_CURBE* pCurbeData = &pEncComBuf1->Curbe;
//
//        FileParams.lRemaining = sizeof(char)* MAX_FIELD_LENGTH * MAX_NUM_ATTRIBUTES;
//        FileParams.psWriteToFile = (char*)MOS_AllocAndZeroMemory(FileParams.lRemaining);
//        CODECHAL_DEBUG_CHK_NULL(FileParams.psWriteToFile);
//        FileParams.dwOffset = 0;
//
//        memset(sOutBuf, 0, sizeof(sOutBuf));
//
//        MOS_SecureStringPrint(sOutBuf, sizeof(sOutBuf), sizeof(sOutBuf), "# CURBE Parameters:");
//        CodecHal_DbgAddStringToBufferNewLine(&FileParams, sOutBuf);
//
//        WRITE_CURBE_FIELD_TO_FILE(FrameWidthInSamples);
//        WRITE_CURBE_FIELD_TO_FILE(FrameHeightInSamples);
//
//        WRITE_CURBE_FIELD_TO_FILE(Log2MaxCUSize);
//        WRITE_CURBE_FIELD_TO_FILE(Log2MinCUSize);
//        WRITE_CURBE_FIELD_TO_FILE(Log2MaxTUSize);
//        WRITE_CURBE_FIELD_TO_FILE(Log2MinTUSize);
//        WRITE_CURBE_FIELD_TO_FILE(MaxIntraRdeIter);
//        WRITE_CURBE_FIELD_TO_FILE(QPType);
//        WRITE_CURBE_FIELD_TO_FILE(MaxTransformDepthInter);
//        WRITE_CURBE_FIELD_TO_FILE(MaxTransformDepthIntra);
//        WRITE_CURBE_FIELD_TO_FILE(Log2ParallelMergeLevel);
//
//        WRITE_CURBE_FIELD_TO_FILE(CornerNeighborPixel);
//        WRITE_CURBE_FIELD_TO_FILE(IntraNeighborAvailFlags);
//        WRITE_CURBE_FIELD_TO_FILE(ChromaFormatType);
//        WRITE_CURBE_FIELD_TO_FILE(SubPelMode);
//        WRITE_CURBE_FIELD_TO_FILE(InterSADMeasure);
//        WRITE_CURBE_FIELD_TO_FILE(IntraSADMeasure);
//        WRITE_CURBE_FIELD_TO_FILE(IntraPrediction);
//        WRITE_CURBE_FIELD_TO_FILE(RefIDCostMode);
//        WRITE_CURBE_FIELD_TO_FILE(TUBasedCostSetting);
//
//        WRITE_CURBE_FIELD_TO_FILE(ExplictModeEn);
//        WRITE_CURBE_FIELD_TO_FILE(AdaptiveEn);
//        WRITE_CURBE_FIELD_TO_FILE(EarlyImeSuccessEn);
//        WRITE_CURBE_FIELD_TO_FILE(IntraSpeedMode);
//        WRITE_CURBE_FIELD_TO_FILE(IMECostCentersSel);
//        WRITE_CURBE_FIELD_TO_FILE(RDEQuantRoundValue);
//        WRITE_CURBE_FIELD_TO_FILE(IMERefWindowSize);
//        WRITE_CURBE_FIELD_TO_FILE(IntraComputeType);
//        WRITE_CURBE_FIELD_TO_FILE(Depth0IntraPredition);
//        WRITE_CURBE_FIELD_TO_FILE(TUDepthControl);
//        WRITE_CURBE_FIELD_TO_FILE(IntraTuRecFeedbackDisable);
//        WRITE_CURBE_FIELD_TO_FILE(MergeListBiDisable);
//        WRITE_CURBE_FIELD_TO_FILE(EarlyImeStop);
//
//        WRITE_CURBE_FIELD_TO_FILE(FrameQP);
//        WRITE_CURBE_FIELD_TO_FILE(FrameQPSign);
//        WRITE_CURBE_FIELD_TO_FILE(ConcurrentGroupNum);
//        WRITE_CURBE_FIELD_TO_FILE(NumofUnitInWaveFront);
//
//        WRITE_CURBE_FIELD_TO_FILE(LoadBalenceEnable);
//        WRITE_CURBE_FIELD_TO_FILE(NumberofMultiFrame);
//        WRITE_CURBE_FIELD_TO_FILE(Degree45);
//        WRITE_CURBE_FIELD_TO_FILE(Break12Dependency);
//        WRITE_CURBE_FIELD_TO_FILE(ThreadNumber);
//
//        WRITE_CURBE_FIELD_TO_FILE(Pic_init_qp_B);
//        WRITE_CURBE_FIELD_TO_FILE(Pic_init_qp_P);
//        WRITE_CURBE_FIELD_TO_FILE(Pic_init_qp_I);
//
//        WRITE_CURBE_FIELD_TO_FILE(NumofRowTile);
//        WRITE_CURBE_FIELD_TO_FILE(NumofColumnTile);
//
//        WRITE_CURBE_FIELD_TO_FILE(TransquantBypassEnableFlag);
//        WRITE_CURBE_FIELD_TO_FILE(PCMEnabledFlag);
//        WRITE_CURBE_FIELD_TO_FILE(CuQpDeltaEnabledFlag);
//        WRITE_CURBE_FIELD_TO_FILE(Stepping);
//        WRITE_CURBE_FIELD_TO_FILE(WaveFrontSplitsEnable);
//        WRITE_CURBE_FIELD_TO_FILE(HMEFlag);
//        WRITE_CURBE_FIELD_TO_FILE(SuperHME);
//        WRITE_CURBE_FIELD_TO_FILE(UltraHME);
//        WRITE_CURBE_FIELD_TO_FILE(Cu64SkipCheckOnly);
//        WRITE_CURBE_FIELD_TO_FILE(EnableCu64Check);
//        WRITE_CURBE_FIELD_TO_FILE(Cu642Nx2NCheckOnly);
//        WRITE_CURBE_FIELD_TO_FILE(EnableCu64AmpCheck);
//        WRITE_CURBE_FIELD_TO_FILE(DisablePIntra);
//        WRITE_CURBE_FIELD_TO_FILE(DisableIntraTURec);
//        WRITE_CURBE_FIELD_TO_FILE(InheritIntraModeFromTU0);
//        WRITE_CURBE_FIELD_TO_FILE(CostScalingForRA);
//        WRITE_CURBE_FIELD_TO_FILE(DisableIntraNxN);
//
//        WRITE_CURBE_FIELD_TO_FILE(MaxRefIdxL0);
//        WRITE_CURBE_FIELD_TO_FILE(MaxRefIdxL1);
//        WRITE_CURBE_FIELD_TO_FILE(MaxBRefIdxL0);
//
//        WRITE_CURBE_FIELD_TO_FILE(SkipEarlyTermination);
//        WRITE_CURBE_FIELD_TO_FILE(SkipEarlyTermSize);
//        WRITE_CURBE_FIELD_TO_FILE(Dynamic64Enable);
//        WRITE_CURBE_FIELD_TO_FILE(Dynamic64Order);
//        WRITE_CURBE_FIELD_TO_FILE(Dynamic64Th);
//        WRITE_CURBE_FIELD_TO_FILE(DynamicOrderTh);
//        WRITE_CURBE_FIELD_TO_FILE(PerBFrameQPOffset);
//        WRITE_CURBE_FIELD_TO_FILE(IncreaseExitThresh);
//        WRITE_CURBE_FIELD_TO_FILE(Dynamic64Min32);
//        WRITE_CURBE_FIELD_TO_FILE(LastFrameIsIntra);
//
//        WRITE_CURBE_FIELD_TO_FILE(LenSP);
//        WRITE_CURBE_FIELD_TO_FILE(MaxNumSU);
//
//        WRITE_CURBE_FIELD_TO_FILE(CostTableIndex);
//
//        WRITE_CURBE_FIELD_TO_FILE(SliceType);
//        WRITE_CURBE_FIELD_TO_FILE(TemporalMvpEnableFlag);
//        WRITE_CURBE_FIELD_TO_FILE(CollocatedFromL0Flag);
//        WRITE_CURBE_FIELD_TO_FILE(theSameRefList);
//        WRITE_CURBE_FIELD_TO_FILE(IsLowDelay);
//        WRITE_CURBE_FIELD_TO_FILE(MaxNumMergeCand);
//        WRITE_CURBE_FIELD_TO_FILE(NumRefIdxL0);
//        WRITE_CURBE_FIELD_TO_FILE(NumRefIdxL1);
//
//        WRITE_CURBE_FIELD_TO_FILE(FwdPocNumber_L0_mTb_0);
//        WRITE_CURBE_FIELD_TO_FILE(BwdPocNumber_L1_mTb_0);
//        WRITE_CURBE_FIELD_TO_FILE(FwdPocNumber_L0_mTb_1);
//        WRITE_CURBE_FIELD_TO_FILE(BwdPocNumber_L1_mTb_1);
//
//        WRITE_CURBE_FIELD_TO_FILE(FwdPocNumber_L0_mTb_2);
//        WRITE_CURBE_FIELD_TO_FILE(BwdPocNumber_L1_mTb_2);
//        WRITE_CURBE_FIELD_TO_FILE(FwdPocNumber_L0_mTb_3);
//        WRITE_CURBE_FIELD_TO_FILE(BwdPocNumber_L1_mTb_3);
//
//        WRITE_CURBE_FIELD_TO_FILE(FwdPocNumber_L0_mTb_4);
//        WRITE_CURBE_FIELD_TO_FILE(BwdPocNumber_L1_mTb_4);
//        WRITE_CURBE_FIELD_TO_FILE(FwdPocNumber_L0_mTb_5);
//        WRITE_CURBE_FIELD_TO_FILE(BwdPocNumber_L1_mTb_5);
//
//        WRITE_CURBE_FIELD_TO_FILE(FwdPocNumber_L0_mTb_6);
//        WRITE_CURBE_FIELD_TO_FILE(BwdPocNumber_L1_mTb_6);
//        WRITE_CURBE_FIELD_TO_FILE(FwdPocNumber_L0_mTb_7);
//        WRITE_CURBE_FIELD_TO_FILE(BwdPocNumber_L1_mTb_7);
//
//        WRITE_CURBE_FIELD_TO_FILE(LongTermReferenceFlags_L0);
//        WRITE_CURBE_FIELD_TO_FILE(LongTermReferenceFlags_L1);
//
//        WRITE_CURBE_FIELD_TO_FILE(RefFrameWinWidth);
//        WRITE_CURBE_FIELD_TO_FILE(RefFrameWinHeight);
//
//        WRITE_CURBE_FIELD_TO_FILE(RoundingInter);
//        WRITE_CURBE_FIELD_TO_FILE(RoundingIntra);
//        WRITE_CURBE_FIELD_TO_FILE(MaxThreadWidth);
//        WRITE_CURBE_FIELD_TO_FILE(MaxThreadHeight);
//
//        CODECHAL_DEBUG_CHK_STATUS(MosUtilities::MosWriteFileFromPtr(
//            pDebugInterface->sPath,
//            FileParams.psWriteToFile,
//            FileParams.dwOffset));
//    }
//
//finish:
//    if (m_osInterface && pEncComBuf1)
//    {
//        m_osInterface->pfnUnlockResource(
//            m_osInterface,
//            presDBuffer);
//    }
//
//    if (FileParams.psWriteToFile)
//    {
//        MOS_FreeMemory(FileParams.psWriteToFile);
//    }
//    return eStatus;
//}

#endif
MOS_STATUS CodechalEncHevcStateG12::VerifyCommandBufferSize()
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    if (UseRenderCommandBuffer() || m_numPipe == 1)
    {
        // legacy mode & resize CommandBuffer Size for every BRC pass
        if (!m_singleTaskPhaseSupported)
        {
            CODECHAL_ENCODE_CHK_STATUS_RETURN(VerifySpaceAvailable());
        }
        return eStatus;
    }

    // virtual engine
    uint32_t requestedSize =
        m_pictureStatesSize +
        m_extraPictureStatesSize +
        (m_sliceStatesSize * m_numSlices);

    requestedSize += (requestedSize * m_numPassesInOnePipe + m_hucCommandsSize);

    // Running in the multiple VDBOX mode
    int currentPipe = GetCurrentPipe();
    if (currentPipe < 0 || currentPipe >= m_numPipe)
    {
        eStatus = MOS_STATUS_INVALID_PARAMETER;
        return eStatus;
    }
    int currentPass = GetCurrentPass();
    if (currentPass < 0 || currentPass >= CODECHAL_HEVC_MAX_NUM_BRC_PASSES)
    {
        eStatus = MOS_STATUS_INVALID_PARAMETER;
        return eStatus;
    }

    if (IsFirstPipe() && m_osInterface->bUsesPatchList)
    {
        CODECHAL_ENCODE_CHK_STATUS_RETURN(VerifySpaceAvailable());
    }

    PMOS_COMMAND_BUFFER pCmdBuffer = m_singleTaskPhaseSupported ? &m_veBatchBuffer[m_virtualEngineBbIndex][currentPipe][0] : &m_veBatchBuffer[m_virtualEngineBbIndex][currentPipe][currentPass];

    if (Mos_ResourceIsNull(&pCmdBuffer->OsResource) ||
        m_sizeOfVeBatchBuffer < requestedSize)
    {
        MOS_ALLOC_GFXRES_PARAMS allocParamsForBufferLinear;

        MOS_ZeroMemory(&allocParamsForBufferLinear, sizeof(MOS_ALLOC_GFXRES_PARAMS));
        allocParamsForBufferLinear.Type     = MOS_GFXRES_BUFFER;
        allocParamsForBufferLinear.TileType = MOS_TILE_LINEAR;
        allocParamsForBufferLinear.Format   = Format_Buffer;
        allocParamsForBufferLinear.dwBytes  = requestedSize;
        allocParamsForBufferLinear.pBufName = "Batch buffer for each VDBOX";

        if (!Mos_ResourceIsNull(&pCmdBuffer->OsResource))
        {
            if (pCmdBuffer->pCmdBase)
            {
                m_osInterface->pfnUnlockResource(m_osInterface, &pCmdBuffer->OsResource);
            }
            m_osInterface->pfnFreeResource(m_osInterface, &pCmdBuffer->OsResource);
        }

        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_osInterface->pfnAllocateResource(
            m_osInterface,
            &allocParamsForBufferLinear,
            &pCmdBuffer->OsResource));

        m_sizeOfVeBatchBuffer = requestedSize;
    }

    if (pCmdBuffer->pCmdBase == nullptr)
    {
        MOS_LOCK_PARAMS lockParams;
        MOS_ZeroMemory(&lockParams, sizeof(lockParams));
        lockParams.WriteOnly = true;
        pCmdBuffer->pCmdPtr = pCmdBuffer->pCmdBase = (uint32_t *)m_osInterface->pfnLockResource(m_osInterface, &pCmdBuffer->OsResource, &lockParams);
        pCmdBuffer->iRemaining                     = m_sizeOfVeBatchBuffer;
        pCmdBuffer->iOffset                        = 0;

        if (pCmdBuffer->pCmdBase == nullptr)
        {
            eStatus = MOS_STATUS_NULL_POINTER;
            return eStatus;
        }
    }

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::GetCommandBuffer(PMOS_COMMAND_BUFFER cmdBuffer)
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    CODECHAL_ENCODE_CHK_NULL_RETURN(cmdBuffer);
    CODECHAL_ENCODE_CHK_NULL_RETURN(m_osInterface->osCpInterface);

    if (UseRenderCommandBuffer() || m_numPipe == 1)
    {
        // legacy mode
        m_realCmdBuffer.pCmdBase = m_realCmdBuffer.pCmdPtr = nullptr;
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_osInterface->pfnGetCommandBuffer(m_osInterface, cmdBuffer, 0));
        return eStatus;
    }

    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_osInterface->pfnGetCommandBuffer(m_osInterface, &m_realCmdBuffer, 0));

    int currentPipe = GetCurrentPipe();
    if (currentPipe < 0 || currentPipe >= m_numPipe)
    {
        eStatus = MOS_STATUS_INVALID_PARAMETER;
        return eStatus;
    }
    int currentPass = GetCurrentPass();
    if (currentPass < 0 || currentPass >= CODECHAL_HEVC_MAX_NUM_BRC_PASSES)
    {
        eStatus = MOS_STATUS_INVALID_PARAMETER;
        return eStatus;
    }

    *cmdBuffer = m_singleTaskPhaseSupported ? m_veBatchBuffer[m_virtualEngineBbIndex][currentPipe][0] : m_veBatchBuffer[m_virtualEngineBbIndex][currentPipe][currentPass];

    if (m_osInterface->osCpInterface->IsCpEnabled() && cmdBuffer->iOffset == 0)
    {
        // Insert CP Prolog
        CODECHAL_ENCODE_NORMALMESSAGE("Adding cp prolog for secure scalable encode");
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hwInterface->GetCpInterface()->AddProlog(m_osInterface, cmdBuffer));
    }
    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::ReturnCommandBuffer(PMOS_COMMAND_BUFFER cmdBuffer)
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    CODECHAL_ENCODE_CHK_NULL_RETURN(cmdBuffer);

    if (UseRenderCommandBuffer() || m_numPipe == 1)
    {
        // legacy mode
        m_osInterface->pfnReturnCommandBuffer(m_osInterface, cmdBuffer, 0);
        return eStatus;
    }

    int currentPipe = GetCurrentPipe();
    if (currentPipe < 0 || currentPipe >= m_numPipe)
    {
        eStatus = MOS_STATUS_INVALID_PARAMETER;
        return eStatus;
    }
    int currentPass = GetCurrentPass();
    if (currentPass < 0 || currentPass >= CODECHAL_HEVC_MAX_NUM_BRC_PASSES)
    {
        eStatus = MOS_STATUS_INVALID_PARAMETER;
        return eStatus;
    }
    uint8_t passIndex                                               = m_singleTaskPhaseSupported ? 0 : currentPass;
    m_veBatchBuffer[m_virtualEngineBbIndex][currentPipe][passIndex] = *cmdBuffer;
    m_osInterface->pfnReturnCommandBuffer(m_osInterface, &m_realCmdBuffer, 0);

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::SubmitCommandBuffer(
    PMOS_COMMAND_BUFFER cmdBuffer,
    bool                bNullRendering)
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    CODECHAL_ENCODE_CHK_NULL_RETURN(cmdBuffer);

    if (UseRenderCommandBuffer() || m_numPipe == 1)
    {
        // legacy mode
        if (!UseRenderCommandBuffer())  // Set VE Hints for video contexts only
        {
            CODECHAL_ENCODE_CHK_STATUS_RETURN(SetAndPopulateVEHintParams(cmdBuffer));
        }

        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_osInterface->pfnSubmitCommandBuffer(m_osInterface, cmdBuffer, bNullRendering));
        return eStatus;
    }

    bool cmdBufferReadyForSubmit = IsLastPipe();

    // In STF, Hold the command buffer submission till last pass
    if (m_singleTaskPhaseSupported)
    {
        cmdBufferReadyForSubmit = cmdBufferReadyForSubmit && IsLastPass();
    }

    if (!cmdBufferReadyForSubmit)
    {
        return eStatus;
    }

    int currentPass = GetCurrentPass();
    if (currentPass < 0 || currentPass >= CODECHAL_HEVC_MAX_NUM_BRC_PASSES)
    {
        eStatus = MOS_STATUS_INVALID_PARAMETER;
        return eStatus;
    }
    uint8_t passIndex = m_singleTaskPhaseSupported ? 0 : currentPass;

    for (uint32_t i = 0; i < m_numPipe; i++)
    {
        PMOS_COMMAND_BUFFER cmdBuffer = &m_veBatchBuffer[m_virtualEngineBbIndex][i][passIndex];

        if (cmdBuffer->pCmdBase)
        {
            m_osInterface->pfnUnlockResource(m_osInterface, &cmdBuffer->OsResource);
        }

        cmdBuffer->pCmdBase = 0;
        cmdBuffer->iOffset = cmdBuffer->iRemaining = 0;
    }
    m_sizeOfVeBatchBuffer = 0;

    if (eStatus == MOS_STATUS_SUCCESS)
    {
        CODECHAL_ENCODE_CHK_STATUS_RETURN(SetAndPopulateVEHintParams(&m_realCmdBuffer));
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_osInterface->pfnSubmitCommandBuffer(m_osInterface, &m_realCmdBuffer, bNullRendering));
    }

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::SetSliceStructs()
{
    MOS_STATUS eStatus   = MOS_STATUS_SUCCESS;
    eStatus              = CodechalEncodeHevcBase::SetSliceStructs();
    m_numPassesInOnePipe = m_numPasses;
    m_numPasses          = (m_numPasses + 1) * m_numPipe - 1;
    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::AllocateTileStatistics()
{
    CODECHAL_ENCODE_FUNCTION_ENTER;

    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    if (!m_hevcPicParams->tiles_enabled_flag)
    {
        return eStatus;
    }

    auto num_tile_rows    = m_hevcPicParams->num_tile_rows_minus1 + 1;
    auto num_tile_columns = m_hevcPicParams->num_tile_columns_minus1 + 1;
    auto num_tiles        = num_tile_rows * num_tile_columns;

    MOS_ZeroMemory(&m_hevcFrameStatsOffset, sizeof(HEVC_TILE_STATS_INFO));
    MOS_ZeroMemory(&m_hevcTileStatsOffset, sizeof(HEVC_TILE_STATS_INFO));
    MOS_ZeroMemory(&m_hevcStatsSize, sizeof(HEVC_TILE_STATS_INFO));

    MOS_LOCK_PARAMS lockFlagsWriteOnly;
    MOS_ZeroMemory(&lockFlagsWriteOnly, sizeof(MOS_LOCK_PARAMS));
    lockFlagsWriteOnly.WriteOnly = true;

    // Set the maximum size based on frame level statistics.
    m_hevcStatsSize.uiTileSizeRecord     = CODECHAL_CACHELINE_SIZE;
    m_hevcStatsSize.uiHevcPakStatistics  = m_sizeOfHcpPakFrameStats;
    m_hevcStatsSize.uiVdencStatistics    = 0;
    m_hevcStatsSize.uiHevcSliceStreamout = CODECHAL_CACHELINE_SIZE;

    // Maintain the offsets to use for patching addresses in to the HuC Pak Integration kernel Aggregated Frame Statistics Output Buffer
    // Each offset needs to be page aligned as the combined region is fed into different page aligned HuC regions
    m_hevcFrameStatsOffset.uiTileSizeRecord     = 0;  // Tile Size Record is not present in resHuCPakAggregatedFrameStatsBuffer
    m_hevcFrameStatsOffset.uiHevcPakStatistics  = 0;
    m_hevcFrameStatsOffset.uiVdencStatistics    = MOS_ALIGN_CEIL(m_hevcFrameStatsOffset.uiHevcPakStatistics + m_hevcStatsSize.uiHevcPakStatistics, CODECHAL_PAGE_SIZE);
    m_hevcFrameStatsOffset.uiHevcSliceStreamout = MOS_ALIGN_CEIL(m_hevcFrameStatsOffset.uiVdencStatistics + m_hevcStatsSize.uiVdencStatistics, CODECHAL_PAGE_SIZE);

    // Frame level statistics
    m_hwInterface->m_pakIntAggregatedFrameStatsSize = MOS_ALIGN_CEIL(m_hevcFrameStatsOffset.uiHevcSliceStreamout + (m_hevcStatsSize.uiHevcSliceStreamout * CODECHAL_HEVC_MAX_NUM_SLICES_LVL_6), CODECHAL_PAGE_SIZE);

    // HEVC Frame Statistics Buffer - Output from HuC PAK Integration kernel
    if (Mos_ResourceIsNull(&m_resHuCPakAggregatedFrameStatsBuffer.sResource))
    {
        MOS_ALLOC_GFXRES_PARAMS allocParamsForBufferLinear;
        MOS_ZeroMemory(&allocParamsForBufferLinear, sizeof(MOS_ALLOC_GFXRES_PARAMS));
        allocParamsForBufferLinear.Type     = MOS_GFXRES_BUFFER;
        allocParamsForBufferLinear.TileType = MOS_TILE_LINEAR;
        allocParamsForBufferLinear.Format   = Format_Buffer;
        allocParamsForBufferLinear.dwBytes  = m_hwInterface->m_pakIntAggregatedFrameStatsSize;
        allocParamsForBufferLinear.pBufName = "GEN11 HCP Aggregated Frame Statistics Streamout Buffer";

        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_osInterface->pfnAllocateResource(
            m_osInterface,
            &allocParamsForBufferLinear,
            &m_resHuCPakAggregatedFrameStatsBuffer.sResource));
        m_resHuCPakAggregatedFrameStatsBuffer.dwSize = m_hwInterface->m_pakIntAggregatedFrameStatsSize;

        uint8_t *pData = (uint8_t *)m_osInterface->pfnLockResource(
            m_osInterface,
            &m_resHuCPakAggregatedFrameStatsBuffer.sResource,
            &lockFlagsWriteOnly);

        CODECHAL_ENCODE_CHK_NULL_RETURN(pData);
        MOS_ZeroMemory(pData, allocParamsForBufferLinear.dwBytes);
        m_osInterface->pfnUnlockResource(m_osInterface, &m_resHuCPakAggregatedFrameStatsBuffer.sResource);
    }

    // Maintain the offsets to use for patching addresses in to the Tile Based Statistics Buffer
    // Each offset needs to be page aligned as the combined region is fed into different page aligned HuC regions
    m_hevcTileStatsOffset.uiTileSizeRecord     = 0;  // TileReord is in a separated resource
    m_hevcTileStatsOffset.uiHevcPakStatistics  = 0;  // PakStaticstics is head of m_resTileBasedStatisticsBuffer
    m_hevcTileStatsOffset.uiVdencStatistics    = MOS_ALIGN_CEIL(m_hevcTileStatsOffset.uiHevcPakStatistics + (m_hevcStatsSize.uiHevcPakStatistics * num_tiles), CODECHAL_PAGE_SIZE);
    m_hevcTileStatsOffset.uiHevcSliceStreamout = MOS_ALIGN_CEIL(m_hevcTileStatsOffset.uiVdencStatistics + (m_hevcStatsSize.uiVdencStatistics * num_tiles), CODECHAL_PAGE_SIZE);
    // Combined statistics size for all tiles
    m_hwInterface->m_pakIntTileStatsSize = MOS_ALIGN_CEIL(m_hevcTileStatsOffset.uiHevcSliceStreamout + m_hevcStatsSize.uiHevcSliceStreamout * CODECHAL_HEVC_MAX_NUM_SLICES_LVL_6, CODECHAL_PAGE_SIZE);

    // Tile size record size for all tiles
    m_hwInterface->m_tileRecordSize = m_hevcStatsSize.uiTileSizeRecord * num_tiles;

    if (Mos_ResourceIsNull(&m_resTileBasedStatisticsBuffer[m_virtualEngineBbIndex].sResource) || m_resTileBasedStatisticsBuffer[m_virtualEngineBbIndex].dwSize < m_hwInterface->m_pakIntTileStatsSize)
    {
        if (!Mos_ResourceIsNull(&m_resTileBasedStatisticsBuffer[m_virtualEngineBbIndex].sResource))
        {
            m_osInterface->pfnFreeResource(m_osInterface, &m_resTileBasedStatisticsBuffer[m_virtualEngineBbIndex].sResource);
        }
        MOS_ALLOC_GFXRES_PARAMS allocParamsForBufferLinear;
        MOS_ZeroMemory(&allocParamsForBufferLinear, sizeof(MOS_ALLOC_GFXRES_PARAMS));
        allocParamsForBufferLinear.Type     = MOS_GFXRES_BUFFER;
        allocParamsForBufferLinear.TileType = MOS_TILE_LINEAR;
        allocParamsForBufferLinear.Format   = Format_Buffer;
        allocParamsForBufferLinear.dwBytes  = m_hwInterface->m_pakIntTileStatsSize;
        allocParamsForBufferLinear.pBufName = "GEN11 HCP Tile Level Statistics Streamout Buffer";

        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_osInterface->pfnAllocateResource(
            m_osInterface,
            &allocParamsForBufferLinear,
            &m_resTileBasedStatisticsBuffer[m_virtualEngineBbIndex].sResource));
        m_resTileBasedStatisticsBuffer[m_virtualEngineBbIndex].dwSize = m_hwInterface->m_pakIntTileStatsSize;

        uint8_t *pData = (uint8_t *)m_osInterface->pfnLockResource(
            m_osInterface,
            &m_resTileBasedStatisticsBuffer[m_virtualEngineBbIndex].sResource,
            &lockFlagsWriteOnly);
        CODECHAL_ENCODE_CHK_NULL_RETURN(pData);

        MOS_ZeroMemory(pData, allocParamsForBufferLinear.dwBytes);
        m_osInterface->pfnUnlockResource(m_osInterface, &m_resTileBasedStatisticsBuffer[m_virtualEngineBbIndex].sResource);
    }

    if (Mos_ResourceIsNull(&m_tileRecordBuffer[m_virtualEngineBbIndex].sResource) || m_tileRecordBuffer[m_virtualEngineBbIndex].dwSize < m_hwInterface->m_tileRecordSize)
    {
        if (!Mos_ResourceIsNull(&m_tileRecordBuffer[m_virtualEngineBbIndex].sResource))
        {
            m_osInterface->pfnFreeResource(m_osInterface, &m_tileRecordBuffer[m_virtualEngineBbIndex].sResource);
        }
        MOS_ALLOC_GFXRES_PARAMS allocParamsForBufferLinear;
        MOS_ZeroMemory(&allocParamsForBufferLinear, sizeof(MOS_ALLOC_GFXRES_PARAMS));
        allocParamsForBufferLinear.Type     = MOS_GFXRES_BUFFER;
        allocParamsForBufferLinear.TileType = MOS_TILE_LINEAR;
        allocParamsForBufferLinear.Format   = Format_Buffer;
        allocParamsForBufferLinear.dwBytes  = m_hwInterface->m_tileRecordSize;
        allocParamsForBufferLinear.pBufName = "Tile Record Buffer";

        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_osInterface->pfnAllocateResource(
            m_osInterface,
            &allocParamsForBufferLinear,
            &m_tileRecordBuffer[m_virtualEngineBbIndex].sResource));
        m_tileRecordBuffer[m_virtualEngineBbIndex].dwSize = m_hwInterface->m_tileRecordSize;

        uint8_t *data = (uint8_t *)m_osInterface->pfnLockResource(
            m_osInterface,
            &m_tileRecordBuffer[m_virtualEngineBbIndex].sResource,
            &lockFlagsWriteOnly);
        CODECHAL_ENCODE_CHK_NULL_RETURN(data);

        MOS_ZeroMemory(data, allocParamsForBufferLinear.dwBytes);
        m_osInterface->pfnUnlockResource(m_osInterface, &m_tileRecordBuffer[m_virtualEngineBbIndex].sResource);
    }

    return eStatus;
}

void CodechalEncHevcStateG12::SetHcpPipeBufAddrParams(MHW_VDBOX_PIPE_BUF_ADDR_PARAMS &pipeBufAddrParams)
{
    CODECHAL_ENCODE_FUNCTION_ENTER;

    CodechalEncodeHevcBase::SetHcpPipeBufAddrParams(pipeBufAddrParams);

    // SAO Row Store is GEN12 specific
    pipeBufAddrParams.presSaoRowStoreBuffer = &m_SAORowStoreBuffer;

    PCODECHAL_ENCODE_BUFFER tileStatisticsBuffer = &m_resTileBasedStatisticsBuffer[m_virtualEngineBbIndex];
    if (!Mos_ResourceIsNull(&tileStatisticsBuffer->sResource) && (m_numPipe > 1))
    {
        pipeBufAddrParams.presLcuBaseAddressBuffer     = &tileStatisticsBuffer->sResource;
        pipeBufAddrParams.dwLcuStreamOutOffset         = m_hevcTileStatsOffset.uiHevcSliceStreamout;
        pipeBufAddrParams.presFrameStatStreamOutBuffer = &tileStatisticsBuffer->sResource;
        pipeBufAddrParams.dwFrameStatStreamOutOffset   = m_hevcTileStatsOffset.uiHevcPakStatistics;
    }
}

MOS_STATUS CodechalEncHevcStateG12::ReadSseStatistics(PMOS_COMMAND_BUFFER cmdBuffer)
{
    CODECHAL_ENCODE_FUNCTION_ENTER;

    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    if (!m_sseEnabled)
    {
        return eStatus;
    }

    // encodeStatus is offset by 2 DWs in the resource
    uint32_t sseOffsetinBytes = (m_encodeStatusBuf.wCurrIndex * m_encodeStatusBuf.dwReportSize) + sizeof(uint32_t) * 2 + m_encodeStatusBuf.dwSumSquareErrorOffset;
    for (auto i = 0; i < 6; i++)  // 64 bit SSE values for luma/ chroma channels need to be copied
    {
        MHW_MI_COPY_MEM_MEM_PARAMS miCpyMemMemParams;
        MOS_ZeroMemory(&miCpyMemMemParams, sizeof(miCpyMemMemParams));
        miCpyMemMemParams.presSrc     = m_hevcPicParams->tiles_enabled_flag && (m_numPipe > 1) ? &m_resHuCPakAggregatedFrameStatsBuffer.sResource : &m_resFrameStatStreamOutBuffer;
        miCpyMemMemParams.dwSrcOffset = (HEVC_PAK_STATISTICS_SSE_OFFSET + i) * sizeof(uint32_t);  // SSE luma offset is located at DW32 in Frame statistics, followed by chroma
        miCpyMemMemParams.presDst     = &m_encodeStatusBuf.resStatusBuffer;
        miCpyMemMemParams.dwDstOffset = sseOffsetinBytes + i * sizeof(uint32_t);
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_miInterface->AddMiCopyMemMemCmd(cmdBuffer, &miCpyMemMemParams));
    }
    return eStatus;
}

void CodechalEncHevcStateG12::SetHcpIndObjBaseAddrParams(MHW_VDBOX_IND_OBJ_BASE_ADDR_PARAMS &indObjBaseAddrParams)
{
    PCODECHAL_ENCODE_BUFFER tileRecordBuffer    = &m_tileRecordBuffer[m_virtualEngineBbIndex];
    bool                    useTileRecordBuffer = !Mos_ResourceIsNull(&tileRecordBuffer->sResource);

    MOS_ZeroMemory(&indObjBaseAddrParams, sizeof(indObjBaseAddrParams));
    indObjBaseAddrParams.Mode                        = CODECHAL_ENCODE_MODE_HEVC;
    indObjBaseAddrParams.presMvObjectBuffer          = IsPanicModePass() ? &m_skipFrameInfo.m_resMbCodeSkipFrameSurface : &m_resMbCodeSurface;
    indObjBaseAddrParams.dwMvObjectOffset            = m_mvOffset;
    indObjBaseAddrParams.dwMvObjectSize              = m_mbCodeSize - m_mvOffset;
    indObjBaseAddrParams.presPakBaseObjectBuffer     = &m_resBitstreamBuffer;
    indObjBaseAddrParams.dwPakBaseObjectSize         = m_bitstreamUpperBound;
    indObjBaseAddrParams.presPakTileSizeStasBuffer   = useTileRecordBuffer ? &tileRecordBuffer->sResource : nullptr;
    indObjBaseAddrParams.dwPakTileSizeStasBufferSize = useTileRecordBuffer ? m_hwInterface->m_tileRecordSize : 0;
    indObjBaseAddrParams.dwPakTileSizeRecordOffset   = useTileRecordBuffer ? m_hevcTileStatsOffset.uiTileSizeRecord : 0;
}

MOS_STATUS CodechalEncHevcStateG12::UpdateCmdBufAttribute(
    PMOS_COMMAND_BUFFER cmdBuffer,
    bool                renderEngineInUse)
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    // should not be there. Will remove it in the next change
    CODECHAL_ENCODE_FUNCTION_ENTER;
    if (MOS_VE_SUPPORTED(m_osInterface) && cmdBuffer->Attributes.pAttriVe)
    {
        PMOS_CMD_BUF_ATTRI_VE attriExt =
            (PMOS_CMD_BUF_ATTRI_VE)(cmdBuffer->Attributes.pAttriVe);

        memset(attriExt, 0, sizeof(MOS_CMD_BUF_ATTRI_VE));
        attriExt->bUseVirtualEngineHint =
            attriExt->VEngineHintParams.NeedSyncWithPrevious = !renderEngineInUse;
    }

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::SetAndPopulateVEHintParams(
    PMOS_COMMAND_BUFFER cmdBuffer)
{
    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_FUNCTION_ENTER;

    if (!MOS_VE_SUPPORTED(m_osInterface))
    {
        return eStatus;
    }

    CODECHAL_ENCODE_SCALABILITY_SETHINT_PARMS scalSetParms;
    MOS_ZeroMemory(&scalSetParms, sizeof(CODECHAL_ENCODE_SCALABILITY_SETHINT_PARMS));

    if (!MOS_VE_CTXBASEDSCHEDULING_SUPPORTED(m_osInterface))
    {
        scalSetParms.bNeedSyncWithPrevious = true;
    }

    if (m_numPipe >= 2)
    {
        int32_t currentPass = GetCurrentPass();
        if (currentPass < 0 || currentPass >= CODECHAL_HEVC_MAX_NUM_BRC_PASSES)
        {
            eStatus = MOS_STATUS_INVALID_PARAMETER;
            return eStatus;
        }

        uint8_t passIndex = m_singleTaskPhaseSupported ? 0 : currentPass;
        for (auto i = 0; i < m_numPipe; i++)
        {
            scalSetParms.veBatchBuffer[i] = m_veBatchBuffer[m_virtualEngineBbIndex][i][passIndex].OsResource;
        }
    }

    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalEncodeScalability_SetHintParams(this, m_scalabilityState, &scalSetParms));
    CODECHAL_ENCODE_CHK_NULL_RETURN(cmdBuffer);
    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalEncodeScalability_PopulateHintParams(m_scalabilityState, cmdBuffer));

    return eStatus;
}

MOS_STATUS CodechalEncHevcStateG12::AddMediaVfeCmd(
    PMOS_COMMAND_BUFFER   cmdBuffer,
    SendKernelCmdsParams *params)
{
    CODECHAL_ENCODE_CHK_NULL_RETURN(params);

    MHW_VFE_PARAMS_G12 vfeParams       = {};
    vfeParams.pKernelState             = params->pKernelState;
    vfeParams.eVfeSliceDisable         = MHW_VFE_SLICE_ALL;
    vfeParams.dwMaximumNumberofThreads = m_encodeVfeMaxThreads;
    vfeParams.bFusedEuDispatch         = false;  // legacy mode

    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_renderEngineInterface->AddMediaVfeCmd(cmdBuffer, &vfeParams));

    return MOS_STATUS_SUCCESS;
}

#if USE_CODECHAL_DEBUG_TOOL
MOS_STATUS CodechalEncHevcStateG12::DumpFrameStatsBuffer(CodechalDebugInterface *debugInterface)
{
    CODECHAL_ENCODE_CHK_NULL_RETURN(debugInterface);

    PMOS_RESOURCE resBuffer = &m_resFrameStatStreamOutBuffer;
    uint32_t      offset    = 0;
    uint32_t      num_tiles = 1;
    //In scalable mode, HEVC PAK Frame Statistics gets dumped out for each tile
    if (m_numPipe > 1)
    {
        resBuffer = &m_resTileBasedStatisticsBuffer[m_virtualEngineBbIndex].sResource;
        offset    = m_hevcTileStatsOffset.uiHevcPakStatistics;
        num_tiles = (m_hevcPicParams->num_tile_rows_minus1 + 1) * (m_hevcPicParams->num_tile_columns_minus1 + 1);
    }
    uint32_t size = MOS_ALIGN_CEIL(m_sizeOfHcpPakFrameStats * num_tiles, CODECHAL_CACHELINE_SIZE);

    CODECHAL_ENCODE_CHK_STATUS_RETURN(debugInterface->DumpBuffer(
        resBuffer,
        CodechalDbgAttr::attrFrameState,
        "FrameStatus",
        size,
        offset,
        CODECHAL_NUM_MEDIA_STATES));

    return MOS_STATUS_SUCCESS;
}

MOS_STATUS CodechalEncHevcStateG12::DumpPakOutput()
{
    std::string currPassName = "PAK_PASS" + std::to_string((int)m_currPass);

    CODECHAL_DEBUG_TOOL(
        int32_t currentPass = GetCurrentPass();
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpBuffer(
            &m_resPakcuLevelStreamoutData.sResource,
            CodechalDbgAttr::attrCUStreamout,
            currPassName.data(),
            m_resPakcuLevelStreamoutData.dwSize,
            0,
            CODECHAL_NUM_MEDIA_STATES));
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpBuffer(
            &m_resTileBasedStatisticsBuffer[m_virtualEngineBbIndex].sResource,
            CodechalDbgAttr::attrTileBasedStats,
            currPassName.data(),
            m_resTileBasedStatisticsBuffer[m_virtualEngineBbIndex].dwSize,
            0,
            CODECHAL_NUM_MEDIA_STATES));
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpBuffer(
            &m_brcBuffers.resBrcPakStatisticBuffer[m_brcBuffers.uiCurrBrcPakStasIdxForWrite],
            CodechalDbgAttr::attrBrcPakStats,
            currPassName.data(),
            m_hevcBrcPakStatisticsSize,
            0,
            CODECHAL_NUM_MEDIA_STATES));
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpBuffer(
            &m_HucStitchCmdBatchBuffer.OsResource,
            CodechalDbgAttr::attr2ndLvlBatchMfx,
            currPassName.data(),
            m_hwInterface->m_HucStitchCmdBatchBufferSize,
            0,
            CODECHAL_NUM_MEDIA_STATES));
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpBuffer(
            &m_resHucStitchDataBuffer[m_currRecycledBufIdx][currentPass],
            CodechalDbgAttr::attrHuCStitchDataBuf,
            currPassName.data(),
            sizeof(HucCommandData),
            0,
            CODECHAL_NUM_MEDIA_STATES));
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_debugInterface->DumpHucDmem(
            &m_resHucPakStitchDmemBuffer[m_currRecycledBufIdx][currentPass],
            sizeof(HucPakStitchDmemEncG12),
            currentPass,
            hucRegionDumpPakIntegrate));)

    return MOS_STATUS_SUCCESS;
}
#endif

MOS_STATUS CodechalEncHevcStateG12::EncodeMeKernel()
{
    CODECHAL_ENCODE_FUNCTION_ENTER;

    if (m_hmeKernel && m_hmeKernel->Is4xMeEnabled())
    {
        CodechalKernelHme::CurbeParam curbeParam;
        CODECHAL_ENCODE_CHK_STATUS_RETURN(SetMeCurbeParams(curbeParam));

        CodechalKernelHme::SurfaceParams surfaceParam;
        CODECHAL_ENCODE_CHK_STATUS_RETURN(SetMeSurfaceParams(surfaceParam));

        m_hmeKernel->setnoMEKernelForPFrame(m_lowDelay);

        if (m_hmeKernel->Is16xMeEnabled())
        {
            if (m_hmeKernel->Is32xMeEnabled())
            {
                surfaceParam.downScaledWidthInMb         = m_downscaledWidthInMb32x;
                surfaceParam.downScaledHeightInMb        = m_downscaledFrameFieldHeightInMb32x;
                surfaceParam.downScaledBottomFieldOffset = m_scaled32xBottomFieldOffset;
                CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hmeKernel->Execute(curbeParam, surfaceParam, CodechalKernelHme::HmeLevel::hmeLevel32x));
            }
            surfaceParam.downScaledWidthInMb         = m_downscaledWidthInMb16x;
            surfaceParam.downScaledHeightInMb        = m_downscaledFrameFieldHeightInMb16x;
            surfaceParam.downScaledBottomFieldOffset = m_scaled16xBottomFieldOffset;
            CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hmeKernel->Execute(curbeParam, surfaceParam, CodechalKernelHme::HmeLevel::hmeLevel16x));
        }
        surfaceParam.downScaledWidthInMb         = m_downscaledWidthInMb4x;
        surfaceParam.downScaledHeightInMb        = m_downscaledFrameFieldHeightInMb4x;
        surfaceParam.downScaledBottomFieldOffset = m_scaledBottomFieldOffset;
        surfaceParam.meBrcDistortionSurface      = m_brcBuffers.meBrcDistortionSurface;

        curbeParam.sumMVThreshold = m_sumMVThreshold;

        m_lastTaskInPhase = true;

        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hmeKernel->Execute(curbeParam, surfaceParam, CodechalKernelHme::HmeLevel::hmeLevel4x));
    }

    return MOS_STATUS_SUCCESS;
}

void CodechalEncHevcStateG12::ResizeBufferOffset()
{
    CODECHAL_ENCODE_FUNCTION_ENTER;

    uint32_t size = 0;
    uint32_t numLcu64 = m_widthAlignedMaxLcu * m_heightAlignedMaxLcu / 64 / 64;
    MBENC_COMBINED_BUFFER2 fixedBuf;

    //Re-Calculate m_encBCombinedBuffer2 Size and Offsets
    m_historyOutBufferSize = MOS_ALIGN_CEIL(32 * numLcu64, CODECHAL_CACHELINE_SIZE);
    m_threadTaskBufferSize = MOS_ALIGN_CEIL(96 * numLcu64, CODECHAL_CACHELINE_SIZE);

    size = MOS_ALIGN_CEIL(sizeof(fixedBuf), CODECHAL_CACHELINE_SIZE) + m_historyOutBufferSize + m_threadTaskBufferSize;

    m_historyOutBufferOffset = MOS_ALIGN_CEIL(sizeof(fixedBuf), CODECHAL_CACHELINE_SIZE);
    m_threadTaskBufferOffset = m_historyOutBufferOffset + m_historyOutBufferSize;
}
