/*
* Copyright (c) 2017-2020, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
//!
//! \file     codechal_encode_csc_ds_g12.cpp
//! \brief    This file implements the Csc+Ds feature for all codecs on Gen12 platform
//!

#include "codechal_encoder_base.h"
#include "codechal_encode_csc_ds_g12.h"
#include "codechal_encode_sfc_g12.h"
#include "codechal_kernel_header_g12.h"
#include "codeckrnheader.h"
#if defined(ENABLE_KERNELS) && !defined(_FULL_OPEN_SOURCE)
#include "igcodeckrn_g12.h"
#endif
#if USE_CODECHAL_DEBUG_TOOL
#include "codechal_debug_encode_par_g12.h"
#endif

uint8_t CodechalEncodeCscDsG12::GetBTCount() const
{
    return (uint8_t)cscNumSurfaces;
}

MOS_STATUS CodechalEncodeCscDsG12::AllocateSurfaceCsc()
{
    CODECHAL_ENCODE_FUNCTION_ENTER;

    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodechalEncodeCscDs::AllocateSurfaceCsc());

    MEDIA_WA_TABLE* waTable = m_osInterface->pfnGetWaTable(m_osInterface);
    uint32_t memType = (MEDIA_IS_WA(waTable, WaForceAllocateLML4)) ? MOS_MEMPOOL_DEVICEMEMORY : 0;

    // allocate the MbStats surface
    if (Mos_ResourceIsNull(&m_resMbStatsBuffer))
    {
        MOS_ALLOC_GFXRES_PARAMS    allocParamsForBufferLinear;
        MOS_ZeroMemory(&allocParamsForBufferLinear, sizeof(MOS_ALLOC_GFXRES_PARAMS));
        allocParamsForBufferLinear.Type = MOS_GFXRES_BUFFER;
        allocParamsForBufferLinear.TileType = MOS_TILE_LINEAR;
        allocParamsForBufferLinear.Format = Format_Buffer;
        uint32_t alignedWidth = MOS_ALIGN_CEIL(CODECHAL_GET_WIDTH_IN_MACROBLOCKS(m_encoder->m_oriFrameWidth), 64);
        uint32_t alignedHeight = MOS_ALIGN_CEIL(CODECHAL_GET_WIDTH_IN_MACROBLOCKS(m_encoder->m_oriFrameHeight), 64);
        allocParamsForBufferLinear.dwBytes = m_hwInterface->m_avcMbStatBufferSize =
            MOS_ALIGN_CEIL((alignedWidth * alignedHeight << 6) , 1024);
        allocParamsForBufferLinear.pBufName = "MB Statistics Buffer";
        allocParamsForBufferLinear.dwMemType = memType;

        CODECHAL_ENCODE_CHK_STATUS_MESSAGE_RETURN(m_osInterface->pfnAllocateResource(
            m_osInterface,
            &allocParamsForBufferLinear,
            &m_resMbStatsBuffer), "Failed to allocate  MB Statistics Buffer.");
    }

    return eStatus;
}

MOS_STATUS CodechalEncodeCscDsG12::CheckRawColorFormat(MOS_FORMAT format, MOS_TILE_TYPE tileType)
{
    CODECHAL_ENCODE_FUNCTION_ENTER;

    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    // check input color format, and set target traverse thread space size
    switch (format)
    {
    case Format_NV12:
        m_colorRawSurface = cscColorNv12Linear;
        m_cscRequireColor = 1;
        break;
    case Format_YUY2:
    case Format_YUYV:
        m_colorRawSurface = cscColorYUY2;
        m_cscRequireColor = (uint8_t)HCP_CHROMA_FORMAT_YUV420 == m_outputChromaFormat;
        m_cscRequireConvTo8bPlanar = (uint8_t)HCP_CHROMA_FORMAT_YUV422 == m_outputChromaFormat;
        break;
    case Format_A8R8G8B8:
    case Format_X8R8G8B8:
        m_colorRawSurface = cscColorARGB;
        m_cscUsingSfc = IsSfcEnabled() ? 1 : 0;
        m_cscRequireColor = 1;
        //Use EU for better performance in big resolution cases
        if (m_cscRawSurfWidth * m_cscRawSurfHeight > 1920 * 1088
            && !MEDIA_IS_WA(m_hwInterface->GetWaTable(), Wa_1409932735))
        {
            m_cscUsingSfc = 0;
        }
        break;
    case Format_A8B8G8R8:
        m_colorRawSurface = cscColorABGR;
        m_cscRequireColor = 1;
        break;
    case Format_P010:
    case Format_P016:
        m_colorRawSurface = cscColorP010;
        m_cscRequireConvTo8bPlanar = 1;
        break;
    case Format_Y210:
        if (m_encoder->m_vdencEnabled && MEDIA_IS_WA(m_encoder->m_waTable, WaHEVCVDEncY210LinearInputNotSupported))
        {
            if (tileType == MOS_TILE_Y)
            {
                m_colorRawSurface = cscColorY210;
                m_cscRequireConvTo8bPlanar = 1;
            }
            else
            {
                CODECHAL_ENCODE_ASSERTMESSAGE("Input color format Y210 Linear not yet supported!");
                eStatus = MOS_STATUS_PLATFORM_NOT_SUPPORTED;
            }
        }
        else
        {
            m_colorRawSurface = cscColorY210;
            m_cscRequireConvTo8bPlanar = 1;
        }
        break;
    case Format_Y216:
        m_colorRawSurface = cscColorY210;
        m_cscRequireConvTo8bPlanar = 1;
        break;
    case Format_AYUV:
        if (m_encoder->m_vdencEnabled)
        {
            m_colorRawSurface = cscColorAYUV;
            m_cscRequireColor = 1;
            break;
        }
    case Format_R10G10B10A2:
        if (m_encoder->m_vdencEnabled)
        {
            m_colorRawSurface = cscColorARGB10;
            break;
        }
    case Format_B10G10R10A2:
        if (m_encoder->m_vdencEnabled)
        {
            m_colorRawSurface = cscColorABGR10;
            break;
        }
    case Format_Y410:
        if (m_encoder->m_vdencEnabled)
        {
            m_colorRawSurface = cscColorY410;
            break;
        }
    case Format_YVYU:
        if (m_encoder->m_vdencEnabled)
        {
            m_colorRawSurface = cscColorYUY2;
            m_cscRequireColor = (uint8_t)HCP_CHROMA_FORMAT_YUV420 == m_outputChromaFormat;
            m_cscRequireConvTo8bPlanar = (uint8_t)HCP_CHROMA_FORMAT_YUV422 == m_outputChromaFormat;
            break;
        }
    case Format_P210:
        // not supported yet so fall-thru to default
        m_colorRawSurface = cscColorP210;
        m_cscRequireConvTo8bPlanar = 1;
    default:
        CODECHAL_ENCODE_ASSERTMESSAGE("Input color format = %d not yet supported!", format);
        eStatus = MOS_STATUS_INVALID_PARAMETER;
        break;
    }

    return eStatus;
}

MOS_STATUS CodechalEncodeCscDsG12::InitKernelStateCsc()
{
    CODECHAL_ENCODE_FUNCTION_ENTER;

    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_KERNEL_HEADER currKrnHeader;
    auto kernelSize = m_combinedKernelSize;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(GetCommonKernelHeaderAndSizeG12(
        m_kernelBase,
        ENC_SCALING_CONVERSION,
        0,
        &currKrnHeader,
        &kernelSize));

    m_cscKernelState->KernelParams.iBTCount = cscNumSurfaces;
    m_cscKernelState->KernelParams.iThreadCount = m_hwInterface->GetRenderInterface()->GetHwCaps()->dwMaxThreads;
    m_cscKernelState->KernelParams.iCurbeLength = m_cscCurbeLength;
    m_cscKernelState->KernelParams.iBlockWidth = CODECHAL_MACROBLOCK_WIDTH;
    m_cscKernelState->KernelParams.iBlockHeight = CODECHAL_MACROBLOCK_HEIGHT;
    m_cscKernelState->KernelParams.iIdCount = 1;
    m_cscKernelState->KernelParams.iInlineDataLength = m_cscCurbeLength;
    m_cscKernelState->dwCurbeOffset = m_stateHeapInterface->GetSizeofCmdInterfaceDescriptorData();
    m_cscKernelState->KernelParams.pBinary =
        m_kernelBase + (currKrnHeader.KernelStartPointer << MHW_KERNEL_OFFSET_SHIFT);
    m_cscKernelState->KernelParams.iSize = kernelSize;

    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_stateHeapInterface->CalculateSshAndBtSizesRequested(
        m_cscKernelState->KernelParams.iBTCount,
        &m_cscKernelState->dwSshSize,
        &m_cscKernelState->dwBindingTableSize));

    CODECHAL_ENCODE_CHK_NULL_RETURN(m_renderInterface->m_stateHeapInterface);
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hwInterface->MhwInitISH(m_renderInterface->m_stateHeapInterface, m_cscKernelState));

    m_maxBtCount += MOS_ALIGN_CEIL(cscNumSurfaces,m_renderInterface->m_stateHeapInterface->pStateHeapInterface->GetBtIdxAlignment());

    return eStatus;
}

MOS_STATUS CodechalEncodeCscDsG12::SetKernelParamsCsc(KernelParams* params)
{
    CODECHAL_ENCODE_FUNCTION_ENTER;

    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CODECHAL_ENCODE_CHK_NULL_RETURN(params);

    m_lastTaskInPhase = params->bLastTaskInPhaseCSC;

    auto inputFrameWidth = m_encoder->m_frameWidth;
    auto inputFrameHeight = m_encoder->m_frameHeight;
    auto inputSurface = m_rawSurfaceToEnc;
    auto output4xDsSurface = m_encoder->m_trackedBuf->Get4xDsSurface(CODEC_CURR_TRACKED_BUFFER);
    auto output2xDsSurface = m_encoder->m_trackedBuf->Get2xDsSurface(CODEC_CURR_TRACKED_BUFFER);
    auto mbStatsSurface = &m_resMbStatsBuffer;

    m_curbeParams.bHevcEncHistorySum = false;
    m_surfaceParamsCsc.hevcExtParams = nullptr;

    if (dsDisabled == params->stageDsConversion)
    {
        m_curbeParams.bConvertFlag = m_cscFlag != 0;

        if (m_2xScalingEnabled && m_scalingEnabled)
        {
            m_curbeParams.downscaleStage = dsStage2x4x;
            m_currRefList->b4xScalingUsed =
            m_currRefList->b2xScalingUsed = true;
            m_surfaceParamsCsc.bScalingInUses16UnormSurfFmt = false;
            m_surfaceParamsCsc.bScalingInUses32UnormSurfFmt = false;
        }
        else if (m_2xScalingEnabled)
        {
            m_curbeParams.downscaleStage = dsStage2x;
            m_currRefList->b2xScalingUsed = true;
            output4xDsSurface = nullptr;
            mbStatsSurface = nullptr;
            m_surfaceParamsCsc.bScalingInUses16UnormSurfFmt = true;
            m_surfaceParamsCsc.bScalingInUses32UnormSurfFmt = false;
        }
        else if (m_scalingEnabled)
        {
            m_curbeParams.downscaleStage = dsStage4x;
            m_currRefList->b4xScalingUsed = true;
            output2xDsSurface = nullptr;
            m_surfaceParamsCsc.bScalingInUses16UnormSurfFmt = false;
            m_surfaceParamsCsc.bScalingInUses32UnormSurfFmt = true;
        }
        else
        {
            // do CSC only
            m_curbeParams.downscaleStage = dsDisabled;
            output4xDsSurface = nullptr;
            output2xDsSurface = nullptr;
            mbStatsSurface = nullptr;
            m_surfaceParamsCsc.bScalingInUses16UnormSurfFmt = false;
            m_surfaceParamsCsc.bScalingInUses32UnormSurfFmt = false;
        }

        // history sum to be enabled only for the 4x stage
        if (params->hevcExtParams)
        {
            auto hevcExtParam = (HevcExtKernelParams*)params->hevcExtParams;
            m_curbeParams.bUseLCU32 = hevcExtParam->bUseLCU32;
            m_curbeParams.bHevcEncHistorySum = hevcExtParam->bHevcEncHistorySum;
            m_surfaceParamsCsc.hevcExtParams = params->hevcExtParams;
        }
    }
    else
    {
        // do 16x/32x downscaling
        m_curbeParams.bConvertFlag = false;
        mbStatsSurface = nullptr;

        if (dsStage16x == params->stageDsConversion)
        {
            m_currRefList->b16xScalingUsed = true;
            m_lastTaskInPhase = params->bLastTaskInPhase16xDS;
            m_curbeParams.downscaleStage = dsStage16x;
            inputFrameWidth = m_encoder->m_downscaledWidth4x << 2;
            inputFrameHeight = m_encoder->m_downscaledHeight4x << 2;

            inputSurface = m_encoder->m_trackedBuf->Get4xDsSurface(CODEC_CURR_TRACKED_BUFFER);
            output4xDsSurface = m_encoder->m_trackedBuf->Get16xDsSurface(CODEC_CURR_TRACKED_BUFFER);
            output2xDsSurface = nullptr;
            m_surfaceParamsCsc.bScalingInUses16UnormSurfFmt = false;
            m_surfaceParamsCsc.bScalingInUses32UnormSurfFmt = true;
        }
        else if (dsStage32x == params->stageDsConversion)
        {
            m_currRefList->b32xScalingUsed = true;
            m_lastTaskInPhase = params->bLastTaskInPhase32xDS;
            m_curbeParams.downscaleStage = dsStage2x;
            inputFrameWidth = m_encoder->m_downscaledWidth16x;
            inputFrameHeight = m_encoder->m_downscaledHeight16x;

            inputSurface = m_encoder->m_trackedBuf->Get16xDsSurface(CODEC_CURR_TRACKED_BUFFER);
            output4xDsSurface = nullptr;
            output2xDsSurface = m_encoder->m_trackedBuf->Get32xDsSurface(CODEC_CURR_TRACKED_BUFFER);
            m_surfaceParamsCsc.bScalingInUses16UnormSurfFmt = true;
            m_surfaceParamsCsc.bScalingInUses32UnormSurfFmt = false;
        }
    }

    // setup Curbe
    m_curbeParams.dwInputPictureWidth = inputFrameWidth;
    m_curbeParams.dwInputPictureHeight = inputFrameHeight;
    m_curbeParams.bFlatnessCheckEnabled = m_flatnessCheckEnabled;
    m_curbeParams.bMBVarianceOutputEnabled = m_mbStatsEnabled;
    m_curbeParams.bMBPixelAverageOutputEnabled = m_mbStatsEnabled;
    m_curbeParams.bCscOrCopyOnly = !m_scalingEnabled || params->cscOrCopyOnly;
    m_curbeParams.inputColorSpace = params->inputColorSpace;

    // setup surface states
    m_surfaceParamsCsc.psInputSurface = inputSurface;
    m_surfaceParamsCsc.psOutputCopiedSurface = m_curbeParams.bConvertFlag ? m_encoder->m_trackedBuf->GetCscSurface(CODEC_CURR_TRACKED_BUFFER) : nullptr;
    m_surfaceParamsCsc.psOutput4xDsSurface = output4xDsSurface;
    m_surfaceParamsCsc.psOutput2xDsSurface = output2xDsSurface;
    m_surfaceParamsCsc.presMBVProcStatsBuffer = mbStatsSurface;
    m_surfaceParamsCsc.hevcExtParams = params->hevcExtParams;

    if (dsStage16x == params->stageDsConversion)
    {
        // here to calculate the walker resolution, we need to use the input surface resolution.
        // it is inputFrameWidth/height / 4 in 16xStage, becasue kernel internally will do this.
        inputFrameWidth = inputFrameWidth >> 2;
        inputFrameHeight = inputFrameHeight >> 2;
    }

    // setup walker param
    m_walkerResolutionX = CODECHAL_GET_4xDS_SIZE_32ALIGNED(inputFrameWidth) >> 3;
    m_walkerResolutionY = CODECHAL_GET_4xDS_SIZE_32ALIGNED(inputFrameHeight) >> 3;

    return eStatus;
}

MOS_STATUS CodechalEncodeCscDsG12::SetCurbeCsc()
{
    CODECHAL_ENCODE_FUNCTION_ENTER;

    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    CscKernelCurbeData curbe;

    curbe.DW0_OutputBitDepthForChroma = m_curbeParams.ucEncBitDepthChroma;
    curbe.DW0_OutputBitDepthForLuma = m_curbeParams.ucEncBitDepthLuma;
    curbe.DW0_RoundingEnable = 1;

    if (m_colorRawSurface == cscColorABGR || m_colorRawSurface == cscColorABGR10)
    {
        curbe.DW1_PictureFormat = (uint8_t)((m_colorRawSurface == cscColorABGR) ? cscColorARGB : cscColorARGB10); // Use cscColorARGB for ABGR CSC, just switch B and R coefficients
    }
    else
    {
        curbe.DW1_PictureFormat = (uint8_t)m_colorRawSurface;
    }

    curbe.DW1_ConvertFlag = m_curbeParams.bConvertFlag;
    curbe.DW1_DownscaleStage = (uint8_t)m_curbeParams.downscaleStage;
    curbe.DW1_MbStatisticsDumpFlag = (m_curbeParams.downscaleStage == dsStage4x || m_curbeParams.downscaleStage == dsStage2x4x);
    curbe.DW1_YUY2ConversionFlag = (m_colorRawSurface == cscColorYUY2) && m_cscRequireColor;
    curbe.DW1_HevcEncHistorySum = m_curbeParams.bHevcEncHistorySum;
    curbe.DW1_LCUSize = m_curbeParams.bUseLCU32;

    curbe.DW2_OriginalPicWidthInSamples = m_curbeParams.dwInputPictureWidth;
    curbe.DW2_OriginalPicHeightInSamples = m_curbeParams.dwInputPictureHeight;

    // RGB->YUV CSC coefficients
    if (m_curbeParams.inputColorSpace == ECOLORSPACE_P709)
    {
        curbe.DW4_CSC_Coefficient_C0 = 0xFFCD;
        curbe.DW5_CSC_Coefficient_C3 = 0x0080;
        curbe.DW6_CSC_Coefficient_C4 = 0x004F;
        curbe.DW7_CSC_Coefficient_C7 = 0x0010;
        curbe.DW8_CSC_Coefficient_C8 = 0xFFD5;
        curbe.DW9_CSC_Coefficient_C11 = 0x0080;
        if (cscColorARGB == m_colorRawSurface || cscColorARGB10 == m_colorRawSurface)
        {
            curbe.DW4_CSC_Coefficient_C1 = 0xFFFB;
            curbe.DW5_CSC_Coefficient_C2 = 0x0038;
            curbe.DW6_CSC_Coefficient_C5 = 0x0008;
            curbe.DW7_CSC_Coefficient_C6 = 0x0017;
            curbe.DW8_CSC_Coefficient_C9 = 0x0038;
            curbe.DW9_CSC_Coefficient_C10 = 0xFFF3;
        }
        else // cscColorABGR == m_colorRawSurface || cscColorABGR10 == m_colorRawSurface
        {
            curbe.DW4_CSC_Coefficient_C1 = 0x0038;
            curbe.DW5_CSC_Coefficient_C2 = 0xFFFB;
            curbe.DW6_CSC_Coefficient_C5 = 0x0017;
            curbe.DW7_CSC_Coefficient_C6 = 0x0008;
            curbe.DW8_CSC_Coefficient_C9 = 0xFFF3;
            curbe.DW9_CSC_Coefficient_C10 = 0x0038;
        }
    }
    else if (m_curbeParams.inputColorSpace == ECOLORSPACE_P601)
    {
        curbe.DW4_CSC_Coefficient_C0 = 0xFFD1;
        curbe.DW5_CSC_Coefficient_C3 = 0x0080;
        curbe.DW6_CSC_Coefficient_C4 = 0x0041;
        curbe.DW7_CSC_Coefficient_C7 = 0x0010;
        curbe.DW8_CSC_Coefficient_C8 = 0xFFDB;
        curbe.DW9_CSC_Coefficient_C11 = 0x0080;
        if (cscColorARGB == m_colorRawSurface || cscColorARGB10 == m_colorRawSurface)
        {
            curbe.DW4_CSC_Coefficient_C1 = 0xFFF7;
            curbe.DW5_CSC_Coefficient_C2 = 0x0038;
            curbe.DW6_CSC_Coefficient_C5 = 0x000D;
            curbe.DW7_CSC_Coefficient_C6 = 0x0021;
            curbe.DW8_CSC_Coefficient_C9 = 0x0038;
            curbe.DW9_CSC_Coefficient_C10 = 0xFFED;
        }
        else // cscColorABGR == m_colorRawSurface || cscColorABGR10 == m_colorRawSurface
        {
            curbe.DW4_CSC_Coefficient_C1 = 0x0038;
            curbe.DW5_CSC_Coefficient_C2 = 0xFFF7;
            curbe.DW6_CSC_Coefficient_C5 = 0x0021;
            curbe.DW7_CSC_Coefficient_C6 = 0x000D;
            curbe.DW8_CSC_Coefficient_C9 = 0xFFED;
            curbe.DW9_CSC_Coefficient_C10 = 0x0038;
        }
    }
    else
    {
        CODECHAL_ENCODE_ASSERTMESSAGE("Unsupported ARGB input color space = %d!", m_curbeParams.inputColorSpace);
        return MOS_STATUS_INVALID_PARAMETER;
    }

    curbe.DW10_BTI_InputSurface = cscSrcYPlane;
    curbe.DW11_BTI_Enc8BitSurface = cscDstConvYPlane;
    curbe.DW12_BTI_4xDsSurface = cscDst4xDs;
    curbe.DW13_BTI_MbStatsSurface = cscDstMbStats;
    curbe.DW14_BTI_2xDsSurface = cscDst2xDs;
    curbe.DW15_BTI_HistoryBuffer = cscDstHistBuffer;
    curbe.DW16_BTI_HistorySumBuffer = cscDstHistSum;
    curbe.DW17_BTI_MultiTaskBuffer = cscDstMultiTask;

    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_cscKernelState->m_dshRegion.AddData(
        &curbe,
        m_cscKernelState->dwCurbeOffset,
        sizeof(curbe)));

    return eStatus;
}

MOS_STATUS CodechalEncodeCscDsG12::SendSurfaceCsc(PMOS_COMMAND_BUFFER cmdBuffer)
{
    CODECHAL_ENCODE_FUNCTION_ENTER;

    MOS_STATUS eStatus = MOS_STATUS_SUCCESS;

    // PAK input surface (could be 10-bit)
    CODECHAL_SURFACE_CODEC_PARAMS surfaceParams;
    MOS_ZeroMemory(&surfaceParams, sizeof(surfaceParams));
    surfaceParams.bIs2DSurface = true;
    surfaceParams.bUseUVPlane = (cscColorNv12TileY == m_colorRawSurface ||
        cscColorP010 == m_colorRawSurface ||
        cscColorP210 == m_colorRawSurface ||
        cscColorNv12Linear == m_colorRawSurface);
    surfaceParams.bMediaBlockRW = true;

    // Configure to R16/32 for input surface
    if (m_surfaceParamsCsc.bScalingInUses16UnormSurfFmt)
    {
        // 32x scaling requires R16_UNROM
        surfaceParams.bUse16UnormSurfaceFormat = true;
    }
    else if (m_surfaceParamsCsc.bScalingInUses32UnormSurfFmt)
    {
        surfaceParams.bUse32UnormSurfaceFormat = true;
    }
    else
    {
        /*
        * Unify surface format to avoid mismatches introduced by DS kernel between MMC on and off cases.
        * bUseCommonKernel        | FormatIsNV12 | MmcdOn | SurfaceFormatToUse
        *            1            |       1      |  0/1   |        R8
        *            1            |       0      |  0/1   |        R16
        *            0            |       1      |  0/1   |        R8
        *            0            |       0      |   1    |        R8
        *            0            |       0      |   0    |        R32
        */
        surfaceParams.bUse16UnormSurfaceFormat = !(cscColorNv12TileY == m_colorRawSurface ||
                                                   cscColorNv12Linear == m_colorRawSurface);
    }

    if (m_encoder->m_vdencEnabled && (CODECHAL_HEVC == m_standard || CODECHAL_AVC == m_standard))
    {
        surfaceParams.bCheckCSC8Format= true;
    }

    surfaceParams.psSurface = m_surfaceParamsCsc.psInputSurface;
    if (cscColorNv12Linear == m_colorRawSurface)
    {
        surfaceParams.dwHeightInUse = (surfaceParams.psSurface->dwHeight * 3) / 2;
    }
    surfaceParams.dwCacheabilityControl = m_hwInterface->ComposeSurfaceCacheabilityControl(
        MOS_CODEC_RESOURCE_USAGE_ORIGINAL_UNCOMPRESSED_PICTURE_ENCODE,
        (codechalL3 | codechalLLC));

#ifdef _MMC_SUPPORTED
    CODECHAL_ENCODE_CHK_NULL_RETURN(m_encoder->m_mmcState);
    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_encoder->m_mmcState->SetSurfaceParams(&surfaceParams));

    // disable compression for render RC TA resources
    if (surfaceParams.psSurface->MmcState == MOS_MEMCOMP_RC &&
        surfaceParams.psSurface->OsResource.pGmmResInfo->GetArraySize() > 1)
    {
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_osInterface->pfnDecompResource(m_osInterface, &surfaceParams.psSurface->OsResource));
        surfaceParams.psSurface->MmcState = MOS_MEMCOMP_DISABLED;
    }
#endif

    surfaceParams.dwBindingTableOffset = cscSrcYPlane;
    surfaceParams.dwUVBindingTableOffset = cscSrcUVPlane;
    CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
        m_hwInterface,
        cmdBuffer,
        &surfaceParams,
        m_cscKernelState));

    // Converted NV12 output surface, or ENC 8-bit output surface
    if (m_surfaceParamsCsc.psOutputCopiedSurface)
    {
        MOS_ZeroMemory(&surfaceParams, sizeof(surfaceParams));
        surfaceParams.bIs2DSurface =
            surfaceParams.bUseUVPlane =
            surfaceParams.bMediaBlockRW =
            surfaceParams.bIsWritable = true;
        surfaceParams.psSurface = m_surfaceParamsCsc.psOutputCopiedSurface;
        surfaceParams.dwCacheabilityControl = m_hwInterface->ComposeSurfaceCacheabilityControl(
            MOS_CODEC_RESOURCE_USAGE_SURFACE_HME_DOWNSAMPLED_ENCODE,
            codechalLLC);

        surfaceParams.dwBindingTableOffset = cscDstConvYPlane;
        surfaceParams.dwUVBindingTableOffset = cscDstConvUVlane;
        CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
            m_hwInterface,
            cmdBuffer,
            &surfaceParams,
            m_cscKernelState));
    }

    // 4x downscaled surface
    if (m_surfaceParamsCsc.psOutput4xDsSurface)
    {
        MOS_ZeroMemory(&surfaceParams, sizeof(surfaceParams));
        surfaceParams.bIs2DSurface =
            surfaceParams.bMediaBlockRW =
            surfaceParams.bIsWritable = true;
        surfaceParams.psSurface = m_surfaceParamsCsc.psOutput4xDsSurface;
        surfaceParams.dwCacheabilityControl = m_hwInterface->ComposeSurfaceCacheabilityControl(
            MOS_CODEC_RESOURCE_USAGE_SURFACE_HME_DOWNSAMPLED_ENCODE,
            codechalLLC);
        surfaceParams.dwBindingTableOffset = cscDst4xDs;
        CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
            m_hwInterface,
            cmdBuffer,
            &surfaceParams,
            m_cscKernelState));
    }

    // MB Stats surface
    if (m_surfaceParamsCsc.presMBVProcStatsBuffer)
    {
        MOS_ZeroMemory(&surfaceParams, sizeof(surfaceParams));
        surfaceParams.dwSize = m_hwInterface->m_avcMbStatBufferSize;
        surfaceParams.bIsWritable = true;
        surfaceParams.presBuffer = m_surfaceParamsCsc.presMBVProcStatsBuffer;
        surfaceParams.dwCacheabilityControl = m_hwInterface->ComposeSurfaceCacheabilityControl(
            MOS_CODEC_RESOURCE_USAGE_MB_STATS_ENCODE,
            codechalLLC);
        surfaceParams.dwBindingTableOffset = cscDstMbStats;
        CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
            m_hwInterface,
            cmdBuffer,
            &surfaceParams,
            m_cscKernelState));
    }

    // 2x downscaled surface
    if (m_surfaceParamsCsc.psOutput2xDsSurface)
    {
        MOS_ZeroMemory(&surfaceParams, sizeof(surfaceParams));
        surfaceParams.bIs2DSurface =
            surfaceParams.bMediaBlockRW =
            surfaceParams.bIsWritable = true;
        surfaceParams.psSurface = m_surfaceParamsCsc.psOutput2xDsSurface;
        surfaceParams.dwCacheabilityControl = m_hwInterface->ComposeSurfaceCacheabilityControl(
            MOS_CODEC_RESOURCE_USAGE_SURFACE_HME_DOWNSAMPLED_ENCODE,
            codechalLLC);
        surfaceParams.dwBindingTableOffset = cscDst2xDs;
        CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
            m_hwInterface,
            cmdBuffer,
            &surfaceParams,
            m_cscKernelState));
    }

    if (m_surfaceParamsCsc.hevcExtParams)
    {
        auto hevcExtParams = (HevcExtKernelParams*)m_surfaceParamsCsc.hevcExtParams;

        // History buffer
        if (hevcExtParams->presHistoryBuffer)
        {
            MOS_ZeroMemory(&surfaceParams, sizeof(surfaceParams));
            surfaceParams.dwSize = hevcExtParams->dwSizeHistoryBuffer;
            surfaceParams.dwOffset = hevcExtParams->dwOffsetHistoryBuffer;
            surfaceParams.bIsWritable = true;
            surfaceParams.presBuffer = hevcExtParams->presHistoryBuffer;
            surfaceParams.dwCacheabilityControl = m_hwInterface->ComposeSurfaceCacheabilityControl(
                MOS_CODEC_RESOURCE_USAGE_SURFACE_HME_DOWNSAMPLED_ENCODE,
                codechalLLC);
            surfaceParams.dwBindingTableOffset = cscDstHistBuffer;
            CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
                m_hwInterface,
                cmdBuffer,
                &surfaceParams,
                m_cscKernelState));
        }

        // History sum output buffer
        if (hevcExtParams->presHistorySumBuffer)
        {
            MOS_ZeroMemory(&surfaceParams, sizeof(surfaceParams));
            surfaceParams.dwSize = hevcExtParams->dwSizeHistorySumBuffer;
            surfaceParams.dwOffset = hevcExtParams->dwOffsetHistorySumBuffer;
            surfaceParams.bIsWritable = true;
            surfaceParams.presBuffer = hevcExtParams->presHistorySumBuffer;
            surfaceParams.dwCacheabilityControl = m_hwInterface->ComposeSurfaceCacheabilityControl(
                MOS_CODEC_RESOURCE_USAGE_SURFACE_HME_DOWNSAMPLED_ENCODE,
                codechalLLC);
            surfaceParams.dwBindingTableOffset = cscDstHistSum;
            CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
                m_hwInterface,
                cmdBuffer,
                &surfaceParams,
                m_cscKernelState));
        }

        // multi-thread task buffer
        if (hevcExtParams->presMultiThreadTaskBuffer)
        {
            MOS_ZeroMemory(&surfaceParams, sizeof(surfaceParams));
            surfaceParams.dwSize = hevcExtParams->dwSizeMultiThreadTaskBuffer;
            surfaceParams.dwOffset = hevcExtParams->dwOffsetMultiThreadTaskBuffer;
            surfaceParams.bIsWritable = true;
            surfaceParams.presBuffer = hevcExtParams->presMultiThreadTaskBuffer;
            surfaceParams.dwCacheabilityControl = m_hwInterface->ComposeSurfaceCacheabilityControl(
                MOS_CODEC_RESOURCE_USAGE_SURFACE_HME_DOWNSAMPLED_ENCODE,
                codechalLLC);
            surfaceParams.dwBindingTableOffset = cscDstMultiTask;
            CODECHAL_ENCODE_CHK_STATUS_RETURN(CodecHalSetRcsSurfaceState(
                m_hwInterface,
                cmdBuffer,
                &surfaceParams,
                m_cscKernelState));
        }
    }

    return eStatus;
}

MOS_STATUS CodechalEncodeCscDsG12::InitKernelStateDS()
{
    CODECHAL_ENCODE_FUNCTION_ENTER;

    m_dsBTCount[0] = ds4xNumSurfaces;
    m_dsCurbeLength[0] =
    m_dsInlineDataLength = sizeof(Ds4xKernelCurbeData);
    m_dsBTISrcY = ds4xSrcYPlane;
    m_dsBTIDstY = ds4xDstYPlane;
    m_dsBTISrcYTopField = ds4xSrcYPlaneTopField;
    m_dsBTIDstYTopField = ds4xDstYPlaneTopField;
    m_dsBTISrcYBtmField = ds4xSrcYPlaneBtmField;
    m_dsBTIDstYBtmField = ds4xDstYPlaneBtmField;
    m_dsBTIDstMbVProc = ds4xDstMbVProc;
    m_dsBTIDstMbVProcTopField = ds4xDstMbVProcTopField;
    m_dsBTIDstMbVProcBtmField = ds4xDstMbVProcBtmField;

    uint32_t kernelSize, numKernelsToLoad = m_encoder->m_interlacedFieldDisabled ? 1 : CODEC_NUM_FIELDS_PER_FRAME;
    m_dsKernelBase = m_kernelBase;
    CODECHAL_KERNEL_HEADER currKrnHeader;
    for (uint32_t krnStateIdx = 0; krnStateIdx < numKernelsToLoad; krnStateIdx++)
    {
        kernelSize = m_combinedKernelSize;
        m_dsKernelState = &m_encoder->m_scaling4xKernelStates[krnStateIdx];

        CODECHAL_ENCODE_CHK_STATUS_RETURN(GetCommonKernelHeaderAndSizeG12(
            m_dsKernelBase,
            ENC_SCALING4X,
            krnStateIdx,
            &currKrnHeader,
            &kernelSize))

        m_dsKernelState->KernelParams.iBTCount = m_dsBTCount[0];
        m_dsKernelState->KernelParams.iThreadCount = m_renderInterface->GetHwCaps()->dwMaxThreads;
        m_dsKernelState->KernelParams.iCurbeLength = m_dsCurbeLength[0];
        m_dsKernelState->KernelParams.iBlockWidth = CODECHAL_MACROBLOCK_WIDTH;
        m_dsKernelState->KernelParams.iBlockHeight = CODECHAL_MACROBLOCK_HEIGHT;
        m_dsKernelState->KernelParams.iIdCount = 1;
        m_dsKernelState->KernelParams.iInlineDataLength = m_dsInlineDataLength;

        m_dsKernelState->dwCurbeOffset = m_stateHeapInterface->GetSizeofCmdInterfaceDescriptorData();
        m_dsKernelState->KernelParams.pBinary = m_dsKernelBase + (currKrnHeader.KernelStartPointer << MHW_KERNEL_OFFSET_SHIFT);
        m_dsKernelState->KernelParams.iSize = kernelSize;
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_stateHeapInterface->CalculateSshAndBtSizesRequested(
            m_dsKernelState->KernelParams.iBTCount,
            &m_dsKernelState->dwSshSize,
            &m_dsKernelState->dwBindingTableSize));

        CODECHAL_ENCODE_CHK_NULL_RETURN(m_renderInterface->m_stateHeapInterface);
        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hwInterface->MhwInitISH(m_renderInterface->m_stateHeapInterface, m_dsKernelState));

        if (m_32xMeSupported)
        {
            m_dsKernelState = &m_encoder->m_scaling2xKernelStates[krnStateIdx];

            CODECHAL_ENCODE_CHK_STATUS_RETURN(GetCommonKernelHeaderAndSizeG12(
                m_dsKernelBase,
                ENC_SCALING2X,
                krnStateIdx,
                &currKrnHeader,
                &kernelSize))

            m_dsKernelState->KernelParams.iBTCount = m_dsBTCount[1];
            m_dsKernelState->KernelParams.iThreadCount = m_renderInterface->GetHwCaps()->dwMaxThreads;
            m_dsKernelState->KernelParams.iCurbeLength = m_dsCurbeLength[1];
            m_dsKernelState->KernelParams.iBlockWidth = CODECHAL_MACROBLOCK_WIDTH;
            m_dsKernelState->KernelParams.iBlockHeight = CODECHAL_MACROBLOCK_HEIGHT;

            m_dsKernelState->dwCurbeOffset = m_stateHeapInterface->GetSizeofCmdInterfaceDescriptorData();
            m_dsKernelState->KernelParams.pBinary = m_dsKernelBase + (currKrnHeader.KernelStartPointer << MHW_KERNEL_OFFSET_SHIFT);
            m_dsKernelState->KernelParams.iSize = kernelSize;
            CODECHAL_ENCODE_CHK_STATUS_RETURN(m_stateHeapInterface->CalculateSshAndBtSizesRequested(
                m_dsKernelState->KernelParams.iBTCount,
                &m_dsKernelState->dwSshSize,
                &m_dsKernelState->dwBindingTableSize));

            CODECHAL_ENCODE_CHK_NULL_RETURN(m_renderInterface->m_stateHeapInterface);
            CODECHAL_ENCODE_CHK_STATUS_RETURN(m_hwInterface->MhwInitISH(m_renderInterface->m_stateHeapInterface, m_dsKernelState));
        }

        if (m_encoder->m_interlacedFieldDisabled)
        {
            m_encoder->m_scaling4xKernelStates[1] = m_encoder->m_scaling4xKernelStates[0];

            if (m_32xMeSupported)
            {
                m_encoder->m_scaling2xKernelStates[1] = m_encoder->m_scaling2xKernelStates[0];
            }
        }
    }

    return MOS_STATUS_SUCCESS;
}

MOS_STATUS CodechalEncodeCscDsG12::SetCurbeDS4x()
{
    CODECHAL_ENCODE_FUNCTION_ENTER;

    if (CODECHAL_AVC != m_standard)
    {
        return CodechalEncodeCscDs::SetCurbeDS4x();
    }

    Ds4xKernelCurbeData curbe;

    curbe.DW0_InputPictureWidth = m_curbeParams.dwInputPictureWidth;
    curbe.DW0_InputPictureHeight = m_curbeParams.dwInputPictureHeight;

    curbe.DW1_InputYBTIFrame = ds4xSrcYPlane;
    curbe.DW2_OutputYBTIFrame = ds4xDstYPlane;

    if (m_curbeParams.bFieldPicture)
    {
        curbe.DW3_InputYBTIBottomField = ds4xSrcYPlaneBtmField;
        curbe.DW4_OutputYBTIBottomField = ds4xDstYPlaneBtmField;
    }

    if ((curbe.DW6_EnableMBFlatnessCheck = m_curbeParams.bFlatnessCheckEnabled))
    {
        curbe.DW5_FlatnessThreshold = 128;
    }

    // For gen10 DS kernel, If Flatness Check enabled, need enable MBVariance as well. Otherwise will not output MbIsFlat.
    curbe.DW6_EnableMBVarianceOutput = curbe.DW6_EnableMBFlatnessCheck || m_curbeParams.bMBVarianceOutputEnabled;
    curbe.DW6_EnableMBPixelAverageOutput = m_curbeParams.bMBPixelAverageOutputEnabled;
    curbe.DW6_EnableBlock8x8StatisticsOutput = m_curbeParams.bBlock8x8StatisticsEnabled;

    if (curbe.DW6_EnableMBVarianceOutput || curbe.DW6_EnableMBPixelAverageOutput)
    {
        curbe.DW8_MBVProcStatsBTIFrame = ds4xDstMbVProc;

        if (m_curbeParams.bFieldPicture)
        {
            curbe.DW9_MBVProcStatsBTIBottomField = ds4xDstMbVProcBtmField;
        }
    }

    CODECHAL_ENCODE_CHK_STATUS_RETURN(m_dsKernelState->m_dshRegion.AddData(
        &curbe,
        m_dsKernelState->dwCurbeOffset,
        sizeof(curbe)));

    CODECHAL_DEBUG_TOOL(
        if (m_encoder->m_encodeParState)
        {
            CODECHAL_ENCODE_CHK_STATUS_RETURN(m_encoder->m_encodeParState->PopulateDsParam(&curbe));
        }
    )

    return MOS_STATUS_SUCCESS;
}

MOS_STATUS CodechalEncodeCscDsG12::InitSfcState()
{
    CODECHAL_ENCODE_FUNCTION_ENTER;

    if (!m_sfcState)
    {
        m_sfcState = (CodecHalEncodeSfc*)MOS_New(CodecHalEncodeSfcG12);
        CODECHAL_ENCODE_CHK_NULL_RETURN(m_sfcState);

        CODECHAL_ENCODE_CHK_STATUS_RETURN(m_sfcState->Initialize(m_hwInterface, m_osInterface));

        m_sfcState->SetInputColorSpace(MHW_CSpace_sRGB);
    }
    return MOS_STATUS_SUCCESS;
}

MOS_STATUS CodechalEncodeCscDsG12::SurfaceNeedsExtraCopy()
{
    m_needsExtraCopy = true;
    return MOS_STATUS_SUCCESS;
}

MOS_STATUS CodechalEncodeCscDsG12::CheckRawSurfaceAlignment(MOS_SURFACE surface)
{
    if (m_cscEnableCopy && m_needsExtraCopy)
    {
        if (surface.Format == Format_A8R8G8B8) // not touch NV12 logic.
        {
            m_colorRawSurface = cscColorARGB;
            m_cscRequireCopy = 1;
        }
    }

    if (m_cscEnableCopy && (surface.dwWidth % m_rawSurfAlignment || surface.dwHeight % m_rawSurfAlignment) &&
        m_colorRawSurface != cscColorNv12TileY)
    {
        m_cscRequireCopy = 1;
    }

    return MOS_STATUS_SUCCESS;
}

CodechalEncodeCscDsG12::CodechalEncodeCscDsG12(CodechalEncoderState* encoder)
    : CodechalEncodeCscDs(encoder)
{
    m_cscKernelUID = IDR_CODEC_HME_DS_SCOREBOARD_KERNEL;
    m_cscCurbeLength = sizeof(CscKernelCurbeData);
#if defined(ENABLE_KERNELS) && !defined(_FULL_OPEN_SOURCE)
    m_kernelBase = (uint8_t*)IGCODECKRN_G12;
#endif
}

CodechalEncodeCscDsG12::~CodechalEncodeCscDsG12()
{
    // free the MbStats surface
    m_osInterface->pfnFreeResource(m_osInterface, &m_resMbStatsBuffer);
}
