/******************************************************************************
 *
 * Copyright (C) 2022 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 *****************************************************************************
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 */
/**

 * *******************************************************************************

 * * @file
 *  isvc_mem_fns_sse42.c
 *
 * @brief
 *  SSE4.2 variants of
 * functions used for memory operations
 *

 * *******************************************************************************

 */
#include <string.h>
#include <immintrin.h>

#include "ih264_typedefs.h"
#include "isvc_mem_fns.h"

void isvc_memset_2d_sse42(UWORD8 *pu1_dst, WORD32 i4_dst_stride, UWORD8 u1_val, WORD32 i4_blk_wd,
                          WORD32 i4_blk_ht)
{
    WORD32 i, j;

    if((i4_blk_wd == 4) && (i4_blk_ht == 4))
    {
        *((WORD32 *) (pu1_dst)) = _mm_cvtsi128_si32(_mm_set1_epi8(u1_val));
        pu1_dst += i4_dst_stride;

        *((WORD32 *) (pu1_dst)) = _mm_cvtsi128_si32(_mm_set1_epi8(u1_val));
        pu1_dst += i4_dst_stride;

        *((WORD32 *) (pu1_dst)) = _mm_cvtsi128_si32(_mm_set1_epi8(u1_val));
        pu1_dst += i4_dst_stride;

        *((WORD32 *) (pu1_dst)) = _mm_cvtsi128_si32(_mm_set1_epi8(u1_val));
    }
    else if((i4_blk_wd == 8) && (i4_blk_ht == 8))
    {
        _mm_storel_epi64((__m128i *) (&pu1_dst[0]), _mm_set1_epi8(u1_val));
        pu1_dst += i4_dst_stride;

        _mm_storel_epi64((__m128i *) (&pu1_dst[0]), _mm_set1_epi8(u1_val));
        pu1_dst += i4_dst_stride;

        _mm_storel_epi64((__m128i *) (&pu1_dst[0]), _mm_set1_epi8(u1_val));
        pu1_dst += i4_dst_stride;

        _mm_storel_epi64((__m128i *) (&pu1_dst[0]), _mm_set1_epi8(u1_val));
        pu1_dst += i4_dst_stride;

        _mm_storel_epi64((__m128i *) (&pu1_dst[0]), _mm_set1_epi8(u1_val));
        pu1_dst += i4_dst_stride;

        _mm_storel_epi64((__m128i *) (&pu1_dst[0]), _mm_set1_epi8(u1_val));
        pu1_dst += i4_dst_stride;

        _mm_storel_epi64((__m128i *) (&pu1_dst[0]), _mm_set1_epi8(u1_val));
        pu1_dst += i4_dst_stride;

        _mm_storel_epi64((__m128i *) (&pu1_dst[0]), _mm_set1_epi8(u1_val));
    }
    else if((i4_blk_wd % 16 == 0) && (i4_blk_ht % 16 == 0))
    {
        UWORD8 *pu1_dst_col_ptr, *pu1_dst_row_ptr;

        WORD32 i4_width_by_16 = i4_blk_wd / 16;
        WORD32 i4_height_by_16 = i4_blk_ht / 16;

        for(i = 0; i < i4_height_by_16; i++)
        {
            pu1_dst_row_ptr = pu1_dst + i * 16 * i4_dst_stride;

            for(j = 0; j < i4_width_by_16; j++)
            {
                pu1_dst_col_ptr = pu1_dst_row_ptr + (j << 4);

                _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
                pu1_dst_col_ptr += i4_dst_stride;

                _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
                pu1_dst_col_ptr += i4_dst_stride;

                _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
                pu1_dst_col_ptr += i4_dst_stride;

                _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
                pu1_dst_col_ptr += i4_dst_stride;

                _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
                pu1_dst_col_ptr += i4_dst_stride;

                _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
                pu1_dst_col_ptr += i4_dst_stride;

                _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
                pu1_dst_col_ptr += i4_dst_stride;

                _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
                pu1_dst_col_ptr += i4_dst_stride;

                _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
                pu1_dst_col_ptr += i4_dst_stride;

                _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
                pu1_dst_col_ptr += i4_dst_stride;

                _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
                pu1_dst_col_ptr += i4_dst_stride;

                _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
                pu1_dst_col_ptr += i4_dst_stride;

                _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
                pu1_dst_col_ptr += i4_dst_stride;

                _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
                pu1_dst_col_ptr += i4_dst_stride;

                _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
                pu1_dst_col_ptr += i4_dst_stride;

                _mm_storeu_si128((__m128i *) (&pu1_dst_col_ptr[0]), _mm_set1_epi8(u1_val));
            }
        }
    }
    else
    {
        for(i = 0; i < i4_blk_ht; i++)
        {
            memset(pu1_dst, u1_val, i4_blk_wd);

            pu1_dst += i4_dst_stride;
        }
    }
}
