/* chunkset_sse41.c -- SSE4 inline functions to copy small data chunks.
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

#include "zbuild.h"

/* This requires SSE2 support. While it's implicit with SSE4, we can minimize
 * code size by sharing the chunkcopy functions, which will certainly compile
 * to identical machine code */
#if defined(X86_SSE41) && defined(X86_SSE2)
#include <immintrin.h>
#include "chunk_permute_table.h"

typedef __m128i chunk_t;

#define CHUNK_SIZE 16

#define HAVE_CHUNKMEMSET_2
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8
#define HAVE_CHUNK_MAG
#define HAVE_CHUNKCOPY
#define HAVE_CHUNKUNROLL

static const lut_rem_pair perm_idx_lut[13] = {
    {0, 1},      /* 3 */
    {0, 0},      /* don't care */
    {1 * 32, 1}, /* 5 */
    {2 * 32, 4}, /* 6 */
    {3 * 32, 2}, /* 7 */
    {0 * 32, 0}, /* don't care */
    {4 * 32, 7}, /* 9 */
    {5 * 32, 6}, /* 10 */
    {6 * 32, 5}, /* 11 */
    {7 * 32, 4}, /* 12 */
    {8 * 32, 3}, /* 13 */
    {9 * 32, 2}, /* 14 */
    {10 * 32, 1},/* 15 */
};


static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
    int16_t tmp;
    zmemcpy_2(&tmp, from);
    *chunk = _mm_set1_epi16(tmp);
}

static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
    int32_t tmp;
    zmemcpy_4(&tmp, from);
    *chunk = _mm_set1_epi32(tmp);
}

static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
    int64_t tmp;
    zmemcpy_8(&tmp, from);
    *chunk = _mm_set1_epi64x(tmp);
}

static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
    *chunk = _mm_loadu_si128((__m128i *)s);
}

static inline void storechunk(uint8_t *out, chunk_t *chunk) {
    _mm_storeu_si128((__m128i *)out, *chunk);
}

static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
    lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
    __m128i perm_vec, ret_vec;
#ifdef Z_MEMORY_SANITIZER
    /* Important to note: 
     * This is _not_ to subvert the memory sanitizer but to instead unpoison some
     * bytes we willingly and purposefully load unitialized that we swizzle over
     * in a vector register, anyway.  If what we assume is wrong about what is used,
     * the memory sanitizer will still usefully flag it */
    __msan_unpoison(buf + dist, 16 - dist);
#endif
    ret_vec = _mm_loadu_si128((__m128i*)buf);
    *chunk_rem = lut_rem.remval;

    perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
    ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec);

    return ret_vec;
}

extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len);

#define CHUNKSIZE            chunksize_sse41
#define CHUNKMEMSET          chunkmemset_sse41
#define CHUNKMEMSET_SAFE     chunkmemset_safe_sse41
#define CHUNKCOPY(a, b, c)   chunkcopy_sse2(a, b, c)
#define CHUNKUNROLL(a, b, c) chunkunroll_sse2(a, b, c)

#include "chunkset_tpl.h"

#endif
