//
// Copyright (c) 2017 The Khronos Group Inc.
// 
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//    http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//

#include <iomanip>

#include "testBase.h"
#include "harness/conversions.h"
#include "harness/typeWrappers.h"
#include "harness/testHarness.h"

// #define USE_NEW_SYNTAX    1
// The number of shuffles to test per test
#define NUM_TESTS 32
// The number of times to run each combination of shuffles
#define NUM_ITERATIONS_PER_TEST 2
#define MAX_PROGRAM_SIZE NUM_TESTS*1024
#define PRINT_SHUFFLE_KERNEL_SOURCE 0
#define SPEW_ORDER_DETAILS 0

enum ShuffleMode
{
    kNormalMode = 0,
    kFunctionCallMode,
    kArrayAccessMode,
    kBuiltInFnMode,
    kBuiltInDualInputFnMode
};

static const char *shuffleKernelPattern[3] =  {
    "__kernel void sample_test( __global %s%s *source, __global %s%s *dest )\n"
    "{\n"
    "    if (get_global_id(0) != 0) return;\n"
    "     //%s%s src1 %s, src2%s;\n",// Here's a comma...
                                    // Above code is commented out for now, but keeping around for testing local storage options
    "}\n" };

static const char *shuffleTempPattern = "  %s%s tmp;\n";

static const char *clearTempPattern = "        tmp = (%s%s)((%s)0);\n";

static const char *shuffleSinglePattern =
"        tmp%s%s = source[%d]%s%s;\n"
"        dest[%d] = tmp;\n"
;

static const char * shuffleSinglePatternV3src =
"           tmp%s%s = vload3(%d, source)%s%s;\n"
"        dest[%d] = tmp;\n";

static const char * shuffleSinglePatternV3dst =
"        tmp%s%s = source[%d]%s%s;\n"
"           vstore3(tmp, %d, dest);\n";


static const char * shuffleSinglePatternV3srcV3dst =
"tmp%s%s = vload3(%d, source)%s%s;\n"
"vstore3(tmp, %d, dest);\n";

static const char *shuffleFnLinePattern = "%s%s shuffle_fn( %s%s source );\n%s%s shuffle_fn( %s%s source ) { return source; }\n\n";

static const char *shuffleFnPattern =
"        tmp%s%s = shuffle_fn( source[%d] )%s%s;\n"
"        dest[%d] = tmp;\n"
;


static const char *shuffleFnPatternV3src =
"        tmp%s%s = shuffle_fn( vload3(%d, source) )%s%s;\n"
"        dest[%d] = tmp;\n"
;


static const char *shuffleFnPatternV3dst =
"        tmp%s%s = shuffle_fn( source[%d] )%s%s;\n"
"               vstore3(tmp, %d, dest);\n"
;


static const char *shuffleFnPatternV3srcV3dst =
"        tmp%s%s = shuffle_fn(vload3(%d, source) )%s%s;\n"
"               vstore3(tmp, %d, dest);\n"
;

// shuffle() built-in function patterns
static const char *shuffleBuiltInPattern =
"        {\n"
"            %s%s src1 = %s;\n"
"            %s%s%s mask = (%s%s%s)( %s );\n"
"            tmp = shuffle( src1, mask );\n"
"            %s;\n"
"        }\n"
;

// shuffle() built-in dual-input function patterns
static const char *shuffleBuiltInDualPattern =
"        {\n"
"            %s%s src1 = %s;\n"
"            %s%s src2 = %s;\n"
"            %s%s%s mask = (%s%s%s)( %s );\n"
"            tmp = shuffle2( src1, src2, mask );\n"
"            %s;\n"
"        }\n"
;


typedef unsigned char ShuffleOrder[ 16 ];

void incrementShuffleOrder( ShuffleOrder &order, size_t orderSize, size_t orderRange )
{
    for( size_t i = 0; i < orderSize; i++ )
    {
        order[ i ]++;
        if( order[ i ] < orderRange )
            return;
        order[ i ] = 0;
    }
}

bool shuffleOrderContainsDuplicates( ShuffleOrder &order, size_t orderSize )
{
    bool flags[ 16 ] = { false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false };
    for( size_t i = 0; i < orderSize; i++ )
    {
        if( flags[ order[ i ] ] )
            return true;
        flags[ order[ i ] ] = true;
    }
    return false;
}

static void shuffleVector( unsigned char *inVector, unsigned char *outVector, ShuffleOrder order, size_t vecSize, size_t typeSize, cl_uint lengthToUse )
{
    for(size_t i = 0; i < lengthToUse; i++ )
    {
        unsigned char *inPtr = inVector + typeSize *order[ i ];
        memcpy( outVector, inPtr, typeSize );
        outVector += typeSize;
    }
}

static void shuffleVector2( unsigned char *inVector, unsigned char *outVector, ShuffleOrder order, size_t vecSize, size_t typeSize, cl_uint lengthToUse )
{
    for(size_t i = 0; i < lengthToUse; i++ )
    {
        unsigned char *outPtr = outVector + typeSize *order[ i ];
        memcpy( outPtr, inVector, typeSize );
        inVector += typeSize;
    }
}

static void shuffleVectorDual( unsigned char *inVector, unsigned char *inSecondVector, unsigned char *outVector, ShuffleOrder order, size_t vecSize, size_t typeSize, cl_uint lengthToUse )
{
    // This is tricky: the indices of each shuffle are in a range (0-srcVecSize * 2-1),
    // where (srcVecSize-srcVecSize*2-1) refers to the second input.
    size_t uphalfMask = (size_t)vecSize;
    size_t lowerBits = (size_t)( vecSize - 1 );

    for(size_t i = 0; i < lengthToUse; i++ )
    {
        unsigned char *inPtr;
#if SPEW_ORDER_DETAILS
        log_info("order[%d] is %d, or %d of %s\n", (int)i,
                 (int)(order[i]),
                 (int)(order[i] & lowerBits),
                 ((order[i]&uphalfMask) == 0)?"lower num":"upper num");
#endif
        if( order[ i ] & uphalfMask )
            inPtr = inSecondVector + typeSize * ( order[ i ] & lowerBits );
        else
            inPtr = inVector + typeSize * ( order[ i ] & lowerBits );
        memcpy( outVector, inPtr, typeSize );
        outVector += typeSize;
    }
}


static ShuffleOrder sNaturalOrder = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };

static int useNumbersFlip = 0;
const char *get_order_string( ShuffleOrder &order, size_t vecSize, cl_uint lengthToUse, bool byNumber, MTdata d )
{
    // NOTE: names are only valid for hex characters (up to F) but for debugging, we use
    // this to print out orders for dual inputs, which actually can be valid up to position 31 (two 16-element vectors)
    // so we go ahead and fake the rest of the alphabet for those other 16 positions, so we have
    // some (indirectly) meaningful output
    char names[] = "0123456789abcdefghijklmnopqrstuv";
    char namesUpperCase[] = "0123456789ABCDEFGHIJKLMNOPQRSTUV";
    char names2[] = "xyzw!!!!!!!!!!!!";

    static char orderString[ 18 ];

    size_t j, idx;

    // Assume we don't have to use numbered indices (.s0123...).
    byNumber = false;
    // Check if any index is beyond xyzw, which requires to use numbers.
    for( j = 0; j < lengthToUse; j++ )
    {
        if (order[j] > 3) {
            byNumber = true;
            break;
        }
    }
    // If we can use numbers, do so half the time.
    if (!byNumber) {
        byNumber = (useNumbersFlip++)%2;
    }

    if (byNumber)
    {
        idx = 0;
        // Randomly chose upper and lower case S.
        orderString[ idx++ ] = random_in_range(0, 1, d) ? 's' : 'S';
        for( j = 0; j < vecSize && j < lengthToUse; j++ ) {
            // Randomly choose upper and lower case.
            orderString[ idx++ ] = random_in_range(0, 1, d) ? names[ (int)order[ j ] ] : namesUpperCase[ (int)order[ j ] ];
        }
        orderString[ idx++ ] = 0;
    }
    else
    {
        // Use xyzw.
        for( j = 0; j < vecSize && j < lengthToUse; j++ ) {
            orderString[ j ] = names2[ (int)order[ j ] ];
        }
        orderString[ j ] = 0;
    }

    return orderString;
}

char * get_order_name( ExplicitType vecType, size_t inVecSize, size_t outVecSize, ShuffleOrder &inOrder, ShuffleOrder &outOrder, cl_uint lengthToUse, MTdata d, bool inUseNumerics, bool outUseNumerics )
{
    static char orderName[ 512 ] = "";
    char inOrderStr[ 512 ], outOrderStr[ 512 ];

    if( inVecSize == 1 )
        inOrderStr[ 0 ] = 0;
    else
        sprintf(inOrderStr, "%d.%s", (int)inVecSize,
                get_order_string(inOrder, inVecSize, lengthToUse, inUseNumerics,
                                 d));
    if( outVecSize == 1 )
        outOrderStr[ 0 ] = 0;
    else
        sprintf( outOrderStr, "%d.%s", (int)outVecSize, get_order_string( outOrder, outVecSize, lengthToUse, outUseNumerics, d ) );

    sprintf( orderName, "order %s%s -> %s%s",
            get_explicit_type_name( vecType ), inOrderStr, get_explicit_type_name( vecType ), outOrderStr );
    return orderName;
}

void print_hex_mem_dump(const unsigned char *inDataPtr,
                        const unsigned char *inDataPtr2,
                        const unsigned char *expected,
                        const unsigned char *outDataPtr, size_t inVecSize,
                        size_t outVecSize, size_t typeSize)
{
    auto byte_to_hex_str = [](unsigned char v) {
        // Use a new stream to avoid manipulating state of outer stream.
        std::ostringstream ss;
        ss << std::setfill('0') << std::setw(2) << std::right << std::hex << +v;
        return ss.str();
    };

    std::ostringstream error;
    error << "      Source: ";
    for (size_t j = 0; j < inVecSize * typeSize; j++)
    {
        error << (j % typeSize ? "" : " ") << byte_to_hex_str(inDataPtr[j])
              << " ";
    }
    if (inDataPtr2 != NULL)
    {
        error << "\n    Source 2: ";
        for (size_t j = 0; j < inVecSize * typeSize; j++)
        {
            error << (j % typeSize ? "" : " ") << byte_to_hex_str(inDataPtr2[j])
                  << " ";
        }
    }
    error << "\n    Expected: ";
    for (size_t j = 0; j < outVecSize * typeSize; j++)
    {
        error << (j % typeSize ? "" : " ") << byte_to_hex_str(expected[j])
              << " ";
    }
    error << "\n      Actual: ";
    for (size_t j = 0; j < outVecSize * typeSize; j++)
    {
        error << (j % typeSize ? "" : " ") << byte_to_hex_str(outDataPtr[j])
              << " ";
    }
    log_info("%s\n", error.str().c_str());
}

void generate_shuffle_mask( char *outMaskString, size_t maskSize, const ShuffleOrder *order )
{
    outMaskString[ 0 ] = 0;
    if( order != NULL )
    {
        for( size_t jj = 0; jj < maskSize; jj++ )
        {
            char thisMask[ 16 ];
            sprintf( thisMask, "%s%d", ( jj == 0 ) ? "" : ", ", (*order)[ jj ] );
            strcat( outMaskString, thisMask );
        }
    }
    else
    {
        for( size_t jj = 0; jj < maskSize; jj++ )
        {
            char thisMask[ 16 ];
            sprintf( thisMask, "%s%ld", ( jj == 0 ) ? "" : ", ", jj );
            strcat( outMaskString, thisMask );
        }
    }
}

static int create_shuffle_kernel( cl_context context, cl_program *outProgram, cl_kernel *outKernel,
                                 size_t *outRealVecSize,
                                 ExplicitType vecType, size_t inVecSize, size_t outVecSize, cl_uint *lengthToUse, bool inUseNumerics, bool outUseNumerics,
                                 size_t numOrders, ShuffleOrder *inOrders, ShuffleOrder *outOrders,
                                 MTdata d, ShuffleMode shuffleMode = kNormalMode )
{
    char inOrder[18], shuffledOrder[18];
    char kernelSource[MAX_PROGRAM_SIZE], progLine[ 10240 ];
    char *programPtr;
    char inSizeName[4], outSizeName[4], outRealSizeName[4], inSizeArgName[4];
    char outSizeNameTmpVar[4];


    /* Create the source; note vec size is the vector length we are testing */
    if( inVecSize == 1 ) //|| (inVecSize == 3)) // just have arrays if we go with size 3
        inSizeName[ 0 ] = 0;
    else
        sprintf( inSizeName, "%ld", inVecSize );
    if( inVecSize == 3 )
        inSizeArgName[ 0 ] = 0;
    else
        strcpy( inSizeArgName, inSizeName );

    *outRealVecSize = outVecSize;

    if( outVecSize == 1 ||  (outVecSize == 3))
        outSizeName[ 0 ] = 0;
    else
        sprintf( outSizeName, "%d", (int)outVecSize );

    if(outVecSize == 1) {
        outSizeNameTmpVar[0] = 0;
    } else {
        sprintf(outSizeNameTmpVar, "%d", (int)outVecSize);
    }

    if( *outRealVecSize == 1 || ( *outRealVecSize == 3))
        outRealSizeName[ 0 ] = 0;
    else
        sprintf( outRealSizeName, "%d", (int)*outRealVecSize );


    // Loop through and create the source for all order strings
    kernelSource[ 0 ] = 0;
    if (vecType == kDouble) {
        strcat(kernelSource, "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n");
    }

    if( shuffleMode == kFunctionCallMode )
    {
        sprintf( progLine, shuffleFnLinePattern, get_explicit_type_name( vecType ), inSizeName, get_explicit_type_name( vecType ), inSizeName,
                get_explicit_type_name( vecType ), inSizeName, get_explicit_type_name( vecType ), inSizeName );
        strcat(kernelSource, progLine);
    }

    // We're going to play a REALLY NASTY trick here. We're going to use the inSize insert point
    // to put in an entire third parameter if we need it
    char inParamSizeString[ 1024 ];
    if( shuffleMode == kBuiltInDualInputFnMode )
        sprintf( inParamSizeString, "%s *secondSource, __global %s%s", inSizeArgName, get_explicit_type_name( vecType ), inSizeArgName );
    else
        strcpy( inParamSizeString, inSizeArgName );

    // These two take care of unused variable warnings
    const char * src2EnableA = ( shuffleMode == kBuiltInDualInputFnMode ) ? "" : "/*";
    const char * src2EnableB = ( shuffleMode == kBuiltInDualInputFnMode ) ? "" : "*/";

    sprintf( progLine, shuffleKernelPattern[ 0 ], get_explicit_type_name( vecType ), inParamSizeString,
            get_explicit_type_name( vecType ), outRealSizeName, get_explicit_type_name( vecType ), inSizeName,
            src2EnableA, src2EnableB );
    strcat(kernelSource, progLine);
    if( inOrders == NULL )
        strcpy( inOrder, get_order_string( sNaturalOrder, outVecSize, (cl_uint)outVecSize, inUseNumerics, d ) );

    sprintf( progLine, shuffleTempPattern, get_explicit_type_name( vecType ), outSizeNameTmpVar);
    strcat(kernelSource, progLine);

    for( unsigned int i = 0; i < numOrders; i++ )
    {
        if( inOrders != NULL )
            strcpy(inOrder,
                   get_order_string(inOrders[i], inVecSize, lengthToUse[i],
                                    inUseNumerics, d));
        strcpy( shuffledOrder, get_order_string( outOrders[ i ], outVecSize, lengthToUse[i], outUseNumerics, d ) );


        sprintf( progLine, clearTempPattern, get_explicit_type_name( vecType ), outSizeName,get_explicit_type_name( vecType ));
        strcat(kernelSource, progLine);


        if( shuffleMode == kNormalMode )
        {
            if(outVecSize == 3 && inVecSize == 3) {
                // shuffleSinglePatternV3srcV3dst
                sprintf( progLine, shuffleSinglePatternV3srcV3dst,
                        outVecSize > 1 ? "." : "", outVecSize > 1 ? shuffledOrder : "", (int)i,
                        inVecSize > 1 ? "." : "", inVecSize > 1 ? inOrder : "", (int)i );
            } else if(inVecSize == 3) {
                // shuffleSinglePatternV3src
                sprintf( progLine, shuffleSinglePatternV3src,
                        outVecSize > 1 ? "." : "", outVecSize > 1 ? shuffledOrder : "", (int)i,
                        inVecSize > 1 ? "." : "", inVecSize > 1 ? inOrder : "", (int)i );
            } else if(outVecSize == 3) {
                // shuffleSinglePatternV3dst
                sprintf( progLine, shuffleSinglePatternV3dst,
                        outVecSize > 1 ? "." : "", outVecSize > 1 ? shuffledOrder : "", (int)i,
                        inVecSize > 1 ? "." : "", inVecSize > 1 ? inOrder : "",
                        (int)i );
            } else {
                sprintf( progLine, shuffleSinglePattern,
                        outVecSize > 1 ? "." : "", outVecSize > 1 ? shuffledOrder : "", (int)i,
                        inVecSize > 1 ? "." : "", inVecSize > 1 ? inOrder : "", (int)i );
            }
        }
        else if( shuffleMode == kFunctionCallMode )
        {
            // log_info("About to make a shuffle line\n");
            // fflush(stdout);
            if(inVecSize == 3 && outVecSize == 3) { // swap last two
                sprintf( progLine, shuffleFnPatternV3srcV3dst,
                        outVecSize > 1 ? "." : "", outVecSize > 1 ? shuffledOrder : "", (int)i,
                        inVecSize > 1 ? "." : "", inVecSize > 1 ? inOrder : "",
                        (int)i );
            } else if(outVecSize == 3)  { // swap last two
                                          // log_info("Here\n\n");
                                          // fflush(stdout);
                sprintf( progLine, shuffleFnPatternV3dst,
                        outVecSize > 1 ? "." : "",
                        outVecSize > 1 ? shuffledOrder : "",
                        (int)i,
                        inVecSize > 1 ? "." : "",
                        inVecSize > 1 ? inOrder : "",
                        (int)i );
                // log_info("\n%s\n", progLine);
                // fflush(stdout);
            } else if(inVecSize == 3) {
                sprintf( progLine, shuffleFnPatternV3src,
                        outVecSize > 1 ? "." : "", outVecSize > 1 ? shuffledOrder : "", (int)i,
                        inVecSize > 1 ? "." : "", inVecSize > 1 ? inOrder : "", (int)i );
            } else  {
                sprintf( progLine, shuffleFnPattern,
                        outVecSize > 1 ? "." : "", outVecSize > 1 ? shuffledOrder : "", (int)i,
                        inVecSize > 1 ? "." : "", inVecSize > 1 ? inOrder : "", (int)i );
            }
        }
        else if( shuffleMode == kArrayAccessMode )
        { // now we want to replace inSizeName with inSizeNameShuffleFn
            int vectorSizeToCastTo = 16;
            cl_uint item;
            for (item =0; item<lengthToUse[i]; item++) {
                int absoluteIndex = i*(int)inVecSize+(int)inOrders[i][item];
                int castVectorIndex = absoluteIndex/vectorSizeToCastTo;
                size_t castElementIndex = absoluteIndex % vectorSizeToCastTo;
                ShuffleOrder myOutOrders, myInOrders;
                myOutOrders[0]  = outOrders[i][item];
                myInOrders[0] = castElementIndex;

                strcpy( inOrder, get_order_string( myInOrders, 1, 1, 0, d ) );
                strcpy( shuffledOrder, get_order_string( myOutOrders, 1, 1, 0, d ) );

                sprintf(progLine, "     tmp%s%s = ((__global %s%d *)source)[%d]%s%s;\n",
                        outVecSize > 1 ? "." : "", outVecSize > 1 ? shuffledOrder : "",
                        get_explicit_type_name( vecType ), vectorSizeToCastTo,
                        castVectorIndex,
                        vectorSizeToCastTo > 1 ? "." : "", vectorSizeToCastTo > 1 ? inOrder : "");
                strcat(kernelSource, progLine);
            }
            if(outVecSize == 3) {
                sprintf(progLine,"     vstore3(tmp, %d, (__global %s *)dest);\n",
                        i, get_explicit_type_name( vecType ));
                // probably don't need that last
                // cast to (__global %s *) where %s is get_explicit_type_name( vecType)
            } else {
                sprintf(progLine,"     dest[%d] = tmp;\n", i );
            }
        }
        else // shuffleMode == kBuiltInFnMode or kBuiltInDualInputFnMode
        {
            if(inVecSize == 3 || outVecSize == 3 ||
               inVecSize == 1 || outVecSize == 1) {
                // log_info("Skipping test for size 3\n");
                continue;
            }
            ExplicitType maskType = vecType;
            if( maskType == kFloat )
                maskType = kUInt;
            if( maskType == kDouble) {
                maskType = kULong;
            }

            char maskString[ 1024 ] = "";
            size_t maskSize = outVecSize;// ( shuffleMode == kBuiltInDualInputFnMode ) ? ( outVecSize << 1 ) : outVecSize;
            generate_shuffle_mask( maskString, maskSize, ( outOrders != NULL ) ? &outOrders[ i ] : NULL );

            // Set up a quick prefix, so mask gets unsigned type regardless of the input/output type
            char maskPrefix[ 2 ] = "u";
            if( get_explicit_type_name( maskType )[ 0 ] == 'u' )
                maskPrefix[ 0 ] = 0;

            char progLine2[ 10240 ];
            if( shuffleMode == kBuiltInDualInputFnMode )
            {
                sprintf( progLine2, shuffleBuiltInDualPattern, get_explicit_type_name( vecType ), inSizeName,
                        ( inVecSize == 3 ) ? "vload3( %ld, (__global %s *)source )" : "source[ %ld ]",
                        get_explicit_type_name( vecType ), inSizeName,
                        ( inVecSize == 3 ) ? "vload3( %ld, (__global %s *)secondSource )" : "secondSource[ %ld ]",
                        maskPrefix, get_explicit_type_name( maskType ), outSizeName, maskPrefix, get_explicit_type_name( maskType ), outSizeName,
                        maskString,
                        ( outVecSize == 3 ) ? "vstore3( tmp, %ld, (__global %s *)dest )" : "dest[ %ld ] = tmp" );

                if( outVecSize == 3 )
                {
                    if( inVecSize == 3 )
                        sprintf( progLine, progLine2, i, get_explicit_type_name( vecType ), i, get_explicit_type_name( vecType ), i, get_explicit_type_name( vecType ) );
                    else
                        sprintf( progLine, progLine2, i, i, i, get_explicit_type_name( vecType ) );
                }
                else
                {
                    if( inVecSize == 3 )
                        sprintf( progLine, progLine2, i, get_explicit_type_name( vecType ), i, get_explicit_type_name( vecType ), i );
                    else
                        sprintf( progLine, progLine2, i, i, i );
                }
            }
            else
            {
                sprintf( progLine2, shuffleBuiltInPattern, get_explicit_type_name( vecType ), inSizeName,
                        ( inVecSize == 3 ) ? "vload3( %ld, (__global %s *)source )" : "source[ %ld ]",
                        maskPrefix, get_explicit_type_name( maskType ), outSizeName, maskPrefix, get_explicit_type_name( maskType ), outSizeName,
                        maskString,
                        ( outVecSize == 3 ) ? "vstore3( tmp, %ld, (__global %s *)dest )" : "dest[ %ld ] = tmp" );

                if( outVecSize == 3 )
                {
                    if( inVecSize == 3 )
                        sprintf( progLine, progLine2, i, get_explicit_type_name( vecType ), i, get_explicit_type_name( vecType ) );
                    else
                        sprintf( progLine, progLine2, i, i, get_explicit_type_name( vecType ) );
                }
                else
                {
                    if( inVecSize == 3 )
                        sprintf( progLine, progLine2, i, get_explicit_type_name( vecType ), i );
                    else
                        sprintf( progLine, progLine2, i, i );
                }
            }
        }

        strcat( kernelSource, progLine );
        if (strlen(kernelSource) > 0.9*MAX_PROGRAM_SIZE)
            log_info("WARNING: Program has grown to 90%% (%d) of the defined max program size of %d\n", (int)strlen(kernelSource), (int)MAX_PROGRAM_SIZE);
    }
    strcat( kernelSource, shuffleKernelPattern[ 1 ] );

    // Print the kernel source
    if (PRINT_SHUFFLE_KERNEL_SOURCE)
        log_info( "Kernel:%s\n", kernelSource );

    /* Create kernel */
    programPtr = kernelSource;
    if( create_single_kernel_helper( context, outProgram, outKernel, 1, (const char **)&programPtr, "sample_test" ) )
    {
        return -1;
    }
    return 0;
}

int test_shuffle_dual_kernel(cl_context context, cl_command_queue queue,
                             ExplicitType vecType, size_t inVecSize, size_t outVecSize, cl_uint *lengthToUse, size_t numOrders,
                             ShuffleOrder *inOrderIdx, ShuffleOrder *outOrderIdx, bool inUseNumerics, bool outUseNumerics, MTdata d,
                             ShuffleMode shuffleMode = kNormalMode )
{
    clProgramWrapper program;
    clKernelWrapper kernel;
    int error;
    size_t threads[1], localThreads[1];
    size_t typeSize, outRealVecSize;
    clMemWrapper streams[ 3 ];

    /* Create the source */
    error = create_shuffle_kernel( context, &program, &kernel, &outRealVecSize, vecType,
                                  inVecSize, outVecSize, lengthToUse, inUseNumerics, outUseNumerics, numOrders, inOrderIdx, outOrderIdx,
                                  d, shuffleMode );
    if( error != 0 )
        return error;

    typeSize = get_explicit_type_size( vecType );

#if !(defined(_WIN32) && defined (_MSC_VER))
    cl_long inData[ inVecSize * numOrders ];
    cl_long inSecondData[ inVecSize * numOrders ];
    cl_long outData[ outRealVecSize * numOrders ];
#else
    cl_long* inData  = (cl_long*)_malloca(inVecSize * numOrders * sizeof(cl_long));
    cl_long* inSecondData  = (cl_long*)_malloca(inVecSize * numOrders * sizeof(cl_long));
    cl_long* outData = (cl_long*)_malloca(outRealVecSize * numOrders * sizeof(cl_long));
#endif
    memset(outData, 0, outRealVecSize * numOrders * sizeof(cl_long) );

    generate_random_data( vecType, (unsigned int)( numOrders * inVecSize ), d, inData );
    if( shuffleMode == kBuiltInDualInputFnMode )
        generate_random_data( vecType, (unsigned int)( numOrders * inVecSize ), d, inSecondData );

    streams[0] =
        clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
                       typeSize * inVecSize * numOrders, inData, &error);
    test_error( error, "Unable to create input stream" );

    streams[1] =
        clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
                       typeSize * outRealVecSize * numOrders, outData, &error);
    test_error( error, "Unable to create output stream" );

    int argIndex = 0;
    if( shuffleMode == kBuiltInDualInputFnMode )
    {
        streams[2] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
                                    typeSize * inVecSize * numOrders,
                                    inSecondData, &error);
        test_error( error, "Unable to create second input stream" );

        error = clSetKernelArg( kernel, argIndex++, sizeof( streams[ 2 ] ), &streams[ 2 ] );
        test_error( error, "Unable to set kernel argument" );
    }

    // Set kernel arguments
    error = clSetKernelArg( kernel, argIndex++, sizeof( streams[ 0 ] ), &streams[ 0 ] );
    test_error( error, "Unable to set kernel argument" );
    error = clSetKernelArg( kernel, argIndex++, sizeof( streams[ 1 ] ), &streams[ 1 ] );
    test_error( error, "Unable to set kernel argument" );


    /* Run the kernel */
    threads[0] = numOrders;

    error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
    test_error( error, "Unable to get work group size to use" );

    error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
    test_error( error, "Unable to execute test kernel" );


    // Read the results back
    error = clEnqueueReadBuffer( queue, streams[ 1 ], CL_TRUE, 0, typeSize * numOrders * outRealVecSize, outData, 0, NULL, NULL );
    test_error( error, "Unable to read results" );

    unsigned char *inDataPtr = (unsigned char *)inData;
    unsigned char *inSecondDataPtr = (unsigned char *)inSecondData;
    unsigned char *outDataPtr = (unsigned char *)outData;
    int ret = 0;
    int errors_printed = 0;
    for( size_t i = 0; i < numOrders; i++ )
    {
        unsigned char expected[ 1024 ];
        unsigned char temp[ 1024 ];
        memset(expected, 0, sizeof(expected));
        memset(temp, 0, sizeof(temp));
        if( shuffleMode == kBuiltInFnMode )
            shuffleVector( inDataPtr, expected, outOrderIdx[ i ], outVecSize, typeSize, lengthToUse[i] );
        else if( shuffleMode == kBuiltInDualInputFnMode )
            shuffleVectorDual( inDataPtr, inSecondDataPtr, expected, outOrderIdx[ i ], inVecSize, typeSize, lengthToUse[i] );
        else
        {
            shuffleVector( inDataPtr, temp, inOrderIdx[ i ], inVecSize, typeSize, lengthToUse[i] );
            shuffleVector2( temp, expected, outOrderIdx[ i ], outVecSize, typeSize, lengthToUse[i] );
        }

        if( memcmp( expected, outDataPtr, outVecSize * typeSize ) != 0 )
        {
            log_error( " ERROR: Shuffle test %d FAILED for %s (memory hex dump follows)\n", (int)i,
                      get_order_name( vecType, inVecSize, outVecSize, inOrderIdx[ i ], outOrderIdx[ i ], lengthToUse[i], d, inUseNumerics, outUseNumerics ) );

            print_hex_mem_dump( inDataPtr, ( shuffleMode == kBuiltInDualInputFnMode ) ? inSecondDataPtr : NULL, expected, outDataPtr, inVecSize, outVecSize, typeSize );

            if( ( shuffleMode == kBuiltInFnMode ) || ( shuffleMode == kBuiltInDualInputFnMode ) )
            {
                // Mask would've been different for every shuffle done, so we have to regen it to print it
                char maskString[ 1024 ];
                generate_shuffle_mask( maskString, outVecSize, ( outOrderIdx != NULL ) ? &outOrderIdx[ i ] : NULL );
                log_error( "        Mask:  %s\n", maskString );
            }

            ret++;
            errors_printed++;
            if (errors_printed > MAX_ERRORS_TO_PRINT)
            {
                log_info("Further errors suppressed.\n");
                return ret;
            }
        }
        inDataPtr += inVecSize * typeSize;
        inSecondDataPtr += inVecSize * typeSize;
        outDataPtr += outRealVecSize * typeSize;
    }

    return ret;
}

void    build_random_shuffle_order( ShuffleOrder &outIndices, unsigned int length, unsigned int selectLength, bool allowRepeats, MTdata d )
{
    char flags[ 16 ];

    memset( flags, 0, sizeof( flags ) );

    for( unsigned int i = 0; i < length; i++ )
    {
        char selector = (char)random_in_range( 0, selectLength - 1, d );
        if( !allowRepeats )
        {
            while( flags[ (int)selector ] )
                selector = (char)random_in_range( 0, selectLength - 1, d );
            flags[ (int)selector ] = true;
        }
        outIndices[ i ] = selector;
    }
}

class shuffleBuffer
{
public:

    shuffleBuffer( cl_context ctx, cl_command_queue queue, ExplicitType type, size_t inSize, size_t outSize, ShuffleMode mode )
    {
        mContext = ctx;
        mQueue = queue;
        mVecType = type;
        mInVecSize = inSize;
        mOutVecSize = outSize;
        mShuffleMode = mode;

        mCount = 0;

        // Here's the deal with mLengthToUse[i].
        // if you have, for instance
        // uchar4 dst;
        // uchar8 src;
        // you can do
        // src.s0213 = dst.s1045;
        // but you can also do
        // src.s02 = dst.s10;
        // which has a different effect
        // The intent with these "sub lengths" is to test all such
        // possibilities
        // Calculate a range of sub-lengths within the vector to copy.
        int i;
        size_t maxSize = (mInVecSize < mOutVecSize) ? mInVecSize : mOutVecSize;
        for(i=0; i<NUM_TESTS; i++)
        {
            // Built-in fns can't select sub-lengths (the mask must be the length of the dest vector).
            // Well, at least for these tests...
            if( ( mode == kBuiltInFnMode ) || ( mode == kBuiltInDualInputFnMode ) )
                mLengthToUse[i]    = (cl_int)mOutVecSize;
            else
            {
                mLengthToUse[i] = (cl_uint)(((double)i/NUM_TESTS)*maxSize) + 1;
                // Force the length to be a valid vector length.
                if( ( mLengthToUse[i] == 1 ) && ( mode != kBuiltInFnMode ) )
                    mLengthToUse[i] = 1;
                else if (mLengthToUse[i] < 4)
                    mLengthToUse[i] = 2;
                else if (mLengthToUse[i] < 8)
                    mLengthToUse[i] = 4;
                else if (mLengthToUse[i] < 16)
                    mLengthToUse[i] = 8;
                else
                    mLengthToUse[i] = 16;
            }
        }
    }

    int    AddRun( ShuffleOrder &inOrder, ShuffleOrder &outOrder, MTdata d )
    {
        memcpy( &mInOrders[ mCount ], &inOrder, sizeof( inOrder ) );
        memcpy( &mOutOrders[ mCount ], &outOrder, sizeof( outOrder ) );
        mCount++;

        if( mCount == NUM_TESTS )
            return Flush(d);

        return CL_SUCCESS;
    }

    int Flush( MTdata d )
    {
        int err = CL_SUCCESS;
        if( mCount > 0 )
        {
            err = test_shuffle_dual_kernel( mContext, mQueue, mVecType, mInVecSize, mOutVecSize, mLengthToUse,
                                           mCount, mInOrders, mOutOrders, true, true, d, mShuffleMode );
            mCount = 0;
        }
        return err;
    }

protected:
    cl_context            mContext;
    cl_command_queue    mQueue;
    ExplicitType        mVecType;
    size_t                mInVecSize, mOutVecSize, mCount;
    ShuffleMode            mShuffleMode;
    cl_uint             mLengthToUse[ NUM_TESTS ];

    ShuffleOrder        mInOrders[ NUM_TESTS ], mOutOrders[ NUM_TESTS ];
};


int test_shuffle_random(cl_device_id device, cl_context context, cl_command_queue queue, ShuffleMode shuffleMode, MTdata d )
{
    ExplicitType vecType[] = { kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kDouble };
    unsigned int vecSizes[] = { 1, 2, 3, 4, 8, 16, 0 };
    unsigned int srcIdx, dstIdx, typeIndex;
    int error = 0, totalError = 0, prevTotalError = 0;
    RandomSeed seed(gRandomSeed);

    for( typeIndex = 0; typeIndex < 10; typeIndex++ )
    {
        //log_info( "\n\t%s... ", get_explicit_type_name( vecType[ typeIndex ] ) );
        //fflush( stdout );
        if (vecType[typeIndex] == kDouble) {
            if (!is_extension_available(device, "cl_khr_fp64")) {
                log_info("Extension cl_khr_fp64 not supported; skipping double tests.\n");
                continue;
            }
            log_info("Testing doubles.\n");
        }

        if ((vecType[typeIndex] == kLong || vecType[typeIndex] == kULong) && !gHasLong )
        {
            log_info("Long types are unsupported, skipping.");
            continue;
        }

        error = 0;
        for( srcIdx = 0; vecSizes[ srcIdx ] != 0 /*&& error == 0*/; srcIdx++ )
        {
            for( dstIdx = 0; vecSizes[ dstIdx ] != 0 /*&& error == 0*/; dstIdx++ )
            {
                if( ( ( shuffleMode == kBuiltInDualInputFnMode ) || ( shuffleMode == kBuiltInFnMode ) ) &&
                   ( ( vecSizes[ dstIdx ] & 1 ) || ( vecSizes[ srcIdx ] & 1 ) ) )
                {
                    // Built-in shuffle functions don't work on size 1 (scalars) or size 3 (vec3s)
                    continue;
                }

                log_info("Testing [%s%d to %s%d]... ", get_explicit_type_name( vecType[ typeIndex ] ) , vecSizes[srcIdx], get_explicit_type_name( vecType[ typeIndex ] ) , vecSizes[dstIdx]);
                shuffleBuffer buffer( context, queue, vecType[ typeIndex ], vecSizes[ srcIdx ], vecSizes[ dstIdx ], shuffleMode );

                int numTests = NUM_TESTS*NUM_ITERATIONS_PER_TEST;
                for( int i = 0; i < numTests /*&& error == 0*/; i++ )
                {
                    ShuffleOrder src, dst;
                    if( shuffleMode == kBuiltInFnMode )
                    {
                        build_random_shuffle_order( dst, vecSizes[ dstIdx ], vecSizes[ srcIdx ], true, d );
                    }
                    else if(shuffleMode == kBuiltInDualInputFnMode)
                    {
                        build_random_shuffle_order(dst, vecSizes[dstIdx], 2*vecSizes[srcIdx], true, d);
                    }
                    else
                    {
                        build_random_shuffle_order( src, vecSizes[ dstIdx ], vecSizes[ srcIdx ], true, d );
                        build_random_shuffle_order( dst, vecSizes[ dstIdx ], vecSizes[ dstIdx ], false, d );
                    }

                    error = buffer.AddRun( src, dst, seed );
                    if (error)
                        totalError++;
                }
                int test_error = buffer.Flush(seed);
                if (test_error)
                    totalError++;

                if (totalError == prevTotalError)
                    log_info("\tPassed.\n");
                else
                {
                    log_error("\tFAILED.\n");
                    prevTotalError = totalError;
                }
            }
        }
    }
    return totalError;
}

int test_shuffle_copy(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
{
    RandomSeed seed(gRandomSeed);
    return test_shuffle_random( device, context, queue, kNormalMode, seed );
}

int test_shuffle_function_call(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
{
    RandomSeed seed(gRandomSeed);
    return test_shuffle_random( device, context, queue, kFunctionCallMode, seed );
}

int test_shuffle_array_cast(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
{
    RandomSeed seed(gRandomSeed);
    return test_shuffle_random( device, context, queue, kArrayAccessMode, seed );
}

int test_shuffle_built_in(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
{
    RandomSeed seed(gRandomSeed);
    return test_shuffle_random( device, context, queue, kBuiltInFnMode, seed );
}

int test_shuffle_built_in_dual_input(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
{
    RandomSeed seed(gRandomSeed);
    return test_shuffle_random( device, context, queue, kBuiltInDualInputFnMode, seed );
}

