/*
 * Copyright (c) 2018-2021 Arm Limited.
 *
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
#include "arm_compute/runtime/CL/CLTensorAllocator.h"

#include "arm_compute/core/utils/misc/MMappedFile.h"
#include "arm_compute/runtime/BlobLifetimeManager.h"
#include "arm_compute/runtime/CL/CLBufferAllocator.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
#include "arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h"
#include "arm_compute/runtime/MemoryGroup.h"
#include "arm_compute/runtime/MemoryManagerOnDemand.h"
#include "arm_compute/runtime/PoolManager.h"
#include "tests/CL/CLAccessor.h"
#include "tests/Globals.h"
#include "tests/framework/Asserts.h"
#include "tests/framework/Macros.h"
#include "tests/validation/Validation.h"
#include "tests/validation/reference/ActivationLayer.h"

#include <memory>
#include <random>

namespace arm_compute
{
namespace test
{
namespace validation
{
namespace
{
cl_mem import_malloc_memory_helper(void *ptr, size_t size)
{
    const cl_import_properties_arm import_properties[] =
    {
        CL_IMPORT_TYPE_ARM,
        CL_IMPORT_TYPE_HOST_ARM,
        0
    };

    cl_int err = CL_SUCCESS;
    cl_mem buf = clImportMemoryARM(CLKernelLibrary::get().context().get(), CL_MEM_READ_WRITE, import_properties, ptr, size, &err);
    ARM_COMPUTE_ASSERT(err == CL_SUCCESS);

    return buf;
}

class DummyAllocator final : public IAllocator
{
public:
    DummyAllocator() = default;

    void *allocate(size_t size, size_t alignment) override
    {
        ++_n_calls;
        return _backend_allocator.allocate(size, alignment);
    }
    void free(void *ptr) override
    {
        return _backend_allocator.free(ptr);
    }
    std::unique_ptr<IMemoryRegion> make_region(size_t size, size_t alignment) override
    {
        // Needs to be implemented as is the one that is used internally by the CLTensorAllocator
        ++_n_calls;
        return _backend_allocator.make_region(size, alignment);
    }
    int get_n_calls() const
    {
        return _n_calls;
    }

private:
    int               _n_calls{};
    CLBufferAllocator _backend_allocator{};
};

void run_conv2d(std::shared_ptr<IMemoryManager> mm, IAllocator &mm_allocator)
{
    // Create tensors
    CLTensor src, weights, bias, dst;
    src.allocator()->init(TensorInfo(TensorShape(16U, 32U, 32U, 2U), 1, DataType::F32, DataLayout::NHWC));
    weights.allocator()->init(TensorInfo(TensorShape(16U, 3U, 3U, 32U), 1, DataType::F32, DataLayout::NHWC));
    bias.allocator()->init(TensorInfo(TensorShape(32U), 1, DataType::F32, DataLayout::NHWC));
    dst.allocator()->init(TensorInfo(TensorShape(32U, 32U, 32U, 2U), 1, DataType::F32, DataLayout::NHWC));

    // Create and configure function
    CLGEMMConvolutionLayer conv(mm);
    conv.configure(&src, &weights, &bias, &dst, PadStrideInfo(1U, 1U, 1U, 1U));

    // Allocate tensors
    src.allocator()->allocate();
    weights.allocator()->allocate();
    bias.allocator()->allocate();
    dst.allocator()->allocate();

    // Finalize memory manager
    if(mm != nullptr)
    {
        mm->populate(mm_allocator, 1 /* num_pools */);
        ARM_COMPUTE_EXPECT(mm->lifetime_manager()->are_all_finalized(), framework::LogLevel::ERRORS);
        ARM_COMPUTE_EXPECT(mm->pool_manager()->num_pools() == 1, framework::LogLevel::ERRORS);
    }

    conv.run();
}
} // namespace

TEST_SUITE(CL)
TEST_SUITE(UNIT)
TEST_SUITE(TensorAllocator)

/* Validate that an external global allocator can be used for all internal allocations */
TEST_CASE(ExternalGlobalAllocator, framework::DatasetMode::ALL)
{
    DummyAllocator global_tensor_alloc;
    CLTensorAllocator::set_global_allocator(&global_tensor_alloc);

    // Run a convolution
    run_conv2d(nullptr /* mm */, global_tensor_alloc);

    // Check that allocator has been called multiple times > 4
    ARM_COMPUTE_EXPECT(global_tensor_alloc.get_n_calls() > 4, framework::LogLevel::ERRORS);

    // Nullify global allocator
    CLTensorAllocator::set_global_allocator(nullptr);
}

/* Validate that an external global allocator can be used for the pool manager */
TEST_CASE(ExternalGlobalAllocatorMemoryPool, framework::DatasetMode::ALL)
{
    auto lifetime_mgr = std::make_shared<BlobLifetimeManager>();
    auto pool_mgr     = std::make_shared<PoolManager>();
    auto mm           = std::make_shared<MemoryManagerOnDemand>(lifetime_mgr, pool_mgr);

    DummyAllocator global_tensor_alloc;
    CLTensorAllocator::set_global_allocator(&global_tensor_alloc);

    // Run a convolution
    run_conv2d(mm, global_tensor_alloc);

    // Check that allocator has been called multiple times > 4
    ARM_COMPUTE_EXPECT(global_tensor_alloc.get_n_calls() > 4, framework::LogLevel::ERRORS);

    // Nullify global allocator
    CLTensorAllocator::set_global_allocator(nullptr);
}

/** Validates import memory interface when importing cl buffer objects */
TEST_CASE(ImportMemoryBuffer, framework::DatasetMode::ALL)
{
    // Init tensor info
    const TensorInfo info(TensorShape(24U, 16U, 3U), 1, DataType::F32);

    // Allocate memory buffer
    const size_t total_size = info.total_size();
    auto         buf        = cl::Buffer(CLScheduler::get().context(), CL_MEM_READ_WRITE, total_size);

    // Negative case : Import nullptr
    CLTensor t1;
    t1.allocator()->init(info);
    ARM_COMPUTE_ASSERT(!bool(t1.allocator()->import_memory(cl::Buffer())));
    ARM_COMPUTE_ASSERT(t1.info()->is_resizable());

    // Negative case : Import memory to a tensor that is memory managed
    CLTensor    t2;
    MemoryGroup mg;
    t2.allocator()->set_associated_memory_group(&mg);
    ARM_COMPUTE_ASSERT(!bool(t2.allocator()->import_memory(buf)));
    ARM_COMPUTE_ASSERT(t2.info()->is_resizable());

    // Negative case : Invalid buffer size
    CLTensor         t3;
    const TensorInfo info_neg(TensorShape(32U, 16U, 3U), 1, DataType::F32);
    t3.allocator()->init(info_neg);
    ARM_COMPUTE_ASSERT(!bool(t3.allocator()->import_memory(buf)));
    ARM_COMPUTE_ASSERT(t3.info()->is_resizable());

    // Positive case : Set raw pointer
    CLTensor t4;
    t4.allocator()->init(info);
    ARM_COMPUTE_ASSERT(bool(t4.allocator()->import_memory(buf)));
    ARM_COMPUTE_ASSERT(!t4.info()->is_resizable());
    ARM_COMPUTE_EXPECT(t4.cl_buffer().get() == buf.get(), framework::LogLevel::ERRORS);
    t4.allocator()->free();
    ARM_COMPUTE_ASSERT(t4.info()->is_resizable());
    ARM_COMPUTE_EXPECT(t4.cl_buffer().get() != buf.get(), framework::LogLevel::ERRORS);
}

/** Validates import memory interface when importing malloced memory */
TEST_CASE(ImportMemoryMalloc, framework::DatasetMode::ALL)
{
    // Check if import extension is supported
    if(!device_supports_extension(CLKernelLibrary::get().get_device(), "cl_arm_import_memory_host"))
    {
        return;
    }
    else
    {
        const ActivationLayerInfo act_info(ActivationLayerInfo::ActivationFunction::RELU);
        const TensorShape         shape     = TensorShape(24U, 16U, 3U);
        const DataType            data_type = DataType::F32;

        // Create tensor
        const TensorInfo info(shape, 1, data_type);
        CLTensor         tensor;
        tensor.allocator()->init(info);

        // Create and configure activation function
        CLActivationLayer act_func;
        act_func.configure(&tensor, nullptr, act_info);

        // Allocate and import tensor
        const size_t total_size_in_elems = tensor.info()->tensor_shape().total_size();
        const size_t total_size_in_bytes = tensor.info()->total_size();
        const size_t alignment           = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
        size_t       space               = total_size_in_bytes + alignment;
        auto         raw_data            = std::make_unique<uint8_t[]>(space);

        void *aligned_ptr = raw_data.get();
        std::align(alignment, total_size_in_bytes, aligned_ptr, space);

        cl::Buffer wrapped_buffer(import_malloc_memory_helper(aligned_ptr, total_size_in_bytes));
        ARM_COMPUTE_ASSERT(bool(tensor.allocator()->import_memory(wrapped_buffer)));
        ARM_COMPUTE_ASSERT(!tensor.info()->is_resizable());

        // Fill tensor
        std::uniform_real_distribution<float> distribution(-5.f, 5.f);
        std::mt19937                          gen(library->seed());
        auto                                 *typed_ptr = reinterpret_cast<float *>(aligned_ptr);
        for(unsigned int i = 0; i < total_size_in_elems; ++i)
        {
            typed_ptr[i] = distribution(gen);
        }

        // Execute function and sync
        act_func.run();
        CLScheduler::get().sync();

        // Validate result by checking that the input has no negative values
        for(unsigned int i = 0; i < total_size_in_elems; ++i)
        {
            ARM_COMPUTE_EXPECT(typed_ptr[i] >= 0, framework::LogLevel::ERRORS);
        }

        // Release resources
        tensor.allocator()->free();
        ARM_COMPUTE_EXPECT(tensor.info()->is_resizable(), framework::LogLevel::ERRORS);
    }
}

#if !defined(BARE_METAL)
/** Validates import memory interface when importing memory mapped objects */
TEST_CASE(ImportMemoryMappedFile, framework::DatasetMode::ALL)
{
    // Check if import extension is supported
    if(!device_supports_extension(CLKernelLibrary::get().get_device(), "cl_arm_import_memory_host"))
    {
        return;
    }
    else
    {
        const ActivationLayerInfo act_info(ActivationLayerInfo::ActivationFunction::RELU);
        const TensorShape         shape     = TensorShape(24U, 16U, 3U);
        const DataType            data_type = DataType::F32;

        // Create tensor
        const TensorInfo info(shape, 1, data_type);
        CLTensor         tensor;
        tensor.allocator()->init(info);

        // Create and configure activation function
        CLActivationLayer act_func;
        act_func.configure(&tensor, nullptr, act_info);

        // Get number of elements
        const size_t total_size_in_elems = tensor.info()->tensor_shape().total_size();
        const size_t total_size_in_bytes = tensor.info()->total_size();

        // Create file
        std::ofstream output_file("test_mmap_import.bin", std::ios::binary | std::ios::out);
        output_file.seekp(total_size_in_bytes - 1);
        output_file.write("", 1);
        output_file.close();

        // Map file
        utils::mmap_io::MMappedFile mmapped_file("test_mmap_import.bin", 0 /** Whole file */, 0);
        ARM_COMPUTE_ASSERT(mmapped_file.is_mapped());
        unsigned char *data = mmapped_file.data();

        cl::Buffer wrapped_buffer(import_malloc_memory_helper(data, total_size_in_bytes));
        ARM_COMPUTE_ASSERT(bool(tensor.allocator()->import_memory(wrapped_buffer)));
        ARM_COMPUTE_ASSERT(!tensor.info()->is_resizable());

        // Fill tensor
        std::uniform_real_distribution<float> distribution(-5.f, 5.f);
        std::mt19937                          gen(library->seed());
        auto                                 *typed_ptr = reinterpret_cast<float *>(data);
        for(unsigned int i = 0; i < total_size_in_elems; ++i)
        {
            typed_ptr[i] = distribution(gen);
        }

        // Execute function and sync
        act_func.run();
        CLScheduler::get().sync();

        // Validate result by checking that the input has no negative values
        for(unsigned int i = 0; i < total_size_in_elems; ++i)
        {
            ARM_COMPUTE_EXPECT(typed_ptr[i] >= 0, framework::LogLevel::ERRORS);
        }

        // Release resources
        tensor.allocator()->free();
        ARM_COMPUTE_ASSERT(tensor.info()->is_resizable());
    }
}
#endif // !defined(BARE_METAL)

/** Validates symmetric per channel quantization */
TEST_CASE(Symm8PerChannelQuantizationInfo, framework::DatasetMode::ALL)
{
    // Create tensor
    CLTensor                 tensor;
    const std::vector<float> scale = { 0.25f, 1.4f, 3.2f, 2.3f, 4.7f };
    const TensorInfo         info(TensorShape(32U, 16U), 1, DataType::QSYMM8_PER_CHANNEL, QuantizationInfo(scale));
    tensor.allocator()->init(info);

    // Check quantization information
    ARM_COMPUTE_EXPECT(!tensor.info()->quantization_info().empty(), framework::LogLevel::ERRORS);
    ARM_COMPUTE_EXPECT(!tensor.info()->quantization_info().scale().empty(), framework::LogLevel::ERRORS);
    ARM_COMPUTE_EXPECT(tensor.info()->quantization_info().scale().size() == scale.size(), framework::LogLevel::ERRORS);
    ARM_COMPUTE_EXPECT(tensor.info()->quantization_info().offset().empty(), framework::LogLevel::ERRORS);

    CLQuantization quantization = tensor.quantization();
    ARM_COMPUTE_ASSERT(quantization.scale != nullptr);
    ARM_COMPUTE_ASSERT(quantization.offset != nullptr);

    // Check OpenCL quantization arrays before allocating
    ARM_COMPUTE_EXPECT(quantization.scale->max_num_values() == 0, framework::LogLevel::ERRORS);
    ARM_COMPUTE_EXPECT(quantization.offset->max_num_values() == 0, framework::LogLevel::ERRORS);

    // Check OpenCL quantization arrays after allocating
    tensor.allocator()->allocate();
    ARM_COMPUTE_EXPECT(quantization.scale->max_num_values() == scale.size(), framework::LogLevel::ERRORS);
    ARM_COMPUTE_EXPECT(quantization.offset->max_num_values() == 0, framework::LogLevel::ERRORS);

    // Validate that the scale values are the same
    auto  cl_scale_buffer = quantization.scale->cl_buffer();
    void *mapped_ptr      = CLScheduler::get().queue().enqueueMapBuffer(cl_scale_buffer, CL_TRUE, CL_MAP_READ, 0, scale.size());
    auto  cl_scale_ptr    = static_cast<float *>(mapped_ptr);
    for(unsigned int i = 0; i < scale.size(); ++i)
    {
        ARM_COMPUTE_EXPECT(cl_scale_ptr[i] == scale[i], framework::LogLevel::ERRORS);
    }
    CLScheduler::get().queue().enqueueUnmapMemObject(cl_scale_buffer, mapped_ptr);
}

TEST_SUITE_END() // TensorAllocator
TEST_SUITE_END() // UNIT
TEST_SUITE_END() // CL
} // namespace validation
} // namespace test
} // namespace arm_compute