/*
 * Copyright (C) 2017 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#define LOG_TAG "Operations"

#include "Pooling.h"

#include <vector>

#include "OperationResolver.h"
#include "Tracing.h"
#include "nnapi/Validation.h"

#ifdef NN_INCLUDE_CPU_IMPLEMENTATION
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wunused-parameter"
#pragma clang diagnostic ignored "-Wsign-compare"
#pragma clang diagnostic ignored "-Winvalid-partial-specialization"
#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
#include <tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h>
#pragma clang diagnostic pop

#include "CpuOperationUtils.h"
#endif  // NN_INCLUDE_CPU_IMPLEMENTATION

namespace android {
namespace nn {

namespace pooling {

#ifdef NN_INCLUDE_CPU_IMPLEMENTATION
namespace {

struct PoolingParam {
    int32_t padding_left, padding_right;
    int32_t padding_top, padding_bottom;
    int32_t stride_width, stride_height;
    int32_t filter_width, filter_height;
    int32_t activation;
    bool useNchw = false;

    bool initialize(const IOperationExecutionContext* context) {
        uint32_t inCount = context->getNumInputs();
        int32_t padding_implicit = 0;
        if (inCount >= 10) {
            padding_left = context->getInputValue<int32_t>(1);
            padding_right = context->getInputValue<int32_t>(2);
            padding_top = context->getInputValue<int32_t>(3);
            padding_bottom = context->getInputValue<int32_t>(4);
            stride_width = context->getInputValue<int32_t>(5);
            stride_height = context->getInputValue<int32_t>(6);
            filter_width = context->getInputValue<int32_t>(7);
            filter_height = context->getInputValue<int32_t>(8);
            activation = context->getInputValue<int32_t>(9);
            if (inCount == 11) {
                useNchw = context->getInputValue<bool>(10);
            }
        } else {
            padding_implicit = context->getInputValue<int32_t>(1);
            stride_width = context->getInputValue<int32_t>(2);
            stride_height = context->getInputValue<int32_t>(3);
            filter_width = context->getInputValue<int32_t>(4);
            filter_height = context->getInputValue<int32_t>(5);
            activation = context->getInputValue<int32_t>(6);
            if (inCount == 8) {
                useNchw = context->getInputValue<bool>(7);
            }
        }
        if (inCount <= 8) {
            Shape inputShape = context->getInputShape(kInputTensor);
            int32_t input_height = getSizeOfDimension(inputShape, useNchw ? 2 : 1);
            int32_t input_width = getSizeOfDimension(inputShape, useNchw ? 3 : 2);
            calculateExplicitPadding(input_width, stride_width, filter_width, padding_implicit,
                                     &padding_left, &padding_right);
            calculateExplicitPadding(input_height, stride_height, filter_height, padding_implicit,
                                     &padding_top, &padding_bottom);
        }
        NN_RET_CHECK_GE(padding_left, 0);
        NN_RET_CHECK_GE(padding_right, 0);
        NN_RET_CHECK_GE(padding_top, 0);
        NN_RET_CHECK_GE(padding_bottom, 0);
        NN_RET_CHECK_GT(stride_width, 0);
        NN_RET_CHECK_GT(stride_height, 0);
        NN_RET_CHECK_GT(filter_width, 0);
        NN_RET_CHECK_GT(filter_height, 0);
        NN_RET_CHECK_GE(activation, 0);
        NN_RET_CHECK_GT(filter_width, padding_left);
        NN_RET_CHECK_GT(filter_width, padding_right);
        NN_RET_CHECK_GT(filter_height, padding_top);
        NN_RET_CHECK_GT(filter_height, padding_bottom);
        return true;
    }

    tflite::PoolParams toTfliteParam(const Shape& output) const {
        tflite::PoolParams params = {
                .padding_values = {.width = static_cast<int16_t>(padding_left),
                                   .height = static_cast<int16_t>(padding_top),
                                   .width_offset = 0,
                                   .height_offset = 0},
                .stride_height = stride_height,
                .stride_width = stride_width,
                .filter_height = filter_height,
                .filter_width = filter_width,
        };
        if (output.type == OperandType::TENSOR_QUANT8_ASYMM) {
            int32_t output_activation_min = 0;
            int32_t output_activation_max = 0;
            CalculateActivationRangeUint8(activation, output, &output_activation_min,
                                          &output_activation_max);
            params.quantized_activation_min = output_activation_min;
            params.quantized_activation_max = output_activation_max;
        } else if (output.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
            int32_t output_activation_min = 0;
            int32_t output_activation_max = 0;
            CalculateActivationRangeInt8(activation, output, &output_activation_min,
                                         &output_activation_max);
            params.quantized_activation_min = output_activation_min;
            params.quantized_activation_max = output_activation_max;
        } else {
            float output_activation_min, output_activation_max;
            CalculateActivationRangeFloat(activation, &output_activation_min,
                                          &output_activation_max);
            params.float_activation_min = output_activation_min;
            params.float_activation_max = output_activation_max;
        }
        return params;
    }
};

bool averagePoolNhwc(const float* inputData, const Shape& inputShape, const PoolingParam& param,
                     float* outputData, const Shape& outputShape) {
    NNTRACE_TRANS("averagePoolFloat32");
    auto op_params = param.toTfliteParam(outputShape);
    NNTRACE_COMP_SWITCH("optimized_ops::AveragePool");
    tflite::optimized_ops::AveragePool(op_params, convertShapeToTflshape(inputShape), inputData,
                                       convertShapeToTflshape(outputShape), outputData);
    return true;
}

bool averagePoolNhwc(const _Float16* inputData, const Shape& inputShape, const PoolingParam& param,
                     _Float16* outputData, const Shape& outputShape) {
    NNTRACE_TRANS("averagePoolFloat16");
    std::vector<float> inputDataFloat32(getNumberOfElements(inputShape));
    std::vector<float> outputDataFloat32(getNumberOfElements(outputShape));

    convertFloat16ToFloat32(inputData, &inputDataFloat32);
    averagePoolNhwc(inputDataFloat32.data(), inputShape, param, outputDataFloat32.data(),
                    outputShape);
    convertFloat32ToFloat16(outputDataFloat32, outputData);
    return true;
}

bool averagePoolNhwc(const uint8_t* inputData, const Shape& inputShape, const PoolingParam& param,
                     uint8_t* outputData, const Shape& outputShape) {
    NNTRACE_TRANS("averagePoolQuant8");
    auto op_params = param.toTfliteParam(outputShape);
    NNTRACE_COMP_SWITCH("optimized_ops::AveragePool");
    tflite::optimized_ops::AveragePool(op_params, convertShapeToTflshape(inputShape), inputData,
                                       convertShapeToTflshape(outputShape), outputData);
    return true;
}

bool averagePoolNhwc(const int8_t* inputData, const Shape& inputShape, const PoolingParam& param,
                     int8_t* outputData, const Shape& outputShape) {
    NNTRACE_TRANS("averagePoolQuant8Signed");
    auto op_params = param.toTfliteParam(outputShape);
    NNTRACE_COMP_SWITCH("optimized_integer_ops::AveragePool");
    // We are using reference implementation of the AveragePool op because the
    // optimized version fails to pass some of the quantization coupling tests.
    tflite::reference_integer_ops::AveragePool(op_params, convertShapeToTflshape(inputShape),
                                               inputData, convertShapeToTflshape(outputShape),
                                               outputData);
    return true;
}

bool l2PoolNhwc(const float* inputData, const Shape& inputShape, const PoolingParam& param,
                float* outputData, const Shape& outputShape) {
    NNTRACE_TRANS("l2PoolFloat32");
    auto op_params = param.toTfliteParam(outputShape);
    NNTRACE_COMP_SWITCH("optimized_ops::L2Pool");
    tflite::optimized_ops::L2Pool(op_params, convertShapeToTflshape(inputShape), inputData,
                                  convertShapeToTflshape(outputShape), outputData);
    return true;
}

bool l2PoolNhwc(const _Float16* inputData, const Shape& inputShape, const PoolingParam& param,
                _Float16* outputData, const Shape& outputShape) {
    NNTRACE_TRANS("l2PoolFloat16");
    std::vector<float> inputDataFloat32(getNumberOfElements(inputShape));
    std::vector<float> outputDataFloat32(getNumberOfElements(outputShape));

    convertFloat16ToFloat32(inputData, &inputDataFloat32);
    l2PoolNhwc(inputDataFloat32.data(), inputShape, param, outputDataFloat32.data(), outputShape);
    convertFloat32ToFloat16(outputDataFloat32, outputData);
    return true;
}

bool maxPoolNhwc(const float* inputData, const Shape& inputShape, const PoolingParam& param,
                 float* outputData, const Shape& outputShape) {
    NNTRACE_TRANS("maxPoolFloat32");
    auto op_params = param.toTfliteParam(outputShape);
    NNTRACE_COMP_SWITCH("optimized_ops::MaxPool");
    tflite::optimized_ops::MaxPool(op_params, convertShapeToTflshape(inputShape), inputData,
                                   convertShapeToTflshape(outputShape), outputData);
    return true;
}

bool maxPoolNhwc(const uint8_t* inputData, const Shape& inputShape, const PoolingParam& param,
                 uint8_t* outputData, const Shape& outputShape) {
    NNTRACE_TRANS("maxPoolQuant8");
    auto op_params = param.toTfliteParam(outputShape);
    NNTRACE_COMP_SWITCH("optimized_ops::MaxPool");
    tflite::optimized_ops::MaxPool(op_params, convertShapeToTflshape(inputShape), inputData,
                                   convertShapeToTflshape(outputShape), outputData);
    return true;
}

bool maxPoolNhwc(const int8_t* inputData, const Shape& inputShape, const PoolingParam& param,
                 int8_t* outputData, const Shape& outputShape) {
    NNTRACE_TRANS("maxPoolQuant8Signed");
    auto op_params = param.toTfliteParam(outputShape);
    NNTRACE_COMP_SWITCH("optimized_integer_ops::MaxPool");
    // We are using reference implementation of the MaxPool op because the
    // optimized version fails to pass some of the quantization coupling tests.
    tflite::reference_integer_ops::MaxPool(op_params, convertShapeToTflshape(inputShape), inputData,
                                           convertShapeToTflshape(outputShape), outputData);
    return true;
}

bool maxPoolNhwc(const _Float16* inputData, const Shape& inputShape, const PoolingParam& param,
                 _Float16* outputData, const Shape& outputShape) {
    NNTRACE_TRANS("maxPoolFloat16");
    std::vector<float> inputData_float32(getNumberOfElements(inputShape));
    std::vector<float> outputData_float32(getNumberOfElements(outputShape));

    convertFloat16ToFloat32(inputData, &inputData_float32);
    maxPoolNhwc(inputData_float32.data(), inputShape, param, outputData_float32.data(),
                outputShape);
    convertFloat32ToFloat16(outputData_float32, outputData);
    return true;
}

template <typename T>
bool averagePool(const T* inputData, const Shape& inputShape, const PoolingParam& param,
                 T* outputData, const Shape& outputShape) {
    InputWithLayout<T> input(param.useNchw);
    OutputWithLayout<T> output(param.useNchw);
    NN_RET_CHECK(input.initialize(inputData, inputShape));
    NN_RET_CHECK(output.initialize(outputData, outputShape));
    NN_RET_CHECK(averagePoolNhwc(input.getNhwcBuffer(), input.getNhwcShape(), param,
                                 output.getNhwcBuffer(), output.getNhwcShape()));
    NN_RET_CHECK(output.commit());
    return true;
}

template <typename T>
bool l2Pool(const T* inputData, const Shape& inputShape, const PoolingParam& param, T* outputData,
            const Shape& outputShape) {
    InputWithLayout<T> input(param.useNchw);
    OutputWithLayout<T> output(param.useNchw);
    NN_RET_CHECK(input.initialize(inputData, inputShape));
    NN_RET_CHECK(output.initialize(outputData, outputShape));
    NN_RET_CHECK(l2PoolNhwc(input.getNhwcBuffer(), input.getNhwcShape(), param,
                            output.getNhwcBuffer(), output.getNhwcShape()));
    NN_RET_CHECK(output.commit());
    return true;
}

template <typename T>
bool maxPool(const T* inputData, const Shape& inputShape, const PoolingParam& param, T* outputData,
             const Shape& outputShape) {
    InputWithLayout<T> input(param.useNchw);
    OutputWithLayout<T> output(param.useNchw);
    NN_RET_CHECK(input.initialize(inputData, inputShape));
    NN_RET_CHECK(output.initialize(outputData, outputShape));
    NN_RET_CHECK(maxPoolNhwc(input.getNhwcBuffer(), input.getNhwcShape(), param,
                             output.getNhwcBuffer(), output.getNhwcShape()));
    NN_RET_CHECK(output.commit());
    return true;
}

}  // namespace

bool prepare(IOperationExecutionContext* context) {
    Shape input = context->getInputShape(kInputTensor);
    NN_RET_CHECK_EQ(getNumberOfDimensions(input), 4u);

    PoolingParam param;
    NN_RET_CHECK(param.initialize(context));

    // Only batches can be zero.
    uint32_t batches = getSizeOfDimension(input, 0);
    uint32_t height = getSizeOfDimension(input, param.useNchw ? 2 : 1);
    uint32_t width = getSizeOfDimension(input, param.useNchw ? 3 : 2);
    uint32_t channels = getSizeOfDimension(input, param.useNchw ? 1 : 3);
    NN_RET_CHECK_GT(height, 0u);
    NN_RET_CHECK_GT(width, 0u);
    NN_RET_CHECK_GT(channels, 0u);

    uint32_t outWidth = computeOutSize(width, param.filter_width, param.stride_width,
                                       param.padding_left, param.padding_right);
    uint32_t outHeight = computeOutSize(height, param.filter_height, param.stride_height,
                                        param.padding_top, param.padding_bottom);

    Shape output = input;
    if (param.useNchw) {
        output.dimensions = {batches, channels, outHeight, outWidth};
    } else {
        output.dimensions = {batches, outHeight, outWidth, channels};
    }
    return context->setOutputShape(kOutputTensor, output);
}

#define POOLING_DISPATCH_INPUT_TYPE(name, type, cppType)              \
    case OperandType::type:                                           \
        return name(context->getInputBuffer<cppType>(kInputTensor),   \
                    context->getInputShape(kInputTensor), param,      \
                    context->getOutputBuffer<cppType>(kOutputTensor), \
                    context->getOutputShape(kOutputTensor))

bool executeAveragePool(IOperationExecutionContext* context) {
    // Bypass execution in the case of zero-sized input.
    if (getNumberOfElements(context->getOutputShape(kOutputTensor)) == 0) return true;
    PoolingParam param;
    NN_RET_CHECK(param.initialize(context));
    switch (context->getInputType(kInputTensor)) {
        POOLING_DISPATCH_INPUT_TYPE(averagePool, TENSOR_FLOAT32, float);
        POOLING_DISPATCH_INPUT_TYPE(averagePool, TENSOR_FLOAT16, _Float16);
        POOLING_DISPATCH_INPUT_TYPE(averagePool, TENSOR_QUANT8_ASYMM, uint8_t);
        POOLING_DISPATCH_INPUT_TYPE(averagePool, TENSOR_QUANT8_ASYMM_SIGNED, int8_t);
        default:
            NN_RET_CHECK_FAIL() << "Unsupported tensor type for operation AVERAGE_POOL_2D";
    }
}

bool executeL2Pool(IOperationExecutionContext* context) {
    // Bypass execution in the case of zero-sized input.
    if (getNumberOfElements(context->getOutputShape(kOutputTensor)) == 0) return true;
    PoolingParam param;
    NN_RET_CHECK(param.initialize(context));
    switch (context->getInputType(kInputTensor)) {
        POOLING_DISPATCH_INPUT_TYPE(l2Pool, TENSOR_FLOAT32, float);
        POOLING_DISPATCH_INPUT_TYPE(l2Pool, TENSOR_FLOAT16, _Float16);
        default:
            NN_RET_CHECK_FAIL() << "Unsupported tensor type for operation L2_POOL_2D";
    }
}

bool executeMaxPool(IOperationExecutionContext* context) {
    // Bypass execution in the case of zero-sized input.
    if (getNumberOfElements(context->getOutputShape(kOutputTensor)) == 0) return true;
    PoolingParam param;
    NN_RET_CHECK(param.initialize(context));
    switch (context->getInputType(kInputTensor)) {
        POOLING_DISPATCH_INPUT_TYPE(maxPool, TENSOR_FLOAT32, float);
        POOLING_DISPATCH_INPUT_TYPE(maxPool, TENSOR_FLOAT16, _Float16);
        POOLING_DISPATCH_INPUT_TYPE(maxPool, TENSOR_QUANT8_ASYMM, uint8_t);
        POOLING_DISPATCH_INPUT_TYPE(maxPool, TENSOR_QUANT8_ASYMM_SIGNED, int8_t);
        default:
            NN_RET_CHECK_FAIL() << "Unsupported tensor type for operation MAX_POOL_2D";
    }
}
#endif  // NN_INCLUDE_CPU_IMPLEMENTATION

#undef POOLING_DISPATCH_INPUT_TYPE

}  // namespace pooling

NN_REGISTER_OPERATION_DEFAULT_VALIDATION(AVERAGE_POOL_2D, pooling::prepare,
                                         pooling::executeAveragePool, .allowZeroSizedInput = true);
NN_REGISTER_OPERATION_DEFAULT_VALIDATION(L2_POOL_2D, pooling::prepare, pooling::executeL2Pool,
                                         .allowZeroSizedInput = true);
NN_REGISTER_OPERATION_DEFAULT_VALIDATION(MAX_POOL_2D, pooling::prepare, pooling::executeMaxPool,
                                         .allowZeroSizedInput = true);

}  // namespace nn
}  // namespace android
