/*
 * Copyright (C) 2017 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef ANDROID_PACKAGES_MODULES_NEURALNETWORKS_COMMON_CPU_EXECUTOR_H
#define ANDROID_PACKAGES_MODULES_NEURALNETWORKS_COMMON_CPU_EXECUTOR_H

#include <android-base/macros.h>
#include <nnapi/Types.h>

#include <algorithm>
#include <memory>
#include <optional>
#include <vector>

#include "ControlFlow.h"
#include "LegacyUtils.h"
#include "OperationResolver.h"
#include "OperationsExecutionUtils.h"

namespace android {
namespace nn {

// Information we maintain about each operand during execution that
// may change during execution.
struct RunTimeOperandInfo {
    // TODO Storing the type here is redundant, as it won't change during execution.
    OperandType type;
    // The type and dimensions of the operand.  The dimensions can
    // change at runtime.  We include the type because it's useful
    // to pass together with the dimension to the functions implementing
    // the operators.
    //
    // A dimension being zero has different meanings for different operands at different stages:
    // - Model inputs:
    //   * Specified in model: implies "dynamic", and must be fully-specified in request.
    //   * Specified in request: illegal.
    // - Constant operands: illegal.
    // - Model outputs and internal operands:
    //   * Before evaluation: implies unknown and to be deduced from execution.
    //   * After evaluation:
    //     - If isSufficient reports true: the tensor is zero-sized.
    //     - Otherwise: implies unknown.
    std::vector<uint32_t> dimensions;

    float scale;
    int32_t zeroPoint;
    // Where the operand's data is stored.  Check the corresponding
    // location information in the model to figure out if this points
    // to memory we have allocated for an temporary operand.
    uint8_t* buffer;  // TODO(b/148273353): Change the type to void*.
    // The length of the buffer.
    uint32_t length;
    // Whether this is a temporary variable, a model input, a constant, etc.
    Operand::LifeTime lifetime;
    // Keeps track of how many operations have yet to make use
    // of this temporary variable.  When the count is decremented to 0,
    // we free the buffer.  For non-temporary variables, this count is
    // always 0.
    uint32_t numberOfUsesLeft;

    Operand::ExtraParams extraParams;

    Shape shape() const {
        return {
                .type = type,
                .dimensions = dimensions,
                .scale = scale,
                .offset = zeroPoint,
                .extraParams = extraParams,
        };
    }

    bool isSufficient() const {
        if (isExtension(type)) {
            // We don't know sizes of extension types.
            return true;
        }
        return length >= nonExtensionOperandSizeOfData(type, dimensions);
    }
};

// Used to keep a pointer to each of the memory pools.
//
// RunTimePoolInfo references a region of memory. Other RunTimePoolInfo objects
// may reference the same region of memory by either:
// (1) copying an existing RunTimePoolInfo object, or
// (2) creating multiple RunTimePoolInfo objects from the same memory resource
//     (e.g., "createFromMemory" or "createFromExistingBuffer")
//
// If the underlying region of memory is mapped by "createFromMemory", the
// mapping will be sustained until it is no longer referenced by any
// RunTimePoolInfo objects.
class RunTimePoolInfo {
   public:
    static std::optional<RunTimePoolInfo> createFromMemory(const SharedMemory& memory);
    static RunTimePoolInfo createFromExistingBuffer(uint8_t* buffer, uint32_t size = 0);

    uint8_t* getBuffer() const;
    bool flush() const;
    const SharedMemory& getMemory() const;
    uint32_t getSize() const;

   private:
    class RunTimePoolInfoImpl;
    RunTimePoolInfo(const std::shared_ptr<const RunTimePoolInfoImpl>& impl);

    std::shared_ptr<const RunTimePoolInfoImpl> mImpl;
};

bool setRunTimePoolInfosFromCanonicalMemories(std::vector<RunTimePoolInfo>* poolInfos,
                                              const std::vector<SharedMemory>& pools);

bool setRunTimePoolInfosFromMemoryPools(std::vector<RunTimePoolInfo>* poolInfos,
                                        const std::vector<Request::MemoryPool>& pools);

// This class is used to execute a model on the CPU.
class CpuExecutor {
   public:
    // This constructor allows clients of CpuExecutor to provide custom CPU
    // operation implementations. It is used by a sample driver to test
    // extension support.
    //
    // Note that it is not possible to provide custom CPU implementations for
    // non-OperationResolver operations (b/124041202).
    //
    // The operation resolver must outlive the executor.
    explicit CpuExecutor(const IOperationResolver* operationResolver)
        : mOperationResolver(operationResolver) {}

    CpuExecutor() : CpuExecutor(BuiltinOperationResolver::get()) {}

    // Executes the model. The results will be stored at the locations
    // specified in the constructor.
    // The model must outlive the executor.  We prevent it from being modified
    // while this is executing.
    int run(const Model& model, const Request& request,
            const std::vector<RunTimePoolInfo>& modelPoolInfos,
            const std::vector<RunTimePoolInfo>& requestPoolInfos);

    const std::vector<OutputShape>& getOutputShapes() const {
        CHECK(mFinished) << "getOutputShapes() called by an unfinished CpuExecutor.";
        return mOutputShapes;
    }

    void setDeadline(const TimePoint& deadline) { mDeadline = deadline; }
    void setLoopTimeout(uint64_t duration) { mLoopTimeoutDuration = duration; }

   private:
    // Creates runtime info from what's in the model.
    std::vector<RunTimeOperandInfo> initializeRunTimeInfo(const Model::Subgraph& subgraph);
    // Adjusts the runtime info for the arguments passed to the model,
    // modifying the buffer location, and possibly the dimensions.
    void updateForArguments(const std::vector<uint32_t>& indexes,
                            const std::vector<Request::Argument>& arguments,
                            const std::vector<RunTimePoolInfo>& requestPoolInfos,
                            RunTimeOperandInfo* operands);
    // Runs one subgraph.
    int executeSubgraph(const Model::Subgraph& subgraph, RunTimeOperandInfo* operands);
    // Runs one operation of the graph.
    int executeOperation(const Operation& operation, RunTimeOperandInfo* operands);
    int executeIfOperation(const Operation& operation, RunTimeOperandInfo* operands);
    int executeWhileOperation(const Operation& operation, RunTimeOperandInfo* operands);

    void setOutputShapes(const std::vector<uint32_t>& outputIndexes,
                         const std::vector<RunTimeOperandInfo>& operands);

    // Compile-time operand value information used by initializeRunTimeInfo.
    // The fields are only valid while run() is being executed.
    const uint8_t* mModelOperandValues = nullptr;
    const std::vector<RunTimePoolInfo>* mModelPoolInfos = nullptr;
    const std::vector<Model::Subgraph>* mReferencedSubgraphs = nullptr;

    // The output operand shapes returning to the runtime.
    std::vector<OutputShape> mOutputShapes;

    // Whether execution is finished and mOutputShapes is ready
    bool mFinished = false;

    // The deadline hint for the maximum amount of time the client expects the
    // execution will take. If this deadline is exceeded, the CpuExecutor will
    // abort the execution if there are remaining ops to execute.
    OptionalTimePoint mDeadline;

    // The maximum amount of time in nanoseconds that can be spent executing a
    // WHILE loop.
    uint64_t mLoopTimeoutDuration = operation_while::kTimeoutNsDefault;

    [[maybe_unused]] const IOperationResolver* mOperationResolver;
};

// Class for setting reasonable OpenMP threading settings. (OpenMP is used by
// the Eigen matrix library.)
//
// Currently sets a low blocktime: the time OpenMP threads busy-wait for more
// work before going to sleep. See b/79159165, https://reviews.llvm.org/D18577.
// The default is 200ms, we set to 20ms here, see b/109645291. This keeps the
// cores enabled throughout inference computation without too much extra power
// consumption afterwards.
//
// The OpenMP settings are thread-local (applying only to worker threads formed
// from that thread), see https://software.intel.com/en-us/node/522688 and
// http://lists.llvm.org/pipermail/openmp-dev/2016-July/001432.html. This class
// ensures that within the scope in which an object is instantiated we use the
// right settings (scopes may be nested), as long as no other library changes
// them.  (Note that in current NNAPI usage only one instance is used in the
// CpuExecutor thread).
//
// TODO(mikie): consider also setting the number of threads used. Using as many
// threads as there are cores results in more variable performance: if we don't
// get all cores for our threads, the latency is doubled as we wait for one core
// to do twice the amount of work. Reality is complicated though as not all
// cores are the same. Decision to be based on benchmarking against a
// representative set of workloads and devices. I'm keeping the code here for
// reference.
// b/109953668, disable OpenMP
#ifdef NNAPI_OPENMP
class ScopedOpenmpSettings {
   public:
    ScopedOpenmpSettings();
    ~ScopedOpenmpSettings();
    DISALLOW_COPY_AND_ASSIGN(ScopedOpenmpSettings);

   private:
    int mBlocktimeInitial;
#if NNAPI_LIMIT_CPU_THREADS
    int mMaxThreadsInitial;
#endif
};
#endif  // NNAPI_OPENMP

namespace {

template <typename T>
T getScalarData(const RunTimeOperandInfo& info) {
    CHECK_GE(info.length, sizeof(T)) << "Cannot get scalar data: buffer too short";
    T* data = reinterpret_cast<T*>(info.buffer);
    return data[0];
}

template <typename T>
T getScalarDataWithDefault(const RunTimeOperandInfo& info, T defaultValue) {
    if (info.length < sizeof(T)) {
        return defaultValue;
    }
    return getScalarData<T>(info);
}

inline bool IsNullInput(const RunTimeOperandInfo* input) {
    return input->lifetime == Operand::LifeTime::NO_VALUE;
}

inline int NumInputsWithValues(const Operation& operation, const RunTimeOperandInfo* operands) {
    const std::vector<uint32_t>& inputs = operation.inputs;
    return std::count_if(inputs.begin(), inputs.end(),
                         [&operands](uint32_t i) { return !IsNullInput(&operands[i]); });
}

inline int NumOutputs(const Operation& operation) {
    return operation.outputs.size();
}

inline size_t NumDimensions(const RunTimeOperandInfo* operand) {
    return operand->shape().dimensions.size();
}

inline uint32_t SizeOfDimension(const RunTimeOperandInfo* operand, int i) {
    return operand->shape().dimensions[i];
}

inline RunTimeOperandInfo* GetInput(const Operation& operation, RunTimeOperandInfo* operands,
                                    int index) {
    return &operands[operation.inputs[index]];
}

inline RunTimeOperandInfo* GetOutput(const Operation& operation, RunTimeOperandInfo* operands,
                                     int index) {
    return &operands[operation.outputs[index]];
}

}  // anonymous namespace

}  // namespace nn
}  // namespace android

#endif  // ANDROID_PACKAGES_MODULES_NEURALNETWORKS_COMMON_CPU_EXECUTOR_H