/*
 * Copyright (c) 2017-2021 Arm Limited.
 *
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
#ifndef ARM_COMPUTE_ISCHEDULER_H
#define ARM_COMPUTE_ISCHEDULER_H

#include "arm_compute/core/CPP/CPPTypes.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/experimental/Types.h"

#include <functional>
#include <limits>

namespace arm_compute
{
class ICPPKernel;
class ITensor;
class Window;

/** Scheduler interface to run kernels */
class IScheduler
{
public:
    /** Strategies available to split a workload */
    enum class StrategyHint
    {
        STATIC,  /**< Split the workload evenly among the threads */
        DYNAMIC, /**< Split the workload dynamically using a bucket system */
    };

    /** Function to be used and map a given thread id to a logical core id
     *
     * Mapping function expects the thread index and total number of cores as input,
     * and returns the logical core index to bind against
     */
    using BindFunc = std::function<int(int, int)>;

    /** When arm_compute::ISchedular::Hints::_split_dimension is initialized with this value
     * then the schedular is free to break down the problem space over as many dimensions
     * as it wishes
     */
    static constexpr unsigned int split_dimensions_all = std::numeric_limits<unsigned>::max();

    /** Scheduler hints
     *
     * Collection of preferences set by the function regarding how to split a given workload
     */
    class Hints
    {
    public:
        /** Constructor
         *
         * @param[in] split_dimension Dimension along which to split the kernel's execution window.
         * @param[in] strategy        (Optional) Split strategy.
         * @param[in] threshold       (Optional) Dynamic scheduling capping threshold.
         */
        Hints(unsigned int split_dimension, StrategyHint strategy = StrategyHint::STATIC, int threshold = 0)
            : _split_dimension(split_dimension), _strategy(strategy), _threshold(threshold)
        {
        }
        /** Set the split_dimension hint
         *
         * @param[in] split_dimension Dimension along which to split the kernel's execution window.
         *
         * @return the Hints object
         */
        Hints &set_split_dimension(unsigned int split_dimension)
        {
            _split_dimension = split_dimension;
            return *this;
        }
        /** Return the prefered split dimension
         *
         * @return The split dimension
         */
        unsigned int split_dimension() const
        {
            return _split_dimension;
        }

        /** Set the strategy hint
         *
         * @param[in] strategy Prefered strategy to use to split the workload
         *
         * @return the Hints object
         */
        Hints &set_strategy(StrategyHint strategy)
        {
            _strategy = strategy;
            return *this;
        }
        /** Return the prefered strategy to use to split workload.
         *
         * @return The strategy
         */
        StrategyHint strategy() const
        {
            return _strategy;
        }
        /** Return the granule capping threshold to be used by dynamic scheduling.
         *
         * @return The capping threshold
         */
        int threshold() const
        {
            return _threshold;
        }

    private:
        unsigned int _split_dimension{};
        StrategyHint _strategy{};
        int          _threshold{};
    };
    /** Signature for the workloads to execute */
    using Workload = std::function<void(const ThreadInfo &)>;
    /** Default constructor. */
    IScheduler();

    /** Destructor. */
    virtual ~IScheduler() = default;

    /** Sets the number of threads the scheduler will use to run the kernels.
     *
     * @param[in] num_threads If set to 0, then one thread per CPU core available on the system will be used, otherwise the number of threads specified.
     */
    virtual void set_num_threads(unsigned int num_threads) = 0;

    /** Sets the number of threads the scheduler will use to run the kernels but also using a binding function to pin the threads to given logical cores
     *
     * @param[in] num_threads If set to 0, then one thread per CPU core available on the system will be used, otherwise the number of threads specified.
     * @param[in] func        Binding function to use.
     */
    virtual void set_num_threads_with_affinity(unsigned int num_threads, BindFunc func);

    /** Returns the number of threads that the SingleThreadScheduler has in its pool.
     *
     * @return Number of threads available in SingleThreadScheduler.
     */
    virtual unsigned int num_threads() const = 0;

    /** Runs the kernel in the same thread as the caller synchronously.
     *
     * @param[in] kernel Kernel to execute.
     * @param[in] hints  Hints for the scheduler.
     */
    virtual void schedule(ICPPKernel *kernel, const Hints &hints) = 0;

    /** Runs the kernel in the same thread as the caller synchronously.
     *
     * @param[in] kernel  Kernel to execute.
     * @param[in] hints   Hints for the scheduler.
     * @param[in] window  Window to use for kernel execution.
     * @param[in] tensors Vector containing the tensors to operate on.
     */
    virtual void schedule_op(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors) = 0;

    /** Execute all the passed workloads
     *
     * @note there is no guarantee regarding the order in which the workloads will be executed or whether or not they will be executed in parallel.
     *
     * @param[in] workloads Array of workloads to run
     * @param[in] tag       String that can be used by profiling tools to identify the workloads run by the scheduler (Can be null).
     */
    virtual void run_tagged_workloads(std::vector<Workload> &workloads, const char *tag);

    /** Get CPU info.
     *
     * @return CPU info.
     */
    CPUInfo &cpu_info();
    /** Get a hint for the best possible number of execution threads
     *
     * @warning In case we can't work out the best number of threads,
     *          std::thread::hardware_concurrency() is returned else 1 in case of bare metal builds
     *
     * @return Best possible number of execution threads to use
     */
    unsigned int num_threads_hint() const;

protected:
    /** Execute all the passed workloads
     *
     * @note there is no guarantee regarding the order in which the workloads will be executed or whether or not they will be executed in parallel.
     *
     * @param[in] workloads Array of workloads to run
     */
    virtual void run_workloads(std::vector<Workload> &workloads) = 0;

    /** Common scheduler logic to execute the given kernel
     *
     * @param[in] kernel  Kernel to execute.
     * @param[in] hints   Hints for the scheduler.
     * @param[in] window  Window to use for kernel execution.
     * @param[in] tensors Vector containing the tensors to operate on.
     */
    void schedule_common(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors);

    /** Adjust the number of windows to the optimize performance
     * (used for small workloads where smaller number of threads might improve the performance)
     *
     * @param[in] window           Window to use for kernel execution
     * @param[in] split_dimension  Axis of dimension to split
     * @param[in] init_num_windows Initial number of sub-windows to split
     * @param[in] kernel           Kernel to execute
     * @param[in] cpu_info         The CPU platform used to create the context.
     *
     * @return Adjusted number of windows
     */
    std::size_t adjust_num_of_windows(const Window &window, std::size_t split_dimension, std::size_t init_num_windows, const ICPPKernel &kernel, const CPUInfo &cpu_info);

private:
    unsigned int _num_threads_hint = {};
};
} // namespace arm_compute
#endif /* ARM_COMPUTE_ISCHEDULER_H */
