/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName

#include <executorch/backends/vulkan/runtime/utils/MacroUtils.h>

#include <executorch/backends/vulkan/runtime/vk_api/Adapter.h>
#include <executorch/backends/vulkan/runtime/vk_api/Command.h>
#include <executorch/backends/vulkan/runtime/vk_api/Descriptor.h>
#include <executorch/backends/vulkan/runtime/vk_api/Fence.h>
#include <executorch/backends/vulkan/runtime/vk_api/QueryPool.h>
#include <executorch/backends/vulkan/runtime/vk_api/Runtime.h>
#include <executorch/backends/vulkan/runtime/vk_api/VkUtils.h>

namespace vkcompute {
namespace api {

struct ContextConfig final {
  uint32_t cmd_submit_frequency;
  vkapi::CommandPoolConfig cmd_pool_config;
  vkapi::DescriptorPoolConfig descriptor_pool_config;
  vkapi::QueryPoolConfig query_pool_config;
};

//
// Vulkan Context holds onto all relevant Vulkan state as it pertains to our
// use of Vulkan in PyTorch. A Context is associated with one, and only one,
// Adapter as a precursor to multi-GPU support. All Vulkan tensors in PyTorch
// are associated with a Context to make tensor <-> device affinity explicit.
// The context is currently a global object, but technically it does not need
// to be if we were to make it explicit to the user.
//

class Context final {
 public:
  explicit Context(size_t adapter_i, const ContextConfig&);

  Context(const Context&) = delete;
  Context& operator=(const Context&) = delete;

  Context(Context&&) = delete;
  Context& operator=(Context&&) = delete;

  ~Context();

 private:
  // Config
  ContextConfig config_;
  // Important handles
  vkapi::Adapter* adapter_p_;
  VkDevice device_;
  vkapi::Adapter::Queue queue_;
  // Resource Pools
  vkapi::CommandPool command_pool_;
  vkapi::DescriptorPool descriptor_pool_;
  vkapi::FencePool fences_;
  // Diagnostics
  vkapi::QueryPool querypool_;
  // Command buffers submission
  std::mutex cmd_mutex_;
  vkapi::CommandBuffer cmd_;
  uint32_t submit_count_;
  // Memory Management
  std::mutex buffer_clearlist_mutex_;
  std::vector<vkapi::VulkanBuffer> buffers_to_clear_;
  std::mutex image_clearlist_mutex_;
  std::vector<vkapi::VulkanImage> images_to_clear_;
  // Misc
  VkImageTiling preferred_image_tiling_;

 public:
  // Adapter access

  inline vkapi::Adapter* adapter_ptr() {
    return adapter_p_;
  }

  inline VkDevice device() {
    return device_;
  }

  inline VkQueue queue() {
    return queue_.handle;
  }

  // Device Caches

  inline vkapi::ShaderLayoutCache& shader_layout_cache() {
    return adapter_ptr()->shader_layout_cache();
  }

  inline vkapi::ShaderCache& shader_cache() {
    return adapter_ptr()->shader_cache();
  }

  inline vkapi::PipelineLayoutCache& pipeline_layout_cache() {
    return adapter_ptr()->pipeline_layout_cache();
  }

  inline vkapi::ComputePipelineCache& pipeline_cache() {
    return adapter_ptr()->compute_pipeline_cache();
  }

  // Resource Pools

  inline vkapi::DescriptorPool& descriptor_pool() {
    return descriptor_pool_;
  }

  inline vkapi::FencePool& fences() {
    return fences_;
  }

  // Diagnostics

  inline vkapi::QueryPool& querypool() {
    return querypool_;
  }

  inline VkImageTiling preferred_image_tiling() {
    return preferred_image_tiling_;
  }

  /*
   * By default, the querypool attached to a Context instance is uninitialized.
   * This function triggers the querypool to be created via vkCreateQueryPool.
   */
  void initialize_querypool();

  /*
   * Encodes a vkResetQueryPool command to the current command buffer, and reset
   * the internal state of the querypool. If the querypool is not initialized
   * this function is a no-op.
   */
  void cmd_reset_querypool();

  /*
   * Encodes a vkCmdWriteTimestamp command to the current command buffer and
   * record some metadata about the shader that will be dispatched. If the
   * querypool is not initialized this function is a no-op.
   */
  void report_shader_dispatch_start(
      const std::string& shader_name,
      const utils::uvec3& global_wg_size,
      const utils::uvec3& local_wg_size,
      const uint32_t dispatch_id = UINT32_MAX);

  /*
   * Encodes a vkCmdWriteTimstamp command to the current command buffer to
   * record when the last shader that was dispatched has completed execution.
   * If the querypool is not initialized this function is a no-op.
   */
  void report_shader_dispatch_end();

  // Memory Management

  void register_buffer_cleanup(vkapi::VulkanBuffer& buffer) {
    std::lock_guard<std::mutex> bufferlist_lock(buffer_clearlist_mutex_);
    buffers_to_clear_.emplace_back(std::move(buffer));
  }

  void register_image_cleanup(vkapi::VulkanImage& image) {
    std::lock_guard<std::mutex> imagelist_lock(image_clearlist_mutex_);
    images_to_clear_.emplace_back(std::move(image));
  }

  // GPU RPC

  inline std::unique_lock<std::mutex> dispatch_lock() {
    return std::unique_lock<std::mutex>(cmd_mutex_);
  }

  inline void set_cmd(bool reusable = false) {
    if (!cmd_) {
      cmd_ = command_pool_.get_new_cmd(reusable);
      cmd_.begin();
    }
  }

  vkapi::DescriptorSet get_descriptor_set(
      const vkapi::ShaderInfo&,
      const utils::uvec3&,
      const vkapi::SpecVarList&);

  inline vkapi::DescriptorSet get_descriptor_set(
      const vkapi::ShaderInfo& shader_descriptor,
      const utils::uvec3& local_work_group_size) {
    return get_descriptor_set(shader_descriptor, local_work_group_size, {});
  }

  void register_shader_dispatch(
      const vkapi::DescriptorSet&,
      vkapi::PipelineBarrier&,
      const vkapi::ShaderInfo&,
      const utils::uvec3&);

  void register_blit(
      vkapi::PipelineBarrier&,
      vkapi::VulkanImage& src,
      vkapi::VulkanImage& dst);

  template <typename... Arguments>
  bool submit_compute_job(
      const vkapi::ShaderInfo&,
      vkapi::PipelineBarrier&,
      const utils::uvec3&,
      const utils::uvec3&,
      const vkapi::SpecVarList&,
      VkFence fence_handle,
      const uint32_t dispatch_id,
      Arguments&&...);

  void submit_cmd_to_gpu(
      VkFence fence_handle = VK_NULL_HANDLE,
      const bool final_use = false);

  void flush();
};

bool available();

// The global runtime is retrieved using this function, where it is declared as
// a static local variable.
Context* context();

namespace detail {

inline void arg_is_empty(
    bool& any_is_empty,
    const vkapi::VulkanBuffer& buffer) {
  // bool(buffer) will evaluate to false if no memory has been allocated
  any_is_empty = any_is_empty || !buffer;
}

inline void arg_is_empty(bool& any_is_empty, const vkapi::VulkanImage& image) {
  // bool(image) will evaluate to false if no memory has been allocated
  any_is_empty = any_is_empty || !image;
}

inline void arg_is_empty(
    bool& any_is_empty,
    const vkapi::BufferBindInfo& bind_info) {
  any_is_empty = any_is_empty || (bind_info.handle == VK_NULL_HANDLE);
}

/*
  Reports if any VulkanBuffer or VulkanImage argument in a variadic argument
  list does not have any memory associated with it.
 */
template <typename... Arguments>
inline bool any_arg_is_empty(Arguments&&... arguments) {
  bool any_is_empty = false;
  VK_UNUSED const int _[]{
      0,
      (arg_is_empty(any_is_empty, std::forward<Arguments>(arguments)), 0)...,
  };

  return any_is_empty;
}

template <size_t... Indices, typename... Arguments>
inline void bind(
    vkapi::DescriptorSet& descriptor_set,
    const std::index_sequence<Indices...>&,
    Arguments&&... arguments) {
  VK_UNUSED const int _[]{
      0,
      (descriptor_set.bind(Indices, std::forward<Arguments>(arguments)), 0)...,
  };
}

} // namespace detail

/*
  Records a compute shader dispatch into the current command buffer. If the
  number of submit_*_job calls exceeds the configured frequency, or if a fence
  is provided, then the command buffer is submitted to the GPU for execution.
  Returns a bool indicating whether or not the function call resulted in a GPU
  queue submission.
 */
template <typename... Arguments>
inline bool Context::submit_compute_job(
    const vkapi::ShaderInfo& shader,
    vkapi::PipelineBarrier& pipeline_barrier,
    const utils::uvec3& global_work_group,
    const utils::uvec3& local_work_group_size,
    const vkapi::SpecVarList& specialization_constants,
    VkFence fence_handle,
    const uint32_t dispatch_id,
    Arguments&&... arguments) {
  // If any of the provided arguments does not have memory associated with it,
  // then exit early as there is no work to be done. However, if a fence has
  // been passed the command buffer is not empty, then the current command
  // buffer must still be submitted so that the fence can be signaled.
  if (detail::any_arg_is_empty(arguments...)) {
    if (fence_handle != VK_NULL_HANDLE && submit_count_ > 0) {
      submit_cmd_to_gpu(fence_handle);
      return true;
    }
    return false;
  }

  // Serialize recording to the shared command buffer. Do not initialize with a
  // mutex just yet, since in some cases it will be externally managed.
  std::unique_lock<std::mutex> cmd_lock;
  // If a fence was passed, then assume that the host intends to sync with
  // the GPU, implying there will be imminent calls to fence.wait() and flush().
  // We therefore assume the mutex is externally managed in this case, and the
  // calling thread has already locked the mutex prior to calling the function,
  // and will release the mutex manually after calling flush(). This will
  // prevent more dispatches from being recorded until we have flushed the
  // Context.
  if (fence_handle == VK_NULL_HANDLE) {
    cmd_lock = std::unique_lock<std::mutex>(cmd_mutex_);
  }

  set_cmd();

  report_shader_dispatch_start(
      shader.kernel_name,
      global_work_group,
      local_work_group_size,
      dispatch_id);

  // Factor out template parameter independent code to minimize code bloat.
  vkapi::DescriptorSet descriptor_set = get_descriptor_set(
      shader, local_work_group_size, specialization_constants);

  detail::bind(
      descriptor_set,
      std::index_sequence_for<Arguments...>{},
      std::forward<Arguments>(arguments)...);

  // Factor out template parameter independent code to minimize code bloat.
  register_shader_dispatch(
      descriptor_set, pipeline_barrier, shader, global_work_group);

  report_shader_dispatch_end();

  submit_count_++;
  if (fence_handle != VK_NULL_HANDLE ||
      submit_count_ >= config_.cmd_submit_frequency) {
    submit_cmd_to_gpu(fence_handle);
    return true;
  }

  return false;
}

} // namespace api
} // namespace vkcompute