/*
 * Copyright 2022 Google LLC
 * SPDX-License-Identifier: MIT
 */

#include "vn_feedback.h"

#include "vn_command_buffer.h"
#include "vn_device.h"
#include "vn_physical_device.h"
#include "vn_query_pool.h"
#include "vn_queue.h"

static uint32_t
vn_get_memory_type_index(const VkPhysicalDeviceMemoryProperties *mem_props,
                         uint32_t mem_type_bits,
                         VkMemoryPropertyFlags required_mem_flags)
{
   u_foreach_bit(mem_type_index, mem_type_bits)
   {
      assert(mem_type_index < mem_props->memoryTypeCount);
      if ((mem_props->memoryTypes[mem_type_index].propertyFlags &
           required_mem_flags) == required_mem_flags)
         return mem_type_index;
   }

   return UINT32_MAX;
}

VkResult
vn_feedback_buffer_create(struct vn_device *dev,
                          uint32_t size,
                          const VkAllocationCallbacks *alloc,
                          struct vn_feedback_buffer **out_fb_buf)
{
   const bool exclusive = dev->queue_family_count == 1;
   const VkPhysicalDeviceMemoryProperties *mem_props =
      &dev->physical_device->memory_properties;
   VkDevice dev_handle = vn_device_to_handle(dev);
   VkResult result;

   struct vn_feedback_buffer *fb_buf =
      vk_zalloc(alloc, sizeof(*fb_buf), VN_DEFAULT_ALIGN,
                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
   if (!fb_buf)
      return VK_ERROR_OUT_OF_HOST_MEMORY;

   /* use concurrent to avoid explicit queue family ownership transfer for
    * device created with queues from multiple queue families
    */
   const VkBufferCreateInfo buf_create_info = {
      .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
      .size = size,
      /* Feedback for fences and timeline semaphores will write to this buffer
       * as a DST when signalling. Timeline semaphore feedback will also read
       * from this buffer as a SRC to retrieve the counter value to signal.
       */
      .usage =
         VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
      .sharingMode =
         exclusive ? VK_SHARING_MODE_EXCLUSIVE : VK_SHARING_MODE_CONCURRENT,
      /* below favors the current venus protocol */
      .queueFamilyIndexCount = exclusive ? 0 : dev->queue_family_count,
      .pQueueFamilyIndices = exclusive ? NULL : dev->queue_families,
   };
   result = vn_CreateBuffer(dev_handle, &buf_create_info, alloc,
                            &fb_buf->buf_handle);
   if (result != VK_SUCCESS)
      goto out_free_feedback_buffer;

   struct vn_buffer *buf = vn_buffer_from_handle(fb_buf->buf_handle);
   const VkMemoryRequirements *mem_req =
      &buf->requirements.memory.memoryRequirements;
   const uint32_t mem_type_index =
      vn_get_memory_type_index(mem_props, mem_req->memoryTypeBits,
                               VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
   if (mem_type_index >= mem_props->memoryTypeCount) {
      result = VK_ERROR_INITIALIZATION_FAILED;
      goto out_destroy_buffer;
   }

   const VkMemoryAllocateInfo mem_alloc_info = {
      .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
      .allocationSize = mem_req->size,
      .memoryTypeIndex = mem_type_index,
   };
   result = vn_AllocateMemory(dev_handle, &mem_alloc_info, alloc,
                              &fb_buf->mem_handle);
   if (result != VK_SUCCESS)
      goto out_destroy_buffer;

   const VkBindBufferMemoryInfo bind_info = {
      .sType = VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO,
      .buffer = fb_buf->buf_handle,
      .memory = fb_buf->mem_handle,
      .memoryOffset = 0,
   };
   result = vn_BindBufferMemory2(dev_handle, 1, &bind_info);
   if (result != VK_SUCCESS)
      goto out_free_memory;

   result = vn_MapMemory(dev_handle, fb_buf->mem_handle, 0, VK_WHOLE_SIZE, 0,
                         &fb_buf->data);
   if (result != VK_SUCCESS)
      goto out_free_memory;

   *out_fb_buf = fb_buf;

   return VK_SUCCESS;

out_free_memory:
   vn_FreeMemory(dev_handle, fb_buf->mem_handle, alloc);

out_destroy_buffer:
   vn_DestroyBuffer(dev_handle, fb_buf->buf_handle, alloc);

out_free_feedback_buffer:
   vk_free(alloc, fb_buf);

   return result;
}

void
vn_feedback_buffer_destroy(struct vn_device *dev,
                           struct vn_feedback_buffer *fb_buf,
                           const VkAllocationCallbacks *alloc)
{
   VkDevice dev_handle = vn_device_to_handle(dev);

   vn_UnmapMemory(dev_handle, fb_buf->mem_handle);
   vn_FreeMemory(dev_handle, fb_buf->mem_handle, alloc);
   vn_DestroyBuffer(dev_handle, fb_buf->buf_handle, alloc);
   vk_free(alloc, fb_buf);
}

static inline uint32_t
vn_get_feedback_buffer_alignment(struct vn_device *dev,
                                 struct vn_feedback_buffer *fb_buf)
{
   struct vn_buffer *buf = vn_buffer_from_handle(fb_buf->buf_handle);
   return align(buf->requirements.memory.memoryRequirements.alignment,
                dev->physical_device->wa_min_fb_align);
}

static VkResult
vn_feedback_pool_grow_locked(struct vn_feedback_pool *pool)
{
   VN_TRACE_FUNC();
   struct vn_feedback_buffer *fb_buf = NULL;
   VkResult result;

   result =
      vn_feedback_buffer_create(pool->dev, pool->size, pool->alloc, &fb_buf);
   if (result != VK_SUCCESS)
      return result;

   pool->used = 0;
   pool->alignment = vn_get_feedback_buffer_alignment(pool->dev, fb_buf);

   list_add(&fb_buf->head, &pool->fb_bufs);

   return VK_SUCCESS;
}

VkResult
vn_feedback_pool_init(struct vn_device *dev,
                      struct vn_feedback_pool *pool,
                      uint32_t size,
                      const VkAllocationCallbacks *alloc)
{
   simple_mtx_init(&pool->mutex, mtx_plain);

   pool->dev = dev;
   pool->alloc = alloc;
   pool->size = size;
   pool->used = size;
   pool->alignment = 1;
   list_inithead(&pool->fb_bufs);
   list_inithead(&pool->free_slots);

   return VK_SUCCESS;
}

void
vn_feedback_pool_fini(struct vn_feedback_pool *pool)
{
   list_for_each_entry_safe(struct vn_feedback_slot, slot, &pool->free_slots,
                            head)
      vk_free(pool->alloc, slot);

   list_for_each_entry_safe(struct vn_feedback_buffer, fb_buf, &pool->fb_bufs,
                            head)
      vn_feedback_buffer_destroy(pool->dev, fb_buf, pool->alloc);

   simple_mtx_destroy(&pool->mutex);
}

static struct vn_feedback_buffer *
vn_feedback_pool_alloc_locked(struct vn_feedback_pool *pool,
                              uint32_t size,
                              uint32_t *out_offset)
{
   /* Default values of pool->used and pool->alignment are used to trigger the
    * initial pool grow, and will be properly initialized after that.
    */
   if (unlikely(align(size, pool->alignment) > pool->size - pool->used)) {
      VkResult result = vn_feedback_pool_grow_locked(pool);
      if (result != VK_SUCCESS)
         return NULL;

      assert(align(size, pool->alignment) <= pool->size - pool->used);
   }

   *out_offset = pool->used;
   pool->used += align(size, pool->alignment);

   return list_first_entry(&pool->fb_bufs, struct vn_feedback_buffer, head);
}

struct vn_feedback_slot *
vn_feedback_pool_alloc(struct vn_feedback_pool *pool,
                       enum vn_feedback_type type)
{
   static const uint32_t slot_size = 8;
   struct vn_feedback_buffer *fb_buf;
   uint32_t offset;
   struct vn_feedback_slot *slot;

   simple_mtx_lock(&pool->mutex);
   if (!list_is_empty(&pool->free_slots)) {
      slot =
         list_first_entry(&pool->free_slots, struct vn_feedback_slot, head);
      list_del(&slot->head);
      simple_mtx_unlock(&pool->mutex);

      slot->type = type;
      return slot;
   }

   slot = vk_alloc(pool->alloc, sizeof(*slot), VN_DEFAULT_ALIGN,
                   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
   if (!slot) {
      simple_mtx_unlock(&pool->mutex);
      return NULL;
   }

   fb_buf = vn_feedback_pool_alloc_locked(pool, slot_size, &offset);
   simple_mtx_unlock(&pool->mutex);

   if (!fb_buf) {
      vk_free(pool->alloc, slot);
      return NULL;
   }

   slot->type = type;
   slot->offset = offset;
   slot->buf_handle = fb_buf->buf_handle;
   slot->data = fb_buf->data + offset;

   return slot;
}

void
vn_feedback_pool_free(struct vn_feedback_pool *pool,
                      struct vn_feedback_slot *slot)
{
   simple_mtx_lock(&pool->mutex);
   list_add(&slot->head, &pool->free_slots);
   simple_mtx_unlock(&pool->mutex);
}

static inline bool
mask_is_32bit(uint64_t x)
{
   return (x & 0xffffffff00000000) == 0;
}

static void
vn_build_buffer_memory_barrier(const VkDependencyInfo *dep_info,
                               VkBufferMemoryBarrier *barrier1,
                               VkPipelineStageFlags *src_stage_mask,
                               VkPipelineStageFlags *dst_stage_mask)
{

   assert(dep_info->pNext == NULL);
   assert(dep_info->memoryBarrierCount == 0);
   assert(dep_info->bufferMemoryBarrierCount == 1);
   assert(dep_info->imageMemoryBarrierCount == 0);

   const VkBufferMemoryBarrier2 *barrier2 =
      &dep_info->pBufferMemoryBarriers[0];
   assert(barrier2->pNext == NULL);
   assert(mask_is_32bit(barrier2->srcStageMask));
   assert(mask_is_32bit(barrier2->srcAccessMask));
   assert(mask_is_32bit(barrier2->dstStageMask));
   assert(mask_is_32bit(barrier2->dstAccessMask));

   *barrier1 = (VkBufferMemoryBarrier){
      .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
      .pNext = NULL,
      .srcAccessMask = barrier2->srcAccessMask,
      .dstAccessMask = barrier2->dstAccessMask,
      .srcQueueFamilyIndex = barrier2->srcQueueFamilyIndex,
      .dstQueueFamilyIndex = barrier2->dstQueueFamilyIndex,
      .buffer = barrier2->buffer,
      .offset = barrier2->offset,
      .size = barrier2->size,
   };

   *src_stage_mask = barrier2->srcStageMask;
   *dst_stage_mask = barrier2->dstStageMask;
}

static void
vn_cmd_buffer_memory_barrier(VkCommandBuffer cmd_handle,
                             const VkDependencyInfo *dep_info,
                             bool sync2)
{
   if (sync2)
      vn_CmdPipelineBarrier2(cmd_handle, dep_info);
   else {
      VkBufferMemoryBarrier barrier1;
      VkPipelineStageFlags src_stage_mask;
      VkPipelineStageFlags dst_stage_mask;

      vn_build_buffer_memory_barrier(dep_info, &barrier1, &src_stage_mask,
                                     &dst_stage_mask);
      vn_CmdPipelineBarrier(cmd_handle, src_stage_mask, dst_stage_mask,
                            dep_info->dependencyFlags, 0, NULL, 1, &barrier1,
                            0, NULL);
   }
}

void
vn_event_feedback_cmd_record(VkCommandBuffer cmd_handle,
                             VkEvent ev_handle,
                             VkPipelineStageFlags2 src_stage_mask,
                             VkResult status,
                             bool sync2)
{
   /* For vkCmdSetEvent and vkCmdResetEvent feedback interception.
    *
    * The injection point is after the event call to avoid introducing
    * unexpected src stage waiting for VK_PIPELINE_STAGE_HOST_BIT and
    * VK_PIPELINE_STAGE_TRANSFER_BIT if they are not already being waited by
    * vkCmdSetEvent or vkCmdResetEvent. On the other hand, the delay in the
    * feedback signal is acceptable for the nature of VkEvent, and the event
    * feedback cmds lifecycle is guarded by the intercepted command buffer.
    */
   struct vn_event *ev = vn_event_from_handle(ev_handle);
   struct vn_feedback_slot *slot = ev->feedback_slot;

   if (!slot)
      return;

   STATIC_ASSERT(sizeof(*slot->status) == 4);

   const VkDependencyInfo dep_before = {
      .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
      .dependencyFlags = 0,
      .bufferMemoryBarrierCount = 1,
      .pBufferMemoryBarriers =
         (VkBufferMemoryBarrier2[]){
            {
               .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
               .srcStageMask = src_stage_mask | VK_PIPELINE_STAGE_HOST_BIT |
                               VK_PIPELINE_STAGE_TRANSFER_BIT,
               .srcAccessMask =
                  VK_ACCESS_HOST_WRITE_BIT | VK_ACCESS_TRANSFER_WRITE_BIT,
               .dstStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT,
               .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
               .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
               .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
               .buffer = slot->buf_handle,
               .offset = slot->offset,
               .size = 4,
            },
         },
   };
   vn_cmd_buffer_memory_barrier(cmd_handle, &dep_before, sync2);

   vn_CmdFillBuffer(cmd_handle, slot->buf_handle, slot->offset, 4, status);

   const VkDependencyInfo dep_after = {
      .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
      .dependencyFlags = 0,
      .bufferMemoryBarrierCount = 1,
      .pBufferMemoryBarriers =
         (VkBufferMemoryBarrier2[]){
            {
               .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
               .srcStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT,
               .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
               .dstStageMask = VK_PIPELINE_STAGE_HOST_BIT,
               .dstAccessMask =
                  VK_ACCESS_HOST_READ_BIT | VK_ACCESS_HOST_WRITE_BIT,
               .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
               .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
               .buffer = slot->buf_handle,
               .offset = slot->offset,
               .size = 4,
            },
         },
   };
   vn_cmd_buffer_memory_barrier(cmd_handle, &dep_after, sync2);
}

static inline void
vn_feedback_cmd_record_flush_barrier(VkCommandBuffer cmd_handle,
                                     VkBuffer buffer,
                                     VkDeviceSize offset,
                                     VkDeviceSize size)
{
   const VkBufferMemoryBarrier buf_flush_barrier = {
      .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
      .pNext = NULL,
      .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
      .dstAccessMask = VK_ACCESS_HOST_READ_BIT | VK_ACCESS_HOST_WRITE_BIT,
      .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
      .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
      .buffer = buffer,
      .offset = offset,
      .size = size,
   };
   vn_CmdPipelineBarrier(cmd_handle, VK_PIPELINE_STAGE_TRANSFER_BIT,
                         VK_PIPELINE_STAGE_HOST_BIT, 0, 0, NULL, 1,
                         &buf_flush_barrier, 0, NULL);
}

static VkResult
vn_feedback_cmd_record(VkCommandBuffer cmd_handle,
                       struct vn_feedback_slot *dst_slot,
                       struct vn_feedback_slot *src_slot)
{
   STATIC_ASSERT(sizeof(*dst_slot->status) == 4);
   STATIC_ASSERT(sizeof(*dst_slot->counter) == 8);
   STATIC_ASSERT(sizeof(*src_slot->counter) == 8);

   /* slot size is 8 bytes for timeline semaphore and 4 bytes fence.
    * src slot is non-null for timeline semaphore.
    */
   const VkDeviceSize buf_size = src_slot ? 8 : 4;

   static const VkCommandBufferBeginInfo begin_info = {
      .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
      .pNext = NULL,
      .flags = 0,
      .pInheritanceInfo = NULL,
   };
   VkResult result = vn_BeginCommandBuffer(cmd_handle, &begin_info);
   if (result != VK_SUCCESS)
      return result;

   static const VkMemoryBarrier mem_barrier_before = {
      .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
      .pNext = NULL,
      /* make pending writes available to stay close to signal op */
      .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT,
      /* no need to make all memory visible for feedback update */
      .dstAccessMask = 0,
   };

   const VkBufferMemoryBarrier buf_barrier_before = {
      .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
      .pNext = NULL,
      /* slot memory has been made available via mem_barrier_before */
      .srcAccessMask = 0,
      .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
      .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
      .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
      .buffer = dst_slot->buf_handle,
      .offset = dst_slot->offset,
      .size = buf_size,
   };

   /* host writes for src_slots should implicitly be made visible upon
    * QueueSubmit call */
   vn_CmdPipelineBarrier(cmd_handle, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
                         VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 1,
                         &mem_barrier_before, 1, &buf_barrier_before, 0,
                         NULL);

   /* If passed a src_slot, timeline semaphore feedback records a
    * cmd to copy the counter value from the src slot to the dst slot.
    * If src_slot is NULL, then fence feedback records a cmd to fill
    * the dst slot with VK_SUCCESS.
    */
   if (src_slot) {
      assert(src_slot->type == VN_FEEDBACK_TYPE_SEMAPHORE);
      assert(dst_slot->type == VN_FEEDBACK_TYPE_SEMAPHORE);

      const VkBufferCopy buffer_copy = {
         .srcOffset = src_slot->offset,
         .dstOffset = dst_slot->offset,
         .size = buf_size,
      };
      vn_CmdCopyBuffer(cmd_handle, src_slot->buf_handle, dst_slot->buf_handle,
                       1, &buffer_copy);
   } else {
      assert(dst_slot->type == VN_FEEDBACK_TYPE_FENCE);

      vn_CmdFillBuffer(cmd_handle, dst_slot->buf_handle, dst_slot->offset,
                       buf_size, VK_SUCCESS);
   }

   vn_feedback_cmd_record_flush_barrier(cmd_handle, dst_slot->buf_handle,
                                        dst_slot->offset, buf_size);

   return vn_EndCommandBuffer(cmd_handle);
}

struct vn_semaphore_feedback_cmd *
vn_semaphore_feedback_cmd_alloc(struct vn_device *dev,
                                struct vn_feedback_slot *dst_slot)
{
   const VkAllocationCallbacks *alloc = &dev->base.base.alloc;
   struct vn_semaphore_feedback_cmd *sfb_cmd;
   VkCommandBuffer *cmd_handles;

   VK_MULTIALLOC(ma);
   vk_multialloc_add(&ma, &sfb_cmd, __typeof__(*sfb_cmd), 1);
   vk_multialloc_add(&ma, &cmd_handles, __typeof__(*cmd_handles),
                     dev->queue_family_count);
   if (!vk_multialloc_zalloc(&ma, alloc, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT))
      return NULL;

   struct vn_feedback_slot *src_slot =
      vn_feedback_pool_alloc(&dev->feedback_pool, VN_FEEDBACK_TYPE_SEMAPHORE);
   if (!src_slot) {
      vk_free(alloc, sfb_cmd);
      return NULL;
   }

   for (uint32_t i = 0; i < dev->queue_family_count; i++) {
      VkDevice dev_handle = vn_device_to_handle(dev);
      VkResult result =
         vn_feedback_cmd_alloc(dev_handle, &dev->fb_cmd_pools[i], dst_slot,
                               src_slot, &cmd_handles[i]);
      if (result != VK_SUCCESS) {
         for (uint32_t j = 0; j < i; j++) {
            vn_feedback_cmd_free(dev_handle, &dev->fb_cmd_pools[j],
                                 cmd_handles[j]);
         }

         vn_feedback_pool_free(&dev->feedback_pool, src_slot);
         vk_free(alloc, sfb_cmd);
         return NULL;
      }
   }

   sfb_cmd->cmd_handles = cmd_handles;
   sfb_cmd->src_slot = src_slot;
   return sfb_cmd;
}

void
vn_semaphore_feedback_cmd_free(struct vn_device *dev,
                               struct vn_semaphore_feedback_cmd *sfb_cmd)
{
   const VkAllocationCallbacks *alloc = &dev->base.base.alloc;

   for (uint32_t i = 0; i < dev->queue_family_count; i++) {
      vn_feedback_cmd_free(vn_device_to_handle(dev), &dev->fb_cmd_pools[i],
                           sfb_cmd->cmd_handles[i]);
   }

   vn_feedback_pool_free(&dev->feedback_pool, sfb_cmd->src_slot);
   vk_free(alloc, sfb_cmd);
}

static void
vn_query_feedback_cmd_record_internal(VkCommandBuffer cmd_handle,
                                      VkQueryPool pool_handle,
                                      uint32_t query,
                                      uint32_t count,
                                      bool copy)
{
   struct vn_query_pool *pool = vn_query_pool_from_handle(pool_handle);
   assert(pool->fb_buf);

   /* Results are always 64 bit and include availability bit (also 64 bit) */
   const VkDeviceSize slot_size = (pool->result_array_size * 8) + 8;
   const VkDeviceSize offset = slot_size * query;
   const VkDeviceSize buf_size = slot_size * count;

   /* The first synchronization scope of vkCmdCopyQueryPoolResults does not
    * include the query feedback buffer. Insert a barrier to ensure ordering
    * against feedback buffer fill cmd injected in vkCmdResetQueryPool.
    *
    * The second synchronization scope of vkCmdResetQueryPool does not include
    * the query feedback buffer. Insert a barrer to ensure ordering against
    * prior cmds referencing the queries.
    *
    * For srcAccessMask, VK_ACCESS_TRANSFER_WRITE_BIT is sufficient since the
    * gpu cache invalidation for feedback buffer fill in vkResetQueryPool is
    * done implicitly via queue submission.
    */
   const VkPipelineStageFlags src_stage_mask =
      copy ? VK_PIPELINE_STAGE_TRANSFER_BIT
           : VK_PIPELINE_STAGE_ALL_COMMANDS_BIT;

   const VkBufferMemoryBarrier buf_barrier_before = {
      .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
      .pNext = NULL,
      .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
      .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
      .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
      .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
      .buffer = pool->fb_buf->buf_handle,
      .offset = offset,
      .size = buf_size,
   };
   vn_CmdPipelineBarrier(cmd_handle, src_stage_mask,
                         VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 0, NULL, 1,
                         &buf_barrier_before, 0, NULL);

   if (copy) {
      /* Per spec: "The first synchronization scope includes all commands
       * which reference the queries in queryPool indicated by query that
       * occur earlier in submission order. If flags does not include
       * VK_QUERY_RESULT_WAIT_BIT, vkCmdEndQueryIndexedEXT,
       * vkCmdWriteTimestamp2, vkCmdEndQuery, and vkCmdWriteTimestamp are
       * excluded from this scope."
       *
       * Set VK_QUERY_RESULT_WAIT_BIT to ensure ordering after
       * vkCmdEndQuery or vkCmdWriteTimestamp makes the query available.
       *
       * Set VK_QUERY_RESULT_64_BIT as we can convert it to 32 bit if app
       * requested that.
       *
       * Per spec: "vkCmdCopyQueryPoolResults is considered to be a transfer
       * operation, and its writes to buffer memory must be synchronized using
       * VK_PIPELINE_STAGE_TRANSFER_BIT and VK_ACCESS_TRANSFER_WRITE_BIT
       * before using the results."
       *
       * So we can reuse the flush barrier after this copy cmd.
       */
      vn_CmdCopyQueryPoolResults(cmd_handle, pool_handle, query, count,
                                 pool->fb_buf->buf_handle, offset, slot_size,
                                 VK_QUERY_RESULT_WITH_AVAILABILITY_BIT |
                                    VK_QUERY_RESULT_64_BIT |
                                    VK_QUERY_RESULT_WAIT_BIT);
   } else {
      vn_CmdFillBuffer(cmd_handle, pool->fb_buf->buf_handle, offset, buf_size,
                       0);
   }

   vn_feedback_cmd_record_flush_barrier(cmd_handle, pool->fb_buf->buf_handle,
                                        offset, buf_size);
}

static VkResult
vn_query_feedback_cmd_record(VkDevice dev_handle,
                             struct list_head *query_records,
                             struct vn_query_feedback_cmd *qfb_cmd)
{
   assert(!list_is_empty(query_records));

   static const VkCommandBufferBeginInfo begin_info = {
      .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
   };
   VkResult result = vn_BeginCommandBuffer(qfb_cmd->cmd_handle, &begin_info);
   if (result != VK_SUCCESS)
      return result;

   list_for_each_entry_safe(struct vn_cmd_query_record, record, query_records,
                            head) {
      vn_query_feedback_cmd_record_internal(
         qfb_cmd->cmd_handle, vn_query_pool_to_handle(record->query_pool),
         record->query, record->query_count, record->copy);
   }

   return vn_EndCommandBuffer(qfb_cmd->cmd_handle);
}

VkResult
vn_query_feedback_cmd_alloc(VkDevice dev_handle,
                            struct vn_feedback_cmd_pool *fb_cmd_pool,
                            struct list_head *query_records,
                            struct vn_query_feedback_cmd **out_qfb_cmd)
{
   struct vn_query_feedback_cmd *qfb_cmd;
   VkResult result;

   simple_mtx_lock(&fb_cmd_pool->mutex);

   if (list_is_empty(&fb_cmd_pool->free_qfb_cmds)) {
      struct vn_command_pool *cmd_pool =
         vn_command_pool_from_handle(fb_cmd_pool->pool_handle);

      qfb_cmd = vk_alloc(&cmd_pool->allocator, sizeof(*qfb_cmd),
                         VN_DEFAULT_ALIGN, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
      if (!qfb_cmd) {
         result = VK_ERROR_OUT_OF_HOST_MEMORY;
         goto out_unlock;
      }

      const VkCommandBufferAllocateInfo info = {
         .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
         .commandPool = fb_cmd_pool->pool_handle,
         .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
         .commandBufferCount = 1,
      };
      VkCommandBuffer qfb_cmd_handle;
      result = vn_AllocateCommandBuffers(dev_handle, &info, &qfb_cmd_handle);
      if (result != VK_SUCCESS) {
         vk_free(&cmd_pool->allocator, qfb_cmd);
         goto out_unlock;
      }

      qfb_cmd->fb_cmd_pool = fb_cmd_pool;
      qfb_cmd->cmd_handle = qfb_cmd_handle;
   } else {
      qfb_cmd = list_first_entry(&fb_cmd_pool->free_qfb_cmds,
                                 struct vn_query_feedback_cmd, head);
      list_del(&qfb_cmd->head);
      vn_ResetCommandBuffer(qfb_cmd->cmd_handle, 0);
   }

   result = vn_query_feedback_cmd_record(dev_handle, query_records, qfb_cmd);
   if (result != VK_SUCCESS) {
      list_add(&qfb_cmd->head, &fb_cmd_pool->free_qfb_cmds);
      goto out_unlock;
   }

   *out_qfb_cmd = qfb_cmd;

out_unlock:
   simple_mtx_unlock(&fb_cmd_pool->mutex);

   return result;
}

void
vn_query_feedback_cmd_free(struct vn_query_feedback_cmd *qfb_cmd)
{
   simple_mtx_lock(&qfb_cmd->fb_cmd_pool->mutex);
   list_add(&qfb_cmd->head, &qfb_cmd->fb_cmd_pool->free_qfb_cmds);
   simple_mtx_unlock(&qfb_cmd->fb_cmd_pool->mutex);
}

VkResult
vn_feedback_cmd_alloc(VkDevice dev_handle,
                      struct vn_feedback_cmd_pool *fb_cmd_pool,
                      struct vn_feedback_slot *dst_slot,
                      struct vn_feedback_slot *src_slot,
                      VkCommandBuffer *out_cmd_handle)
{
   VkCommandPool cmd_pool_handle = fb_cmd_pool->pool_handle;
   const VkCommandBufferAllocateInfo info = {
      .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
      .pNext = NULL,
      .commandPool = cmd_pool_handle,
      .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
      .commandBufferCount = 1,
   };
   VkCommandBuffer cmd_handle;
   VkResult result;

   simple_mtx_lock(&fb_cmd_pool->mutex);
   result = vn_AllocateCommandBuffers(dev_handle, &info, &cmd_handle);
   if (result != VK_SUCCESS)
      goto out_unlock;

   result = vn_feedback_cmd_record(cmd_handle, dst_slot, src_slot);
   if (result != VK_SUCCESS) {
      vn_FreeCommandBuffers(dev_handle, cmd_pool_handle, 1, &cmd_handle);
      goto out_unlock;
   }

   *out_cmd_handle = cmd_handle;

out_unlock:
   simple_mtx_unlock(&fb_cmd_pool->mutex);

   return result;
}

void
vn_feedback_cmd_free(VkDevice dev_handle,
                     struct vn_feedback_cmd_pool *fb_cmd_pool,
                     VkCommandBuffer cmd_handle)
{
   simple_mtx_lock(&fb_cmd_pool->mutex);
   vn_FreeCommandBuffers(dev_handle, fb_cmd_pool->pool_handle, 1,
                         &cmd_handle);
   simple_mtx_unlock(&fb_cmd_pool->mutex);
}

VkResult
vn_feedback_cmd_pools_init(struct vn_device *dev)
{
   const VkAllocationCallbacks *alloc = &dev->base.base.alloc;
   VkDevice dev_handle = vn_device_to_handle(dev);
   struct vn_feedback_cmd_pool *fb_cmd_pools;
   VkCommandPoolCreateInfo info = {
      .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
      .pNext = NULL,
      .flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
   };

   if (VN_PERF(NO_FENCE_FEEDBACK) && VN_PERF(NO_SEMAPHORE_FEEDBACK) &&
       VN_PERF(NO_QUERY_FEEDBACK))
      return VK_SUCCESS;

   assert(dev->queue_family_count);

   fb_cmd_pools =
      vk_zalloc(alloc, sizeof(*fb_cmd_pools) * dev->queue_family_count,
                VN_DEFAULT_ALIGN, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
   if (!fb_cmd_pools)
      return VK_ERROR_OUT_OF_HOST_MEMORY;

   for (uint32_t i = 0; i < dev->queue_family_count; i++) {
      VkResult result;

      info.queueFamilyIndex = dev->queue_families[i];
      result = vn_CreateCommandPool(dev_handle, &info, alloc,
                                    &fb_cmd_pools[i].pool_handle);
      if (result != VK_SUCCESS) {
         for (uint32_t j = 0; j < i; j++) {
            vn_DestroyCommandPool(dev_handle, fb_cmd_pools[j].pool_handle,
                                  alloc);
            simple_mtx_destroy(&fb_cmd_pools[j].mutex);
         }

         vk_free(alloc, fb_cmd_pools);
         return result;
      }

      simple_mtx_init(&fb_cmd_pools[i].mutex, mtx_plain);
      list_inithead(&fb_cmd_pools[i].free_qfb_cmds);
   }

   dev->fb_cmd_pools = fb_cmd_pools;

   return VK_SUCCESS;
}

void
vn_feedback_cmd_pools_fini(struct vn_device *dev)
{
   const VkAllocationCallbacks *alloc = &dev->base.base.alloc;
   VkDevice dev_handle = vn_device_to_handle(dev);

   if (!dev->fb_cmd_pools)
      return;

   for (uint32_t i = 0; i < dev->queue_family_count; i++) {
      list_for_each_entry_safe(struct vn_query_feedback_cmd, feedback_cmd,
                               &dev->fb_cmd_pools[i].free_qfb_cmds, head)
         vk_free(alloc, feedback_cmd);

      vn_DestroyCommandPool(dev_handle, dev->fb_cmd_pools[i].pool_handle,
                            alloc);
      simple_mtx_destroy(&dev->fb_cmd_pools[i].mutex);
   }

   vk_free(alloc, dev->fb_cmd_pools);
}
