/*
 * Copyright © 2022 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#include "i915/anv_batch_chain.h"
#include "anv_private.h"
#include "anv_measure.h"

#include "perf/intel_perf.h"
#include "util/u_debug.h"

#include "drm-uapi/i915_drm.h"

struct anv_execbuf {
   struct drm_i915_gem_execbuffer2           execbuf;

   struct drm_i915_gem_execbuffer_ext_timeline_fences timeline_fences;

   struct drm_i915_gem_exec_object2 *        objects;
   uint32_t                                  bo_count;
   uint32_t                                  bo_array_length;
   struct anv_bo **                          bos;

   uint32_t                                  syncobj_count;
   uint32_t                                  syncobj_array_length;
   struct drm_i915_gem_exec_fence *          syncobjs;
   uint64_t *                                syncobj_values;

   uint32_t                                  cmd_buffer_count;
   struct anv_query_pool                     *perf_query_pool;

   const VkAllocationCallbacks *             alloc;
   VkSystemAllocationScope                   alloc_scope;
};

static void
anv_execbuf_finish(struct anv_execbuf *exec)
{
   vk_free(exec->alloc, exec->syncobjs);
   vk_free(exec->alloc, exec->syncobj_values);
   vk_free(exec->alloc, exec->objects);
   vk_free(exec->alloc, exec->bos);
}

static void
anv_execbuf_add_ext(struct anv_execbuf *exec,
                    uint32_t ext_name,
                    struct i915_user_extension *ext)
{
   __u64 *iter = &exec->execbuf.cliprects_ptr;

   exec->execbuf.flags |= I915_EXEC_USE_EXTENSIONS;

   while (*iter != 0) {
      iter = (__u64 *) &((struct i915_user_extension *)(uintptr_t)*iter)->next_extension;
   }

   ext->name = ext_name;

   *iter = (uintptr_t) ext;
}

static VkResult
anv_execbuf_add_bo_bitset(struct anv_device *device,
                          struct anv_execbuf *exec,
                          uint32_t dep_words,
                          BITSET_WORD *deps,
                          uint32_t extra_flags);

static VkResult
anv_execbuf_add_bo(struct anv_device *device,
                   struct anv_execbuf *exec,
                   struct anv_bo *bo,
                   struct anv_reloc_list *relocs,
                   uint32_t extra_flags)
{
   struct drm_i915_gem_exec_object2 *obj = NULL;

   if (bo->exec_obj_index < exec->bo_count &&
       exec->bos[bo->exec_obj_index] == bo)
      obj = &exec->objects[bo->exec_obj_index];

   if (obj == NULL) {
      /* We've never seen this one before.  Add it to the list and assign
       * an id that we can use later.
       */
      if (exec->bo_count >= exec->bo_array_length) {
         uint32_t new_len = exec->objects ? exec->bo_array_length * 2 : 64;

         struct drm_i915_gem_exec_object2 *new_objects =
            vk_realloc(exec->alloc, exec->objects,
                       new_len * sizeof(*new_objects), 8, exec->alloc_scope);
         if (new_objects == NULL)
            return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);

         exec->objects = new_objects;

         struct anv_bo **new_bos =
            vk_realloc(exec->alloc, exec->bos, new_len * sizeof(*new_bos), 8,
                       exec->alloc_scope);
         if (new_bos == NULL)
            return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);

         exec->bos = new_bos;
         exec->bo_array_length = new_len;
      }

      assert(exec->bo_count < exec->bo_array_length);

      bo->exec_obj_index = exec->bo_count++;
      obj = &exec->objects[bo->exec_obj_index];
      exec->bos[bo->exec_obj_index] = bo;

      obj->handle = bo->gem_handle;
      obj->relocation_count = 0;
      obj->relocs_ptr = 0;
      obj->alignment = 0;
      obj->offset = bo->offset;
      obj->flags = bo->flags | extra_flags;
      obj->rsvd1 = 0;
      obj->rsvd2 = 0;
   }

   if (extra_flags & EXEC_OBJECT_WRITE) {
      obj->flags |= EXEC_OBJECT_WRITE;
      obj->flags &= ~EXEC_OBJECT_ASYNC;
   }

   if (relocs != NULL) {
      return anv_execbuf_add_bo_bitset(device, exec, relocs->dep_words,
                                       relocs->deps, extra_flags);
   }

   return VK_SUCCESS;
}

/* Add BO dependencies to execbuf */
static VkResult
anv_execbuf_add_bo_bitset(struct anv_device *device,
                          struct anv_execbuf *exec,
                          uint32_t dep_words,
                          BITSET_WORD *deps,
                          uint32_t extra_flags)
{
   for (uint32_t w = 0; w < dep_words; w++) {
      BITSET_WORD mask = deps[w];
      while (mask) {
         int i = u_bit_scan(&mask);
         uint32_t gem_handle = w * BITSET_WORDBITS + i;
         struct anv_bo *bo = anv_device_lookup_bo(device, gem_handle);
         assert(bo->refcount > 0);
         VkResult result =
            anv_execbuf_add_bo(device, exec, bo, NULL, extra_flags);
         if (result != VK_SUCCESS)
            return result;
      }
   }

   return VK_SUCCESS;
}

static VkResult
anv_execbuf_add_syncobj(struct anv_device *device,
                        struct anv_execbuf *exec,
                        uint32_t syncobj,
                        uint32_t flags,
                        uint64_t timeline_value)
{
   if (exec->syncobj_count >= exec->syncobj_array_length) {
      uint32_t new_len = MAX2(exec->syncobj_array_length * 2, 16);

      struct drm_i915_gem_exec_fence *new_syncobjs =
         vk_realloc(exec->alloc, exec->syncobjs,
                    new_len * sizeof(*new_syncobjs), 8, exec->alloc_scope);
      if (new_syncobjs == NULL)
         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);

      exec->syncobjs = new_syncobjs;

      if (exec->syncobj_values) {
         uint64_t *new_syncobj_values =
            vk_realloc(exec->alloc, exec->syncobj_values,
                       new_len * sizeof(*new_syncobj_values), 8,
                       exec->alloc_scope);
         if (new_syncobj_values == NULL)
            return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);

         exec->syncobj_values = new_syncobj_values;
      }

      exec->syncobj_array_length = new_len;
   }

   if (timeline_value && !exec->syncobj_values) {
      exec->syncobj_values =
         vk_zalloc(exec->alloc, exec->syncobj_array_length *
                                sizeof(*exec->syncobj_values),
                   8, exec->alloc_scope);
      if (!exec->syncobj_values)
         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
   }

   exec->syncobjs[exec->syncobj_count] = (struct drm_i915_gem_exec_fence) {
      .handle = syncobj,
      .flags = flags,
   };
   if (exec->syncobj_values)
      exec->syncobj_values[exec->syncobj_count] = timeline_value;

   exec->syncobj_count++;

   return VK_SUCCESS;
}

static VkResult
anv_execbuf_add_sync(struct anv_device *device,
                     struct anv_execbuf *execbuf,
                     struct vk_sync *sync,
                     bool is_signal,
                     uint64_t value)
{
   /* It's illegal to signal a timeline with value 0 because that's never
    * higher than the current value.  A timeline wait on value 0 is always
    * trivial because 0 <= uint64_t always.
    */
   if ((sync->flags & VK_SYNC_IS_TIMELINE) && value == 0)
      return VK_SUCCESS;

   if (vk_sync_is_anv_bo_sync(sync)) {
      struct anv_bo_sync *bo_sync =
         container_of(sync, struct anv_bo_sync, sync);

      assert(is_signal == (bo_sync->state == ANV_BO_SYNC_STATE_RESET));

      return anv_execbuf_add_bo(device, execbuf, bo_sync->bo, NULL,
                                is_signal ? EXEC_OBJECT_WRITE : 0);
   } else if (vk_sync_type_is_drm_syncobj(sync->type)) {
      struct vk_drm_syncobj *syncobj = vk_sync_as_drm_syncobj(sync);

      if (!(sync->flags & VK_SYNC_IS_TIMELINE))
         value = 0;

      return anv_execbuf_add_syncobj(device, execbuf, syncobj->syncobj,
                                     is_signal ? I915_EXEC_FENCE_SIGNAL :
                                                 I915_EXEC_FENCE_WAIT,
                                     value);
   }

   unreachable("Invalid sync type");
}

static VkResult
setup_execbuf_for_cmd_buffer(struct anv_execbuf *execbuf,
                             struct anv_cmd_buffer *cmd_buffer)
{
   VkResult result;
   /* Add surface dependencies (BOs) to the execbuf */
   result = anv_execbuf_add_bo_bitset(cmd_buffer->device, execbuf,
                                      cmd_buffer->surface_relocs.dep_words,
                                      cmd_buffer->surface_relocs.deps, 0);
   if (result != VK_SUCCESS)
      return result;

   /* First, we walk over all of the bos we've seen and add them and their
    * relocations to the validate list.
    */
   struct anv_batch_bo **bbo;
   u_vector_foreach(bbo, &cmd_buffer->seen_bbos) {
      result = anv_execbuf_add_bo(cmd_buffer->device, execbuf,
                                  (*bbo)->bo, &(*bbo)->relocs, 0);
      if (result != VK_SUCCESS)
         return result;
   }

   struct anv_bo **bo_entry;
   u_vector_foreach(bo_entry, &cmd_buffer->dynamic_bos) {
      result = anv_execbuf_add_bo(cmd_buffer->device, execbuf,
                                  *bo_entry, NULL, 0);
      if (result != VK_SUCCESS)
         return result;
   }

   return VK_SUCCESS;
}

static VkResult
pin_state_pool(struct anv_device *device,
               struct anv_execbuf *execbuf,
               struct anv_state_pool *pool)
{
   anv_block_pool_foreach_bo(bo, &pool->block_pool) {
      VkResult result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0);
      if (result != VK_SUCCESS)
         return result;
   }

   return VK_SUCCESS;
}

static void
get_context_and_exec_flags(struct anv_queue *queue,
                           bool is_companion_rcs_batch,
                           uint64_t *exec_flags,
                           uint32_t *context_id)
{
   assert(queue != NULL);

   struct anv_device *device = queue->device;

   /** Submit batch to index 0 which is the main virtual engine */
   *exec_flags = device->physical->has_vm_control ? 0 : queue->exec_flags;

   *context_id = device->physical->has_vm_control ?
                 is_companion_rcs_batch ?
                 queue->companion_rcs_id :
                 queue->context_id :
                 device->context_id;
}

static VkResult
anv_execbuf_add_trtt_bos(struct anv_device *device,
                         struct anv_execbuf *execbuf)
{
   struct anv_trtt *trtt = &device->trtt;
   VkResult result = VK_SUCCESS;

   /* If l3_addr is zero we're not using TR-TT, there's no bo to add. */
   if (!trtt->l3_addr)
      return VK_SUCCESS;

   simple_mtx_lock(&trtt->mutex);

   for (int i = 0; i < trtt->num_page_table_bos; i++) {
      result = anv_execbuf_add_bo(device, execbuf, trtt->page_table_bos[i],
                                  NULL, 0);
      if (result != VK_SUCCESS)
         goto out;
   }

out:
   simple_mtx_unlock(&trtt->mutex);
   return result;
}

static VkResult
setup_execbuf_for_cmd_buffers(struct anv_execbuf *execbuf,
                              struct anv_queue *queue,
                              struct anv_cmd_buffer **cmd_buffers,
                              uint32_t num_cmd_buffers)
{
   struct anv_device *device = queue->device;
   VkResult result;

   if (unlikely(device->physical->measure_device.config)) {
      for (uint32_t i = 0; i < num_cmd_buffers; i++)
         anv_measure_submit(cmd_buffers[i]);
   }

   /* Edit the tail of the command buffers to chain them all together if they
    * can be.
    */
   anv_cmd_buffer_chain_command_buffers(cmd_buffers, num_cmd_buffers);

   for (uint32_t i = 0; i < num_cmd_buffers; i++) {
      result = setup_execbuf_for_cmd_buffer(execbuf, cmd_buffers[i]);
      if (result != VK_SUCCESS)
         return result;
   }

   /* Add all the global BOs to the object list for softpin case. */
   result = pin_state_pool(device, execbuf, &device->scratch_surface_state_pool);
   if (result != VK_SUCCESS)
      return result;

   if (device->physical->va.bindless_surface_state_pool.size > 0) {
      result = pin_state_pool(device, execbuf, &device->bindless_surface_state_pool);
      if (result != VK_SUCCESS)
         return result;
   }

   if (device->physical->va.indirect_push_descriptor_pool.size > 0) {
      result = pin_state_pool(device, execbuf, &device->indirect_push_descriptor_pool);
      if (result != VK_SUCCESS)
         return result;
   }

   result = pin_state_pool(device, execbuf, &device->internal_surface_state_pool);
   if (result != VK_SUCCESS)
      return result;

   result = pin_state_pool(device, execbuf, &device->dynamic_state_pool);
   if (result != VK_SUCCESS)
      return result;

   result = pin_state_pool(device, execbuf, &device->general_state_pool);
   if (result != VK_SUCCESS)
      return result;

   result = pin_state_pool(device, execbuf, &device->instruction_state_pool);
   if (result != VK_SUCCESS)
      return result;

   result = pin_state_pool(device, execbuf, &device->binding_table_pool);
   if (result != VK_SUCCESS)
      return result;

   if (device->physical->va.aux_tt_pool.size > 0) {
      result = pin_state_pool(device, execbuf, &device->aux_tt_pool);
      if (result != VK_SUCCESS)
         return result;
   }

   if (device->physical->va.push_descriptor_buffer_pool.size > 0) {
      result = pin_state_pool(device, execbuf, &device->push_descriptor_buffer_pool);
      if (result != VK_SUCCESS)
         return result;
   }

   /* Add the BOs for all user allocated memory objects because we can't
    * track after binding updates of VK_EXT_descriptor_indexing and due to how
    * sparse resources work.
    */
   list_for_each_entry(struct anv_device_memory, mem,
                       &device->memory_objects, link) {
      result = anv_execbuf_add_bo(device, execbuf, mem->bo, NULL, 0);
      if (result != VK_SUCCESS)
         return result;
   }

   result = anv_execbuf_add_trtt_bos(device, execbuf);
   if (result != VK_SUCCESS)
      return result;

   /* Add all the private BOs from images because we can't track after binding
    * updates of VK_EXT_descriptor_indexing.
    */
   list_for_each_entry(struct anv_image, image,
                       &device->image_private_objects, link) {
      struct anv_bo *private_bo =
         image->bindings[ANV_IMAGE_MEMORY_BINDING_PRIVATE].address.bo;
      result = anv_execbuf_add_bo(device, execbuf, private_bo, NULL, 0);
      if (result != VK_SUCCESS)
         return result;
   }

   struct list_head *batch_bo = &cmd_buffers[0]->batch_bos;
   struct anv_batch_bo *first_batch_bo =
      list_first_entry(batch_bo, struct anv_batch_bo, link);

   /* The kernel requires that the last entry in the validation list be the
    * batch buffer to execute.  We can simply swap the element
    * corresponding to the first batch_bo in the chain with the last
    * element in the list.
    */
   if (first_batch_bo->bo->exec_obj_index != execbuf->bo_count - 1) {
      uint32_t idx = first_batch_bo->bo->exec_obj_index;
      uint32_t last_idx = execbuf->bo_count - 1;

      struct drm_i915_gem_exec_object2 tmp_obj = execbuf->objects[idx];
      assert(execbuf->bos[idx] == first_batch_bo->bo);

      execbuf->objects[idx] = execbuf->objects[last_idx];
      execbuf->bos[idx] = execbuf->bos[last_idx];
      execbuf->bos[idx]->exec_obj_index = idx;

      execbuf->objects[last_idx] = tmp_obj;
      execbuf->bos[last_idx] = first_batch_bo->bo;
      first_batch_bo->bo->exec_obj_index = last_idx;
   }

#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
   if (device->physical->memory.need_flush &&
       anv_bo_needs_host_cache_flush(device->batch_bo_pool.bo_alloc_flags))
      anv_cmd_buffer_clflush(cmd_buffers, num_cmd_buffers);
#endif

   assert(!cmd_buffers[0]->is_companion_rcs_cmd_buffer || device->physical->has_vm_control);
   uint64_t exec_flags = 0;
   uint32_t context_id;
   get_context_and_exec_flags(queue, cmd_buffers[0]->is_companion_rcs_cmd_buffer,
                              &exec_flags, &context_id);

   execbuf->execbuf = (struct drm_i915_gem_execbuffer2) {
      .buffers_ptr = (uintptr_t) execbuf->objects,
      .buffer_count = execbuf->bo_count,
      .batch_start_offset = 0,
      .batch_len = 0,
      .cliprects_ptr = 0,
      .num_cliprects = 0,
      .DR1 = 0,
      .DR4 = 0,
      .flags = I915_EXEC_NO_RELOC |
               I915_EXEC_HANDLE_LUT |
               exec_flags,
      .rsvd1 = context_id,
      .rsvd2 = 0,
   };

   return VK_SUCCESS;
}

static VkResult
setup_empty_execbuf(struct anv_execbuf *execbuf, struct anv_queue *queue)
{
   struct anv_device *device = queue->device;
   VkResult result = anv_execbuf_add_bo(device, execbuf,
                                        device->trivial_batch_bo,
                                        NULL, 0);
   if (result != VK_SUCCESS)
      return result;

   uint64_t exec_flags = 0;
   uint32_t context_id;
   get_context_and_exec_flags(queue, false, &exec_flags, &context_id);

   execbuf->execbuf = (struct drm_i915_gem_execbuffer2) {
      .buffers_ptr = (uintptr_t) execbuf->objects,
      .buffer_count = execbuf->bo_count,
      .batch_start_offset = 0,
      .batch_len = 8, /* GFX7_MI_BATCH_BUFFER_END and NOOP */
      .flags = I915_EXEC_HANDLE_LUT | exec_flags | I915_EXEC_NO_RELOC,
      .rsvd1 = context_id,
      .rsvd2 = 0,
   };

   return VK_SUCCESS;
}

static void
setup_execbuf_fence_params(struct anv_execbuf *execbuf)
{
   if (execbuf->syncobj_values) {
      execbuf->timeline_fences.fence_count = execbuf->syncobj_count;
      execbuf->timeline_fences.handles_ptr = (uintptr_t)execbuf->syncobjs;
      execbuf->timeline_fences.values_ptr = (uintptr_t)execbuf->syncobj_values;
      anv_execbuf_add_ext(execbuf,
                          DRM_I915_GEM_EXECBUFFER_EXT_TIMELINE_FENCES,
                          &execbuf->timeline_fences.base);
   } else if (execbuf->syncobjs) {
      execbuf->execbuf.flags |= I915_EXEC_FENCE_ARRAY;
      execbuf->execbuf.num_cliprects = execbuf->syncobj_count;
      execbuf->execbuf.cliprects_ptr = (uintptr_t)execbuf->syncobjs;
   }
}

static VkResult
setup_async_execbuf(struct anv_execbuf *execbuf,
                    struct anv_async_submit *submit,
                    uint32_t wait_count,
                    const struct vk_sync_wait *waits,
                    uint32_t signal_count,
                    const struct vk_sync_signal *signals)
{
   struct anv_queue *queue = submit->queue;
   struct anv_device *device = queue->device;

   /* Always add the workaround BO as it includes a driver identifier for the
    * error_state.
    */
   VkResult result = anv_execbuf_add_bo(device, execbuf,
                                        device->workaround_bo,
                                        NULL, 0);
   if (result != VK_SUCCESS)
      return result;

   util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, _bo) {
      struct anv_bo *bo = *_bo;

      result = anv_execbuf_add_bo(device, execbuf, bo,
                                  &submit->relocs, 0);
      if (result != VK_SUCCESS)
         return result;

#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
      if (device->physical->memory.need_flush &&
          anv_bo_needs_host_cache_flush(bo->alloc_flags))
         intel_flush_range(bo->map, bo->size);
#endif
   }

   for (uint32_t i = 0; i < wait_count; i++) {
      result = anv_execbuf_add_sync(device, execbuf,
                                    waits[i].sync,
                                    false /* is_signal */,
                                    waits[i].wait_value);
      if (result != VK_SUCCESS)
         return result;
   }
   for (uint32_t i = 0; i < signal_count; i++) {
      result = anv_execbuf_add_sync(device, execbuf,
                                    signals[i].sync,
                                    true /* is_signal */,
                                    signals[i].signal_value);
      if (result != VK_SUCCESS)
         return result;
   }
   if (submit->signal.sync) {
      result = anv_execbuf_add_sync(device, execbuf,
                                    submit->signal.sync,
                                    true /* is_signal */,
                                    submit->signal.signal_value);
      if (result != VK_SUCCESS)
         return result;
   }
   if (queue->sync) {
      result = anv_execbuf_add_sync(device, execbuf,
                                    queue->sync,
                                    true /* is_signal */,
                                    0 /* signal_value */);
      if (result != VK_SUCCESS)
         return result;
   }

   struct anv_bo *batch_bo =
      *util_dynarray_element(&submit->batch_bos, struct anv_bo *, 0);
   if (batch_bo->exec_obj_index != execbuf->bo_count - 1) {
      uint32_t idx = batch_bo->exec_obj_index;
      uint32_t last_idx = execbuf->bo_count - 1;

      struct drm_i915_gem_exec_object2 tmp_obj = execbuf->objects[idx];
      assert(execbuf->bos[idx] == batch_bo);

      execbuf->objects[idx] = execbuf->objects[last_idx];
      execbuf->bos[idx] = execbuf->bos[last_idx];
      execbuf->bos[idx]->exec_obj_index = idx;

      execbuf->objects[last_idx] = tmp_obj;
      execbuf->bos[last_idx] = batch_bo;
      batch_bo->exec_obj_index = last_idx;
   }

   uint64_t exec_flags = 0;
   uint32_t context_id;
   get_context_and_exec_flags(queue, submit->use_companion_rcs,
                              &exec_flags, &context_id);

   execbuf->execbuf = (struct drm_i915_gem_execbuffer2) {
      .buffers_ptr = (uintptr_t) execbuf->objects,
      .buffer_count = execbuf->bo_count,
      .batch_start_offset = 0,
      .flags = I915_EXEC_NO_RELOC |
               I915_EXEC_HANDLE_LUT |
               exec_flags,
      .rsvd1 = context_id,
      .rsvd2 = 0,
   };

   setup_execbuf_fence_params(execbuf);

   return VK_SUCCESS;
}

static int
anv_gem_execbuffer(struct anv_device *device,
                   struct drm_i915_gem_execbuffer2 *execbuf)
{
   int ret;
   const unsigned long request = (execbuf->flags & I915_EXEC_FENCE_OUT) ?
      DRM_IOCTL_I915_GEM_EXECBUFFER2_WR :
      DRM_IOCTL_I915_GEM_EXECBUFFER2;

   do {
      ret = intel_ioctl(device->fd, request, execbuf);
   } while (ret && errno == ENOMEM);

   return ret;
}

static void
anv_i915_debug_submit(const struct anv_execbuf *execbuf)
{
   uint32_t total_size_kb = 0, total_vram_only_size_kb = 0;
   for (uint32_t i = 0; i < execbuf->bo_count; i++) {
      const struct anv_bo *bo = execbuf->bos[i];
      total_size_kb += bo->size / 1024;
      if (anv_bo_is_vram_only(bo))
         total_vram_only_size_kb += bo->size / 1024;
   }

   fprintf(stderr, "Batch offset=0x%x len=0x%x on queue 0 (aperture: %.1fMb, %.1fMb VRAM only)\n",
           execbuf->execbuf.batch_start_offset, execbuf->execbuf.batch_len,
           (float)total_size_kb / 1024.0f,
           (float)total_vram_only_size_kb / 1024.0f);
   for (uint32_t i = 0; i < execbuf->bo_count; i++) {
      const struct anv_bo *bo = execbuf->bos[i];

      fprintf(stderr, "   BO: addr=0x%016"PRIx64"-0x%016"PRIx64" size=%7"PRIu64
              "KB handle=%05u capture=%u vram_only=%u name=%s\n",
              bo->offset, bo->offset + bo->size - 1, bo->size / 1024,
              bo->gem_handle, (bo->flags & EXEC_OBJECT_CAPTURE) != 0,
              anv_bo_is_vram_only(bo), bo->name);
   }
}

VkResult
i915_queue_exec_async(struct anv_async_submit *submit,
                      uint32_t wait_count,
                      const struct vk_sync_wait *waits,
                      uint32_t signal_count,
                      const struct vk_sync_signal *signals)
{
   assert(util_dynarray_num_elements(&submit->batch_bos,
                                     struct anv_bo *) > 0);

   struct anv_queue *queue = submit->queue;
   struct anv_device *device = queue->device;
   struct anv_execbuf execbuf = {
      .alloc = &device->vk.alloc,
      .alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_DEVICE,
   };

   VkResult result = setup_async_execbuf(&execbuf, submit,
                                         wait_count, waits,
                                         signal_count, signals);
   if (result != VK_SUCCESS)
      goto error;

   if (INTEL_DEBUG(DEBUG_SUBMIT))
      anv_i915_debug_submit(&execbuf);

   ANV_RMV(bos_gtt_map, device, execbuf.bos, execbuf.bo_count);

   int ret = queue->device->info->no_hw ? 0 :
      anv_gem_execbuffer(queue->device, &execbuf.execbuf);
   if (ret)
      result = vk_queue_set_lost(&queue->vk, "execbuf2 failed: %m");

   result = anv_queue_post_submit(queue, result);

 error:
   anv_execbuf_finish(&execbuf);

   return result;
}

static VkResult
i915_companion_rcs_queue_exec_locked(struct anv_queue *queue,
                                     struct anv_cmd_buffer *companion_rcs_cmd_buffer,
                                     uint32_t wait_count,
                                     const struct vk_sync_wait *waits)
{
   struct anv_device *device = queue->device;
   struct anv_execbuf execbuf = {
      .alloc = &queue->device->vk.alloc,
      .alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_DEVICE,
   };

   /* Always add the workaround BO as it includes a driver identifier for the
    * error_state.
    */
   VkResult result =
      anv_execbuf_add_bo(device, &execbuf, device->workaround_bo, NULL, 0);
   if (result != VK_SUCCESS)
      goto error;

   for (uint32_t i = 0; i < wait_count; i++) {
      result = anv_execbuf_add_sync(device, &execbuf,
                                    waits[i].sync,
                                    false /* is_signal */,
                                    waits[i].wait_value);
      if (result != VK_SUCCESS)
         goto error;
   }

   if (queue->companion_sync) {
      result = anv_execbuf_add_sync(device, &execbuf,
                                    queue->companion_sync,
                                    true /* is_signal */, 0);
      if (result != VK_SUCCESS)
         goto error;
   }

   result = setup_execbuf_for_cmd_buffers(&execbuf, queue,
                                          &companion_rcs_cmd_buffer, 1);
   if (result != VK_SUCCESS)
      goto error;

   if (INTEL_DEBUG(DEBUG_SUBMIT))
      anv_i915_debug_submit(&execbuf);

   anv_cmd_buffer_exec_batch_debug(queue, 1, &companion_rcs_cmd_buffer, NULL, 0);

   setup_execbuf_fence_params(&execbuf);

   ANV_RMV(bos_gtt_map, device, execbuf.bos, execbuf.bo_count);

   int ret = queue->device->info->no_hw ? 0 :
      anv_gem_execbuffer(queue->device, &execbuf.execbuf);
   if (ret) {
      anv_i915_debug_submit(&execbuf);
      result = vk_queue_set_lost(&queue->vk, "execbuf2 failed: %m");
   }

 error:
   anv_execbuf_finish(&execbuf);
   return result;
}

VkResult
i915_queue_exec_locked(struct anv_queue *queue,
                       uint32_t wait_count,
                       const struct vk_sync_wait *waits,
                       uint32_t cmd_buffer_count,
                       struct anv_cmd_buffer **cmd_buffers,
                       uint32_t signal_count,
                       const struct vk_sync_signal *signals,
                       struct anv_query_pool *perf_query_pool,
                       uint32_t perf_query_pass,
                       struct anv_utrace_submit *utrace_submit)
{
   struct anv_device *device = queue->device;
   struct anv_execbuf execbuf = {
      .alloc = &queue->device->vk.alloc,
      .alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_DEVICE,
   };
   VkResult result;

   /* If there is a utrace submission but no batch, it means there are no
    * commands to run for utrace. But we still have to signal the associated
    * syncs, so add them to the submission.
    */
   if (utrace_submit &&
       util_dynarray_num_elements(&utrace_submit->base.batch_bos,
                                  struct anv_bo *) == 0) {
      result = anv_execbuf_add_sync(device, &execbuf,
                                    utrace_submit->base.signal.sync,
                                    true /* is_signal */,
                                    utrace_submit->base.signal.signal_value);
      if (result != VK_SUCCESS)
         goto error;

      /* Avoid doing a submission after the application's batch since there
       * are no commands.
       */
      utrace_submit = NULL;
   }

   /* Always add the workaround BO as it includes a driver identifier for the
    * error_state.
    */
   result =
      anv_execbuf_add_bo(device, &execbuf, device->workaround_bo, NULL, 0);
   if (result != VK_SUCCESS)
      goto error;

   if (device->printf.bo) {
      result =
         anv_execbuf_add_bo(device, &execbuf, device->printf.bo, NULL, 0);
      if (result != VK_SUCCESS)
         goto error;
   }

   for (uint32_t i = 0; i < wait_count; i++) {
      result = anv_execbuf_add_sync(device, &execbuf,
                                    waits[i].sync,
                                    false /* is_signal */,
                                    waits[i].wait_value);
      if (result != VK_SUCCESS)
         goto error;
   }

   for (uint32_t i = 0; i < signal_count; i++) {
      result = anv_execbuf_add_sync(device, &execbuf,
                                    signals[i].sync,
                                    true /* is_signal */,
                                    signals[i].signal_value);
      if (result != VK_SUCCESS)
         goto error;
   }

   if (queue->sync) {
      result = anv_execbuf_add_sync(device, &execbuf,
                                    queue->sync,
                                    true /* is_signal */,
                                    0 /* signal_value */);
      if (result != VK_SUCCESS)
         goto error;
   }

   if (cmd_buffer_count) {
      result = setup_execbuf_for_cmd_buffers(&execbuf, queue, cmd_buffers,
                                             cmd_buffer_count);
   } else {
      result = setup_empty_execbuf(&execbuf, queue);
   }

   if (result != VK_SUCCESS)
      goto error;

   const bool has_perf_query = perf_query_pool && cmd_buffer_count;

   if (INTEL_DEBUG(DEBUG_SUBMIT))
      anv_i915_debug_submit(&execbuf);

   anv_cmd_buffer_exec_batch_debug(queue, cmd_buffer_count, cmd_buffers,
                                   perf_query_pool, perf_query_pass);

   setup_execbuf_fence_params(&execbuf);

   if (has_perf_query) {
      assert(perf_query_pass < perf_query_pool->n_passes);
      struct intel_perf_query_info *query_info =
         perf_query_pool->pass_query[perf_query_pass];

      /* Some performance queries just the pipeline statistic HW, no need for
       * OA in that case, so no need to reconfigure.
       */
      if (!INTEL_DEBUG(DEBUG_NO_OACONFIG) &&
          (query_info->kind == INTEL_PERF_QUERY_TYPE_OA ||
           query_info->kind == INTEL_PERF_QUERY_TYPE_RAW)) {
         int ret = intel_perf_stream_set_metrics_id(device->physical->perf,
                                                    device->perf_fd,
                                                    query_info->oa_metrics_set_id);
         if (ret < 0) {
            result = vk_device_set_lost(&device->vk,
                                        "i915-perf config failed: %s",
                                        strerror(errno));
         }
      }

      struct anv_bo *pass_batch_bo = perf_query_pool->bo;

      struct drm_i915_gem_exec_object2 query_pass_object = {
         .handle = pass_batch_bo->gem_handle,
         .offset = pass_batch_bo->offset,
         .flags  = pass_batch_bo->flags,
      };

      uint64_t exec_flags = 0;
      uint32_t context_id;
      get_context_and_exec_flags(queue, false, &exec_flags, &context_id);

      struct drm_i915_gem_execbuffer2 query_pass_execbuf = {
         .buffers_ptr = (uintptr_t) &query_pass_object,
         .buffer_count = 1,
         .batch_start_offset = khr_perf_query_preamble_offset(perf_query_pool,
                                                              perf_query_pass),
         .flags = I915_EXEC_HANDLE_LUT | exec_flags,
         .rsvd1 = context_id,
      };

      int ret = queue->device->info->no_hw ? 0 :
         anv_gem_execbuffer(queue->device, &query_pass_execbuf);
      if (ret)
         result = vk_queue_set_lost(&queue->vk, "execbuf2 failed: %m");
   }

   ANV_RMV(bos_gtt_map, device, execbuf.bos, execbuf.bo_count);

   int ret = queue->device->info->no_hw ? 0 :
      anv_gem_execbuffer(queue->device, &execbuf.execbuf);
   if (ret) {
      anv_i915_debug_submit(&execbuf);
      result = vk_queue_set_lost(&queue->vk, "execbuf2 failed: %m");
   }

   if (cmd_buffer_count != 0 && cmd_buffers[0]->companion_rcs_cmd_buffer) {
      struct anv_cmd_buffer *companion_rcs_cmd_buffer =
         cmd_buffers[0]->companion_rcs_cmd_buffer;
      assert(companion_rcs_cmd_buffer->is_companion_rcs_cmd_buffer);
      assert(cmd_buffer_count == 1);
      result = i915_companion_rcs_queue_exec_locked(queue,
                                                    cmd_buffers[0]->companion_rcs_cmd_buffer, wait_count,
                                                    waits);
   }

   result = anv_queue_post_submit(queue, result);

 error:
   anv_execbuf_finish(&execbuf);

   if (result == VK_SUCCESS && utrace_submit) {
      struct vk_sync_signal signal = {
         .sync = utrace_submit->base.signal.sync,
         .signal_value = utrace_submit->base.signal.signal_value,
      };
      result = i915_queue_exec_async(&utrace_submit->base, 0, NULL, 1, &signal);
   }

   return result;
}
