/*
 * Copyright © 2021 Collabora Ltd.
 *
 * Derived from tu_image.c which is:
 * Copyright © 2016 Red Hat.
 * Copyright © 2016 Bas Nieuwenhuizen
 * Copyright © 2015 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 */

#include "vk_cmd_enqueue_entrypoints.h"
#include "vk_common_entrypoints.h"

#include "panvk_buffer.h"
#include "panvk_cmd_alloc.h"
#include "panvk_cmd_buffer.h"
#include "panvk_device.h"
#include "panvk_entrypoints.h"
#include "panvk_instance.h"
#include "panvk_macros.h"
#include "panvk_physical_device.h"
#include "panvk_priv_bo.h"
#include "panvk_queue.h"

#include "genxml/decode.h"
#include "genxml/gen_macros.h"

#include "kmod/pan_kmod.h"
#include "pan_props.h"
#include "pan_samples.h"

static void *
panvk_kmod_zalloc(const struct pan_kmod_allocator *allocator, size_t size,
                  bool transient)
{
   const VkAllocationCallbacks *vkalloc = allocator->priv;

   void *obj = vk_zalloc(vkalloc, size, 8,
                         transient ? VK_SYSTEM_ALLOCATION_SCOPE_COMMAND
                                   : VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);

   /* We force errno to -ENOMEM on host allocation failures so we can properly
    * report it back as VK_ERROR_OUT_OF_HOST_MEMORY. */
   errno = obj ? 0 : -ENOMEM;

   return obj;
}

static void
panvk_kmod_free(const struct pan_kmod_allocator *allocator, void *data)
{
   const VkAllocationCallbacks *vkalloc = allocator->priv;

   return vk_free(vkalloc, data);
}

static void
panvk_device_init_mempools(struct panvk_device *dev)
{
   struct panvk_pool_properties rw_pool_props = {
      .create_flags = 0,
      .slab_size = 16 * 1024,
      .label = "Device RW cached memory pool",
      .owns_bos = false,
      .needs_locking = true,
      .prealloc = false,
   };

   panvk_pool_init(&dev->mempools.rw, dev, NULL, &rw_pool_props);

   struct panvk_pool_properties rw_nc_pool_props = {
      .create_flags = PAN_KMOD_BO_FLAG_GPU_UNCACHED,
      .slab_size = 16 * 1024,
      .label = "Device RW uncached memory pool",
      .owns_bos = false,
      .needs_locking = true,
      .prealloc = false,
   };

   panvk_pool_init(&dev->mempools.rw_nc, dev, NULL, &rw_nc_pool_props);

   struct panvk_pool_properties exec_pool_props = {
      .create_flags = PAN_KMOD_BO_FLAG_EXECUTABLE,
      .slab_size = 16 * 1024,
      .label = "Device executable memory pool (shaders)",
      .owns_bos = false,
      .needs_locking = true,
      .prealloc = false,
   };

   panvk_pool_init(&dev->mempools.exec, dev, NULL, &exec_pool_props);
}

static void
panvk_device_cleanup_mempools(struct panvk_device *dev)
{
   panvk_pool_cleanup(&dev->mempools.rw);
   panvk_pool_cleanup(&dev->mempools.exec);
}

static VkResult
panvk_meta_cmd_bind_map_buffer(struct vk_command_buffer *cmd,
                               struct vk_meta_device *meta, VkBuffer buf,
                               void **map_out)
{
   VK_FROM_HANDLE(panvk_buffer, buffer, buf);
   struct panvk_cmd_buffer *cmdbuf =
      container_of(cmd, struct panvk_cmd_buffer, vk);
   struct panfrost_ptr mem =
      panvk_cmd_alloc_dev_mem(cmdbuf, desc, buffer->vk.size, 64);

   if (!mem.gpu)
      return VK_ERROR_OUT_OF_DEVICE_MEMORY;

   buffer->dev_addr = mem.gpu;
   *map_out = mem.cpu;
   return VK_SUCCESS;
}

static VkResult
panvk_meta_init(struct panvk_device *device)
{
   const struct vk_physical_device *pdev = device->vk.physical;

   VkResult result = vk_meta_device_init(&device->vk, &device->meta);
   if (result != VK_SUCCESS)
      return result;

   device->meta.use_stencil_export = true;
   device->meta.max_bind_map_buffer_size_B = 64 * 1024;
   device->meta.cmd_bind_map_buffer = panvk_meta_cmd_bind_map_buffer;

   /* Assume a maximum of 1024 bytes per worgroup and choose the workgroup size
    * accordingly. */
   for (uint32_t i = 0;
        i < ARRAY_SIZE(device->meta.buffer_access.optimal_wg_size); i++) {
      device->meta.buffer_access.optimal_wg_size[i] =
         MIN2(1024 >> i, pdev->properties.maxComputeWorkGroupSize[0]);
   }

#if PAN_ARCH <= 7
   panvk_per_arch(meta_desc_copy_init)(device);
#endif

   return VK_SUCCESS;
}

static void
panvk_meta_cleanup(struct panvk_device *device)
{
#if PAN_ARCH <= 7
   panvk_per_arch(meta_desc_copy_cleanup)(device);
#endif

   vk_meta_device_finish(&device->vk, &device->meta);
}

static void
panvk_preload_blitter_init(struct panvk_device *device)
{
   const struct panvk_physical_device *physical_device =
      to_panvk_physical_device(device->vk.physical);

   struct panvk_pool_properties bin_pool_props = {
      .create_flags = PAN_KMOD_BO_FLAG_EXECUTABLE,
      .slab_size = 16 * 1024,
      .label = "panvk_meta blitter binary pool",
      .owns_bos = true,
      .needs_locking = false,
      .prealloc = false,
   };
   panvk_pool_init(&device->blitter.bin_pool, device, NULL, &bin_pool_props);

   struct panvk_pool_properties desc_pool_props = {
      .create_flags = 0,
      .slab_size = 16 * 1024,
      .label = "panvk_meta blitter descriptor pool",
      .owns_bos = true,
      .needs_locking = false,
      .prealloc = false,
   };
   panvk_pool_init(&device->blitter.desc_pool, device, NULL, &desc_pool_props);

   pan_blend_shader_cache_init(&device->blitter.blend_shader_cache,
                               physical_device->kmod.props.gpu_prod_id);
   GENX(pan_blitter_cache_init)
   (&device->blitter.cache, physical_device->kmod.props.gpu_prod_id,
    &device->blitter.blend_shader_cache, &device->blitter.bin_pool.base,
    &device->blitter.desc_pool.base);
}

static void
panvk_preload_blitter_cleanup(struct panvk_device *device)
{
   GENX(pan_blitter_cache_cleanup)(&device->blitter.cache);
   pan_blend_shader_cache_cleanup(&device->blitter.blend_shader_cache);
   panvk_pool_cleanup(&device->blitter.desc_pool);
   panvk_pool_cleanup(&device->blitter.bin_pool);
}

/* Always reserve the lower 32MB. */
#define PANVK_VA_RESERVE_BOTTOM 0x2000000ull

VkResult
panvk_per_arch(create_device)(struct panvk_physical_device *physical_device,
                              const VkDeviceCreateInfo *pCreateInfo,
                              const VkAllocationCallbacks *pAllocator,
                              VkDevice *pDevice)
{
   struct panvk_instance *instance =
      to_panvk_instance(physical_device->vk.instance);
   VkResult result;
   struct panvk_device *device;

   device = vk_zalloc2(&instance->vk.alloc, pAllocator, sizeof(*device), 8,
                       VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
   if (!device)
      return vk_error(physical_device, VK_ERROR_OUT_OF_HOST_MEMORY);

   struct vk_device_dispatch_table dispatch_table;

   /* For secondary command buffer support, overwrite any command entrypoints
    * in the main device-level dispatch table with
    * vk_cmd_enqueue_unless_primary_Cmd*.
    */
   vk_device_dispatch_table_from_entrypoints(
      &dispatch_table, &vk_cmd_enqueue_unless_primary_device_entrypoints, true);

   vk_device_dispatch_table_from_entrypoints(
      &dispatch_table, &panvk_per_arch(device_entrypoints), false);
   vk_device_dispatch_table_from_entrypoints(&dispatch_table,
                                             &panvk_device_entrypoints, false);
   vk_device_dispatch_table_from_entrypoints(&dispatch_table,
                                             &wsi_device_entrypoints, false);

   /* Populate our primary cmd_dispatch table. */
   vk_device_dispatch_table_from_entrypoints(
      &device->cmd_dispatch, &panvk_per_arch(device_entrypoints), true);
   vk_device_dispatch_table_from_entrypoints(&device->cmd_dispatch,
                                             &panvk_device_entrypoints, false);
   vk_device_dispatch_table_from_entrypoints(
      &device->cmd_dispatch, &vk_common_device_entrypoints, false);

   result = vk_device_init(&device->vk, &physical_device->vk, &dispatch_table,
                           pCreateInfo, pAllocator);
   if (result != VK_SUCCESS)
      goto err_free_dev;

   /* Must be done after vk_device_init() because this function memset(0) the
    * whole struct.
    */
   device->vk.command_dispatch_table = &device->cmd_dispatch;
   device->vk.command_buffer_ops = &panvk_per_arch(cmd_buffer_ops);
   device->vk.shader_ops = &panvk_per_arch(device_shader_ops);
   device->vk.submit_mode = VK_QUEUE_SUBMIT_MODE_THREADED_ON_DEMAND;

   device->kmod.allocator = (struct pan_kmod_allocator){
      .zalloc = panvk_kmod_zalloc,
      .free = panvk_kmod_free,
      .priv = &device->vk.alloc,
   };
   device->kmod.dev =
      pan_kmod_dev_create(dup(physical_device->kmod.dev->fd),
                          PAN_KMOD_DEV_FLAG_OWNS_FD, &device->kmod.allocator);

   if (!device->kmod.dev) {
      result = vk_errorf(instance, panvk_errno_to_vk_error(), "cannot create device");
      goto err_finish_dev;
   }

   if (instance->debug_flags &
       (PANVK_DEBUG_TRACE | PANVK_DEBUG_SYNC | PANVK_DEBUG_DUMP))
      device->debug.decode_ctx = pandecode_create_context(false);

   /* 32bit address space, with the lower 32MB reserved. We clamp
    * things so it matches kmod VA range limitations.
    */
   uint64_t user_va_start = panfrost_clamp_to_usable_va_range(
      device->kmod.dev, PANVK_VA_RESERVE_BOTTOM);
   uint64_t user_va_end =
      panfrost_clamp_to_usable_va_range(device->kmod.dev, 1ull << 32);
   uint32_t vm_flags = PAN_ARCH <= 7 ? PAN_KMOD_VM_FLAG_AUTO_VA : 0;

   util_vma_heap_init(&device->as.heap, user_va_start,
                      user_va_end - user_va_start);

   device->kmod.vm =
      pan_kmod_vm_create(device->kmod.dev, vm_flags,
                         user_va_start, user_va_end - user_va_start);

   if (!device->kmod.vm) {
      result = vk_error(physical_device, VK_ERROR_OUT_OF_HOST_MEMORY);
      goto err_destroy_kdev;
   }

   panvk_device_init_mempools(device);

#if PAN_ARCH <= 9
   device->tiler_heap = panvk_priv_bo_create(
      device, 128 * 1024 * 1024,
      PAN_KMOD_BO_FLAG_NO_MMAP | PAN_KMOD_BO_FLAG_ALLOC_ON_FAULT,
      VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);

   if (!device->tiler_heap) {
      result = vk_error(physical_device, VK_ERROR_OUT_OF_HOST_MEMORY);
      goto err_free_priv_bos;
   }
#endif

   device->sample_positions =
      panvk_priv_bo_create(device, panfrost_sample_positions_buffer_size(), 0,
                           VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);

   if (!device->sample_positions) {
      result = vk_error(physical_device, VK_ERROR_OUT_OF_HOST_MEMORY);
      goto err_free_priv_bos;
   }

   panfrost_upload_sample_positions(device->sample_positions->addr.host);

   vk_device_set_drm_fd(&device->vk, device->kmod.dev->fd);

   result = panvk_per_arch(blend_shader_cache_init)(device);

   if (result != VK_SUCCESS)
      goto err_free_priv_bos;

   panvk_preload_blitter_init(device);

   result = panvk_meta_init(device);
   if (result != VK_SUCCESS)
      goto err_cleanup_blitter;

   for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
      const VkDeviceQueueCreateInfo *queue_create =
         &pCreateInfo->pQueueCreateInfos[i];
      uint32_t qfi = queue_create->queueFamilyIndex;
      device->queues[qfi] =
         vk_alloc(&device->vk.alloc,
                  queue_create->queueCount * sizeof(struct panvk_queue), 8,
                  VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
      if (!device->queues[qfi]) {
         result = VK_ERROR_OUT_OF_HOST_MEMORY;
         goto err_finish_queues;
      }

      memset(device->queues[qfi], 0,
             queue_create->queueCount * sizeof(struct panvk_queue));

      device->queue_count[qfi] = queue_create->queueCount;

      for (unsigned q = 0; q < queue_create->queueCount; q++) {
         result = panvk_per_arch(queue_init)(device, &device->queues[qfi][q], q,
                                             queue_create);
         if (result != VK_SUCCESS)
            goto err_finish_queues;
      }
   }

   *pDevice = panvk_device_to_handle(device);
   return VK_SUCCESS;

err_finish_queues:
   for (unsigned i = 0; i < PANVK_MAX_QUEUE_FAMILIES; i++) {
      for (unsigned q = 0; q < device->queue_count[i]; q++)
         panvk_per_arch(queue_finish)(&device->queues[i][q]);
      if (device->queue_count[i])
         vk_object_free(&device->vk, NULL, device->queues[i]);
   }

   panvk_meta_cleanup(device);

err_cleanup_blitter:
   panvk_preload_blitter_cleanup(device);
   panvk_per_arch(blend_shader_cache_cleanup)(device);

err_free_priv_bos:
   panvk_priv_bo_unref(device->sample_positions);
   panvk_priv_bo_unref(device->tiler_heap);
   panvk_device_cleanup_mempools(device);
   pan_kmod_vm_destroy(device->kmod.vm);
   util_vma_heap_finish(&device->as.heap);

err_destroy_kdev:
   pan_kmod_dev_destroy(device->kmod.dev);

err_finish_dev:
   vk_device_finish(&device->vk);

err_free_dev:
   vk_free(&device->vk.alloc, device);
   return result;
}

void
panvk_per_arch(destroy_device)(struct panvk_device *device,
                               const VkAllocationCallbacks *pAllocator)
{
   if (!device)
      return;

   for (unsigned i = 0; i < PANVK_MAX_QUEUE_FAMILIES; i++) {
      for (unsigned q = 0; q < device->queue_count[i]; q++)
         panvk_per_arch(queue_finish)(&device->queues[i][q]);
      if (device->queue_count[i])
         vk_object_free(&device->vk, NULL, device->queues[i]);
   }

   panvk_meta_cleanup(device);
   panvk_preload_blitter_cleanup(device);
   panvk_per_arch(blend_shader_cache_cleanup)(device);
   panvk_priv_bo_unref(device->tiler_heap);
   panvk_priv_bo_unref(device->sample_positions);
   panvk_device_cleanup_mempools(device);
   pan_kmod_vm_destroy(device->kmod.vm);
   util_vma_heap_finish(&device->as.heap);

   if (device->debug.decode_ctx)
      pandecode_destroy_context(device->debug.decode_ctx);

   pan_kmod_dev_destroy(device->kmod.dev);
   vk_device_finish(&device->vk);
   vk_free(&device->vk.alloc, device);
}

VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(GetRenderAreaGranularity)(VkDevice device,
                                         VkRenderPass renderPass,
                                         VkExtent2D *pGranularity)
{
   *pGranularity = (VkExtent2D){32, 32};
}

VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(GetRenderingAreaGranularityKHR)(
   VkDevice _device, const VkRenderingAreaInfoKHR *pRenderingAreaInfo,
   VkExtent2D *pGranularity)
{
   *pGranularity = (VkExtent2D){32, 32};
}
