/*
 * Copyright © 2016 Red Hat.
 * Copyright © 2016 Bas Nieuwenhuizen
 *
 * based in part on anv driver which is:
 * Copyright © 2015 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 */

#ifndef RADV_DEVICE_H
#define RADV_DEVICE_H

#include "ac_descriptors.h"
#include "ac_spm.h"
#include "ac_sqtt.h"

#include "util/mesa-blake3.h"

#include "radv_pipeline.h"
#include "radv_printf.h"
#include "radv_queue.h"
#include "radv_radeon_winsys.h"
#include "radv_rra.h"
#include "radv_shader.h"

#include "vk_device.h"
#include "vk_texcompress_astc.h"
#include "vk_texcompress_etc2.h"

#define RADV_NUM_HW_CTX (RADEON_CTX_PRIORITY_REALTIME + 1)

struct radv_image_view;

enum radv_dispatch_table {
   RADV_DEVICE_DISPATCH_TABLE,
   RADV_ANNOTATE_DISPATCH_TABLE,
   RADV_APP_DISPATCH_TABLE,
   RADV_RGP_DISPATCH_TABLE,
   RADV_RRA_DISPATCH_TABLE,
   RADV_RMV_DISPATCH_TABLE,
   RADV_CTX_ROLL_DISPATCH_TABLE,
   RADV_DISPATCH_TABLE_COUNT,
};

struct radv_layer_dispatch_tables {
   struct vk_device_dispatch_table annotate;
   struct vk_device_dispatch_table app;
   struct vk_device_dispatch_table rgp;
   struct vk_device_dispatch_table rra;
   struct vk_device_dispatch_table rmv;
   struct vk_device_dispatch_table ctx_roll;
};

struct radv_device_cache_key {
   uint32_t disable_trunc_coord : 1;
   uint32_t image_2d_view_of_3d : 1;
   uint32_t mesh_shader_queries : 1;
   uint32_t primitives_generated_query : 1;
};

enum radv_force_vrs {
   RADV_FORCE_VRS_1x1 = 0,
   RADV_FORCE_VRS_2x2,
   RADV_FORCE_VRS_2x1,
   RADV_FORCE_VRS_1x2,
};

struct radv_notifier {
   int fd;
   int watch;
   bool quit;
   thrd_t thread;
};

struct radv_meta_state {
   VkAllocationCallbacks alloc;

   VkPipelineCache cache;
   uint32_t initial_cache_entries;

   /*
    * For on-demand pipeline creation, makes sure that
    * only one thread tries to build a pipeline at the same time.
    */
   mtx_t mtx;

   /**
    * Use array element `i` for images with `2^i` samples.
    */
   struct {
      VkPipeline color_pipelines[NUM_META_FS_KEYS];
   } color_clear[MAX_SAMPLES_LOG2][MAX_RTS];

   struct {
      VkPipeline depth_only_pipeline[NUM_DEPTH_CLEAR_PIPELINES];
      VkPipeline stencil_only_pipeline[NUM_DEPTH_CLEAR_PIPELINES];
      VkPipeline depthstencil_pipeline[NUM_DEPTH_CLEAR_PIPELINES];

      VkPipeline depth_only_unrestricted_pipeline[NUM_DEPTH_CLEAR_PIPELINES];
      VkPipeline stencil_only_unrestricted_pipeline[NUM_DEPTH_CLEAR_PIPELINES];
      VkPipeline depthstencil_unrestricted_pipeline[NUM_DEPTH_CLEAR_PIPELINES];
   } ds_clear[MAX_SAMPLES_LOG2];

   VkPipelineLayout clear_color_p_layout;
   VkPipelineLayout clear_depth_p_layout;
   VkPipelineLayout clear_depth_unrestricted_p_layout;

   /* Optimized compute fast HTILE clear for stencil or depth only. */
   VkPipeline clear_htile_mask_pipeline;
   VkPipelineLayout clear_htile_mask_p_layout;
   VkDescriptorSetLayout clear_htile_mask_ds_layout;

   /* Copy VRS into HTILE. */
   VkPipeline copy_vrs_htile_pipeline;
   VkPipelineLayout copy_vrs_htile_p_layout;
   VkDescriptorSetLayout copy_vrs_htile_ds_layout;

   /* Clear DCC with comp-to-single. */
   VkPipeline clear_dcc_comp_to_single_pipeline[2]; /* 0: 1x, 1: 2x/4x/8x */
   VkPipelineLayout clear_dcc_comp_to_single_p_layout;
   VkDescriptorSetLayout clear_dcc_comp_to_single_ds_layout;

   struct {
      /** Pipeline that blits from a 1D image. */
      VkPipeline pipeline_1d_src[NUM_META_FS_KEYS];

      /** Pipeline that blits from a 2D image. */
      VkPipeline pipeline_2d_src[NUM_META_FS_KEYS];

      /** Pipeline that blits from a 3D image. */
      VkPipeline pipeline_3d_src[NUM_META_FS_KEYS];

      VkPipeline depth_only_1d_pipeline;
      VkPipeline depth_only_2d_pipeline;
      VkPipeline depth_only_3d_pipeline;

      VkPipeline stencil_only_1d_pipeline;
      VkPipeline stencil_only_2d_pipeline;
      VkPipeline stencil_only_3d_pipeline;
      VkPipelineLayout pipeline_layout;
      VkDescriptorSetLayout ds_layout;
   } blit;

   struct {
      VkPipelineLayout p_layouts[5];
      VkDescriptorSetLayout ds_layouts[5];
      VkPipeline pipelines[5][NUM_META_FS_KEYS];

      VkPipeline depth_only_pipeline[5];

      VkPipeline stencil_only_pipeline[5];
   } blit2d[MAX_SAMPLES_LOG2];

   struct {
      VkPipelineLayout img_p_layout;
      VkDescriptorSetLayout img_ds_layout;
      VkPipeline pipeline;
      VkPipeline pipeline_3d;
   } itob;
   struct {
      VkPipelineLayout img_p_layout;
      VkDescriptorSetLayout img_ds_layout;
      VkPipeline pipeline;
      VkPipeline pipeline_3d;
   } btoi;
   struct {
      VkPipelineLayout img_p_layout;
      VkDescriptorSetLayout img_ds_layout;
      VkPipeline pipeline;
   } btoi_r32g32b32;
   struct {
      VkPipelineLayout img_p_layout;
      VkDescriptorSetLayout img_ds_layout;
      VkPipeline pipeline[MAX_SAMPLES_LOG2];
      VkPipeline pipeline_2d_3d;
      VkPipeline pipeline_3d_2d;
      VkPipeline pipeline_3d_3d;
   } itoi;
   struct {
      VkPipelineLayout img_p_layout;
      VkDescriptorSetLayout img_ds_layout;
      VkPipeline pipeline;
   } itoi_r32g32b32;
   struct {
      VkPipelineLayout img_p_layout;
      VkDescriptorSetLayout img_ds_layout;
      VkPipeline pipeline[MAX_SAMPLES_LOG2];
      VkPipeline pipeline_3d;
   } cleari;
   struct {
      VkPipelineLayout img_p_layout;
      VkDescriptorSetLayout img_ds_layout;
      VkPipeline pipeline;
   } cleari_r32g32b32;
   struct {
      VkPipelineLayout p_layout;
      VkDescriptorSetLayout ds_layout;
      VkPipeline pipeline[MAX_SAMPLES_LOG2];
   } fmask_copy;

   struct {
      VkPipelineLayout p_layout;
      VkPipeline pipeline[NUM_META_FS_KEYS];
   } resolve;

   struct {
      VkDescriptorSetLayout ds_layout;
      VkPipelineLayout p_layout;
      struct {
         VkPipeline pipeline;
         VkPipeline i_pipeline;
         VkPipeline srgb_pipeline;
      } rc[MAX_SAMPLES_LOG2];

      VkPipeline depth_zero_pipeline;
      struct {
         VkPipeline average_pipeline;
         VkPipeline max_pipeline;
         VkPipeline min_pipeline;
      } depth[MAX_SAMPLES_LOG2];

      VkPipeline stencil_zero_pipeline;
      struct {
         VkPipeline max_pipeline;
         VkPipeline min_pipeline;
      } stencil[MAX_SAMPLES_LOG2];
   } resolve_compute;

   struct {
      VkDescriptorSetLayout ds_layout;
      VkPipelineLayout p_layout;

      struct {
         VkPipeline pipeline[NUM_META_FS_KEYS];
      } rc[MAX_SAMPLES_LOG2];

      VkPipeline depth_zero_pipeline;
      struct {
         VkPipeline average_pipeline;
         VkPipeline max_pipeline;
         VkPipeline min_pipeline;
      } depth[MAX_SAMPLES_LOG2];

      VkPipeline stencil_zero_pipeline;
      struct {
         VkPipeline max_pipeline;
         VkPipeline min_pipeline;
      } stencil[MAX_SAMPLES_LOG2];
   } resolve_fragment;

   struct {
      VkPipelineLayout p_layout;
      VkPipeline decompress_pipeline[MAX_SAMPLES_LOG2];
   } depth_decomp;

   VkDescriptorSetLayout expand_depth_stencil_compute_ds_layout;
   VkPipelineLayout expand_depth_stencil_compute_p_layout;
   VkPipeline expand_depth_stencil_compute_pipeline;

   struct {
      VkPipelineLayout p_layout;
      VkPipeline cmask_eliminate_pipeline;
      VkPipeline fmask_decompress_pipeline;
      VkPipeline dcc_decompress_pipeline;

      VkDescriptorSetLayout dcc_decompress_compute_ds_layout;
      VkPipelineLayout dcc_decompress_compute_p_layout;
      VkPipeline dcc_decompress_compute_pipeline;
   } fast_clear_flush;

   struct {
      VkPipelineLayout fill_p_layout;
      VkPipelineLayout copy_p_layout;
      VkPipeline fill_pipeline;
      VkPipeline copy_pipeline;
   } buffer;

   struct {
      VkDescriptorSetLayout ds_layout;
      VkPipelineLayout p_layout;
      VkPipeline occlusion_query_pipeline;
      VkPipeline pipeline_statistics_query_pipeline;
      VkPipeline tfb_query_pipeline;
      VkPipeline timestamp_query_pipeline;
      VkPipeline pg_query_pipeline;
      VkPipeline ms_prim_gen_query_pipeline;
   } query;

   struct {
      VkDescriptorSetLayout ds_layout;
      VkPipelineLayout p_layout;
      VkPipeline pipeline[MAX_SAMPLES_LOG2];
   } fmask_expand;

   struct {
      VkDescriptorSetLayout ds_layout;
      VkPipelineLayout p_layout;
      VkPipeline pipeline[32];
   } dcc_retile;

   struct {
      VkPipelineLayout leaf_p_layout;
      VkPipeline leaf_pipeline;
      VkPipeline leaf_updateable_pipeline;
      VkPipelineLayout morton_p_layout;
      VkPipeline morton_pipeline;
      VkPipelineLayout lbvh_main_p_layout;
      VkPipeline lbvh_main_pipeline;
      VkPipelineLayout lbvh_generate_ir_p_layout;
      VkPipeline lbvh_generate_ir_pipeline;
      VkPipelineLayout ploc_p_layout;
      VkPipeline ploc_pipeline;
      VkPipelineLayout encode_p_layout;
      VkPipeline encode_pipeline;
      VkPipeline encode_compact_pipeline;
      VkPipelineLayout header_p_layout;
      VkPipeline header_pipeline;
      VkPipelineLayout update_p_layout;
      VkPipeline update_pipeline;
      VkPipelineLayout copy_p_layout;
      VkPipeline copy_pipeline;

      struct radix_sort_vk *radix_sort;

      struct {
         VkBuffer buffer;
         VkDeviceMemory memory;
         VkAccelerationStructureKHR accel_struct;
      } null;
   } accel_struct_build;

   struct vk_texcompress_etc2_state etc_decode;

   struct vk_texcompress_astc_state *astc_decode;

   struct {
      VkDescriptorSetLayout ds_layout;
      VkPipelineLayout p_layout;
   } dgc_prepare;
};

struct radv_memory_trace_data {
   /* ID of the PTE update event in ftrace data */
   uint16_t ftrace_update_ptes_id;

   uint32_t num_cpus;
   int *pipe_fds;
};

struct radv_sqtt_timestamp {
   uint8_t *map;
   unsigned offset;
   uint64_t size;
   struct radeon_winsys_bo *bo;
   struct list_head list;
};

#define RADV_BORDER_COLOR_COUNT       4096
#define RADV_BORDER_COLOR_BUFFER_SIZE (sizeof(VkClearColorValue) * RADV_BORDER_COLOR_COUNT)

struct radv_device_border_color_data {
   bool used[RADV_BORDER_COLOR_COUNT];

   struct radeon_winsys_bo *bo;
   VkClearColorValue *colors_gpu_ptr;

   /* Mutex is required to guarantee vkCreateSampler thread safety
    * given that we are writing to a buffer and checking color occupation */
   mtx_t mutex;
};

struct radv_pso_cache_stats {
   uint32_t hits;
   uint32_t misses;
};

struct radv_device {
   struct vk_device vk;

   struct radeon_winsys *ws;

   struct radv_layer_dispatch_tables layer_dispatch;

   struct radeon_winsys_ctx *hw_ctx[RADV_NUM_HW_CTX];
   struct radv_meta_state meta_state;

   struct radv_queue *queues[RADV_MAX_QUEUE_FAMILIES];
   int queue_count[RADV_MAX_QUEUE_FAMILIES];

   bool pbb_allowed;
   uint32_t scratch_waves;
   uint32_t dispatch_initiator;
   uint32_t dispatch_initiator_task;

   /* MSAA sample locations.
    * The first index is the sample index.
    * The second index is the coordinate: X, Y. */
   float sample_locations_1x[1][2];
   float sample_locations_2x[2][2];
   float sample_locations_4x[4][2];
   float sample_locations_8x[8][2];

   /* GFX7 and later */
   uint32_t gfx_init_size_dw;
   struct radeon_winsys_bo *gfx_init;

   struct radeon_winsys_bo *trace_bo;
   struct radv_trace_data *trace_data;

   /* Whether to keep shader debug info, for debugging. */
   bool keep_shader_info;

   /* Backup in-memory cache to be used if the app doesn't provide one */
   struct vk_pipeline_cache *mem_cache;

   /*
    * use different counters so MSAA MRTs get consecutive surface indices,
    * even if MASK is allocated in between.
    */
   uint32_t image_mrt_offset_counter;
   uint32_t fmask_mrt_offset_counter;

   struct list_head shader_arenas;
   struct hash_table_u64 *capture_replay_arena_vas;
   unsigned shader_arena_shift;
   uint8_t shader_free_list_mask;
   struct radv_shader_free_list shader_free_list;
   struct radv_shader_free_list capture_replay_free_list;
   struct list_head shader_block_obj_pool;
   mtx_t shader_arena_mutex;

   mtx_t shader_upload_hw_ctx_mutex;
   struct radeon_winsys_ctx *shader_upload_hw_ctx;
   VkSemaphore shader_upload_sem;
   uint64_t shader_upload_seq;
   struct list_head shader_dma_submissions;
   mtx_t shader_dma_submission_list_mutex;
   cnd_t shader_dma_submission_list_cond;

   /* Whether to DMA shaders to invisible VRAM or to upload directly through BAR. */
   bool shader_use_invisible_vram;

   /* Whether to inline the compute dispatch size in user sgprs. */
   bool load_grid_size_from_user_sgpr;

   /* Whether the driver uses a global BO list. */
   bool use_global_bo_list;

   /* Whether anisotropy is forced with RADV_TEX_ANISO (-1 is disabled). */
   int force_aniso;

   /* Always disable TRUNC_COORD. */
   bool disable_trunc_coord;

   struct radv_device_border_color_data border_color_data;

   /* Thread trace. */
   struct ac_sqtt sqtt;
   bool sqtt_enabled;
   bool sqtt_triggered;

   /* SQTT timestamps for queue events. */
   simple_mtx_t sqtt_timestamp_mtx;
   struct radv_sqtt_timestamp sqtt_timestamp;

   /* SQTT timed cmd buffers. */
   simple_mtx_t sqtt_command_pool_mtx;
   struct vk_command_pool *sqtt_command_pool[2];

   /* Memory trace. */
   struct radv_memory_trace_data memory_trace;

   /* SPM. */
   struct ac_spm spm;

   /* Radeon Raytracing Analyzer trace. */
   struct radv_rra_trace_data rra_trace;

   FILE *ctx_roll_file;
   simple_mtx_t ctx_roll_mtx;

   /* Trap handler. */
   struct radv_shader *trap_handler_shader;
   struct radeon_winsys_bo *tma_bo; /* Trap Memory Address */
   uint32_t *tma_ptr;

   /* Overallocation. */
   bool overallocation_disallowed;
   uint64_t allocated_memory_size[VK_MAX_MEMORY_HEAPS];
   mtx_t overallocation_mutex;

   /* RADV_FORCE_VRS. */
   struct radv_notifier notifier;
   enum radv_force_vrs force_vrs;

   /* Depth image for VRS when not bound by the app. */
   struct {
      struct radv_image *image;
      struct radv_buffer *buffer; /* HTILE */
      struct radv_device_memory *mem;
   } vrs;

   /* Prime blit sdma queue */
   struct radv_queue *private_sdma_queue;

   struct radv_shader_part_cache vs_prologs;
   struct radv_shader_part *simple_vs_prologs[MAX_VERTEX_ATTRIBS];
   struct radv_shader_part *instance_rate_vs_prologs[816];

   struct radv_shader_part_cache ps_epilogs;

   simple_mtx_t trace_mtx;

   /* Whether per-vertex VRS is forced. */
   bool force_vrs_enabled;

   simple_mtx_t pstate_mtx;
   unsigned pstate_cnt;

   /* BO to contain some performance counter helpers:
    * - A lock for profiling cmdbuffers.
    * - a temporary fence for the end query synchronization.
    * - the pass to use for profiling. (as an array of bools)
    */
   struct radeon_winsys_bo *perf_counter_bo;

   /* Interleaved lock/unlock commandbuffers for perfcounter passes. */
   struct radeon_cmdbuf **perf_counter_lock_cs;

   bool uses_shadow_regs;

   struct hash_table *rt_handles;
   simple_mtx_t rt_handles_mtx;

   struct radv_printf_data printf;

   struct radv_device_cache_key cache_key;
   blake3_hash cache_hash;

   /* Not NULL if a GPU hang report has been generated for VK_EXT_device_fault. */
   char *gpu_hang_report;

   /* For indirect compute pipeline binds with DGC only. */
   simple_mtx_t compute_scratch_mtx;
   uint32_t compute_scratch_size_per_wave;
   uint32_t compute_scratch_waves;

   /* PSO cache stats */
   simple_mtx_t pso_cache_stats_mtx;
   struct radv_pso_cache_stats pso_cache_stats[RADV_PIPELINE_TYPE_COUNT];
};

VK_DEFINE_HANDLE_CASTS(radv_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)

static inline struct radv_physical_device *
radv_device_physical(const struct radv_device *dev)
{
   return (struct radv_physical_device *)dev->vk.physical;
}

static inline bool
radv_uses_device_generated_commands(const struct radv_device *device)
{
   return device->vk.enabled_features.deviceGeneratedCommandsNV || device->vk.enabled_features.deviceGeneratedCompute;
}

static inline bool
radv_uses_primitives_generated_query(const struct radv_device *device)
{
   return device->vk.enabled_features.primitivesGeneratedQuery ||
          device->vk.enabled_features.primitivesGeneratedQueryWithRasterizerDiscard ||
          device->vk.enabled_features.primitivesGeneratedQueryWithNonZeroStreams;
}

static inline bool
radv_uses_image_float32_atomics(const struct radv_device *device)
{
   return device->vk.enabled_features.shaderImageFloat32Atomics ||
          device->vk.enabled_features.sparseImageFloat32Atomics ||
          device->vk.enabled_features.shaderImageFloat32AtomicMinMax ||
          device->vk.enabled_features.sparseImageFloat32AtomicMinMax;
}

VkResult radv_device_init_vrs_state(struct radv_device *device);

unsigned radv_get_default_max_sample_dist(int log_samples);

void radv_emit_default_sample_locations(const struct radv_physical_device *pdev, struct radeon_cmdbuf *cs,
                                        int nr_samples);

bool radv_get_memory_fd(struct radv_device *device, struct radv_device_memory *memory, int *pFD);

unsigned radv_get_dcc_max_uncompressed_block_size(const struct radv_device *device, const struct radv_image *image);

struct radv_color_buffer_info {
   struct ac_cb_surface ac;
};

struct radv_ds_buffer_info {
   struct ac_ds_surface ac;

   uint32_t db_render_override2;
   uint32_t db_render_control;
};

void radv_initialise_color_surface(struct radv_device *device, struct radv_color_buffer_info *cb,
                                   struct radv_image_view *iview);

void radv_initialise_vrs_surface(struct radv_image *image, struct radv_buffer *htile_buffer,
                                 struct radv_ds_buffer_info *ds);


void radv_initialise_ds_surface(const struct radv_device *device, struct radv_ds_buffer_info *ds,
                                struct radv_image_view *iview, VkImageAspectFlags ds_aspects);

void radv_gfx11_set_db_render_control(const struct radv_device *device, unsigned num_samples,
                                      unsigned *db_render_control);

bool radv_device_set_pstate(struct radv_device *device, bool enable);

bool radv_device_acquire_performance_counters(struct radv_device *device);

void radv_device_release_performance_counters(struct radv_device *device);

#endif /* RADV_DEVICE_H */
