/*
 * Copyright 2024 Advanced Micro Devices, Inc.
 *
 * SPDX-License-Identifier: MIT
 */

#ifndef AC_NIR_META_H
#define AC_NIR_META_H

#include "ac_gpu_info.h"
#include "nir.h"
#include "util/box.h"

union ac_ps_resolve_key {
   struct {
      bool use_aco:1;
      bool src_is_array:1;
      uint8_t log_samples:2;
      uint8_t last_src_channel:2; /* this shouldn't be greater than last_dst_channel */
      uint8_t last_dst_channel:2;
      bool x_clamp_to_edge:1;
      bool y_clamp_to_edge:1;
      bool a16:1;
      bool d16:1;
   };
   uint64_t key; /* use with hash_table_u64 */
};

/* Only immutable settings. */
struct ac_ps_resolve_options {
   const nir_shader_compiler_options *nir_options;
   const struct radeon_info *info;
   bool use_aco;     /* global driver setting */
   bool no_fmask;    /* FMASK disabled by a debug option, ignored on GFX11+ */
   bool print_key;   /* print ac_ps_resolve_key into stderr */
};

nir_shader *
ac_create_resolve_ps(const struct ac_ps_resolve_options *options,
                      const union ac_ps_resolve_key *key);

/* Universal optimized compute shader for image blits and clears. */
#define SI_MAX_COMPUTE_BLIT_LANE_SIZE  16
#define SI_MAX_COMPUTE_BLIT_SAMPLES    8

/* This describes all possible variants of the compute blit shader. */
union ac_cs_blit_key {
   struct {
      bool use_aco:1;
      /* Workgroup settings. */
      uint8_t wg_dim:2; /* 1, 2, or 3 */
      bool has_start_xyz:1;
      /* The size of a block of pixels that a single thread will process. */
      uint8_t log_lane_width:3;
      uint8_t log_lane_height:2;
      uint8_t log_lane_depth:2;
      /* Declaration modifiers. */
      bool is_clear:1;
      bool src_is_1d:1;
      bool dst_is_1d:1;
      bool src_is_msaa:1;
      bool dst_is_msaa:1;
      bool src_has_z:1;
      bool dst_has_z:1;
      bool a16:1;
      bool d16:1;
      uint8_t log_samples:2;
      bool sample0_only:1; /* src is MSAA, dst is not MSAA, log2_samples is ignored */
      /* Source coordinate modifiers. */
      bool x_clamp_to_edge:1;
      bool y_clamp_to_edge:1;
      bool flip_x:1;
      bool flip_y:1;
      /* Output modifiers. */
      bool sint_to_uint:1;
      bool uint_to_sint:1;
      bool dst_is_srgb:1;
      bool use_integer_one:1;
      uint8_t last_src_channel:2; /* this shouldn't be greater than last_dst_channel */
      uint8_t last_dst_channel:2;
   };
   uint64_t key;
};

struct ac_cs_blit_options {
   /* Global options. */
   const nir_shader_compiler_options *nir_options;
   const struct radeon_info *info;
   bool use_aco;        /* global driver setting */
   bool no_fmask;       /* FMASK disabled by a global debug option, ignored on GFX11+ */
   bool print_key;      /* print ac_ps_resolve_key into stderr */
   bool fail_if_slow;   /* fail if a gfx blit is faster, set to false on compute queues */

   bool is_nested;      /* for internal use, don't set */
};

struct ac_cs_blit_description
{
   struct {
      struct radeon_surf *surf;
      uint8_t dim;            /* 1 = 1D texture, 2 = 2D texture, 3 = 3D texture */
      bool is_array;          /* array or cube texture */
      unsigned width0;        /* level 0 width */
      unsigned height0;       /* level 0 height */
      uint8_t num_samples;
      uint8_t level;
      struct pipe_box box;       /* negative width, height only legal for src */
      enum pipe_format format;   /* format reinterpretation */
   } dst, src;

   bool is_gfx_queue;
   bool dst_has_dcc;
   bool sample0_only;                  /* copy sample 0 instead of resolving */
   union pipe_color_union clear_color; /* if src.surf == NULL, this is the clear color */
};

/* Dispatch parameters generated by the blit. */
struct ac_cs_blit_dispatch {
   union ac_cs_blit_key shader_key;
   uint32_t user_data[8];        /* for nir_intrinsic_load_user_data_amd */

   unsigned wg_size[3];          /* variable workgroup size (NUM_THREAD_FULL) */
   unsigned last_wg_size[3];     /* workgroup size of the last workgroup (NUM_THREAD_PARTIAL) */
   unsigned num_workgroups[3];   /* DISPATCH_DIRECT parameters */
};

struct ac_cs_blit_dispatches {
   unsigned num_dispatches;
   struct ac_cs_blit_dispatch dispatches[7];
};

nir_shader *
ac_create_blit_cs(const struct ac_cs_blit_options *options, const union ac_cs_blit_key *key);

bool
ac_prepare_compute_blit(const struct ac_cs_blit_options *options,
                        const struct ac_cs_blit_description *blit,
                        struct ac_cs_blit_dispatches *dispatches);

/* clear_buffer/copy_buffer compute shader. */
union ac_cs_clear_copy_buffer_key {
   struct {
      bool is_clear:1;
      unsigned dwords_per_thread:3; /* 1..4 allowed */
      bool clear_value_size_is_12:1;
      bool src_is_sparse:1;
      /* Unaligned clears and copies. */
      unsigned src_align_offset:2; /* how much is the source address unaligned */
      unsigned dst_align_offset:4; /* the first thread shouldn't write this many bytes */
      unsigned dst_last_thread_bytes:4; /* if non-zero, the last thread should write this many bytes */
      bool dst_single_thread_unaligned:1; /* only 1 thread executes, both previous fields apply */
      bool has_start_thread:1; /* whether the first few threads should be skipped, making later
                                  waves start on a 256B boundary */
   };
   uint64_t key;
};

struct ac_cs_clear_copy_buffer_options {
   const nir_shader_compiler_options *nir_options;
   const struct radeon_info *info;
   bool print_key;      /* print the shader key into stderr */
   bool fail_if_slow;   /* fail if a gfx blit is faster, set to false on compute queues */
};

struct ac_cs_clear_copy_buffer_info {
   unsigned dst_offset;
   unsigned src_offset;
   unsigned size;
   unsigned clear_value_size;
   uint32_t clear_value[4];
   unsigned dwords_per_thread;   /* Set to 0 to let the code choose the optimal value. */
   bool render_condition_enabled;
   bool dst_is_vram;
   bool src_is_vram;
   bool src_is_sparse;
};

struct ac_cs_clear_copy_buffer_dispatch {
   union ac_cs_clear_copy_buffer_key shader_key;
   uint32_t user_data[6];        /* for nir_intrinsic_load_user_data_amd */
   unsigned num_ssbos;
   unsigned workgroup_size;
   unsigned num_threads;

   struct {
      unsigned offset;
      unsigned size;
   } ssbo[2];
};

nir_shader *
ac_create_clear_copy_buffer_cs(struct ac_cs_clear_copy_buffer_options *options,
                               union ac_cs_clear_copy_buffer_key *key);

bool
ac_prepare_cs_clear_copy_buffer(const struct ac_cs_clear_copy_buffer_options *options,
                                const struct ac_cs_clear_copy_buffer_info *info,
                                struct ac_cs_clear_copy_buffer_dispatch *out);

#endif
