/*
 * Copyright (C) 2021 Collabora, Ltd.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 * Authors:
 *   Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
 *   Boris Brezillon <boris.brezillon@collabora.com>
 */

#include "util/macros.h"

#include "genxml/gen_macros.h"

#include "pan_desc.h"
#include "pan_encoder.h"
#include "pan_texture.h"

static unsigned
mod_to_block_fmt(uint64_t mod)
{
   switch (mod) {
   case DRM_FORMAT_MOD_LINEAR:
      return MALI_BLOCK_FORMAT_LINEAR;
   case DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED:
      return MALI_BLOCK_FORMAT_TILED_U_INTERLEAVED;
   default:
#if PAN_ARCH >= 5
      if (drm_is_afbc(mod) && !(mod & AFBC_FORMAT_MOD_TILED))
         return MALI_BLOCK_FORMAT_AFBC;
#endif
#if PAN_ARCH >= 7
      if (drm_is_afbc(mod) && (mod & AFBC_FORMAT_MOD_TILED))
         return MALI_BLOCK_FORMAT_AFBC_TILED;
#endif
#if PAN_ARCH >= 10
      if (drm_is_afrc(mod))
         return 0; /* Reserved field for AFRC state */
#endif

      unreachable("Unsupported modifer");
   }
}

static enum mali_msaa
mali_sampling_mode(const struct pan_image_view *view)
{
   unsigned nr_samples = pan_image_view_get_nr_samples(view);

   if (nr_samples > 1) {
      assert(view->nr_samples == nr_samples);
      assert(view->planes[0]->layout.slices[0].surface_stride != 0);
      return MALI_MSAA_LAYERED;
   }

   if (view->nr_samples > nr_samples) {
      assert(nr_samples == 1);
      return MALI_MSAA_AVERAGE;
   }

   assert(view->nr_samples == nr_samples);
   assert(view->nr_samples == 1);

   return MALI_MSAA_SINGLE;
}

int
GENX(pan_select_crc_rt)(const struct pan_fb_info *fb, unsigned tile_size)
{
   /* Disable CRC when the tile size is not 16x16. In the hardware, CRC
    * tiles are the same size as the tiles of the framebuffer. However,
    * our code only handles 16x16 tiles. Therefore under the current
    * implementation, we must disable CRC when 16x16 tiles are not used.
    *
    * This may hurt performance. However, smaller tile sizes are rare, and
    * CRCs are more expensive at smaller tile sizes, reducing the benefit.
    * Restricting CRC to 16x16 should work in practice.
    */
   if (tile_size != 16 * 16) {
      assert(tile_size < 16 * 16);
      return -1;
   }

#if PAN_ARCH <= 6
   if (fb->rt_count == 1 && fb->rts[0].view && !fb->rts[0].discard &&
       pan_image_view_has_crc(fb->rts[0].view))
      return 0;

   return -1;
#else
   bool best_rt_valid = false;
   int best_rt = -1;

   for (unsigned i = 0; i < fb->rt_count; i++) {
      if (!fb->rts[i].view || fb->rts[0].discard ||
          !pan_image_view_has_crc(fb->rts[i].view))
         continue;

      bool valid = *(fb->rts[i].crc_valid);
      bool full = !fb->extent.minx && !fb->extent.miny &&
                  fb->extent.maxx == (fb->width - 1) &&
                  fb->extent.maxy == (fb->height - 1);
      if (!full && !valid)
         continue;

      if (best_rt < 0 || (valid && !best_rt_valid)) {
         best_rt = i;
         best_rt_valid = valid;
      }

      if (valid)
         break;
   }

   return best_rt;
#endif
}

static enum mali_zs_format
translate_zs_format(enum pipe_format in)
{
   switch (in) {
   case PIPE_FORMAT_Z16_UNORM:
      return MALI_ZS_FORMAT_D16;
   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
      return MALI_ZS_FORMAT_D24S8;
   case PIPE_FORMAT_Z24X8_UNORM:
      return MALI_ZS_FORMAT_D24X8;
   case PIPE_FORMAT_Z32_FLOAT:
      return MALI_ZS_FORMAT_D32;
#if PAN_ARCH <= 7
   case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
      return MALI_ZS_FORMAT_D32_S8X24;
#endif
   default:
      unreachable("Unsupported depth/stencil format.");
   }
}

#if PAN_ARCH >= 5
static enum mali_s_format
translate_s_format(enum pipe_format in)
{
   switch (in) {
   case PIPE_FORMAT_S8_UINT:
      return MALI_S_FORMAT_S8;
   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
   case PIPE_FORMAT_X24S8_UINT:
      return MALI_S_FORMAT_X24S8;

#if PAN_ARCH <= 7
   case PIPE_FORMAT_S8_UINT_Z24_UNORM:
   case PIPE_FORMAT_S8X24_UINT:
      return MALI_S_FORMAT_S8X24;
   case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
   case PIPE_FORMAT_X32_S8X24_UINT:
      return MALI_S_FORMAT_X32_S8X24;
#endif

   default:
      unreachable("Unsupported stencil format.");
   }
}

static void
pan_prepare_s(const struct pan_fb_info *fb, unsigned layer_idx,
              struct MALI_ZS_CRC_EXTENSION *ext)
{
   const struct pan_image_view *s = fb->zs.view.s;

   if (!s)
      return;

   const struct pan_image *image = pan_image_view_get_zs_image(s);
   unsigned level = s->first_level;

   ext->s_msaa = mali_sampling_mode(s);

   struct pan_surface surf;
   pan_iview_get_surface(s, 0, layer_idx, 0, &surf);

   assert(image->layout.modifier ==
             DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED ||
          image->layout.modifier == DRM_FORMAT_MOD_LINEAR);
   ext->s_writeback_base = surf.data;
   ext->s_writeback_row_stride = image->layout.slices[level].row_stride;
   ext->s_writeback_surface_stride =
      (pan_image_view_get_nr_samples(s) > 1)
         ? image->layout.slices[level].surface_stride
         : 0;
   ext->s_block_format = mod_to_block_fmt(image->layout.modifier);
   ext->s_write_format = translate_s_format(s->format);
}

static void
pan_prepare_zs(const struct pan_fb_info *fb, unsigned layer_idx,
               struct MALI_ZS_CRC_EXTENSION *ext)
{
   const struct pan_image_view *zs = fb->zs.view.zs;

   if (!zs)
      return;

   const struct pan_image *image = pan_image_view_get_zs_image(zs);
   unsigned level = zs->first_level;

   ext->zs_msaa = mali_sampling_mode(zs);

   struct pan_surface surf;
   pan_iview_get_surface(zs, 0, layer_idx, 0, &surf);
   UNUSED const struct pan_image_slice_layout *slice =
      &image->layout.slices[level];

   if (drm_is_afbc(image->layout.modifier)) {
#if PAN_ARCH >= 9
      ext->zs_writeback_base = surf.afbc.header;
      ext->zs_writeback_row_stride = slice->row_stride;
      /* TODO: surface stride? */
      ext->zs_afbc_body_offset = surf.afbc.body - surf.afbc.header;

      /* TODO: stencil AFBC? */
#else
#if PAN_ARCH >= 6
      ext->zs_afbc_row_stride =
         pan_afbc_stride_blocks(image->layout.modifier, slice->row_stride);
#else
      ext->zs_block_format = MALI_BLOCK_FORMAT_AFBC;
      ext->zs_afbc_body_size = 0x1000;
      ext->zs_afbc_chunk_size = 9;
      ext->zs_afbc_sparse = true;
#endif

      ext->zs_afbc_header = surf.afbc.header;
      ext->zs_afbc_body = surf.afbc.body;
#endif
   } else {
      assert(image->layout.modifier ==
                DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED ||
             image->layout.modifier == DRM_FORMAT_MOD_LINEAR);

      /* TODO: Z32F(S8) support, which is always linear */

      ext->zs_writeback_base = surf.data;
      ext->zs_writeback_row_stride = image->layout.slices[level].row_stride;
      ext->zs_writeback_surface_stride =
         (pan_image_view_get_nr_samples(zs) > 1)
            ? image->layout.slices[level].surface_stride
            : 0;
   }

   ext->zs_block_format = mod_to_block_fmt(image->layout.modifier);
   ext->zs_write_format = translate_zs_format(zs->format);
   if (ext->zs_write_format == MALI_ZS_FORMAT_D24S8)
      ext->s_writeback_base = ext->zs_writeback_base;
}

static void
pan_prepare_crc(const struct pan_fb_info *fb, int rt_crc,
                struct MALI_ZS_CRC_EXTENSION *ext)
{
   if (rt_crc < 0)
      return;

   assert(rt_crc < fb->rt_count);

   const struct pan_image_view *rt = fb->rts[rt_crc].view;
   const struct pan_image *image = pan_image_view_get_rt_image(rt);
   const struct pan_image_slice_layout *slice =
      &image->layout.slices[rt->first_level];

   ext->crc_base =
      image->data.base + image->data.offset + slice->crc.offset;
   ext->crc_row_stride = slice->crc.stride;

#if PAN_ARCH >= 7
   ext->crc_render_target = rt_crc;

   if (fb->rts[rt_crc].clear) {
      uint32_t clear_val = fb->rts[rt_crc].clear_value[0];
      ext->crc_clear_color = clear_val | 0xc000000000000000 |
                             (((uint64_t)clear_val & 0xffff) << 32);
   }
#endif
}

static void
pan_emit_zs_crc_ext(const struct pan_fb_info *fb, unsigned layer_idx,
                    int rt_crc, void *zs_crc_ext)
{
   pan_pack(zs_crc_ext, ZS_CRC_EXTENSION, cfg) {
      pan_prepare_crc(fb, rt_crc, &cfg);
      cfg.zs_clean_pixel_write_enable = fb->zs.clear.z || fb->zs.clear.s;
      pan_prepare_zs(fb, layer_idx, &cfg);
      pan_prepare_s(fb, layer_idx, &cfg);
   }
}

/* Measure format as it appears in the tile buffer */

static unsigned
pan_bytes_per_pixel_tib(enum pipe_format format)
{
   const struct pan_blendable_format *bf =
     GENX(panfrost_blendable_format_from_pipe_format)(format);

   if (bf->internal) {
      /* Blendable formats are always 32-bits in the tile buffer,
       * extra bits are used as padding or to dither */
      return 4;
   } else {
      /* Non-blendable formats are raw, rounded up to the nearest
       * power-of-two size */
      unsigned bytes = util_format_get_blocksize(format);
      return util_next_power_of_two(bytes);
   }
}

static unsigned
pan_cbuf_bytes_per_pixel(const struct pan_fb_info *fb)
{
   unsigned sum = 0;

   for (int cb = 0; cb < fb->rt_count; ++cb) {
      const struct pan_image_view *rt = fb->rts[cb].view;

      if (!rt)
         continue;

      sum += pan_bytes_per_pixel_tib(rt->format) * rt->nr_samples;
   }

   return sum;
}

/*
 * Select the largest tile size that fits within the tilebuffer budget.
 * Formally, maximize (pixels per tile) such that it is a power of two and
 *
 *      (bytes per pixel) (pixels per tile) <= (max bytes per tile)
 *
 * A bit of algebra gives the following formula.
 */
static unsigned
pan_select_max_tile_size(unsigned tile_buffer_bytes, unsigned bytes_per_pixel)
{
   assert(util_is_power_of_two_nonzero(tile_buffer_bytes));
   assert(tile_buffer_bytes >= 1024);

   return tile_buffer_bytes >> util_logbase2_ceil(bytes_per_pixel);
}

static enum mali_color_format
pan_mfbd_raw_format(unsigned bits)
{
   /* clang-format off */
   switch (bits) {
   case    8: return MALI_COLOR_FORMAT_RAW8;
   case   16: return MALI_COLOR_FORMAT_RAW16;
   case   24: return MALI_COLOR_FORMAT_RAW24;
   case   32: return MALI_COLOR_FORMAT_RAW32;
   case   48: return MALI_COLOR_FORMAT_RAW48;
   case   64: return MALI_COLOR_FORMAT_RAW64;
   case   96: return MALI_COLOR_FORMAT_RAW96;
   case  128: return MALI_COLOR_FORMAT_RAW128;
   case  192: return MALI_COLOR_FORMAT_RAW192;
   case  256: return MALI_COLOR_FORMAT_RAW256;
   case  384: return MALI_COLOR_FORMAT_RAW384;
   case  512: return MALI_COLOR_FORMAT_RAW512;
   case  768: return MALI_COLOR_FORMAT_RAW768;
   case 1024: return MALI_COLOR_FORMAT_RAW1024;
   case 1536: return MALI_COLOR_FORMAT_RAW1536;
   case 2048: return MALI_COLOR_FORMAT_RAW2048;
   default: unreachable("invalid raw bpp");
   }
   /* clang-format on */
}

static void
pan_rt_init_format(const struct pan_image_view *rt,
                   struct MALI_RENDER_TARGET *cfg)
{
   /* Explode details on the format */

   const struct util_format_description *desc =
      util_format_description(rt->format);

   /* The swizzle for rendering is inverted from texturing */

   unsigned char swizzle[4] = {
      PIPE_SWIZZLE_X,
      PIPE_SWIZZLE_Y,
      PIPE_SWIZZLE_Z,
      PIPE_SWIZZLE_W,
   };

   /* Fill in accordingly, defaulting to 8-bit UNORM */

   if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB)
      cfg->srgb = true;

   struct pan_blendable_format fmt =
      *GENX(panfrost_blendable_format_from_pipe_format)(rt->format);
   enum mali_color_format writeback_format;

   if (fmt.internal) {
      cfg->internal_format = fmt.internal;
      writeback_format = fmt.writeback;
      panfrost_invert_swizzle(desc->swizzle, swizzle);
   } else {
      /* Construct RAW internal/writeback, where internal is
       * specified logarithmically (round to next power-of-two).
       * Offset specified from RAW8, where 8 = 2^3 */

      unsigned bits = desc->block.bits;
      unsigned offset = util_logbase2_ceil(bits) - 3;
      assert(offset <= 4);

      cfg->internal_format = MALI_COLOR_BUFFER_INTERNAL_FORMAT_RAW8 + offset;
      writeback_format = pan_mfbd_raw_format(bits);
   }

#if PAN_ARCH >= 10
   const struct pan_image *image = pan_image_view_get_rt_image(rt);

   if (drm_is_afrc(image->layout.modifier))
      cfg->afrc.writeback_format = writeback_format;
   else
      cfg->writeback_format = writeback_format;
#else
   cfg->writeback_format = writeback_format;
#endif

   cfg->swizzle = panfrost_translate_swizzle_4(swizzle);
}

static void
pan_prepare_rt(const struct pan_fb_info *fb, unsigned layer_idx,
               unsigned rt_idx, unsigned cbuf_offset,
               struct MALI_RENDER_TARGET *cfg)
{
   cfg->clean_pixel_write_enable = fb->rts[rt_idx].clear;
   cfg->internal_buffer_offset = cbuf_offset;
   if (fb->rts[rt_idx].clear) {
      cfg->clear.color_0 = fb->rts[rt_idx].clear_value[0];
      cfg->clear.color_1 = fb->rts[rt_idx].clear_value[1];
      cfg->clear.color_2 = fb->rts[rt_idx].clear_value[2];
      cfg->clear.color_3 = fb->rts[rt_idx].clear_value[3];
   }

   const struct pan_image_view *rt = fb->rts[rt_idx].view;
   if (!rt || fb->rts[rt_idx].discard) {
      cfg->internal_format = MALI_COLOR_BUFFER_INTERNAL_FORMAT_R8G8B8A8;
      cfg->internal_buffer_offset = cbuf_offset;
#if PAN_ARCH >= 7
      cfg->writeback_block_format = MALI_BLOCK_FORMAT_TILED_U_INTERLEAVED;
      cfg->dithering_enable = true;
#endif
      return;
   }

   const struct pan_image *image = pan_image_view_get_rt_image(rt);

   if (!drm_is_afrc(image->layout.modifier))
      cfg->write_enable = true;

   cfg->dithering_enable = true;

   unsigned level = rt->first_level;
   ASSERTED unsigned layer_count = rt->dim == MALI_TEXTURE_DIMENSION_3D
                                      ? rt->planes[0]->layout.depth
                                      : rt->last_layer - rt->first_layer + 1;

   assert(rt->last_level == rt->first_level);
   assert(layer_idx < layer_count);

   int row_stride = image->layout.slices[level].row_stride;

   /* Only set layer_stride for layered MSAA rendering  */

   unsigned layer_stride = (pan_image_view_get_nr_samples(rt) > 1)
                              ? image->layout.slices[level].surface_stride
                              : 0;

   cfg->writeback_msaa = mali_sampling_mode(rt);

   pan_rt_init_format(rt, cfg);

   cfg->writeback_block_format = mod_to_block_fmt(image->layout.modifier);

   struct pan_surface surf;
   pan_iview_get_surface(rt, 0, layer_idx, 0, &surf);

   if (drm_is_afbc(image->layout.modifier)) {
#if PAN_ARCH >= 9
      if (image->layout.modifier & AFBC_FORMAT_MOD_YTR)
         cfg->afbc.yuv_transform = true;

      cfg->afbc.wide_block = panfrost_afbc_is_wide(image->layout.modifier);
      cfg->afbc.header = surf.afbc.header;
      cfg->afbc.body_offset = surf.afbc.body - surf.afbc.header;
      assert(surf.afbc.body >= surf.afbc.header);

      cfg->afbc.compression_mode = GENX(pan_afbc_compression_mode)(rt->format);
      cfg->afbc.row_stride = row_stride;
#else
      const struct pan_image_slice_layout *slice = &image->layout.slices[level];

#if PAN_ARCH >= 6
      cfg->afbc.row_stride =
         pan_afbc_stride_blocks(image->layout.modifier, slice->row_stride);
      cfg->afbc.afbc_wide_block_enable =
         panfrost_afbc_is_wide(image->layout.modifier);
#else
      cfg->afbc.chunk_size = 9;
      cfg->afbc.sparse = true;
      cfg->afbc.body_size = slice->afbc.body_size;
#endif

      cfg->afbc.header = surf.afbc.header;
      cfg->afbc.body = surf.afbc.body;

      if (image->layout.modifier & AFBC_FORMAT_MOD_YTR)
         cfg->afbc.yuv_transform_enable = true;
#endif
#if PAN_ARCH >= 10
   } else if (drm_is_afrc(image->layout.modifier)) {
      struct pan_afrc_format_info finfo =
         panfrost_afrc_get_format_info(image->layout.format);

      cfg->writeback_mode = MALI_WRITEBACK_MODE_AFRC_RGB;
      cfg->afrc.block_size =
         GENX(pan_afrc_block_size)(image->layout.modifier, 0);
      cfg->afrc.format =
         GENX(pan_afrc_format)(finfo, image->layout.modifier, 0);

      cfg->rgb.base = surf.data;
      cfg->rgb.row_stride = row_stride;
      cfg->rgb.surface_stride = layer_stride;
#endif
   } else {
      assert(image->layout.modifier == DRM_FORMAT_MOD_LINEAR ||
             image->layout.modifier ==
                DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED);
      cfg->rgb.base = surf.data;
      cfg->rgb.row_stride = row_stride;
      cfg->rgb.surface_stride = layer_stride;
   }
}
#endif

void
GENX(pan_emit_tls)(const struct pan_tls_info *info, void *out)
{
   pan_pack(out, LOCAL_STORAGE, cfg) {
      if (info->tls.size) {
         unsigned shift = panfrost_get_stack_shift(info->tls.size);

         cfg.tls_size = shift;
#if PAN_ARCH >= 9
         /* For now, always use packed TLS addressing. This is
          * better for the cache and requires no fix up code in
          * the shader. We may need to revisit this someday for
          * OpenCL generic pointer support.
          */
         cfg.tls_address_mode = MALI_ADDRESS_MODE_PACKED;

         assert((info->tls.ptr & 4095) == 0);
         cfg.tls_base_pointer = info->tls.ptr >> 8;
#else
         cfg.tls_base_pointer = info->tls.ptr;
#endif
      }

      if (info->wls.size) {
         assert(!(info->wls.ptr & 4095));
         assert((info->wls.ptr & 0xffffffff00000000ULL) ==
                ((info->wls.ptr + info->wls.size - 1) & 0xffffffff00000000ULL));
         cfg.wls_base_pointer = info->wls.ptr;
         unsigned wls_size = pan_wls_adjust_size(info->wls.size);
         cfg.wls_instances = info->wls.instances;
         cfg.wls_size_scale = util_logbase2(wls_size) + 1;
      } else {
         cfg.wls_instances = MALI_LOCAL_STORAGE_NO_WORKGROUP_MEM;
      }
   }
}

#if PAN_ARCH <= 5
static void
pan_emit_midgard_tiler(const struct pan_fb_info *fb,
                       const struct pan_tiler_context *tiler_ctx, void *out)
{
   bool hierarchy = !tiler_ctx->midgard.no_hierarchical_tiling;

   assert(tiler_ctx->midgard.polygon_list);

   pan_pack(out, TILER_CONTEXT, cfg) {
      unsigned header_size;

      if (tiler_ctx->midgard.disable) {
         cfg.hierarchy_mask =
            hierarchy ? MALI_MIDGARD_TILER_DISABLED : MALI_MIDGARD_TILER_USER;
         header_size = MALI_MIDGARD_TILER_MINIMUM_HEADER_SIZE;
         cfg.polygon_list_size = header_size + (hierarchy ? 0 : 4);
         cfg.heap_start = tiler_ctx->midgard.polygon_list;
         cfg.heap_end = tiler_ctx->midgard.polygon_list;
      } else {
         cfg.hierarchy_mask = panfrost_choose_hierarchy_mask(
            fb->width, fb->height, tiler_ctx->midgard.vertex_count, hierarchy);
         header_size = panfrost_tiler_header_size(
            fb->width, fb->height, cfg.hierarchy_mask, hierarchy);
         cfg.polygon_list_size = panfrost_tiler_full_size(
            fb->width, fb->height, cfg.hierarchy_mask, hierarchy);
         cfg.heap_start = tiler_ctx->midgard.heap.start;
         cfg.heap_end = cfg.heap_start + tiler_ctx->midgard.heap.size;
      }

      cfg.polygon_list = tiler_ctx->midgard.polygon_list;
      cfg.polygon_list_body = cfg.polygon_list + header_size;
   }
}
#endif

#if PAN_ARCH >= 5
static void
pan_emit_rt(const struct pan_fb_info *fb, unsigned layer_idx,
            unsigned idx, unsigned cbuf_offset, void *out)
{
   pan_pack(out, RENDER_TARGET, cfg) {
      pan_prepare_rt(fb, layer_idx, idx, cbuf_offset, &cfg);
   }
}

#if PAN_ARCH >= 6
/* All Bifrost and Valhall GPUs are affected by issue TSIX-2033:
 *
 *      Forcing clean_tile_writes breaks INTERSECT readbacks
 *
 * To workaround, use the frame shader mode ALWAYS instead of INTERSECT if
 * clean tile writes is forced. Since INTERSECT is a hint that the hardware may
 * ignore, this cannot affect correctness, only performance */

static enum mali_pre_post_frame_shader_mode
pan_fix_frame_shader_mode(enum mali_pre_post_frame_shader_mode mode,
                          bool force_clean_tile)
{
   if (force_clean_tile && mode == MALI_PRE_POST_FRAME_SHADER_MODE_INTERSECT)
      return MALI_PRE_POST_FRAME_SHADER_MODE_ALWAYS;
   else
      return mode;
}

/* Regardless of clean_tile_write_enable, the hardware writes clean tiles if
 * the effective tile size differs from the superblock size of any enabled AFBC
 * render target. Check this condition. */

static bool
pan_force_clean_write_rt(const struct pan_image_view *rt, unsigned tile_size)
{
   const struct pan_image *image = pan_image_view_get_rt_image(rt);
   if (!drm_is_afbc(image->layout.modifier))
      return false;

   unsigned superblock = panfrost_afbc_superblock_width(image->layout.modifier);

   assert(superblock >= 16);
   assert(tile_size <= 16 * 16);

   /* Tile size and superblock differ unless they are both 16x16 */
   return !(superblock == 16 && tile_size == 16 * 16);
}

static bool
pan_force_clean_write(const struct pan_fb_info *fb, unsigned tile_size)
{
   /* Maximum tile size */
   assert(tile_size <= 16 * 16);

   for (unsigned i = 0; i < fb->rt_count; ++i) {
      if (fb->rts[i].view && !fb->rts[i].discard &&
          pan_force_clean_write_rt(fb->rts[i].view, tile_size))
         return true;
   }

   if (fb->zs.view.zs && !fb->zs.discard.z &&
       pan_force_clean_write_rt(fb->zs.view.zs, tile_size))
      return true;

   if (fb->zs.view.s && !fb->zs.discard.s &&
       pan_force_clean_write_rt(fb->zs.view.s, tile_size))
      return true;

   return false;
}

#endif

unsigned
GENX(pan_emit_fbd)(const struct pan_fb_info *fb, unsigned layer_idx,
                   const struct pan_tls_info *tls,
                   const struct pan_tiler_context *tiler_ctx, void *out)
{
   void *fbd = out;
   void *rtd = out + pan_size(FRAMEBUFFER);

#if PAN_ARCH <= 5
   GENX(pan_emit_tls)(tls, pan_section_ptr(fbd, FRAMEBUFFER, LOCAL_STORAGE));
#endif

   unsigned bytes_per_pixel = pan_cbuf_bytes_per_pixel(fb);
   unsigned tile_size =
      pan_select_max_tile_size(fb->tile_buf_budget, bytes_per_pixel);

   /* Clamp tile size to hardware limits */
   tile_size = MIN2(tile_size, 16 * 16);
   assert(tile_size >= 4 * 4);

   /* Colour buffer allocations must be 1K aligned. */
   unsigned cbuf_allocation = ALIGN_POT(bytes_per_pixel * tile_size, 1024);
   assert(cbuf_allocation <= fb->tile_buf_budget && "tile too big");

   int crc_rt = GENX(pan_select_crc_rt)(fb, tile_size);
   bool has_zs_crc_ext = (fb->zs.view.zs || fb->zs.view.s || crc_rt >= 0);

   pan_section_pack(fbd, FRAMEBUFFER, PARAMETERS, cfg) {
#if PAN_ARCH >= 6
      bool force_clean_write = pan_force_clean_write(fb, tile_size);

      cfg.sample_locations = fb->sample_positions;
      cfg.pre_frame_0 = pan_fix_frame_shader_mode(fb->bifrost.pre_post.modes[0],
                                                  force_clean_write);
      cfg.pre_frame_1 = pan_fix_frame_shader_mode(fb->bifrost.pre_post.modes[1],
                                                  force_clean_write);
      cfg.post_frame = pan_fix_frame_shader_mode(fb->bifrost.pre_post.modes[2],
                                                 force_clean_write);
      cfg.frame_shader_dcds = fb->bifrost.pre_post.dcds.gpu;
      cfg.tiler =
         PAN_ARCH >= 9 ? tiler_ctx->valhall.desc : tiler_ctx->bifrost.desc;
#endif
      cfg.width = fb->width;
      cfg.height = fb->height;
      cfg.bound_max_x = fb->width - 1;
      cfg.bound_max_y = fb->height - 1;

      cfg.effective_tile_size = tile_size;
      cfg.tie_break_rule = MALI_TIE_BREAK_RULE_MINUS_180_IN_0_OUT;
      cfg.render_target_count = MAX2(fb->rt_count, 1);

      /* Default to 24 bit depth if there's no surface. */
      cfg.z_internal_format =
         fb->zs.view.zs ? panfrost_get_z_internal_format(fb->zs.view.zs->format)
                        : MALI_Z_INTERNAL_FORMAT_D24;

      cfg.z_clear = fb->zs.clear_value.depth;
      cfg.s_clear = fb->zs.clear_value.stencil;
      cfg.color_buffer_allocation = cbuf_allocation;

      /* The force_samples setting dictates the sample-count that is used
       * for rasterization, and works like D3D11's ForcedSampleCount feature:
       *
       * - If force_samples == 0: Let nr_samples dictate sample count
       * - If force_samples == 1: force single-sampled rasterization
       * - If force_samples >= 1: force multi-sampled rasterization
       *
       * This can be used to read SYSTEM_VALUE_SAMPLE_MASK_IN from the
       * fragment shader, even when performing single-sampled rendering.
       */
      if (!fb->force_samples) {
         cfg.sample_count = fb->nr_samples;
         cfg.sample_pattern = pan_sample_pattern(fb->nr_samples);
      } else if (fb->force_samples == 1) {
         cfg.sample_count = fb->nr_samples;
         cfg.sample_pattern = pan_sample_pattern(1);
      } else {
         cfg.sample_count = 1;
         cfg.sample_pattern = pan_sample_pattern(fb->force_samples);
      }

      cfg.z_write_enable = (fb->zs.view.zs && !fb->zs.discard.z);
      cfg.s_write_enable = (fb->zs.view.s && !fb->zs.discard.s);
      cfg.has_zs_crc_extension = has_zs_crc_ext;

      if (crc_rt >= 0) {
         bool *valid = fb->rts[crc_rt].crc_valid;
         bool full = !fb->extent.minx && !fb->extent.miny &&
                     fb->extent.maxx == (fb->width - 1) &&
                     fb->extent.maxy == (fb->height - 1);

         cfg.crc_read_enable = *valid;

         /* If the data is currently invalid, still write CRC
          * data if we are doing a full write, so that it is
          * valid for next time. */
         cfg.crc_write_enable = *valid || full;

         *valid |= full;
      }

#if PAN_ARCH >= 9
      cfg.point_sprite_coord_origin_max_y = fb->sprite_coord_origin;
      cfg.first_provoking_vertex = fb->first_provoking_vertex;

      /* internal_layer_index is used to select the right primitive list in the
       * tiler context, and frame_arg is the value that's passed to the fragment
       * shader through r62-r63, which we use to pass gl_Layer. Since the
       * layer_idx only takes 8-bits, we might use the extra 56-bits we have
       * in frame_argument to pass other information to the fragment shader at
       * some point. */
      cfg.internal_layer_index = layer_idx;
      cfg.frame_argument = layer_idx;
#endif
   }

#if PAN_ARCH >= 6
   pan_section_pack(fbd, FRAMEBUFFER, PADDING, padding)
      ;
#else
   pan_emit_midgard_tiler(fb, tiler_ctx,
                          pan_section_ptr(fbd, FRAMEBUFFER, TILER));

   /* All weights set to 0, nothing to do here */
   pan_section_pack(fbd, FRAMEBUFFER, TILER_WEIGHTS, w)
      ;
#endif

   if (has_zs_crc_ext) {
      pan_emit_zs_crc_ext(fb, layer_idx, crc_rt, out + pan_size(FRAMEBUFFER));
      rtd += pan_size(ZS_CRC_EXTENSION);
   }

   unsigned rt_count = MAX2(fb->rt_count, 1);
   unsigned cbuf_offset = 0;
   for (unsigned i = 0; i < rt_count; i++) {
      pan_emit_rt(fb, layer_idx, i, cbuf_offset, rtd);
      rtd += pan_size(RENDER_TARGET);
      if (!fb->rts[i].view)
         continue;

      cbuf_offset += pan_bytes_per_pixel_tib(fb->rts[i].view->format) *
                     tile_size * pan_image_view_get_nr_samples(fb->rts[i].view);

      if (i != crc_rt)
         *(fb->rts[i].crc_valid) = false;
   }

   struct mali_framebuffer_pointer_packed tag;
   pan_pack(tag.opaque, FRAMEBUFFER_POINTER, cfg) {
      cfg.zs_crc_extension_present = has_zs_crc_ext;
      cfg.render_target_count = MAX2(fb->rt_count, 1);
   }
   return tag.opaque[0];
}
#else /* PAN_ARCH == 4 */
static enum mali_color_format
pan_sfbd_raw_format(unsigned bits)
{
   /* clang-format off */
   switch (bits) {
   case   16: return MALI_COLOR_FORMAT_1_16B_CHANNEL;
   case   32: return MALI_COLOR_FORMAT_1_32B_CHANNEL;
   case   48: return MALI_COLOR_FORMAT_3_16B_CHANNELS;
   case   64: return MALI_COLOR_FORMAT_2_32B_CHANNELS;
   case   96: return MALI_COLOR_FORMAT_3_32B_CHANNELS;
   case  128: return MALI_COLOR_FORMAT_4_32B_CHANNELS;
   default: unreachable("invalid raw bpp");
   }
   /* clang-format on */
}
unsigned
GENX(pan_emit_fbd)(const struct pan_fb_info *fb, unsigned layer_idx,
                   const struct pan_tls_info *tls,
                   const struct pan_tiler_context *tiler_ctx, void *fbd)
{
   assert(fb->rt_count <= 1);

   GENX(pan_emit_tls)(tls, pan_section_ptr(fbd, FRAMEBUFFER, LOCAL_STORAGE));
   pan_section_pack(fbd, FRAMEBUFFER, PARAMETERS, cfg) {
      cfg.bound_max_x = fb->width - 1;
      cfg.bound_max_y = fb->height - 1;
      cfg.dithering_enable = true;
      cfg.clean_pixel_write_enable = true;
      cfg.tie_break_rule = MALI_TIE_BREAK_RULE_MINUS_180_IN_0_OUT;
      if (fb->rts[0].clear) {
         cfg.clear_color_0 = fb->rts[0].clear_value[0];
         cfg.clear_color_1 = fb->rts[0].clear_value[1];
         cfg.clear_color_2 = fb->rts[0].clear_value[2];
         cfg.clear_color_3 = fb->rts[0].clear_value[3];
      }

      if (fb->zs.clear.z)
         cfg.z_clear = fb->zs.clear_value.depth;

      if (fb->zs.clear.s)
         cfg.s_clear = fb->zs.clear_value.stencil;

      if (fb->rt_count && fb->rts[0].view) {
         const struct pan_image_view *rt = fb->rts[0].view;
         const struct pan_image *image = pan_image_view_get_rt_image(rt);

         const struct util_format_description *desc =
            util_format_description(rt->format);

         /* The swizzle for rendering is inverted from texturing */
         unsigned char swizzle[4];
         panfrost_invert_swizzle(desc->swizzle, swizzle);
         cfg.swizzle = panfrost_translate_swizzle_4(swizzle);

         struct pan_blendable_format fmt =
            *GENX(panfrost_blendable_format_from_pipe_format)(rt->format);

         if (fmt.internal) {
            cfg.internal_format = fmt.internal;
            cfg.color_writeback_format = fmt.writeback;
         } else {
            /* Construct RAW internal/writeback */
            unsigned bits = desc->block.bits;

            cfg.internal_format = MALI_COLOR_BUFFER_INTERNAL_FORMAT_RAW_VALUE;
            cfg.color_writeback_format = pan_sfbd_raw_format(bits);
         }

         unsigned level = rt->first_level;
         struct pan_surface surf;

         pan_iview_get_surface(rt, 0, 0, 0, &surf);

         cfg.color_write_enable = !fb->rts[0].discard;
         cfg.color_writeback.base = surf.data;
         cfg.color_writeback.row_stride =
            image->layout.slices[level].row_stride;

         cfg.color_block_format = mod_to_block_fmt(image->layout.modifier);
         assert(cfg.color_block_format == MALI_BLOCK_FORMAT_LINEAR ||
                cfg.color_block_format ==
                   MALI_BLOCK_FORMAT_TILED_U_INTERLEAVED);

         if (pan_image_view_has_crc(rt)) {
            const struct pan_image_slice_layout *slice =
               &image->layout.slices[level];

            cfg.crc_buffer.row_stride = slice->crc.stride;
            cfg.crc_buffer.base =
               image->data.base + image->data.offset + slice->crc.offset;
         }
      }

      if (fb->zs.view.zs) {
         const struct pan_image_view *zs = fb->zs.view.zs;
         const struct pan_image *image = pan_image_view_get_zs_image(zs);
         unsigned level = zs->first_level;
         struct pan_surface surf;

         pan_iview_get_surface(zs, 0, 0, 0, &surf);

         cfg.zs_write_enable = !fb->zs.discard.z;
         cfg.zs_writeback.base = surf.data;
         cfg.zs_writeback.row_stride = image->layout.slices[level].row_stride;
         cfg.zs_block_format = mod_to_block_fmt(image->layout.modifier);
         assert(cfg.zs_block_format == MALI_BLOCK_FORMAT_LINEAR ||
                cfg.zs_block_format == MALI_BLOCK_FORMAT_TILED_U_INTERLEAVED);

         cfg.zs_format = translate_zs_format(zs->format);
      }

      cfg.sample_count = fb->nr_samples;

      if (fb->rt_count)
         cfg.msaa = mali_sampling_mode(fb->rts[0].view);
   }

   pan_emit_midgard_tiler(fb, tiler_ctx,
                          pan_section_ptr(fbd, FRAMEBUFFER, TILER));

   /* All weights set to 0, nothing to do here */
   pan_section_pack(fbd, FRAMEBUFFER, TILER_WEIGHTS, w)
      ;

   pan_section_pack(fbd, FRAMEBUFFER, PADDING_1, padding)
      ;
   pan_section_pack(fbd, FRAMEBUFFER, PADDING_2, padding)
      ;
   return 0;
}
#endif

#if PAN_ARCH <= 9
void
GENX(pan_emit_fragment_job_payload)(const struct pan_fb_info *fb, mali_ptr fbd,
                                    void *out)
{
   pan_section_pack(out, FRAGMENT_JOB, PAYLOAD, payload) {
      payload.bound_min_x = fb->extent.minx >> MALI_TILE_SHIFT;
      payload.bound_min_y = fb->extent.miny >> MALI_TILE_SHIFT;
      payload.bound_max_x = fb->extent.maxx >> MALI_TILE_SHIFT;
      payload.bound_max_y = fb->extent.maxy >> MALI_TILE_SHIFT;
      payload.framebuffer = fbd;

#if PAN_ARCH >= 5
      if (fb->tile_map.base) {
         payload.has_tile_enable_map = true;
         payload.tile_enable_map = fb->tile_map.base;
         payload.tile_enable_map_row_stride = fb->tile_map.stride;
      }
#endif
   }
}
#endif
