/*
 * Copyright © 2020 Valve Corporation
 * SPDX-License-Identifier: MIT
 */


#include "crashdec.h"


static void
dump_mem_pool_reg_write(unsigned reg, uint32_t data, unsigned context,
                        bool pipe)
{
   /* TODO deal better somehow w/ 64b regs: */
   struct regacc r = {
         .rnn = pipe ? rnn_pipe : NULL,
         .regbase = reg,
         .value = data,
   };
   if (pipe) {
      struct rnndecaddrinfo *info = rnn_reginfo(rnn_pipe, reg);
      printf("\t\twrite %s (%02x) pipe\n", info->name, reg);

      if (!strcmp(info->typeinfo->name, "void")) {
         /* registers that ignore their payload */
      } else {
         printf("\t\t\t");
         dump_register(&r);
      }
      rnn_reginfo_free(info);
   } else {
      printf("\t\twrite %s (%05x)", regname(reg, 1), reg);

      if (is_a6xx()) {
         printf(" context %d", context);
      }

      printf("\n");
      dump_register_val(&r, 2);
   }
}

static void
dump_mem_pool_chunk(const uint32_t *chunk)
{
   struct __attribute__((packed)) {
      bool reg0_enabled : 1;
      bool reg1_enabled : 1;
      uint32_t data0 : 32;
      uint32_t data1 : 32;
      uint32_t reg0 : 18;
      uint32_t reg1 : 18;
      bool reg0_pipe : 1;
      bool reg1_pipe : 1;
      uint32_t reg0_context : 1;
      uint32_t reg1_context : 1;
      uint32_t padding : 22;
   } fields;

   memcpy(&fields, chunk, 4 * sizeof(uint32_t));

   if (fields.reg0_enabled) {
      dump_mem_pool_reg_write(fields.reg0, fields.data0, fields.reg0_context,
                              fields.reg0_pipe);
   }

   if (fields.reg1_enabled) {
      dump_mem_pool_reg_write(fields.reg1, fields.data1, fields.reg1_context,
                              fields.reg1_pipe);
   }
}

void
dump_cp_mem_pool(uint32_t *mempool)
{
   /* The mem pool is a shared pool of memory used for storing in-flight
    * register writes. There are 6 different queues, one for each
    * cluster. Writing to $data (or for some special registers, $addr)
    * pushes data onto the appropriate queue, and each queue is pulled
    * from by the appropriate cluster. The queues are thus written to
    * in-order, but may be read out-of-order.
    *
    * The queues are conceptually divided into 128-bit "chunks", and the
    * read and write pointers are in units of chunks.  These chunks are
    * organized internally into 8-chunk "blocks", and memory is allocated
    * dynamically in terms of blocks. Each queue is represented as a
    * singly-linked list of blocks, as well as 3-bit start/end chunk
    * pointers that point within the first/last block.  The next pointers
    * are located in a separate array, rather than inline.
    */

   /* TODO: The firmware CP_MEM_POOL save/restore routines do something
    * like:
    *
    * cread $02, [ $00 + 0 ]
    * and $02, $02, 0x118
    * ...
    * brne $02, 0, #label
    * mov $03, 0x2000
    * mov $03, 0x1000
    * label:
    * ...
    *
    * I think that control register 0 is the GPU version, and some
    * versions have a smaller mem pool. It seems some models have a mem
    * pool that's half the size, and a bunch of offsets are shifted
    * accordingly. Unfortunately the kernel driver's dumping code doesn't
    * seem to take this into account, even the downstream android driver,
    * and we don't know which versions 0x8, 0x10, or 0x100 correspond
    * to. Or maybe we can use CP_DBG_MEM_POOL_SIZE to figure this out?
    */
   bool small_mem_pool = false;

   /* The array of next pointers for each block. */
   const uint32_t *next_pointers =
      small_mem_pool ? &mempool[0x800] : &mempool[0x1000];

   /* Maximum number of blocks in the pool, also the size of the pointers
    * array.
    */
   const int num_blocks = small_mem_pool ? 0x30 : 0x80;

   /* Number of queues */
   const unsigned num_queues = is_a6xx() ? 6 : 7;

   /* Unfortunately the per-queue state is a little more complicated than
    * a simple pair of begin/end pointers. Instead of a single beginning
    * block, there are *two*, with the property that either the two are
    * equal or the second is the "next" of the first. Similarly there are
    * two end blocks. Thus the queue either looks like this:
    *
    * A -> B -> ... -> C -> D
    *
    * Or like this, or some combination:
    *
    * A/B -> ... -> C/D
    *
    * However, there's only one beginning/end chunk offset. Now the
    * question is, which of A or B is the actual start? I.e. is the chunk
    * offset an offset inside A or B? It depends. I'll show a typical read
    * cycle, starting here (read pointer marked with a *) with a chunk
    * offset of 0:
    *
    *	  A                    B
    *  _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _
    * |_|_|_|_|_|_|_|_| -> |*|_|_|_|_|_|_|_| -> |_|_|_|_|_|_|_|_|
    *
    * Once the pointer advances far enough, the hardware decides to free
    * A, after which the read-side state looks like:
    *
    *	(free)                A/B
    *  _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _
    * |_|_|_|_|_|_|_|_|    |_|_|_|*|_|_|_|_| -> |_|_|_|_|_|_|_|_|
    *
    * Then after advancing the pointer a bit more, the hardware fetches
    * the "next" pointer for A and stores it in B:
    *
    *	(free)                 A                     B
    *  _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _
    * |_|_|_|_|_|_|_|_|    |_|_|_|_|_|_|_|*| -> |_|_|_|_|_|_|_|_|
    *
    * Then the read pointer advances into B, at which point we've come
    * back to the first state having advanced a whole block:
    *
    *	(free)                 A                     B
    *  _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _
    * |_|_|_|_|_|_|_|_|    |_|_|_|_|_|_|_|_| -> |*|_|_|_|_|_|_|_|
    *
    *
    * There is a similar cycle for the write pointer. Now, the question
    * is, how do we know which state we're in? We need to know this to
    * know whether the pointer (*) is in A or B if they're different. It
    * seems like there should be some bit somewhere describing this, but
    * after lots of experimentation I've come up empty-handed. For now we
    * assume that if the pointer is in the first half, then we're in
    * either the first or second state and use B, and otherwise we're in
    * the second or third state and use A. So far I haven't seen anything
    * that violates this assumption.
    */

   struct {
      uint32_t unk0;
      uint32_t padding0[7]; /* Mirrors of unk0 */

      struct {
         uint32_t chunk : 3;
         uint32_t first_block : 32 - 3;
      } writer[6];
      uint32_t padding1[2]; /* Mirror of writer[5] */

      uint32_t unk1;
      uint32_t padding2[7]; /* Mirrors of unk1 */

      uint32_t writer_second_block[7];
      uint32_t padding3[1];

      uint32_t unk2[7];
      uint32_t padding4[1];

      struct {
         uint32_t chunk : 3;
         uint32_t first_block : 32 - 3;
      } reader[7];
      uint32_t padding5[1]; /* Mirror of reader[5] */

      uint32_t unk3;
      uint32_t padding6[7]; /* Mirrors of unk3 */

      uint32_t reader_second_block[7];
      uint32_t padding7[1];

      uint32_t block_count[7];
      uint32_t padding[1];

      uint32_t unk4;
      uint32_t padding9[7]; /* Mirrors of unk4 */
   } data1;

   const uint32_t *data1_ptr =
      small_mem_pool ? &mempool[0xc00] : &mempool[0x1800];
   memcpy(&data1, data1_ptr, sizeof(data1));

   /* Based on the kernel, the first dword is the mem pool size (in
    * blocks?) and mirrors CP_MEM_POOL_DBG_SIZE.
    */
   const uint32_t *data2_ptr =
      small_mem_pool ? &mempool[0x1000] : &mempool[0x2000];
   const int data2_size = 0x60;

   /* This seems to be the size of each queue in chunks. */
   const uint32_t *queue_sizes = &data2_ptr[0x18];

   printf("\tdata2:\n");
   dump_hex_ascii(data2_ptr, 4 * data2_size, 1);

   /* These seem to be some kind of counter of allocated/deallocated blocks */
   if (verbose) {
      printf("\tunk0: %x\n", data1.unk0);
      printf("\tunk1: %x\n", data1.unk1);
      printf("\tunk3: %x\n", data1.unk3);
      printf("\tunk4: %x\n\n", data1.unk4);
   }

   for (int queue = 0; queue < num_queues; queue++) {
      const char *cluster_names_a6xx[6] = {"FE",   "SP_VS", "PC_VS",
                                           "GRAS", "SP_PS", "PS"};
      const char *cluster_names_a7xx[7] = {"FE",   "SP_VS", "PC_VS",
                                           "GRAS", "SP_PS", "VPC_PS", "PS"};
      printf("\tCLUSTER_%s:\n\n",
             is_a6xx() ? cluster_names_a6xx[queue] : cluster_names_a7xx[queue]);

      if (verbose) {
         printf("\t\twriter_first_block: 0x%x\n",
                data1.writer[queue].first_block);
         printf("\t\twriter_second_block: 0x%x\n",
                data1.writer_second_block[queue]);
         printf("\t\twriter_chunk: %d\n", data1.writer[queue].chunk);
         printf("\t\treader_first_block: 0x%x\n",
                data1.reader[queue].first_block);
         printf("\t\treader_second_block: 0x%x\n",
                data1.reader_second_block[queue]);
         printf("\t\treader_chunk: %d\n", data1.reader[queue].chunk);
         printf("\t\tblock_count: %d\n", data1.block_count[queue]);
         printf("\t\tunk2: 0x%x\n", data1.unk2[queue]);
         printf("\t\tqueue_size: %d\n\n", queue_sizes[queue]);
      }

      uint32_t cur_chunk = data1.reader[queue].chunk;
      uint32_t cur_block = cur_chunk > 3 ? data1.reader[queue].first_block
                                         : data1.reader_second_block[queue];
      uint32_t last_chunk = data1.writer[queue].chunk;
      uint32_t last_block = last_chunk > 3 ? data1.writer[queue].first_block
                                           : data1.writer_second_block[queue];

      if (verbose)
         printf("\tblock %x\n", cur_block);
      if (cur_block >= num_blocks) {
         fprintf(stderr, "block %x too large\n", cur_block);
         exit(1);
      }
      unsigned calculated_queue_size = 0;
      while (cur_block != last_block || cur_chunk != last_chunk) {
         calculated_queue_size++;
         uint32_t *chunk_ptr = &mempool[cur_block * 0x20 + cur_chunk * 4];

         dump_mem_pool_chunk(chunk_ptr);

         printf("\t%05x: %08x %08x %08x %08x\n",
                4 * (cur_block * 0x20 + cur_chunk + 4), chunk_ptr[0],
                chunk_ptr[1], chunk_ptr[2], chunk_ptr[3]);

         cur_chunk++;
         if (cur_chunk == 8) {
            cur_block = next_pointers[cur_block];
            if (verbose)
               printf("\tblock %x\n", cur_block);
            if (cur_block >= num_blocks) {
               fprintf(stderr, "block %x too large\n", cur_block);
               exit(1);
            }
            cur_chunk = 0;
         }
      }
      if (calculated_queue_size != queue_sizes[queue]) {
         printf("\t\tCALCULATED SIZE %d DOES NOT MATCH!\n",
                calculated_queue_size);
      }
      printf("\n");
   }
}

