/*
 * Copyright © 2022 Google, Inc.
 * Copyright © 2022 Valve Corporation
 * SPDX-License-Identifier: MIT
 */

#include "util/macros.h"
#include "crashdec.h"
#include "cffdec.h"

#define MAX_PREFETCH_IBS 4

/* CP_INDIRECT_BUFFER contains an optimization to read ahead and start
 * fetching up to 3 subsequent CP_INDIRECT_BUFFER contents into the ROQ before
 * starting to execute the current IB. This effectively combines them into one
 * CP_INDIRECT_BUFFER. The result is that if the ROQ is fast enough and
 * prefetches some of the extra IBs before the first IB finishes, the ROQ may
 * be in a different IB than the CP is processing. That is, normally we'd have
 * a situation like this:
 *
 *    CP_INDIRECT_BUFFER
 *       ...
 *       CP_FOO <- PFP/SQE is reading from here
 *       ...
 *       CP_BAR <- ROQ has prefetched up to here
 *
 * where CP_IB*_BASE and CP_IB*_REM_SIZE point to CP_BAR and the difference
 * between CP_FOO and CP_BAR is given by CP_ROQ_AVAIL_IBn::REM, but instead we
 * may get a situation like this:
 *
 *   CP_INDIRECT_BUFFER
 *      ...
 *      CP_FOO <- PFP/SQE is reading here
 *      ...
 *   CP_INDIRECT_BUFFER
 *      ...
 *      CP_BAR <- ROQ has prefetched up to here
 *
 * in this case, the "rem" we get with CP_ROQ_AVAIL_IBn::REM added will be
 * larger than the size of the second IB, indicating that we need to back up
 * to the IB before it. This can theoretically even happen recursively with
 * IB2:
 *
 * CP_INDIRECT_BUFFER:
 *    ...
 *    CP_INDIRECT_BUFFER:
 *       ...
 *       CP_FOO <- PFP/SQE IB2 is reading here
 *       ...
 * CP_INDIRECT_BUFFER:
 *    CP_INDIRECT_BUFFER:
 *       ...
 *       CP_BAR <- ROQ IB2 has prefetched up to here
 *       ...
 * CP_BAZ <- PFP/SQE IB1 is reading here
 *
 * Here the ROQ has prefetched the second IB1, then when processing the IB2 at
 * the end of the first IB1 it peeks ahead in ROQ and sees another IB2 right
 * afterward in the second IB1 and starts prefetching that too, so that the
 * ROQ is in a different IB1 *and* IB2 from the CP.
 *
 * To account for this when locating the position that the SQE was at in the
 * cmdstream at the time of the crash, we do a pre-pass scanning the
 * CP_INDIRECT_BUFFER packets, keeping a history of previous IB's so that we
 * can backtrack (because CP_IBn_BASE can be several IB's ahead of SQE).  Once
 * we find the IB1 position that is being read into ROQ, we backtrack until
 * we find the IB1 position that SQE is at, and (roughly) repeat the process
 * in IB2.  This has one calculation in that we need to start scanning for the
 * CP_INDIRECT_BUFFER to IB2 from before the detected IB1 position.
 */

struct ib {
   uint64_t ibaddr;
   uint32_t ibsize;
};

struct prefetch_state {
   struct ib history[MAX_PREFETCH_IBS];
   unsigned num, next;
};

static void
push_ib(struct prefetch_state *s, struct ib *ib)
{
   s->history[s->next++ % ARRAY_SIZE(s->history)] = *ib;
   s->num = MIN2(s->num + 1, ARRAY_SIZE(s->history));
}

static struct ib *
get_ib(struct prefetch_state *s, int n)
{
   if ((n >= s->num) || (n < 0))
      return NULL;
   int idx = s->next - (s->num - n);
   return &s->history[idx % ARRAY_SIZE(s->history)];
}

static void
reset_state(struct prefetch_state *s)
{
   s->num = s->next = 0;
}

/**
 * Once we find the ROQ prefetch position, work backwards to find the SQE
 * position.
 */
static struct ib *
reverse_prefetch(struct prefetch_state *s, int lvl)
{
   unsigned rem = options.ibs[lvl].rem;

   for (int n = s->num - 1; n >= 0; n--) {
      struct ib *ib = get_ib(s, n);
      if (ib->ibsize > rem) {
         options.ibs[lvl].crash_found = 1;
         options.ibs[lvl].base = ib->ibaddr;
         options.ibs[lvl].rem = rem;

         return ib;
      }
      rem -= ib->ibsize;
   }

   return NULL;
}

/**
 * Scan cmdstream looking for CP_INDIRECT_BUFFER packets, tracking history
 * of consecutive CP_INDIRECT_BUFFER packets, until we find the one that
 * matches CP_IBn_BASE.
 */
static struct ib *
scan_cmdstream(struct prefetch_state *s, int lvl, uint32_t *dwords, uint32_t sizedwords)
{
   int dwords_left = sizedwords;
   uint32_t count = 0; /* dword count including packet header */
   uint32_t val;

   while (dwords_left > 0) {
      if (pkt_is_opcode(dwords[0], &val, &count)) {
         if (!strcmp(pktname(val), "CP_INDIRECT_BUFFER")) {
            uint64_t ibaddr;
            uint32_t ibsize;

            parse_cp_indirect(&dwords[1], count - 1, &ibaddr, &ibsize);
            push_ib(s, &(struct ib){ ibaddr, ibsize });

            /* If we've found the IB indicated by CP_IBn_BASE, then we can
             * search backwards from here to find the SQE position:
             */
            if (ibaddr == options.ibs[lvl].base)
               return reverse_prefetch(s, lvl);

            goto next_pkt;
         }
      } else if (pkt_is_regwrite(dwords[0], &val, &count)) {
      } else {
         count = find_next_packet(dwords, dwords_left);
      }

      /* prefetch only happens across consecutive CP_INDIRECT_BUFFER, so
       * any other packet resets the state:
       */
      reset_state(s);

next_pkt:
      dwords += count;
      dwords_left -= count;
   }

   return NULL;
}

void
handle_prefetch(uint32_t *dwords, uint32_t sizedwords)
{
   struct prefetch_state rb_state = {};
   struct ib *ib1 = scan_cmdstream(&rb_state, 1, dwords, sizedwords);

   if (!ib1)
      return;

   /* If the gpu crashed in IB1, we can skip the rest: */
   if (!options.ibs[2].rem)
      return;

   struct prefetch_state ib1_state = {};

   /* Once we find the actual IB1 position, we need to find the IB2 position.
    * But because IB2 prefetch can span IB1 CP_INDIRECT_BUFFER targets.  But
    * there are a limited # of buffers that can be prefetched, and we already
    * have a history of enough  RB->IB1 IB's, so we can simply scan forward
    * from our oldest history entry until we find the IB2 match..
    */
   for (int n = 0; n < rb_state.num; n++) {
      struct ib *ib = get_ib(&rb_state, n);
      uint32_t *ibaddr = hostptr(ib->ibaddr);
      if (!ibaddr)
         break;
      struct ib *ib2 = scan_cmdstream(&ib1_state, 2, ibaddr, ib->ibsize);

      /* If the crash happens in IB2, but IB1 has a sequence of CP_INDIRECT_BUFFER's
       * then IB1 could actually be further ahead than IB2, ie:
       *
       *    IB1:CP_INDIRECT_BUFFER
       *        IB2: .. crash somewhere in here ..
       *    IB1:CP_INDIRECT_BUFFER
       *    IB1:CP_INDIRECT_BUFFER  <-- detected IB1 position
       *
       * Our logic for detecting the correct IB1 position is not incorrect.
       * It is just that SQE has already consumed some additional IB's.  So
       * reset the IB1 crash position back to the oldest RB->IB1 IB that we
       * remember.
       *
       * This isn't *quite* correct, but cffdec will only mark the crash when
       * it finds the location in IB2 if we've determined that the crash is
       * in IB2, but will only consider the address in IB2 if it has seen the
       * IB1 base.
       *
       * The main case we are trying to account for here is GMEM mode crash in
       * IB2 which *isn't* the first bin/tile.  Ie. the crash happens later
       * than the first time we encounter the IB2 crash address.
       *
       * This approach works in practice because there will be some other pkts
       * in IB1 to setup for the next tile, breaking up prefetch.
       */
      if (ib2) {
         assert(options.ibs[2].crash_found);
         struct ib *first_rb_ib = get_ib(&rb_state, 0);

         options.ibs[1].base = first_rb_ib->ibaddr;
         options.ibs[1].rem = first_rb_ib->ibsize;

         break;
      }

      if (ib == ib1)
         break;
   }
}
