/*
 * Copyright (C) 2019-2021 Collabora, Ltd.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 * Authors (Collabora):
 *    Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
 */

#include "compiler.h"

/* Bifrost texture operations have a `skip` bit, instructing helper invocations
 * to skip execution. Each clause has a `terminate_discarded_threads` bit,
 * which will terminate helper invocations.
 *
 * The terminate bit should be set on the last clause requiring helper
 * invocations. Without control flow, that's the last source-order instruction;
 * with control flow, there may be multiple such instructions (with ifs) or no
 * such instruction (with loops).
 *
 * The skip bit should be set unless the value of this instruction is required
 * by a future instruction requiring helper invocations. Consider:
 *
 *      0 = texture ...
 *      1 = fmul 0, #10
 *      2 = dfdx 1
 *      store 2
 *
 * Since the derivative calculation 2 requires helper invocations, the value 1
 * must be calculated by helper invocations, and since it depends on 0, 0 must
 * be calculated by helpers. Hence the texture op does NOT have the skip bit
 * set, and the clause containing the derivative has the terminate bit set.
 *
 * Calculating the terminate bit occurs by forward dataflow analysis to
 * determine which blocks require helper invocations. A block requires
 * invocations in if any of its instructions use helper invocations, or if it
 * depends on a block that requires invocation. With that analysis, the
 * terminate bit is set on the last instruction using invocations within any
 * block that does *not* require invocations out.
 *
 * Likewise, calculating the execute bit requires backward dataflow analysis
 * with union as the join operation and the generating set being the union of
 * sources of instructions writing executed values. The skip bit is the inverse
 * of the execute bit.
 */

static bool
bi_has_skip_bit(enum bi_opcode op)
{
   switch (op) {
   case BI_OPCODE_TEX_SINGLE:
   case BI_OPCODE_TEXC:
   case BI_OPCODE_TEXC_DUAL:
   case BI_OPCODE_TEXS_2D_F16:
   case BI_OPCODE_TEXS_2D_F32:
   case BI_OPCODE_TEXS_CUBE_F16:
   case BI_OPCODE_TEXS_CUBE_F32:
   case BI_OPCODE_VAR_TEX_F16:
   case BI_OPCODE_VAR_TEX_F32:
      return true;
   default:
      return false;
   }
}

/* Does a given instruction require helper threads to be active (because it
 * reads from other subgroup lanes)? This only applies to fragment shaders.
 * Other shader stages do not have a notion of helper threads. */

bool
bi_instr_uses_helpers(bi_instr *I)
{
   switch (I->op) {
   case BI_OPCODE_TEXC:
   case BI_OPCODE_TEXC_DUAL:
   case BI_OPCODE_TEXS_2D_F16:
   case BI_OPCODE_TEXS_2D_F32:
   case BI_OPCODE_TEXS_CUBE_F16:
   case BI_OPCODE_TEXS_CUBE_F32:
   case BI_OPCODE_VAR_TEX_F16:
   case BI_OPCODE_VAR_TEX_F32:
      return !I->lod_mode; /* set for zero, clear for computed */
   case BI_OPCODE_TEX_SINGLE:
      return (I->va_lod_mode == BI_VA_LOD_MODE_COMPUTED_LOD) ||
             (I->va_lod_mode == BI_VA_LOD_MODE_COMPUTED_BIAS);
   case BI_OPCODE_CLPER_I32:
   case BI_OPCODE_CLPER_OLD_I32:
      /* Fragment shaders require helpers to implement derivatives.
       * Other shader stages don't have helpers at all */
      return true;
   default:
      return false;
   }
}

static void
bi_add_branch_compare_values(const bi_instr *I, BITSET_WORD *deps)
{
   switch (I->op) {
   case BI_OPCODE_BRANCHZI:
   case BI_OPCODE_BRANCHC_I16:
   case BI_OPCODE_BRANCHC_I32:
      BITSET_SET(deps, I->src[0].value);
      break;
   case BI_OPCODE_BRANCH_F16:
   case BI_OPCODE_BRANCH_F32:
   case BI_OPCODE_BRANCH_I16:
   case BI_OPCODE_BRANCH_I32:
   case BI_OPCODE_BRANCH_S16:
   case BI_OPCODE_BRANCH_S32:
   case BI_OPCODE_BRANCH_U16:
   case BI_OPCODE_BRANCH_U32:
   case BI_OPCODE_BRANCHZ_F16:
   case BI_OPCODE_BRANCHZ_F32:
   case BI_OPCODE_BRANCHZ_I16:
   case BI_OPCODE_BRANCHZ_I32:
   case BI_OPCODE_BRANCHZ_S16:
   case BI_OPCODE_BRANCHZ_S32:
   case BI_OPCODE_BRANCHZ_U16:
   case BI_OPCODE_BRANCHZ_U32:
      BITSET_SET(deps, I->src[0].value);
      BITSET_SET(deps, I->src[1].value);
      break;
   default:
      break;
   }
}

/* Does a block use helpers directly */
static bool
bi_block_uses_helpers(bi_block *block)
{
   bi_foreach_instr_in_block(block, I) {
      if (bi_instr_uses_helpers(I))
         return true;
   }

   return false;
}

bool
bi_block_terminates_helpers(bi_block *block)
{
   /* Can't terminate if a successor needs helpers */
   bi_foreach_successor(block, succ) {
      if (succ->pass_flags & 1)
         return false;
   }

   /* Otherwise we terminate */
   return true;
}

/*
 * Propagate the pass flag up the control flow graph by performing depth-first
 * search on the directed control flow graph.
 */
static void
bi_propagate_pass_flag(bi_block *block)
{
   block->pass_flags = 1;

   bi_foreach_predecessor(block, pred) {
      if ((*pred)->pass_flags == 0)
         bi_propagate_pass_flag(*pred);
   }
}

void
bi_analyze_helper_terminate(bi_context *ctx)
{
   /* Other shader stages do not have a notion of helper threads, so we
    * can skip the analysis. Don't run for blend shaders, either, since
    * they run in the context of another shader that we don't see. */
   if (ctx->stage != MESA_SHADER_FRAGMENT || ctx->inputs->is_blend)
      return;

   /* Clear flags */
   bi_foreach_block(ctx, block)
      block->pass_flags = 0;

   /* For each block, check if it uses helpers and propagate that fact if
    * so. We walk in reverse order to minimize the number of blocks tested:
    * if the (unique) last block uses helpers, only that block is tested.
    */
   bi_foreach_block_rev(ctx, block) {
      if (block->pass_flags == 0 && bi_block_uses_helpers(block))
         bi_propagate_pass_flag(block);
   }
}

void
bi_mark_clauses_td(bi_context *ctx)
{
   if (ctx->stage != MESA_SHADER_FRAGMENT || ctx->inputs->is_blend)
      return;

   /* Finally, mark clauses requiring helpers */
   bi_foreach_block(ctx, block) {
      /* At the end, there are helpers iff we don't terminate */
      bool helpers = !bi_block_terminates_helpers(block);

      bi_foreach_clause_in_block_rev(block, clause) {
         bi_foreach_instr_in_clause_rev(block, clause, I) {
            helpers |= bi_instr_uses_helpers(I);
         }

         clause->td = !helpers;
      }
   }
}

static bool
bi_helper_block_update(BITSET_WORD *deps, bi_block *block)
{
   bool progress = false;

   bi_foreach_instr_in_block_rev(block, I) {
      /* If a destination is required by helper invocation... */
      bi_foreach_dest(I, d) {
         if (!BITSET_TEST(deps, I->dest[d].value))
            continue;

         /* ...so are the sources */
         bi_foreach_ssa_src(I, s) {
            progress |= !BITSET_TEST(deps, I->src[s].value);
            BITSET_SET(deps, I->src[s].value);
         }

         break;
      }
   }

   return progress;
}

void
bi_analyze_helper_requirements(bi_context *ctx)
{
   BITSET_WORD *deps = calloc(sizeof(BITSET_WORD), ctx->ssa_alloc);

   /* Initialize with the sources of instructions consuming
    * derivatives and the sources of conditional branch instructions */

   bi_foreach_instr_global(ctx, I) {
      if (bi_instr_uses_helpers(I)) {
         bi_foreach_ssa_src(I, s)
            BITSET_SET(deps, I->src[s].value);
      } else {
         bi_add_branch_compare_values(I, deps);
      }
   }

   /* Propagate that up */
   bool progress;
   do {
      progress = false;
      bi_foreach_block_rev(ctx, block)
         progress |= bi_helper_block_update(deps, block);
   } while (progress);

   /* Set the execute bits */

   bi_foreach_instr_global(ctx, I) {
      if (!bi_has_skip_bit(I->op))
         continue;

      bool exec = false;

      bi_foreach_dest(I, d)
         exec |= BITSET_TEST(deps, I->dest[d].value);

      I->skip = !exec;
   }

   free(deps);
}
