/*
 * Copyright © 2024 Intel Corporation
 * SPDX-License-Identifier: MIT
 */

#include "executor.h"

#ifdef HAVE_VALGRIND
#include <valgrind.h>
#include <memcheck.h>
#define VG(x) x
#else
#define VG(x) ((void)0)
#endif

#define __gen_address_type executor_address
#define __gen_combine_address executor_combine_address
#define __gen_user_data void

#include "intel/genxml/gen_macros.h"
#include "intel/genxml/genX_pack.h"

#define __executor_cmd_length(cmd) cmd ## _length
#define __executor_cmd_header(cmd) cmd ## _header
#define __executor_cmd_pack(cmd) cmd ## _pack

#define executor_batch_emit(cmd, name)                                               \
   for (struct cmd name = { __executor_cmd_header(cmd) },                            \
        *_dst = executor_alloc_bytes(&ec->bo.batch, __executor_cmd_length(cmd) * 4); \
        __builtin_expect(_dst != NULL, 1);                                           \
        ({ __executor_cmd_pack(cmd)(0, _dst, &name);                                 \
           VG(VALGRIND_CHECK_MEM_IS_DEFINED(_dst, __executor_cmd_length(cmd) * 4));  \
           _dst = NULL;                                                              \
         }))

static void
emit_pipe_control(executor_context *ec)
{
   executor_batch_emit(GENX(PIPE_CONTROL), pc) {
#if GFX_VER >= 12
      pc.HDCPipelineFlushEnable     = true;
#endif
      pc.PipeControlFlushEnable     = true;
      pc.CommandStreamerStallEnable = true;
   }
}

static void
emit_state_base_address(executor_context *ec, uint32_t mocs)
{
   /* Use the full address for everything. */
   const executor_address base_address = {0};
   const uint32_t size                 = (1 << 20) - 1;

   executor_batch_emit(GENX(STATE_BASE_ADDRESS), sba) {
      sba.GeneralStateBaseAddress               = base_address;
      sba.GeneralStateBaseAddressModifyEnable   = true;
      sba.GeneralStateBufferSize                = size;
      sba.GeneralStateBufferSizeModifyEnable    = true;
      sba.GeneralStateMOCS                      = mocs;

      sba.DynamicStateBaseAddress               = base_address;
      sba.DynamicStateBaseAddressModifyEnable   = true;
      sba.DynamicStateBufferSize                = size;
      sba.DynamicStateBufferSizeModifyEnable    = true;
      sba.DynamicStateMOCS                      = mocs;

      sba.InstructionBaseAddress                = base_address;
      sba.InstructionBaseAddressModifyEnable    = true;
      sba.InstructionBufferSize                 = size;
      sba.InstructionBuffersizeModifyEnable     = true;
      sba.InstructionMOCS                       = mocs;

      sba.IndirectObjectBaseAddress             = base_address;
      sba.IndirectObjectBaseAddressModifyEnable = true;
      sba.IndirectObjectBufferSize              = size;
      sba.IndirectObjectBufferSizeModifyEnable  = true;
      sba.IndirectObjectMOCS                    = mocs;

      sba.SurfaceStateMOCS            = mocs;
      sba.StatelessDataPortAccessMOCS = mocs;

#if GFX_VER >= 11
      sba.BindlessSamplerStateMOCS    = mocs;
#endif
      sba.BindlessSurfaceStateMOCS    = mocs;

#if GFX_VERx10 >= 125
      sba.L1CacheControl = L1CC_WB;
#endif
   };
}

void
genX(emit_execute)(executor_context *ec, const executor_params *params)
{
   uint32_t *kernel = executor_alloc_bytes(&ec->bo.extra, params->kernel_size);
   memcpy(kernel, params->kernel_bin, params->kernel_size);
   executor_address kernel_addr = executor_address_of_ptr(&ec->bo.extra, kernel);

   /* TODO: Let SIMD be a parameter. */

   struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
      .KernelStartPointer = kernel_addr.offset,
      .NumberofThreadsinGPGPUThreadGroup = 1,
   };

   void *b = executor_alloc_bytes_aligned(&ec->bo.batch, 0, 256);
   ec->batch_start = executor_address_of_ptr(&ec->bo.batch, b).offset;

   emit_pipe_control(ec);

#if GFX_VERx10 < 200
   executor_batch_emit(GENX(PIPELINE_SELECT), ps) {
      ps.PipelineSelection = GPGPU;
      ps.MaskBits = 0x3;
   }
   emit_pipe_control(ec);
#endif

   const uint32_t mocs = isl_mocs(ec->isl_dev, 0, false);

   emit_state_base_address(ec, mocs);

#if GFX_VERx10 >= 125
   executor_batch_emit(GENX(STATE_COMPUTE_MODE), cm) {
      cm.Mask1 = 0xffff;
#if GFX_VERx10 >= 200
      cm.Mask2 = 0xffff;
#endif
   }

   executor_batch_emit(GENX(CFE_STATE), cfe) {
      cfe.MaximumNumberofThreads = 64;
   }
#else
   executor_batch_emit(GENX(MEDIA_VFE_STATE), vfe) {
      vfe.NumberofURBEntries = 2;
      vfe.MaximumNumberofThreads = 64;
   }
#endif

   emit_pipe_control(ec);

#if GFX_VERx10 >= 125
   executor_batch_emit(GENX(COMPUTE_WALKER), cw) {
#if GFX_VERx10 >= 200
      cw.SIMDSize                = 1;
      cw.MessageSIMD             = 1;
#endif
      cw.ThreadGroupIDXDimension = 1;
      cw.ThreadGroupIDYDimension = 1;
      cw.ThreadGroupIDZDimension = 1;
      cw.ExecutionMask           = 0xFFFFFFFF;
      cw.PostSync.MOCS           = mocs;
      cw.InterfaceDescriptor     = desc;
   };
#else
   uint32_t *idd = executor_alloc_bytes_aligned(&ec->bo.extra, 8 * 4, 256);
   GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, idd, &desc);

   executor_address idd_addr = executor_address_of_ptr(&ec->bo.extra, idd);

   executor_batch_emit(GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) {
      load.InterfaceDescriptorDataStartAddress = idd_addr.offset,
      load.InterfaceDescriptorTotalLength = 8 * 4;
   }

   executor_batch_emit(GENX(GPGPU_WALKER), gw) {
      gw.ThreadGroupIDXDimension = 1;
      gw.ThreadGroupIDYDimension = 1;
      gw.ThreadGroupIDZDimension = 1;
      gw.RightExecutionMask      = 0xFFFFFFFF;
      gw.BottomExecutionMask     = 0xFFFFFFFF;
   }

   executor_batch_emit(GENX(MEDIA_STATE_FLUSH), msf);
#endif

   emit_pipe_control(ec);

   executor_batch_emit(GENX(MI_BATCH_BUFFER_END), end);
}
