/*
 * Copyright © 2022 Imagination Technologies Ltd.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <assert.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>

#include "pvr_device_info.h"
#include "pvr_pds.h"
#include "pvr_rogue_pds_defs.h"
#include "pvr_rogue_pds_disasm.h"
#include "pvr_rogue_pds_encode.h"
#include "util/log.h"
#include "util/macros.h"

#define H32(X) (uint32_t)((((X) >> 32U) & 0xFFFFFFFFUL))
#define L32(X) (uint32_t)(((X)&0xFFFFFFFFUL))

/*****************************************************************************
 Macro definitions
*****************************************************************************/

#define PVR_PDS_DWORD_SHIFT 2

#define PVR_PDS_CONSTANTS_BLOCK_BASE 0
#define PVR_PDS_CONSTANTS_BLOCK_SIZE 128
#define PVR_PDS_TEMPS_BLOCK_BASE 128
#define PVR_PDS_TEMPS_BLOCK_SIZE 32

#define PVR_ROGUE_PDSINST_ST_COUNT4_MAX_SIZE PVR_ROGUE_PDSINST_ST_COUNT4_MASK
#define PVR_ROGUE_PDSINST_LD_COUNT8_MAX_SIZE PVR_ROGUE_PDSINST_LD_COUNT8_MASK

/* Map PDS temp registers to the CDM values they contain Work-group IDs are only
 * available in the coefficient sync task.
 */
#define PVR_PDS_CDM_WORK_GROUP_ID_X 0
#define PVR_PDS_CDM_WORK_GROUP_ID_Y 1
#define PVR_PDS_CDM_WORK_GROUP_ID_Z 2
/* Local IDs are available in every task. */
#define PVR_PDS_CDM_LOCAL_ID_X 0
#define PVR_PDS_CDM_LOCAL_ID_YZ 1

#define PVR_PDS_DOUTW_LOWER32 0x0
#define PVR_PDS_DOUTW_UPPER32 0x1
#define PVR_PDS_DOUTW_LOWER64 0x2
#define PVR_PDS_DOUTW_LOWER128 0x3
#define PVR_PDS_DOUTW_MAXMASK 0x4

#define ROGUE_PDS_FIXED_PIXEL_SHADER_DATA_SIZE 8U
#define PDS_ROGUE_TA_STATE_PDS_ADDR_ALIGNSIZE (16U)

/*****************************************************************************
 Static variables
*****************************************************************************/

static const uint32_t dword_mask_const[PVR_PDS_DOUTW_MAXMASK] = {
   PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_BSIZE_LOWER,
   PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_BSIZE_UPPER,
   PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_BSIZE_ALL64,
   PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_BSIZE_ALL64
};

/* If has_slc_mcu_cache_control is enabled use cache_control_const[0], else use
 * cache_control_const[1].
 */
static const uint32_t cache_control_const[2][2] = {
   { PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_CMODE_BYPASS,
     PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_CMODE_CACHED },
   { 0, 0 }
};

/*****************************************************************************
 Function definitions
*****************************************************************************/

uint64_t pvr_pds_encode_ld_src0(uint64_t dest,
                                uint64_t count8,
                                uint64_t src_add,
                                bool cached,
                                const struct pvr_device_info *dev_info)
{
   uint64_t encoded = 0;

   if (PVR_HAS_FEATURE(dev_info, slc_mcu_cache_controls)) {
      encoded |= (cached ? PVR_ROGUE_PDSINST_LD_LD_SRC0_SLCMODE_CACHED
                         : PVR_ROGUE_PDSINST_LD_LD_SRC0_SLCMODE_BYPASS);
   }

   encoded |= ((src_add & PVR_ROGUE_PDSINST_LD_SRCADD_MASK)
               << PVR_ROGUE_PDSINST_LD_LD_SRC0_SRCADD_SHIFT);
   encoded |= ((count8 & PVR_ROGUE_PDSINST_LD_COUNT8_MASK)
               << PVR_ROGUE_PDSINST_LD_LD_SRC0_COUNT8_SHIFT);
   encoded |= (cached ? PVR_ROGUE_PDSINST_LD_LD_SRC0_CMODE_CACHED
                      : PVR_ROGUE_PDSINST_LD_LD_SRC0_CMODE_BYPASS);
   encoded |= ((dest & PVR_ROGUE_PDSINST_REGS64TP_MASK)
               << PVR_ROGUE_PDSINST_LD_LD_SRC0_DEST_SHIFT);

   return encoded;
}

uint64_t pvr_pds_encode_st_src0(uint64_t src,
                                uint64_t count4,
                                uint64_t dst_add,
                                bool write_through,
                                const struct pvr_device_info *device_info)
{
   uint64_t encoded = 0;

   if (device_info->features.has_slc_mcu_cache_controls) {
      encoded |= (write_through
                     ? PVR_ROGUE_PDSINST_ST_ST_SRC0_SLCMODE_WRITE_THROUGH
                     : PVR_ROGUE_PDSINST_ST_ST_SRC0_SLCMODE_WRITE_BACK);
   }

   encoded |= ((dst_add & PVR_ROGUE_PDSINST_ST_SRCADD_MASK)
               << PVR_ROGUE_PDSINST_ST_ST_SRC0_DSTADD_SHIFT);
   encoded |= ((count4 & PVR_ROGUE_PDSINST_ST_COUNT4_MASK)
               << PVR_ROGUE_PDSINST_ST_ST_SRC0_COUNT4_SHIFT);
   encoded |= (write_through ? PVR_ROGUE_PDSINST_ST_ST_SRC0_CMODE_WRITE_THROUGH
                             : PVR_ROGUE_PDSINST_ST_ST_SRC0_CMODE_WRITE_BACK);
   encoded |= ((src & PVR_ROGUE_PDSINST_REGS32TP_MASK)
               << PVR_ROGUE_PDSINST_ST_ST_SRC0_SRC_SHIFT);

   return encoded;
}

static ALWAYS_INLINE uint32_t
pvr_pds_encode_doutw_src1(uint32_t dest,
                          uint32_t dword_mask,
                          uint32_t flags,
                          bool cached,
                          const struct pvr_device_info *dev_info)
{
   assert(((dword_mask > PVR_PDS_DOUTW_LOWER64) && ((dest & 3) == 0)) ||
          ((dword_mask == PVR_PDS_DOUTW_LOWER64) && ((dest & 1) == 0)) ||
          (dword_mask < PVR_PDS_DOUTW_LOWER64));

   uint32_t encoded =
      (dest << PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_AO_SHIFT);

   encoded |= dword_mask_const[dword_mask];

   encoded |= flags;

   encoded |=
      cache_control_const[PVR_HAS_FEATURE(dev_info, slc_mcu_cache_controls) ? 0
                                                                            : 1]
                         [cached ? 1 : 0];
   return encoded;
}

static ALWAYS_INLINE uint32_t pvr_pds_encode_doutw64(uint32_t cc,
                                                     uint32_t end,
                                                     uint32_t src1,
                                                     uint32_t src0)
{
   return pvr_pds_inst_encode_dout(cc,
                                   end,
                                   src1,
                                   src0,
                                   PVR_ROGUE_PDSINST_DSTDOUT_DOUTW);
}

static ALWAYS_INLINE uint32_t pvr_pds_encode_doutu(uint32_t cc,
                                                   uint32_t end,
                                                   uint32_t src0)
{
   return pvr_pds_inst_encode_dout(cc,
                                   end,
                                   0,
                                   src0,
                                   PVR_ROGUE_PDSINST_DSTDOUT_DOUTU);
}

static ALWAYS_INLINE uint32_t pvr_pds_inst_encode_doutc(uint32_t cc,
                                                        uint32_t end)
{
   return pvr_pds_inst_encode_dout(cc,
                                   end,
                                   0,
                                   0,
                                   PVR_ROGUE_PDSINST_DSTDOUT_DOUTC);
}

static ALWAYS_INLINE uint32_t pvr_pds_encode_doutd(uint32_t cc,
                                                   uint32_t end,
                                                   uint32_t src1,
                                                   uint32_t src0)
{
   return pvr_pds_inst_encode_dout(cc,
                                   end,
                                   src1,
                                   src0,
                                   PVR_ROGUE_PDSINST_DSTDOUT_DOUTD);
}

static ALWAYS_INLINE uint32_t pvr_pds_encode_douti(uint32_t cc,
                                                   uint32_t end,
                                                   uint32_t src0)
{
   return pvr_pds_inst_encode_dout(cc,
                                   end,
                                   0,
                                   src0,
                                   PVR_ROGUE_PDSINST_DSTDOUT_DOUTI);
}

static ALWAYS_INLINE uint32_t pvr_pds_encode_bra(uint32_t srcc,
                                                 uint32_t neg,
                                                 uint32_t setc,
                                                 int32_t relative_address)
{
   /* Address should be signed but API only allows unsigned value. */
   return pvr_pds_inst_encode_bra(srcc, neg, setc, (uint32_t)relative_address);
}

/**
 * Gets the next constant address and moves the next constant pointer along.
 *
 * \param next_constant Pointer to the next constant address.
 * \param num_constants The number of constants required.
 * \param count The number of constants allocated.
 * \return The address of the next constant.
 */
static uint32_t pvr_pds_get_constants(uint32_t *next_constant,
                                      uint32_t num_constants,
                                      uint32_t *count)
{
   uint32_t constant;

   /* Work out starting constant number. For even number of constants, start on
    * a 64-bit boundary.
    */
   if (num_constants & 1)
      constant = *next_constant;
   else
      constant = (*next_constant + 1) & ~1;

   /* Update the count with the number of constants actually allocated. */
   *count += constant + num_constants - *next_constant;

   /* Move the next constant pointer. */
   *next_constant = constant + num_constants;

   assert((constant + num_constants) <= PVR_PDS_CONSTANTS_BLOCK_SIZE);

   return constant;
}

/**
 * Gets the next temp address and moves the next temp pointer along.
 *
 * \param next_temp Pointer to the next temp address.
 * \param num_temps The number of temps required.
 * \param count The number of temps allocated.
 * \return The address of the next temp.
 */
static uint32_t
pvr_pds_get_temps(uint32_t *next_temp, uint32_t num_temps, uint32_t *count)
{
   uint32_t temp;

   /* Work out starting temp number. For even number of temps, start on a
    * 64-bit boundary.
    */
   if (num_temps & 1)
      temp = *next_temp;
   else
      temp = (*next_temp + 1) & ~1;

   /* Update the count with the number of temps actually allocated. */
   *count += temp + num_temps - *next_temp;

   /* Move the next temp pointer. */
   *next_temp = temp + num_temps;

   assert((temp + num_temps) <=
          (PVR_PDS_TEMPS_BLOCK_SIZE + PVR_PDS_TEMPS_BLOCK_BASE));

   return temp;
}

/**
 * Write a 32-bit constant indexed by the long range.
 *
 * \param data_block Pointer to data block to write to.
 * \param index Index within the data to write to.
 * \param dword The 32-bit constant to write.
 */
static void
pvr_pds_write_constant32(uint32_t *data_block, uint32_t index, uint32_t dword0)
{
   /* Check range. */
   assert(index <= (PVR_ROGUE_PDSINST_REGS32_CONST32_UPPER -
                    PVR_ROGUE_PDSINST_REGS32_CONST32_LOWER));

   data_block[index + 0] = dword0;

   PVR_PDS_PRINT_DATA("WriteConstant32", (uint64_t)dword0, index);
}

/**
 * Write a 64-bit constant indexed by the long range.
 *
 * \param data_block Pointer to data block to write to.
 * \param index Index within the data to write to.
 * \param dword0 Lower half of the 64 bit constant.
 * \param dword1 Upper half of the 64 bit constant.
 */
static void pvr_pds_write_constant64(uint32_t *data_block,
                                     uint32_t index,
                                     uint32_t dword0,
                                     uint32_t dword1)
{
   /* Has to be on 64 bit boundary. */
   assert((index & 1) == 0);

   /* Check range. */
   assert((index >> 1) <= (PVR_ROGUE_PDSINST_REGS64_CONST64_UPPER -
                           PVR_ROGUE_PDSINST_REGS64_CONST64_LOWER));

   data_block[index + 0] = dword0;
   data_block[index + 1] = dword1;

   PVR_PDS_PRINT_DATA("WriteConstant64",
                      ((uint64_t)dword0 << 32) | (uint64_t)dword1,
                      index);
}

/**
 * Write a 64-bit constant from a single wide word indexed by the long-range
 * number.
 *
 * \param data_block Pointer to data block to write to.
 * \param index Index within the data to write to.
 * \param word The 64-bit constant to write.
 */

static void
pvr_pds_write_wide_constant(uint32_t *data_block, uint32_t index, uint64_t word)
{
   /* Has to be on 64 bit boundary. */
   assert((index & 1) == 0);

   /* Check range. */
   assert((index >> 1) <= (PVR_ROGUE_PDSINST_REGS64_CONST64_UPPER -
                           PVR_ROGUE_PDSINST_REGS64_CONST64_LOWER));

   data_block[index + 0] = L32(word);
   data_block[index + 1] = H32(word);

   PVR_PDS_PRINT_DATA("WriteWideConstant", word, index);
}

static void pvr_pds_write_dma_address(uint32_t *data_block,
                                      uint32_t index,
                                      uint64_t address,
                                      bool coherent,
                                      const struct pvr_device_info *dev_info)
{
   /* Has to be on 64 bit boundary. */
   assert((index & 1) == 0);

   if (PVR_HAS_FEATURE(dev_info, slc_mcu_cache_controls))
      address |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTD_SRC0_SLCMODE_CACHED;

   /* Check range. */
   assert((index >> 1) <= (PVR_ROGUE_PDSINST_REGS64_CONST64_UPPER -
                           PVR_ROGUE_PDSINST_REGS64_CONST64_LOWER));

   data_block[index + 0] = L32(address);
   data_block[index + 1] = H32(address);

   PVR_PDS_PRINT_DATA("WriteDMAAddress", address, index);
}

/**
 * External API to append a 64-bit constant to an existing data segment
 * allocation.
 *
 * \param constants Pointer to start of data segment.
 * \param constant_value Value to write to constant.
 * \param data_size The number of constants allocated.
 * \returns The address of the next constant.
 */
uint32_t pvr_pds_append_constant64(uint32_t *constants,
                                   uint64_t constant_value,
                                   uint32_t *data_size)
{
   /* Calculate next constant from current data size. */
   uint32_t next_constant = *data_size;
   uint32_t constant = pvr_pds_get_constants(&next_constant, 2, data_size);

   /* Set the value. */
   pvr_pds_write_wide_constant(constants, constant, constant_value);

   return constant;
}

void pvr_pds_pixel_shader_sa_initialize(
   struct pvr_pds_pixel_shader_sa_program *program)
{
   memset(program, 0, sizeof(*program));
}

/**
 * Encode a DMA burst.
 *
 * \param dma_control DMA control words.
 * \param dma_address DMA address.
 * \param dest_offset Destination offset in the attribute.
 * \param dma_size The size of the DMA in words.
 * \param src_address Source address for the burst.
 * \param last Last DMA in program.
 * \param dev_info PVR device info structure.
 * \returns The number of DMA transfers required.
 */
uint32_t pvr_pds_encode_dma_burst(uint32_t *dma_control,
                                  uint64_t *dma_address,
                                  uint32_t dest_offset,
                                  uint32_t dma_size,
                                  uint64_t src_address,
                                  bool last,
                                  const struct pvr_device_info *dev_info)
{
   dma_control[0] = dma_size
                    << PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTD_SRC1_BSIZE_SHIFT;
   dma_control[0] |= dest_offset
                     << PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTD_SRC1_AO_SHIFT;

   dma_control[0] |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTD_SRC1_CMODE_CACHED |
                     PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTD_SRC1_DEST_COMMON_STORE;

   if (last)
      dma_control[0] |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTD_SRC1_LAST_EN;

   dma_address[0] = src_address;
   if (PVR_HAS_FEATURE(dev_info, slc_mcu_cache_controls))
      dma_address[0] |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTD_SRC0_SLCMODE_CACHED;

   /* Force to 1 DMA. */
   return 1;
}

/* FIXME: use the csbgen interface and pvr_csb_pack.
 * FIXME: use bool for phase_rate_change.
 */
/**
 * Sets up the USC control words for a DOUTU.
 *
 * \param usc_task_control USC task control structure to be setup.
 * \param execution_address USC execution virtual address.
 * \param usc_temps Number of USC temps.
 * \param sample_rate Sample rate for the DOUTU.
 * \param phase_rate_change Phase rate change for the DOUTU.
 */
void pvr_pds_setup_doutu(struct pvr_pds_usc_task_control *usc_task_control,
                         uint64_t execution_address,
                         uint32_t usc_temps,
                         uint32_t sample_rate,
                         bool phase_rate_change)
{
   usc_task_control->src0 = UINT64_C(0);

   /* Set the execution address. */
   pvr_set_usc_execution_address64(&(usc_task_control->src0),
                                   execution_address);

   if (usc_temps > 0) {
      /* Temps are allocated in blocks of 4 dwords. */
      usc_temps =
         DIV_ROUND_UP(usc_temps,
                      PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTU_SRC0_TEMPS_ALIGNSIZE);

      /* Check for losing temps due to too many requested. */
      assert((usc_temps & PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTU_TEMPS_MASK) ==
             usc_temps);

      usc_task_control->src0 |=
         ((uint64_t)(usc_temps &
                     PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTU_TEMPS_MASK))
         << PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTU_SRC0_TEMPS_SHIFT;
   }

   if (sample_rate > 0) {
      usc_task_control->src0 |=
         ((uint64_t)sample_rate)
         << PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTU_SRC0_SAMPLE_RATE_SHIFT;
   }

   if (phase_rate_change) {
      usc_task_control->src0 |=
         PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTU_SRC0_DUAL_PHASE_EN;
   }
}

/**
 * Generates the PDS pixel event program.
 *
 * \param program Pointer to the PDS pixel event program.
 * \param buffer Pointer to the buffer for the program.
 * \param gen_mode Generate either a data segment or code segment.
 * \param dev_info PVR device info structure.
 * \returns Pointer to just beyond the buffer for the program.
 */
uint32_t *
pvr_pds_generate_pixel_event(struct pvr_pds_event_program *restrict program,
                             uint32_t *restrict buffer,
                             enum pvr_pds_generate_mode gen_mode,
                             const struct pvr_device_info *dev_info)
{
   uint32_t next_constant = PVR_PDS_CONSTANTS_BLOCK_BASE;
   uint32_t *constants = buffer;

   uint32_t data_size = 0;

   /* Copy the DMA control words and USC task control words to constants, then
    * arrange them so that the 64-bit words are together followed by the 32-bit
    * words.
    */
   uint32_t control_constant =
      pvr_pds_get_constants(&next_constant, 2, &data_size);
   uint32_t emit_constant =
      pvr_pds_get_constants(&next_constant,
                            (2 * program->num_emit_word_pairs),
                            &data_size);

   uint32_t control_word_constant =
      pvr_pds_get_constants(&next_constant,
                            program->num_emit_word_pairs,
                            &data_size);

   if (gen_mode == PDS_GENERATE_DATA_SEGMENT) {
      /* Src0 for DOUTU. */
      pvr_pds_write_wide_constant(buffer,
                                  control_constant,
                                  program->task_control.src0); /* DOUTU */
      /* 64-bit Src0. */

      /* Emit words for end of tile program. */
      for (uint32_t i = 0; i < program->num_emit_word_pairs; i++) {
         pvr_pds_write_constant64(constants,
                                  emit_constant + (2 * i),
                                  program->emit_words[(2 * i) + 0],
                                  program->emit_words[(2 * i) + 1]);
      }

      /* Control words. */
      for (uint32_t i = 0; i < program->num_emit_word_pairs; i++) {
         uint32_t doutw = pvr_pds_encode_doutw_src1(
            (2 * i),
            PVR_PDS_DOUTW_LOWER64,
            PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_DEST_COMMON_STORE,
            false,
            dev_info);

         if (i == (program->num_emit_word_pairs - 1))
            doutw |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN;

         pvr_pds_write_constant32(constants, control_word_constant + i, doutw);
      }
   }

   else if (gen_mode == PDS_GENERATE_CODE_SEGMENT) {
      /* DOUTW the state into the shared register. */
      for (uint32_t i = 0; i < program->num_emit_word_pairs; i++) {
         *buffer++ = pvr_pds_encode_doutw64(
            /* cc */ 0,
            /* END */ 0,
            /* SRC1 */ (control_word_constant + i), /* DOUTW 32-bit Src1 */
            /* SRC0 */ (emit_constant + (2 * i)) >> 1); /* DOUTW 64-bit Src0
                                                         */
      }

      /* Kick the USC. */
      *buffer++ = pvr_pds_encode_doutu(
         /* cc */ 0,
         /* END */ 1,
         /* SRC0 */ control_constant >> 1);
   }

   uint32_t code_size = 1 + program->num_emit_word_pairs;

   /* Save the data segment Pointer and size. */
   program->data_segment = constants;
   program->data_size = data_size;
   program->code_size = code_size;

   if (gen_mode == PDS_GENERATE_DATA_SEGMENT)
      return (constants + next_constant);

   if (gen_mode == PDS_GENERATE_CODE_SEGMENT)
      return buffer;

   return NULL;
}

/**
 * Checks if any of the vertex streams contains instance data.
 *
 * \param streams Streams contained in the vertex shader.
 * \param num_streams Number of vertex streams.
 * \returns true if one or more of the given vertex streams contains
 *          instance data, otherwise false.
 */
static bool pvr_pds_vertex_streams_contains_instance_data(
   const struct pvr_pds_vertex_stream *streams,
   uint32_t num_streams)
{
   for (uint32_t i = 0; i < num_streams; i++) {
      const struct pvr_pds_vertex_stream *vertex_stream = &streams[i];
      if (vertex_stream->instance_data)
         return true;
   }

   return false;
}

static uint32_t pvr_pds_get_bank_based_constants(uint32_t num_backs,
                                                 uint32_t *next_constant,
                                                 uint32_t num_constants,
                                                 uint32_t *count)
{
   /* Allocate constant for PDS vertex shader where constant is divided into
    * banks.
    */
   uint32_t constant;

   assert(num_constants == 1 || num_constants == 2);

   if (*next_constant >= (num_backs << 3))
      return pvr_pds_get_constants(next_constant, num_constants, count);

   if ((*next_constant % 8) == 0) {
      constant = *next_constant;

      if (num_constants == 1)
         *next_constant += 1;
      else
         *next_constant += 8;
   } else if (num_constants == 1) {
      constant = *next_constant;
      *next_constant += 7;
   } else {
      *next_constant += 7;
      constant = *next_constant;

      if (*next_constant >= (num_backs << 3)) {
         *next_constant += 2;
         *count += 2;
      } else {
         *next_constant += 8;
      }
   }
   return constant;
}

/**
 * Generates a PDS program to load USC vertex inputs based from one or more
 * vertex buffers, each containing potentially multiple elements, and then a
 * DOUTU to execute the USC.
 *
 * \param program Pointer to the description of the program which should be
 *                generated.
 * \param buffer Pointer to buffer that receives the output of this function.
 *               Will either be the data segment or code segment depending on
 *               gen_mode.
 * \param gen_mode Which part to generate, either data segment or
 *                 code segment. If PDS_GENERATE_SIZES is specified, nothing is
 *                 written, but size information in program is updated.
 * \param dev_info PVR device info structure.
 * \returns Pointer to just beyond the buffer for the data - i.e the value
 *          of the buffer after writing its contents.
 */
/* FIXME: Implement PDS_GENERATE_CODEDATA_SEGMENTS? */
uint32_t *
pvr_pds_vertex_shader(struct pvr_pds_vertex_shader_program *restrict program,
                      uint32_t *restrict buffer,
                      enum pvr_pds_generate_mode gen_mode,
                      const struct pvr_device_info *dev_info)
{
   uint32_t next_constant = PVR_PDS_CONSTANTS_BLOCK_BASE;
   uint32_t next_stream_constant;
   uint32_t next_temp;
   uint32_t usc_control_constant64;
   uint32_t stride_constant32 = 0;
   uint32_t dma_address_constant64 = 0;
   uint32_t dma_control_constant64;
   uint32_t multiplier_constant32 = 0;
   uint32_t base_instance_const32 = 0;

   uint32_t temp = 0;
   uint32_t index_temp64 = 0;
   uint32_t num_vertices_temp64 = 0;
   uint32_t pre_index_temp = (uint32_t)(-1);
   bool first_ddmadt = true;
   uint32_t input_register0;
   uint32_t input_register1;
   uint32_t input_register2;

   struct pvr_pds_vertex_stream *vertex_stream;
   struct pvr_pds_vertex_element *vertex_element;
   uint32_t shift_2s_comp;

   uint32_t data_size = 0;
   uint32_t code_size = 0;
   uint32_t temps_used = 0;

   bool direct_writes_needed = false;

   uint32_t consts_size = 0;
   uint32_t vertex_id_control_word_const32 = 0;
   uint32_t instance_id_control_word_const32 = 0;
   uint32_t instance_id_modifier_word_const32 = 0;
   uint32_t geometry_id_control_word_const64 = 0;
   uint32_t empty_dma_control_constant64 = 0;

   bool any_instanced_stream =
      pvr_pds_vertex_streams_contains_instance_data(program->streams,
                                                    program->num_streams);

   uint32_t base_instance_register = 0;
   uint32_t ddmadt_enables = 0;

   bool issue_empty_ddmad = false;
   uint32_t last_stream_index = program->num_streams - 1;
   bool current_p0 = false;
   uint32_t skip_stream_flag = 0;

   /* Generate the PDS vertex shader data. */

#if MESA_DEBUG
   if (gen_mode == PDS_GENERATE_DATA_SEGMENT) {
      for (uint32_t i = 0; i < program->data_size; i++)
         buffer[i] = 0xDEADBEEF;
   }
#endif

   /* Generate the PDS vertex shader program */
   next_temp = PVR_PDS_TEMPS_BLOCK_BASE;
   /* IR0 is in first 32-bit temp, temp[0].32, vertex_Index. */
   input_register0 = pvr_pds_get_temps(&next_temp, 1, &temps_used);
   /* IR1 is in second 32-bit temp, temp[1].32, instance_ID. */
   input_register1 = pvr_pds_get_temps(&next_temp, 1, &temps_used);

   if (program->iterate_remap_id)
      input_register2 = pvr_pds_get_temps(&next_temp, 1, &temps_used);
   else
      input_register2 = 0; /* Not used, but need to silence the compiler. */

   /* Generate the PDS vertex shader code. The constants in the data block are
    * arranged as follows:
    *
    * 64 bit bank 0        64 bit bank 1          64 bit bank 2    64 bit bank
    * 3 Not used (tmps)    Stride | Multiplier    Address          Control
    */

   /* Find out how many constants are needed by streams. */
   for (uint32_t stream = 0; stream < program->num_streams; stream++) {
      pvr_pds_get_constants(&next_constant,
                            8 * program->streams[stream].num_elements,
                            &consts_size);
   }

   /* If there are no vertex streams allocate the first bank for USC Code
    * Address.
    */
   if (consts_size == 0)
      pvr_pds_get_constants(&next_constant, 2, &consts_size);
   else
      next_constant = 8;

   direct_writes_needed = program->iterate_instance_id ||
                          program->iterate_vtx_id || program->iterate_remap_id;

   if (!PVR_HAS_FEATURE(dev_info, pds_ddmadt)) {
      /* Evaluate what config of DDMAD should be used for each stream. */
      for (uint32_t stream = 0; stream < program->num_streams; stream++) {
         vertex_stream = &program->streams[stream];

         if (vertex_stream->use_ddmadt) {
            ddmadt_enables |= (1 << stream);

            /* The condition for index value is:
             * index * stride + size <= bufferSize (all in unit of byte)
             */
            if (vertex_stream->stride == 0) {
               if (vertex_stream->elements[0].size <=
                   vertex_stream->buffer_size_in_bytes) {
                  /* index can be any value -> no need to use DDMADT. */
                  ddmadt_enables &= (~(1 << stream));
               } else {
                  /* No index works -> no need to issue DDMAD instruction.
                   */
                  skip_stream_flag |= (1 << stream);
               }
            } else {
               /* index * stride + size <= bufferSize
                *
                * can be converted to:
                * index <= (bufferSize - size) / stride
                *
                * where maximum index is:
                * integer((bufferSize - size) / stride).
                */
               if (vertex_stream->buffer_size_in_bytes <
                   vertex_stream->elements[0].size) {
                  /* No index works -> no need to issue DDMAD instruction.
                   */
                  skip_stream_flag |= (1 << stream);
               } else {
                  uint32_t max_index = (vertex_stream->buffer_size_in_bytes -
                                        vertex_stream->elements[0].size) /
                                       vertex_stream->stride;
                  if (max_index == 0xFFFFFFFFu) {
                     /* No need to use DDMADT as all possible indices can
                      * pass the test.
                      */
                     ddmadt_enables &= (~(1 << stream));
                  } else {
                     /* In this case, test condition can be changed to
                      * index < max_index + 1.
                      */
                     program->streams[stream].num_vertices =
                        pvr_pds_get_bank_based_constants(program->num_streams,
                                                         &next_constant,
                                                         1,
                                                         &consts_size);

                     if (gen_mode == PDS_GENERATE_DATA_SEGMENT) {
                        pvr_pds_write_constant32(
                           buffer,
                           program->streams[stream].num_vertices,
                           max_index + 1);
                     }
                  }
               }
            }
         }

         if ((skip_stream_flag & (1 << stream)) == 0) {
            issue_empty_ddmad = (ddmadt_enables & (1 << stream)) != 0;
            last_stream_index = stream;
         }
      }
   } else {
      if (program->num_streams > 0 &&
          program->streams[program->num_streams - 1].use_ddmadt) {
         issue_empty_ddmad = true;
      }
   }

   if (direct_writes_needed)
      issue_empty_ddmad = false;

   if (issue_empty_ddmad) {
      /* An empty DMA control const (DMA size = 0) is required in case the
       * last DDMADD is predicated out and last flag does not have any usage.
       */
      empty_dma_control_constant64 =
         pvr_pds_get_bank_based_constants(program->num_streams,
                                          &next_constant,
                                          2,
                                          &consts_size);
   }

   /* Assign constants for non stream or base instance if there is any
    * instanced stream.
    */
   if (direct_writes_needed || any_instanced_stream ||
       program->instance_id_modifier) {
      if (program->iterate_vtx_id) {
         vertex_id_control_word_const32 =
            pvr_pds_get_bank_based_constants(program->num_streams,
                                             &next_constant,
                                             1,
                                             &consts_size);
      }

      if (program->iterate_instance_id || program->instance_id_modifier) {
         if (program->instance_id_modifier == 0) {
            instance_id_control_word_const32 =
               pvr_pds_get_bank_based_constants(program->num_streams,
                                                &next_constant,
                                                1,
                                                &consts_size);
         } else {
            instance_id_modifier_word_const32 =
               pvr_pds_get_bank_based_constants(program->num_streams,
                                                &next_constant,
                                                1,
                                                &consts_size);
            if ((instance_id_modifier_word_const32 % 2) == 0) {
               instance_id_control_word_const32 =
                  pvr_pds_get_bank_based_constants(program->num_streams,
                                                   &next_constant,
                                                   1,
                                                   &consts_size);
            } else {
               instance_id_control_word_const32 =
                  instance_id_modifier_word_const32;
               instance_id_modifier_word_const32 =
                  pvr_pds_get_bank_based_constants(program->num_streams,
                                                   &next_constant,
                                                   1,
                                                   &consts_size);
            }
         }
      }

      if (program->base_instance != 0) {
         base_instance_const32 =
            pvr_pds_get_bank_based_constants(program->num_streams,
                                             &next_constant,
                                             1,
                                             &consts_size);
      }

      if (program->iterate_remap_id) {
         geometry_id_control_word_const64 =
            pvr_pds_get_bank_based_constants(program->num_streams,
                                             &next_constant,
                                             2,
                                             &consts_size);
      }
   }

   if (program->instance_id_modifier != 0) {
      /* This instanceID modifier is used when a draw array instanced call
       * sourcing from client data cannot fit into vertex buffer and needs to
       * be broken down into several draw calls.
       */

      code_size += 1;

      if (gen_mode == PDS_GENERATE_DATA_SEGMENT) {
         pvr_pds_write_constant32(buffer,
                                  instance_id_modifier_word_const32,
                                  program->instance_id_modifier);
      } else if (gen_mode == PDS_GENERATE_CODE_SEGMENT) {
         *buffer++ = pvr_pds_inst_encode_add32(
            /* cc */ 0x0,
            /* ALUM */ 0, /* Unsigned */
            /* SNA */ 0, /* Add */
            /* SRC0 32b */ instance_id_modifier_word_const32,
            /* SRC1 32b */ input_register1,
            /* DST 32b */ input_register1);
      }
   }

   /* Adjust instanceID if necessary. */
   if (any_instanced_stream || program->iterate_instance_id) {
      if (program->base_instance != 0) {
         assert(!program->draw_indirect);

         if (gen_mode == PDS_GENERATE_DATA_SEGMENT) {
            pvr_pds_write_constant32(buffer,
                                     base_instance_const32,
                                     program->base_instance);
         }

         base_instance_register = base_instance_const32;
      }

      if (program->draw_indirect) {
         assert((program->instance_id_modifier == 0) &&
                (program->base_instance == 0));

         base_instance_register = PVR_ROGUE_PDSINST_REGS32_PTEMP32_LOWER + 1;
      }
   }

   next_constant = next_stream_constant = PVR_PDS_CONSTANTS_BLOCK_BASE;
   usc_control_constant64 =
      pvr_pds_get_constants(&next_stream_constant, 2, &data_size);

   for (uint32_t stream = 0; stream < program->num_streams; stream++) {
      bool instance_data_with_base_instance;

      if ((!PVR_HAS_FEATURE(dev_info, pds_ddmadt)) &&
          ((skip_stream_flag & (1 << stream)) != 0)) {
         continue;
      }

      vertex_stream = &program->streams[stream];

      instance_data_with_base_instance =
         ((vertex_stream->instance_data) &&
          ((program->base_instance > 0) || (program->draw_indirect)));

      /* Get all 8 32-bit constants at once, only 6 for first stream due to
       * USC constants.
       */
      if (stream == 0) {
         stride_constant32 =
            pvr_pds_get_constants(&next_stream_constant, 6, &data_size);
      } else {
         next_constant =
            pvr_pds_get_constants(&next_stream_constant, 8, &data_size);

         /* Skip bank 0. */
         stride_constant32 = next_constant + 2;
      }

      multiplier_constant32 = stride_constant32 + 1;

      if (gen_mode == PDS_GENERATE_DATA_SEGMENT) {
         pvr_pds_write_constant32(buffer,
                                  stride_constant32,
                                  vertex_stream->stride);

         /* Vertex stream frequency multiplier. */
         if (vertex_stream->multiplier)
            pvr_pds_write_constant32(buffer,
                                     multiplier_constant32,
                                     vertex_stream->multiplier);
      }

      /* Update the code size count and temps count for the above code
       * segment.
       */
      if (vertex_stream->current_state) {
         code_size += 1;
         temp = pvr_pds_get_temps(&next_temp, 1, &temps_used); /* 32-bit */
      } else {
         unsigned int num_temps_required = 0;

         if (vertex_stream->multiplier) {
            num_temps_required += 2;
            code_size += 3;

            if (vertex_stream->shift) {
               code_size += 1;

               if ((int32_t)vertex_stream->shift > 0)
                  code_size += 1;
            }
         } else if (vertex_stream->shift) {
            code_size += 1;
            num_temps_required += 1;
         } else if (instance_data_with_base_instance) {
            num_temps_required += 1;
         }

         if (num_temps_required != 0) {
            temp = pvr_pds_get_temps(&next_temp,
                                     num_temps_required,
                                     &temps_used); /* 64-bit */
         } else {
            temp = vertex_stream->instance_data ? input_register1
                                                : input_register0;
         }

         if (instance_data_with_base_instance)
            code_size += 1;
      }

      /* The real code segment. */
      if (gen_mode == PDS_GENERATE_CODE_SEGMENT) {
         /* If it's current state stream, then index = 0 always. */
         if (vertex_stream->current_state) {
            /* Put zero in temp. */
            *buffer++ = pvr_pds_inst_encode_limm(0, temp, 0, 0);
         } else if (vertex_stream->multiplier) {
            /* old: Iout = (Iin * (Multiplier+2^24)) >> (Shift+24)
             * new: Iout = (Iin * Multiplier) >> (shift+31)
             */

            /* Put zero in temp. Need zero for add part of the following
             * MAD. MAD source is 64 bit, so need two LIMMs.
             */
            *buffer++ = pvr_pds_inst_encode_limm(0, temp, 0, 0);
            /* Put zero in temp. Need zero for add part of the following
             * MAD.
             */
            *buffer++ = pvr_pds_inst_encode_limm(0, temp + 1, 0, 0);

            /* old: (Iin * (Multiplier+2^24))
             * new: (Iin * Multiplier)
             */
            *buffer++ = pvr_rogue_inst_encode_mad(
               0, /* Sign of add is positive. */
               0, /* Unsigned ALU mode */
               0, /* Unconditional */
               multiplier_constant32,
               vertex_stream->instance_data ? input_register1 : input_register0,
               temp / 2,
               temp / 2);

            if (vertex_stream->shift) {
               int32_t shift = (int32_t)vertex_stream->shift;

               /* new: >> (shift + 31) */
               shift += 31;
               shift *= -1;

               if (shift < -31) {
                  /* >> (31) */
                  shift_2s_comp = 0xFFFE1;
                  *buffer++ = pvr_pds_inst_encode_sftlp64(
                     /* cc */ 0,
                     /* LOP */ PVR_ROGUE_PDSINST_LOP_NONE,
                     /* IM */ 1, /*  enable immediate */
                     /* SRC0 */ temp / 2,
                     /* SRC1 */ input_register0, /* This won't be used in
                                                  * a shift operation.
                                                  */
                     /* SRC2 (Shift) */ shift_2s_comp,
                     /* DST */ temp / 2);
                  shift += 31;
               }

               /* old: >> (Shift+24)
                * new: >> (shift + 31)
                */
               shift_2s_comp = *((uint32_t *)&shift);
               *buffer++ = pvr_pds_inst_encode_sftlp64(
                  /* cc */ 0,
                  /* LOP */ PVR_ROGUE_PDSINST_LOP_NONE,
                  /* IM */ 1, /*enable immediate */
                  /* SRC0 */ temp / 2,
                  /* SRC1 */ input_register0, /* This won't be used in
                                               * a shift operation.
                                               */
                  /* SRC2 (Shift) */ shift_2s_comp,
                  /* DST */ temp / 2);
            }

            if (instance_data_with_base_instance) {
               *buffer++ =
                  pvr_pds_inst_encode_add32(0, /* cc */
                                            0, /* ALNUM */
                                            0, /* SNA */
                                            base_instance_register, /* src0
                                                                     */
                                            temp, /* src1 */
                                            temp /* dst */
                  );
            }
         } else { /* NOT vertex_stream->multiplier */
            if (vertex_stream->shift) {
               /* Shift Index/InstanceNum Right by shift bits. Put result
                * in a Temp.
                */

               /* 2's complement of shift as this will be a right shift. */
               shift_2s_comp = ~(vertex_stream->shift) + 1;

               *buffer++ = pvr_pds_inst_encode_sftlp32(
                  /* IM */ 1, /*  enable immediate. */
                  /* cc */ 0,
                  /* LOP */ PVR_ROGUE_PDSINST_LOP_NONE,
                  /* SRC0 */ vertex_stream->instance_data ? input_register1
                                                          : input_register0,
                  /* SRC1 */ input_register0, /* This won't be used in
                                               * a shift operation.
                                               */
                  /* SRC2 (Shift) */ shift_2s_comp,
                  /* DST */ temp);

               if (instance_data_with_base_instance) {
                  *buffer++ =
                     pvr_pds_inst_encode_add32(0, /* cc */
                                               0, /* ALNUM */
                                               0, /* SNA */
                                               base_instance_register, /* src0
                                                                        */
                                               temp, /* src1 */
                                               temp /* dst */
                     );
               }
            } else {
               if (instance_data_with_base_instance) {
                  *buffer++ =
                     pvr_pds_inst_encode_add32(0, /* cc */
                                               0, /* ALNUM */
                                               0, /* SNA */
                                               base_instance_register, /* src0
                                                                        */
                                               input_register1, /* src1 */
                                               temp /* dst */
                     );
               } else {
                  /* If the shift instruction doesn't happen, use the IR
                   * directly into the following MAD.
                   */
                  temp = vertex_stream->instance_data ? input_register1
                                                      : input_register0;
               }
            }
         }
      }

      if (PVR_HAS_FEATURE(dev_info, pds_ddmadt)) {
         if (vertex_stream->use_ddmadt)
            ddmadt_enables |= (1 << stream);
      } else {
         if ((ddmadt_enables & (1 << stream)) != 0) {
            /* Emulate what DDMADT does for range checking. */
            if (first_ddmadt) {
               /* Get an 64 bits temp such that cmp current index with
                * allowed vertex number can work.
                */
               index_temp64 =
                  pvr_pds_get_temps(&next_temp, 2, &temps_used); /* 64-bit
                                                                  */
               num_vertices_temp64 =
                  pvr_pds_get_temps(&next_temp, 2, &temps_used); /* 64-bit
                                                                  */

               index_temp64 -= PVR_ROGUE_PDSINST_REGS32_TEMP32_LOWER;
               num_vertices_temp64 -= PVR_ROGUE_PDSINST_REGS32_TEMP32_LOWER;

               code_size += 3;
               current_p0 = true;
            }

            code_size += (temp == pre_index_temp ? 1 : 2);

            if (gen_mode == PDS_GENERATE_CODE_SEGMENT) {
               if (first_ddmadt) {
                  /* Set predicate to be P0. */
                  *buffer++ = pvr_pds_encode_bra(
                     PVR_ROGUE_PDSINST_PREDICATE_KEEP, /* SRCCC
                                                        */
                     0, /* Neg */
                     PVR_ROGUE_PDSINST_PREDICATE_P0, /* SETCC
                                                      */
                     1); /* Addr */

                  *buffer++ =
                     pvr_pds_inst_encode_limm(0, index_temp64 + 1, 0, 0);
                  *buffer++ =
                     pvr_pds_inst_encode_limm(0, num_vertices_temp64 + 1, 0, 0);
               }

               if (temp != pre_index_temp) {
                  *buffer++ = pvr_pds_inst_encode_sftlp32(
                     /* IM */ 1, /*  enable immediate. */
                     /* cc */ 0,
                     /* LOP */ PVR_ROGUE_PDSINST_LOP_NONE,
                     /* SRC0 */ temp - PVR_ROGUE_PDSINST_REGS32_TEMP32_LOWER,
                     /* SRC1 */ 0,
                     /* SRC2 (Shift) */ 0,
                     /* DST */ index_temp64);
               }

               *buffer++ = pvr_pds_inst_encode_sftlp32(
                  /* IM */ 1, /*  enable immediate. */
                  /* cc */ 0,
                  /* LOP */ PVR_ROGUE_PDSINST_LOP_OR,
                  /* SRC0 */ num_vertices_temp64 + 1,
                  /* SRC1 */ vertex_stream->num_vertices,
                  /* SRC2 (Shift) */ 0,
                  /* DST */ num_vertices_temp64);
            }

            first_ddmadt = false;

            pre_index_temp = temp;
         }
      }

      /* Process the elements in the stream. */
      for (uint32_t element = 0; element < vertex_stream->num_elements;
           element++) {
         bool terminate = false;

         vertex_element = &vertex_stream->elements[element];
         /* Check if last DDMAD needs terminate or not. */
         if ((element == (vertex_stream->num_elements - 1)) &&
             (stream == last_stream_index)) {
            terminate = !issue_empty_ddmad && !direct_writes_needed;
         }

         /* Get a new set of constants for this element. */
         if (element) {
            /* Get all 8 32 bit constants at once. */
            next_constant =
               pvr_pds_get_constants(&next_stream_constant, 8, &data_size);
         }

         dma_address_constant64 = next_constant + 4;
         dma_control_constant64 = dma_address_constant64 + 2;

         if (vertex_element->component_size == 0) {
            /* Standard DMA.
             *
             * Write the DMA transfer control words into the PDS data
             * section.
             *
             * DMA Address is 40-bit.
             */

            if (gen_mode == PDS_GENERATE_DATA_SEGMENT) {
               uint32_t dma_control_word;
               uint64_t dma_control_word64 = 0;
               uint32_t dma_size;

               /* Write the address to the constant. */
               pvr_pds_write_dma_address(buffer,
                                         dma_address_constant64,
                                         vertex_stream->address +
                                            (uint64_t)vertex_element->offset,
                                         false,
                                         dev_info);
               {
                  if (program->stream_patch_offsets) {
                     program
                        ->stream_patch_offsets[program->num_stream_patches++] =
                        (stream << 16) | (dma_address_constant64 >> 1);
                  }
               }

               /* Size is in bytes - round up to nearest 32 bit word. */
               dma_size =
                  (vertex_element->size + (1 << PVR_PDS_DWORD_SHIFT) - 1) >>
                  PVR_PDS_DWORD_SHIFT;

               assert(dma_size <= PVR_ROGUE_PDSINST_DDMAD_FIELDS_BSIZE_UPPER);

               /* Set up the dma transfer control word. */
               dma_control_word =
                  dma_size << PVR_ROGUE_PDSINST_DDMAD_FIELDS_SRC3_BSIZE_SHIFT;

               dma_control_word |=
                  vertex_element->reg
                  << PVR_ROGUE_PDSINST_DDMAD_FIELDS_SRC3_AO_SHIFT;

               dma_control_word |=
                  PVR_ROGUE_PDSINST_DDMAD_FIELDS_SRC3_DEST_UNIFIED_STORE |
                  PVR_ROGUE_PDSINST_DDMAD_FIELDS_SRC3_CMODE_CACHED;

               if (PVR_HAS_FEATURE(dev_info, pds_ddmadt)) {
                  if ((ddmadt_enables & (1 << stream)) != 0) {
                     assert(
                        ((((uint64_t)vertex_stream->buffer_size_in_bytes
                           << PVR_ROGUE_PDSINST_DDMAD_FIELDS_SRC3_MSIZE_SHIFT) &
                          ~PVR_ROGUE_PDSINST_DDMAD_FIELDS_SRC3_MSIZE_CLRMSK) >>
                         PVR_ROGUE_PDSINST_DDMAD_FIELDS_SRC3_MSIZE_SHIFT) ==
                        (uint64_t)vertex_stream->buffer_size_in_bytes);
                     dma_control_word64 =
                        (PVR_ROGUE_PDSINST_DDMAD_FIELDS_SRC3_TEST_EN |
                         (((uint64_t)vertex_stream->buffer_size_in_bytes
                           << PVR_ROGUE_PDSINST_DDMAD_FIELDS_SRC3_MSIZE_SHIFT) &
                          ~PVR_ROGUE_PDSINST_DDMAD_FIELDS_SRC3_MSIZE_CLRMSK));
                  }
               }
               /* If this is the last dma then also set the last flag. */
               if (terminate) {
                  dma_control_word |=
                     PVR_ROGUE_PDSINST_DDMAD_FIELDS_SRC3_LAST_EN;
               }

               /* Write the 32-Bit SRC3 word to a 64-bit constant as per
                * spec.
                */
               pvr_pds_write_wide_constant(buffer,
                                           dma_control_constant64,
                                           dma_control_word64 |
                                              (uint64_t)dma_control_word);
            }

            if (gen_mode == PDS_GENERATE_CODE_SEGMENT) {
               if (!PVR_HAS_FEATURE(dev_info, pds_ddmadt)) {
                  if ((ddmadt_enables & (1 << stream)) != 0) {
                     *buffer++ = pvr_pds_inst_encode_cmp(
                        0, /* cc enable */
                        PVR_ROGUE_PDSINST_COP_LT, /* Operation */
                        index_temp64 >> 1, /* SRC0 (REGS64TP) */
                        (num_vertices_temp64 >> 1) +
                           PVR_ROGUE_PDSINST_REGS64_TEMP64_LOWER); /* SRC1
                                                                      (REGS64)
                                                                    */
                  }
               }
               /* Multiply by the vertex stream stride and add the base
                * followed by a DOUTD.
                *
                * dmad32 (C0 * T0) + C1, C2
                * src0 = stride  src1 = index  src2 = baseaddr src3 =
                * doutd part
                */

               uint32_t cc;
               if (PVR_HAS_FEATURE(dev_info, pds_ddmadt))
                  cc = 0;
               else
                  cc = (ddmadt_enables & (1 << stream)) != 0 ? 1 : 0;

               *buffer++ = pvr_pds_inst_encode_ddmad(
                  /* cc */ cc,
                  /* END */ 0,
                  /* SRC0 */ stride_constant32, /* Stride 32-bit*/
                  /* SRC1 */ temp, /* Index 32-bit*/
                  /* SRC2 64-bit */ dma_address_constant64 >> 1, /* Stream
                                                                  * Address
                                                                  * +
                                                                  * Offset
                                                                  */
                  /* SRC3 64-bit */ dma_control_constant64 >> 1 /* DMA
                                                                 * Transfer
                                                                 * Control
                                                                 * Word.
                                                                 */
               );
            }

            if ((!PVR_HAS_FEATURE(dev_info, pds_ddmadt)) &&
                ((ddmadt_enables & (1 << stream)) != 0)) {
               code_size += 1;
            }
            code_size += 1;
         } else {
            /* Repeat DMA.
             *
             * Write the DMA transfer control words into the PDS data
             * section.
             *
             * DMA address is 40-bit.
             */

            if (gen_mode == PDS_GENERATE_DATA_SEGMENT) {
               uint32_t dma_control_word;

               /* Write the address to the constant. */
               pvr_pds_write_dma_address(buffer,
                                         dma_address_constant64,
                                         vertex_stream->address +
                                            (uint64_t)vertex_element->offset,
                                         false,
                                         dev_info);

               /* Set up the DMA transfer control word. */
               dma_control_word =
                  vertex_element->size
                  << PVR_ROGUE_PDSINST_DDMAD_FIELDS_SRC3_BSIZE_SHIFT;

               dma_control_word |=
                  vertex_element->reg
                  << PVR_ROGUE_PDSINST_DDMAD_FIELDS_SRC3_AO_SHIFT;

               switch (vertex_element->component_size) {
               case 4: {
                  dma_control_word |=
                     PVR_ROGUE_PDSINST_DDMAD_FIELDS_SRC3_WORDSIZE_FOUR;
                  break;
               }
               case 3: {
                  dma_control_word |=
                     PVR_ROGUE_PDSINST_DDMAD_FIELDS_SRC3_WORDSIZE_THREE;
                  break;
               }
               case 2: {
                  dma_control_word |=
                     PVR_ROGUE_PDSINST_DDMAD_FIELDS_SRC3_WORDSIZE_TWO;
                  break;
               }
               default: {
                  dma_control_word |=
                     PVR_ROGUE_PDSINST_DDMAD_FIELDS_SRC3_WORDSIZE_ONE;
                  break;
               }
               }

               dma_control_word |=
                  PVR_ROGUE_PDSINST_DDMAD_FIELDS_SRC3_REPEAT_REPEAT;

               dma_control_word |=
                  PVR_ROGUE_PDSINST_DDMAD_FIELDS_SRC3_DEST_UNIFIED_STORE |
                  PVR_ROGUE_PDSINST_DDMAD_FIELDS_SRC3_CMODE_CACHED;

               /* If this is the last dma then also set the last flag. */
               if (terminate) {
                  dma_control_word |=
                     PVR_ROGUE_PDSINST_DDMAD_FIELDS_SRC3_LAST_EN;
               }

               /* Write the 32-Bit SRC3 word to a 64-bit constant as per
                * spec.
                */
               pvr_pds_write_wide_constant(buffer,
                                           dma_control_constant64,
                                           (uint64_t)dma_control_word);
            }

            if (gen_mode == PDS_GENERATE_CODE_SEGMENT) {
               /* Multiply by the vertex stream stride and add the base
                * followed by a DOUTD.
                *
                * dmad32 (C0 * T0) + C1, C2
                * src0 = stride  src1 = index  src2 = baseaddr src3 =
                * doutd part
                */
               *buffer++ = pvr_pds_inst_encode_ddmad(
                  /* cc */ 0,
                  /* END */ 0,
                  /* SRC0 */ stride_constant32, /* Stride 32-bit*/
                  /* SRC1 */ temp, /* Index 32-bit*/
                  /* SRC2 64-bit */ dma_address_constant64 >> 1, /* Stream
                                                                  * Address
                                                                  * +
                                                                  * Offset.
                                                                  */
                  /* SRC3 64-bit */ dma_control_constant64 >> 1 /* DMA
                                                                 * Transfer
                                                                 * Control
                                                                 * Word.
                                                                 */
               );
            }

            code_size += 1;
         } /* End of repeat DMA. */
      } /* Element loop */
   } /* Stream loop */

   if (issue_empty_ddmad) {
      /* Issue an empty last DDMAD, always executed. */
      if (gen_mode == PDS_GENERATE_DATA_SEGMENT) {
         pvr_pds_write_wide_constant(
            buffer,
            empty_dma_control_constant64,
            PVR_ROGUE_PDSINST_DDMAD_FIELDS_SRC3_LAST_EN);
      }

      code_size += 1;

      if (gen_mode == PDS_GENERATE_CODE_SEGMENT) {
         *buffer++ = pvr_pds_inst_encode_ddmad(
            /* cc */ 0,
            /* END */ 0,
            /* SRC0 */ stride_constant32, /* Stride 32-bit*/
            /* SRC1 */ temp, /* Index 32-bit*/
            /* SRC2 64-bit */ dma_address_constant64 >> 1, /* Stream
                                                            *Address +
                                                            *Offset.
                                                            */
            /* SRC3 64-bit */ empty_dma_control_constant64 >> 1 /* DMA
                                                                 * Transfer
                                                                 * Control
                                                                 * Word.
                                                                 */
         );
      }
   }

   if (!PVR_HAS_FEATURE(dev_info, pds_ddmadt)) {
      if (current_p0) {
         code_size += 1;

         if (gen_mode == PDS_GENERATE_CODE_SEGMENT) {
            /* Revert predicate back to IF0 which is required by DOUTU. */
            *buffer++ =
               pvr_pds_encode_bra(PVR_ROGUE_PDSINST_PREDICATE_KEEP, /* SRCCC
                                                                     */
                                  0, /* Neg */
                                  PVR_ROGUE_PDSINST_PREDICATE_IF0, /* SETCC
                                                                    */
                                  1); /* Addr */
         }
      }
   }
   /* Send VertexID if requested. */
   if (program->iterate_vtx_id) {
      if (program->draw_indirect) {
         if (gen_mode == PDS_GENERATE_CODE_SEGMENT) {
            *buffer++ = pvr_pds_inst_encode_add32(
               /* cc */ 0x0,
               /* ALUM */ 0, /* Unsigned */
               /* SNA */ 1, /* Minus */
               /* SRC0 32b */ input_register0, /* vertexID */
               /* SRC1 32b */ PVR_ROGUE_PDSINST_REGS32_PTEMP32_LOWER, /* base
                                                                       * vertexID.
                                                                       */
               /* DST 32b */ input_register0);
         }

         code_size += 1;
      }

      if (gen_mode == PDS_GENERATE_DATA_SEGMENT) {
         uint32_t doutw = pvr_pds_encode_doutw_src1(
            program->vtx_id_register,
            PVR_PDS_DOUTW_LOWER32,
            PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_DEST_UNIFIED_STORE,
            false,
            dev_info);

         if (!program->iterate_instance_id && !program->iterate_remap_id)
            doutw |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN;

         pvr_pds_write_constant32(buffer,
                                  vertex_id_control_word_const32,
                                  doutw);
      } else if (gen_mode == PDS_GENERATE_CODE_SEGMENT) {
         *buffer++ = pvr_pds_encode_doutw64(
            /* cc */ 0,
            /* END */ 0,
            /* SRC1 */ vertex_id_control_word_const32, /* DOUTW 32-bit Src1
                                                        */
            /* SRC0 */ input_register0 >> 1); /* DOUTW 64-bit Src0 */
      }

      code_size += 1;
   }

   /* Send InstanceID if requested. */
   if (program->iterate_instance_id) {
      if (gen_mode == PDS_GENERATE_DATA_SEGMENT) {
         uint32_t doutw = pvr_pds_encode_doutw_src1(
            program->instance_id_register,
            PVR_PDS_DOUTW_UPPER32,
            PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_DEST_UNIFIED_STORE,
            true,
            dev_info);

         if (!program->iterate_remap_id)
            doutw |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN;

         pvr_pds_write_constant32(buffer,
                                  instance_id_control_word_const32,
                                  doutw);
      } else if (gen_mode == PDS_GENERATE_CODE_SEGMENT) {
         *buffer++ = pvr_pds_encode_doutw64(
            /* cc */ 0,
            /* END */ 0,
            /* SRC1 */ instance_id_control_word_const32, /* DOUTW 32-bit Src1 */
            /* SRC0 */ input_register1 >> 1); /* DOUTW 64-bit Src0 */
      }

      code_size += 1;
   }

   /* Send remapped index number to vi0. */
   if (program->iterate_remap_id) {
      if (gen_mode == PDS_GENERATE_DATA_SEGMENT) {
         uint32_t doutw = pvr_pds_encode_doutw_src1(
            0 /* vi0 */,
            PVR_PDS_DOUTW_LOWER32,
            PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_DEST_UNIFIED_STORE |
               PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN,
            false,
            dev_info);

         pvr_pds_write_constant64(buffer,
                                  geometry_id_control_word_const64,
                                  doutw,
                                  0);
      } else if (gen_mode == PDS_GENERATE_CODE_SEGMENT) {
         *buffer++ = pvr_pds_encode_doutw64(
            /* cc */ 0,
            /* END */ 0,
            /* SRC1 */ geometry_id_control_word_const64, /* DOUTW 32-bit
                                                          * Src1
                                                          */
            /* SRC0 */ input_register2 >> 1); /* DOUTW 64-bit Src0 */
      }

      code_size += 1;
   }

   /* Copy the USC task control words to constants. */
   if (gen_mode == PDS_GENERATE_DATA_SEGMENT) {
      pvr_pds_write_wide_constant(buffer,
                                  usc_control_constant64,
                                  program->usc_task_control.src0); /* 64-bit
                                                                    * Src0
                                                                    */
      if (program->stream_patch_offsets) {
         /* USC TaskControl is always the first patch. */
         program->stream_patch_offsets[0] = usc_control_constant64 >> 1;
      }
   }

   if (gen_mode == PDS_GENERATE_CODE_SEGMENT) {
      /* Conditionally (if last in task) issue the task to the USC
       * (if0) DOUTU src1=USC Code Base address, src2=DOUTU word 2.
       */

      *buffer++ = pvr_pds_encode_doutu(
         /* cc */ 1,
         /* END */ 1,
         /* SRC0 */ usc_control_constant64 >> 1); /* DOUTU 64-bit Src0 */

      /* End the program if the Dout did not already end it. */
      *buffer++ = pvr_pds_inst_encode_halt(0);
   }

   code_size += 2;

   if (gen_mode == PDS_GENERATE_DATA_SEGMENT) {
      /* Set the data segment pointer and ensure we return 1 past the buffer
       * ptr.
       */
      program->data_segment = buffer;

      buffer += consts_size;
   }

   program->temps_used = temps_used;
   program->data_size = consts_size;
   program->code_size = code_size;
   program->ddmadt_enables = ddmadt_enables;
   if (!PVR_HAS_FEATURE(dev_info, pds_ddmadt))
      program->skip_stream_flag = skip_stream_flag;

   return buffer;
}

/**
 * Generates a PDS program to load USC compute shader global/local/workgroup
 * sizes/ids and then a DOUTU to execute the USC.
 *
 * \param program Pointer to description of the program that should be
 *                generated.
 * \param buffer Pointer to buffer that receives the output of this function.
 *               This will be either the data segment, or the code depending on
 *               gen_mode.
 * \param gen_mode Which part to generate, either data segment or code segment.
 *                 If PDS_GENERATE_SIZES is specified, nothing is written, but
 *                 size information in program is updated.
 * \param dev_info PVR device info struct.
 * \returns Pointer to just beyond the buffer for the data - i.e. the value of
 *          the buffer after writing its contents.
 */
uint32_t *
pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program,
                       uint32_t *restrict buffer,
                       enum pvr_pds_generate_mode gen_mode,
                       const struct pvr_device_info *dev_info)
{
   uint32_t usc_control_constant64;
   uint32_t usc_control_constant64_coeff_update = 0;
   uint32_t zero_constant64 = 0;

   uint32_t data_size = 0;
   uint32_t code_size = 0;
   uint32_t temps_used = 0;
   uint32_t doutw = 0;

   uint32_t barrier_ctrl_word = 0;
   uint32_t barrier_ctrl_word2 = 0;

   /* Even though there are 3 IDs for local and global we only need max one
    * DOUTW for local, and two for global.
    */
   uint32_t work_group_id_ctrl_words[2] = { 0 };
   uint32_t local_id_ctrl_word = 0;
   uint32_t local_input_register;

   /* For the constant value to load into ptemp (SW fence). */
   uint64_t predicate_ld_src0_constant = 0;
   uint32_t cond_render_negate_constant = 0;

   uint32_t cond_render_pred_temp;
   uint32_t cond_render_negate_temp;

   /* 2x 64 bit registers that will mask out the Predicate load. */
   uint32_t cond_render_pred_mask_constant = 0;

#if MESA_DEBUG
   if (gen_mode == PDS_GENERATE_DATA_SEGMENT) {
      for (uint32_t j = 0; j < program->data_size; j++)
         buffer[j] = 0xDEADBEEF;
   }
#endif

   /* All the compute input registers are in temps. */
   temps_used += PVR_PDS_NUM_COMPUTE_INPUT_REGS;

   uint32_t next_temp = PVR_PDS_TEMPS_BLOCK_BASE + temps_used;

   uint32_t next_constant = PVR_PDS_CONSTANTS_BLOCK_BASE;

   if (program->kick_usc) {
      /* Copy the USC task control words to constants. */
      usc_control_constant64 =
         pvr_pds_get_constants(&next_constant, 2, &data_size);
   }

   if (program->has_coefficient_update_task) {
      usc_control_constant64_coeff_update =
         pvr_pds_get_constants(&next_constant, 2, &data_size);
   }

   if (program->conditional_render) {
      predicate_ld_src0_constant =
         pvr_pds_get_constants(&next_constant, 2, &data_size);
      cond_render_negate_constant =
         pvr_pds_get_constants(&next_constant, 2, &data_size);
      cond_render_pred_mask_constant =
         pvr_pds_get_constants(&next_constant, 4, &data_size);

      /* LD will load a 64 bit value. */
      cond_render_pred_temp = pvr_pds_get_temps(&next_temp, 4, &temps_used);
      cond_render_negate_temp = pvr_pds_get_temps(&next_temp, 2, &temps_used);

      program->cond_render_const_offset_in_dwords = predicate_ld_src0_constant;
      program->cond_render_pred_temp = cond_render_pred_temp;
   }

   if ((program->barrier_coefficient != PVR_PDS_COMPUTE_INPUT_REG_UNUSED) ||
       (program->clear_pds_barrier) ||
       (program->kick_usc && program->conditional_render)) {
      zero_constant64 = pvr_pds_get_constants(&next_constant, 2, &data_size);
   }

   if (program->barrier_coefficient != PVR_PDS_COMPUTE_INPUT_REG_UNUSED) {
      barrier_ctrl_word = pvr_pds_get_constants(&next_constant, 1, &data_size);
      if (PVR_HAS_QUIRK(dev_info, 51210)) {
         barrier_ctrl_word2 =
            pvr_pds_get_constants(&next_constant, 1, &data_size);
      }
   }

   if (program->work_group_input_regs[0] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED ||
       program->work_group_input_regs[1] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED) {
      work_group_id_ctrl_words[0] =
         pvr_pds_get_constants(&next_constant, 1, &data_size);
   }

   if (program->work_group_input_regs[2] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED) {
      work_group_id_ctrl_words[1] =
         pvr_pds_get_constants(&next_constant, 1, &data_size);
   }

   if ((program->local_input_regs[0] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED) ||
       (program->local_input_regs[1] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED) ||
       (program->local_input_regs[2] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED)) {
      local_id_ctrl_word = pvr_pds_get_constants(&next_constant, 1, &data_size);
   }

   if (program->add_base_workgroup) {
      for (uint32_t workgroup_component = 0; workgroup_component < 3;
           workgroup_component++) {
         if (program->work_group_input_regs[workgroup_component] !=
             PVR_PDS_COMPUTE_INPUT_REG_UNUSED) {
            program
               ->base_workgroup_constant_offset_in_dwords[workgroup_component] =
               pvr_pds_get_constants(&next_constant, 1, &data_size);
         }
      }
   }

   if (gen_mode == PDS_GENERATE_DATA_SEGMENT) {
      if (program->kick_usc) {
         /* Src0 for DOUTU */
         pvr_pds_write_wide_constant(buffer,
                                     usc_control_constant64,
                                     program->usc_task_control.src0); /* 64-bit
                                                                       * Src0.
                                                                       */
      }

      if (program->has_coefficient_update_task) {
         /* Src0 for DOUTU. */
         pvr_pds_write_wide_constant(
            buffer,
            usc_control_constant64_coeff_update,
            program->usc_task_control_coeff_update.src0); /* 64-bit Src0 */
      }

      if ((program->barrier_coefficient != PVR_PDS_COMPUTE_INPUT_REG_UNUSED) ||
          (program->clear_pds_barrier) ||
          (program->kick_usc && program->conditional_render)) {
         pvr_pds_write_wide_constant(buffer, zero_constant64, 0); /* 64-bit
                                                                   * Src0
                                                                   */
      }

      if (program->barrier_coefficient != PVR_PDS_COMPUTE_INPUT_REG_UNUSED) {
         if (PVR_HAS_QUIRK(dev_info, 51210)) {
            /* Write the constant for the coefficient register write. */
            doutw = pvr_pds_encode_doutw_src1(
               program->barrier_coefficient + 4,
               PVR_PDS_DOUTW_LOWER64,
               PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_DEST_COMMON_STORE,
               true,
               dev_info);
            pvr_pds_write_constant32(buffer, barrier_ctrl_word2, doutw);
         }
         /* Write the constant for the coefficient register write. */
         doutw = pvr_pds_encode_doutw_src1(
            program->barrier_coefficient,
            PVR_PDS_DOUTW_LOWER64,
            PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_DEST_COMMON_STORE,
            true,
            dev_info);

         /* Check whether the barrier is going to be the last DOUTW done by
          * the coefficient sync task.
          */
         if ((program->work_group_input_regs[0] ==
              PVR_PDS_COMPUTE_INPUT_REG_UNUSED) &&
             (program->work_group_input_regs[1] ==
              PVR_PDS_COMPUTE_INPUT_REG_UNUSED) &&
             (program->work_group_input_regs[2] ==
              PVR_PDS_COMPUTE_INPUT_REG_UNUSED)) {
            doutw |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN;
         }

         pvr_pds_write_constant32(buffer, barrier_ctrl_word, doutw);
      }

      /* If we want work-group id X, see if we also want work-group id Y. */
      if (program->work_group_input_regs[0] !=
             PVR_PDS_COMPUTE_INPUT_REG_UNUSED &&
          program->work_group_input_regs[1] !=
             PVR_PDS_COMPUTE_INPUT_REG_UNUSED) {
         /* Make sure we are going to DOUTW them into adjacent registers
          * otherwise we can't do it in one.
          */
         assert(program->work_group_input_regs[1] ==
                (program->work_group_input_regs[0] + 1));

         doutw = pvr_pds_encode_doutw_src1(
            program->work_group_input_regs[0],
            PVR_PDS_DOUTW_LOWER64,
            PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_DEST_COMMON_STORE,
            true,
            dev_info);

         /* If we don't want the Z work-group id then this is the last one.
          */
         if (program->work_group_input_regs[2] ==
             PVR_PDS_COMPUTE_INPUT_REG_UNUSED) {
            doutw |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN;
         }

         pvr_pds_write_constant32(buffer, work_group_id_ctrl_words[0], doutw);
      }
      /* If we only want one of X or Y then handle them separately. */
      else {
         if (program->work_group_input_regs[0] !=
             PVR_PDS_COMPUTE_INPUT_REG_UNUSED) {
            doutw = pvr_pds_encode_doutw_src1(
               program->work_group_input_regs[0],
               PVR_PDS_DOUTW_LOWER32,
               PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_DEST_COMMON_STORE,
               true,
               dev_info);

            /* If we don't want the Z work-group id then this is the last
             * one.
             */
            if (program->work_group_input_regs[2] ==
                PVR_PDS_COMPUTE_INPUT_REG_UNUSED) {
               doutw |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN;
            }

            pvr_pds_write_constant32(buffer,
                                     work_group_id_ctrl_words[0],
                                     doutw);
         } else if (program->work_group_input_regs[1] !=
                    PVR_PDS_COMPUTE_INPUT_REG_UNUSED) {
            doutw = pvr_pds_encode_doutw_src1(
               program->work_group_input_regs[1],
               PVR_PDS_DOUTW_UPPER32,
               PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_DEST_COMMON_STORE,
               true,
               dev_info);

            /* If we don't want the Z work-group id then this is the last
             * one.
             */
            if (program->work_group_input_regs[2] ==
                PVR_PDS_COMPUTE_INPUT_REG_UNUSED) {
               doutw |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN;
            }

            pvr_pds_write_constant32(buffer,
                                     work_group_id_ctrl_words[0],
                                     doutw);
         }
      }

      /* Handle work-group id Z. */
      if (program->work_group_input_regs[2] !=
          PVR_PDS_COMPUTE_INPUT_REG_UNUSED) {
         doutw = pvr_pds_encode_doutw_src1(
            program->work_group_input_regs[2],
            PVR_PDS_DOUTW_UPPER32,
            PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_DEST_COMMON_STORE |
               PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN,
            true,
            dev_info);

         pvr_pds_write_constant32(buffer, work_group_id_ctrl_words[1], doutw);
      }

      /* Handle the local IDs. */
      if ((program->local_input_regs[1] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED) ||
          (program->local_input_regs[2] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED)) {
         uint32_t dest_reg;

         /* If we want local id Y and Z make sure the compiler wants them in
          * the same register.
          */
         if (!program->flattened_work_groups) {
            if ((program->local_input_regs[1] !=
                 PVR_PDS_COMPUTE_INPUT_REG_UNUSED) &&
                (program->local_input_regs[2] !=
                 PVR_PDS_COMPUTE_INPUT_REG_UNUSED)) {
               assert(program->local_input_regs[1] ==
                      program->local_input_regs[2]);
            }
         }

         if (program->local_input_regs[1] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED)
            dest_reg = program->local_input_regs[1];
         else
            dest_reg = program->local_input_regs[2];

         /* If we want local id X and (Y or Z) then we can do that in a
          * single 64-bit DOUTW.
          */
         if (program->local_input_regs[0] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED) {
            assert(dest_reg == (program->local_input_regs[0] + 1));

            doutw = pvr_pds_encode_doutw_src1(
               program->local_input_regs[0],
               PVR_PDS_DOUTW_LOWER64,
               PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_DEST_UNIFIED_STORE,
               true,
               dev_info);

            doutw |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN;

            pvr_pds_write_constant32(buffer, local_id_ctrl_word, doutw);
         }
         /* Otherwise just DMA in Y and Z together in a single 32-bit DOUTW.
          */
         else {
            doutw = pvr_pds_encode_doutw_src1(
               dest_reg,
               PVR_PDS_DOUTW_UPPER32,
               PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_DEST_UNIFIED_STORE,
               true,
               dev_info);

            doutw |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN;

            pvr_pds_write_constant32(buffer, local_id_ctrl_word, doutw);
         }
      }
      /* If we don't want Y or Z then just DMA in X in a single 32-bit DOUTW.
       */
      else if (program->local_input_regs[0] !=
               PVR_PDS_COMPUTE_INPUT_REG_UNUSED) {
         doutw = pvr_pds_encode_doutw_src1(
            program->local_input_regs[0],
            PVR_PDS_DOUTW_LOWER32,
            PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_DEST_UNIFIED_STORE |
               PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN,
            true,
            dev_info);

         pvr_pds_write_constant32(buffer, local_id_ctrl_word, doutw);
      }
   }

   if (gen_mode == PDS_GENERATE_CODE_SEGMENT ||
       gen_mode == PDS_GENERATE_SIZES) {
      const bool encode = (gen_mode == PDS_GENERATE_CODE_SEGMENT);
#define APPEND(X)                    \
   if (encode) {                     \
      *buffer = X;                   \
      buffer++;                      \
   } else {                          \
      code_size += sizeof(uint32_t); \
   }

      /* Assert that coeff_update_task_branch_size is > 0 because if it is 0
       * then we will be doing an infinite loop.
       */
      if (gen_mode == PDS_GENERATE_CODE_SEGMENT)
         assert(program->coeff_update_task_branch_size > 0);

      /* Test whether this is the coefficient update task or not. */
      APPEND(
         pvr_pds_encode_bra(PVR_ROGUE_PDSINST_PREDICATE_IF1, /* SRCC */
                            PVR_ROGUE_PDSINST_NEG_ENABLE, /* NEG */
                            PVR_ROGUE_PDSINST_PREDICATE_KEEP, /* SETC */
                            program->coeff_update_task_branch_size /* ADDR */));

      /* Do we need to initialize the barrier coefficient? */
      if (program->barrier_coefficient != PVR_PDS_COMPUTE_INPUT_REG_UNUSED) {
         if (PVR_HAS_QUIRK(dev_info, 51210)) {
            /* Initialize the second barrier coefficient registers to zero.
             */
            APPEND(pvr_pds_encode_doutw64(0, /* cc */
                                          0, /* END */
                                          barrier_ctrl_word2, /* SRC1 */
                                          zero_constant64 >> 1)); /* SRC0 */
         }
         /* Initialize the coefficient register to zero. */
         APPEND(pvr_pds_encode_doutw64(0, /* cc */
                                       0, /* END */
                                       barrier_ctrl_word, /* SRC1 */
                                       zero_constant64 >> 1)); /* SRC0 */
      }

      if (program->add_base_workgroup) {
         const uint32_t temp_values[3] = { 0, 1, 3 };
         for (uint32_t workgroup_component = 0; workgroup_component < 3;
              workgroup_component++) {
            if (program->work_group_input_regs[workgroup_component] ==
                PVR_PDS_COMPUTE_INPUT_REG_UNUSED)
               continue;

            APPEND(pvr_pds_inst_encode_add32(
               /* cc */ 0x0,
               /* ALUM */ 0,
               /* SNA */ 0,
               /* SRC0 (R32)*/ PVR_ROGUE_PDSINST_REGS32_CONST32_LOWER +
                  program->base_workgroup_constant_offset_in_dwords
                     [workgroup_component],
               /* SRC1 (R32)*/ PVR_ROGUE_PDSINST_REGS32_TEMP32_LOWER +
                  PVR_PDS_CDM_WORK_GROUP_ID_X +
                  temp_values[workgroup_component],
               /* DST  (R32TP)*/ PVR_ROGUE_PDSINST_REGS32TP_TEMP32_LOWER +
                  PVR_PDS_CDM_WORK_GROUP_ID_X +
                  temp_values[workgroup_component]));
         }
      }

      /* If we are going to put the work-group IDs in coefficients then we
       * just need to do the DOUTWs.
       */
      if ((program->work_group_input_regs[0] !=
           PVR_PDS_COMPUTE_INPUT_REG_UNUSED) ||
          (program->work_group_input_regs[1] !=
           PVR_PDS_COMPUTE_INPUT_REG_UNUSED)) {
         uint32_t dest_reg;

         if (program->work_group_input_regs[0] !=
             PVR_PDS_COMPUTE_INPUT_REG_UNUSED) {
            dest_reg = PVR_PDS_TEMPS_BLOCK_BASE + PVR_PDS_CDM_WORK_GROUP_ID_X;
         } else {
            dest_reg = PVR_PDS_TEMPS_BLOCK_BASE + PVR_PDS_CDM_WORK_GROUP_ID_Y;
         }

         APPEND(pvr_pds_encode_doutw64(0, /* cc */
                                       0, /* END */
                                       work_group_id_ctrl_words[0], /* SRC1
                                                                     */
                                       dest_reg >> 1)); /* SRC0 */
      }

      if (program->work_group_input_regs[2] !=
          PVR_PDS_COMPUTE_INPUT_REG_UNUSED) {
         APPEND(pvr_pds_encode_doutw64(
            0, /* cc */
            0, /* END */
            work_group_id_ctrl_words[1], /* SRC1 */
            (PVR_PDS_TEMPS_BLOCK_BASE + PVR_PDS_CDM_WORK_GROUP_ID_Z) >>
               1)); /* SRC0 */
      }

      /* Issue the task to the USC. */
      if (program->kick_usc && program->has_coefficient_update_task) {
         APPEND(pvr_pds_encode_doutu(0, /* cc */
                                     1, /* END */
                                     usc_control_constant64_coeff_update >>
                                        1)); /* SRC0; DOUTU 64-bit Src0 */
      }

      /* Encode a HALT */
      APPEND(pvr_pds_inst_encode_halt(0));

      /* Set the branch size used to skip the coefficient sync task. */
      program->coeff_update_task_branch_size = code_size / sizeof(uint32_t);

      /* DOUTW in the local IDs. */

      /* If we want X and Y or Z, we only need one DOUTW. */
      if ((program->local_input_regs[0] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED) &&
          ((program->local_input_regs[1] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED) ||
           (program->local_input_regs[2] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED))) {
         local_input_register =
            PVR_PDS_TEMPS_BLOCK_BASE + PVR_PDS_CDM_LOCAL_ID_X;
      } else {
         /* If we just want X. */
         if (program->local_input_regs[0] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED) {
            local_input_register =
               PVR_PDS_TEMPS_BLOCK_BASE + PVR_PDS_CDM_LOCAL_ID_X;
         }
         /* If we just want Y or Z. */
         else if (program->local_input_regs[1] !=
                     PVR_PDS_COMPUTE_INPUT_REG_UNUSED ||
                  program->local_input_regs[2] !=
                     PVR_PDS_COMPUTE_INPUT_REG_UNUSED) {
            local_input_register =
               PVR_PDS_TEMPS_BLOCK_BASE + PVR_PDS_CDM_LOCAL_ID_YZ;
         }
      }

      if ((program->local_input_regs[0] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED) ||
          (program->local_input_regs[1] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED) ||
          (program->local_input_regs[2] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED)) {
         APPEND(pvr_pds_encode_doutw64(0, /* cc */
                                       0, /* END */
                                       local_id_ctrl_word, /* SRC1 */
                                       local_input_register >> 1)); /* SRC0
                                                                     */
      }

      if (program->clear_pds_barrier) {
         /* Zero the persistent temp (SW fence for context switch). */
         APPEND(pvr_pds_inst_encode_add64(
            0, /* cc */
            PVR_ROGUE_PDSINST_ALUM_UNSIGNED,
            PVR_ROGUE_PDSINST_MAD_SNA_ADD,
            PVR_ROGUE_PDSINST_REGS64_CONST64_LOWER +
               (zero_constant64 >> 1), /* src0 = 0 */
            PVR_ROGUE_PDSINST_REGS64_CONST64_LOWER +
               (zero_constant64 >> 1), /* src1 = 0 */
            PVR_ROGUE_PDSINST_REGS64TP_PTEMP64_LOWER + 0)); /* dest =
                                                             * ptemp64[0]
                                                             */
      }

      /* If this is a fence, issue the DOUTC. */
      if (program->fence) {
         APPEND(pvr_pds_inst_encode_doutc(0, /* cc */
                                          0 /* END */));
      }

      if (program->kick_usc) {
         if (program->conditional_render) {
            /* Skip if coefficient update task. */
            APPEND(pvr_pds_inst_encode_bra(PVR_ROGUE_PDSINST_PREDICATE_IF1,
                                           0,
                                           PVR_ROGUE_PDSINST_PREDICATE_KEEP,
                                           16));

            /* Load the predicate. */
            APPEND(pvr_pds_inst_encode_ld(0, predicate_ld_src0_constant >> 1));

            /* Load negate constant into temp for CMP. */
            APPEND(pvr_pds_inst_encode_add64(
               0, /* cc */
               PVR_ROGUE_PDSINST_ALUM_UNSIGNED,
               PVR_ROGUE_PDSINST_MAD_SNA_ADD,
               PVR_ROGUE_PDSINST_REGS64_CONST64_LOWER +
                  (cond_render_negate_constant >> 1), /* src0 = 0 */
               PVR_ROGUE_PDSINST_REGS64_CONST64_LOWER +
                  (zero_constant64 >> 1), /* src1 = 0 */
               PVR_ROGUE_PDSINST_REGS64TP_TEMP64_LOWER +
                  (cond_render_negate_temp >> 1))); /* dest = ptemp64[0]
                                                     */

            APPEND(pvr_pds_inst_encode_wdf(0));

            for (uint32_t i = 0; i < 4; i++) {
               APPEND(pvr_pds_inst_encode_sftlp32(
                  1, /* enable immediate */
                  0, /* cc */
                  PVR_ROGUE_PDSINST_LOP_AND, /* LOP */
                  cond_render_pred_temp + i, /* SRC0 */
                  cond_render_pred_mask_constant + i, /* SRC1 */
                  0, /* SRC2 (Shift) */
                  cond_render_pred_temp + i)); /* DST */

               APPEND(
                  pvr_pds_inst_encode_sftlp32(1, /* enable immediate */
                                              0, /* cc */
                                              PVR_ROGUE_PDSINST_LOP_OR, /* LOP
                                                                         */
                                              cond_render_pred_temp + i, /* SRC0
                                                                          */
                                              cond_render_pred_temp, /* SRC1 */
                                              0, /* SRC2 (Shift) */
                                              cond_render_pred_temp)); /* DST */
            }

            APPEND(pvr_pds_inst_encode_limm(0, /* cc */
                                            cond_render_pred_temp + 1, /* SRC1
                                                                        */
                                            0, /* SRC0 */
                                            0)); /* GLOBALREG */

            APPEND(pvr_pds_inst_encode_sftlp32(1, /* enable immediate */
                                               0, /* cc */
                                               PVR_ROGUE_PDSINST_LOP_XOR, /* LOP
                                                                           */
                                               cond_render_pred_temp, /* SRC0 */
                                               cond_render_negate_temp, /* SRC1
                                                                         */
                                               0, /* SRC2 (Shift) */
                                               cond_render_pred_temp)); /* DST
                                                                         */

            /* Check that the predicate is 0. */
            APPEND(pvr_pds_inst_encode_cmpi(
               0, /* cc */
               PVR_ROGUE_PDSINST_COP_EQ, /* LOP */
               (cond_render_pred_temp >> 1) +
                  PVR_ROGUE_PDSINST_REGS64TP_TEMP64_LOWER, /* SRC0 */
               0)); /* SRC1 */

            /* If predicate is 0, skip DOUTU. */
            APPEND(pvr_pds_inst_encode_bra(
               PVR_ROGUE_PDSINST_PREDICATE_P0, /* SRCC:
                                                  P0 */
               0, /* NEG */
               PVR_ROGUE_PDSINST_PREDICATE_KEEP, /* SETC:
                                                    keep
                                                  */
               2));
         }

         /* Issue the task to the USC.
          * DoutU src1=USC Code Base address, src2=doutu word 2.
          */
         APPEND(pvr_pds_encode_doutu(1, /* cc */
                                     1, /* END */
                                     usc_control_constant64 >> 1)); /* SRC0;
                                                                     * DOUTU
                                                                     * 64-bit
                                                                     * Src0.
                                                                     */
      }

      /* End the program if the Dout did not already end it. */
      APPEND(pvr_pds_inst_encode_halt(0));
#undef APPEND
   }

   if (gen_mode == PDS_GENERATE_DATA_SEGMENT) {
      /* Set the data segment pointer and ensure we return 1 past the buffer
       * ptr.
       */
      program->data_segment = buffer;

      buffer += next_constant;
   }

   /* Require at least one DWORD of PDS data so the program runs. */
   data_size = MAX2(1, data_size);

   program->temps_used = temps_used;
   program->highest_temp = temps_used;
   program->data_size = data_size;
   if (gen_mode == PDS_GENERATE_SIZES)
      program->code_size = code_size;

   return buffer;
}

/**
 * Generates the PDS vertex shader data or code block. This program will do a
 * DMA into USC Constants followed by a DOUTU.
 *
 * \param program Pointer to the PDS vertex shader program.
 * \param buffer Pointer to the buffer for the program.
 * \param gen_mode Generate code or data.
 * \param dev_info PVR device information struct.
 * \returns Pointer to just beyond the code/data.
 */
uint32_t *pvr_pds_vertex_shader_sa(
   struct pvr_pds_vertex_shader_sa_program *restrict program,
   uint32_t *restrict buffer,
   enum pvr_pds_generate_mode gen_mode,
   const struct pvr_device_info *dev_info)
{
   uint32_t next_constant;
   uint32_t data_size = 0;
   uint32_t code_size = 0;

   uint32_t usc_control_constant64 = 0;
   uint32_t dma_address_constant64 = 0;
   uint32_t dma_control_constant32 = 0;
   uint32_t doutw_value_constant64 = 0;
   uint32_t doutw_control_constant32 = 0;
   uint32_t fence_constant_word = 0;
   uint32_t *buffer_base;
   uint32_t kick_index;

   uint32_t total_num_doutw =
      program->num_dword_doutw + program->num_q_word_doutw;
   uint32_t total_size_dma =
      program->num_dword_doutw + 2 * program->num_q_word_doutw;

   next_constant = PVR_PDS_CONSTANTS_BLOCK_BASE;

   /* Copy the DMA control words and USC task control words to constants.
    *
    * Arrange them so that the 64-bit words are together followed by the 32-bit
    * words.
    */
   if (program->kick_usc) {
      usc_control_constant64 =
         pvr_pds_get_constants(&next_constant, 2, &data_size);
   }

   if (program->clear_pds_barrier) {
      fence_constant_word =
         pvr_pds_get_constants(&next_constant, 2, &data_size);
   }
   dma_address_constant64 = pvr_pds_get_constants(&next_constant,
                                                  2 * program->num_dma_kicks,
                                                  &data_size);

   /* Assign all unaligned constants together to avoid alignment issues caused
    * by pvr_pds_get_constants with even allocation sizes.
    */
   doutw_value_constant64 = pvr_pds_get_constants(
      &next_constant,
      total_size_dma + total_num_doutw + program->num_dma_kicks,
      &data_size);
   doutw_control_constant32 = doutw_value_constant64 + total_size_dma;
   dma_control_constant32 = doutw_control_constant32 + total_num_doutw;

   if (gen_mode == PDS_GENERATE_DATA_SEGMENT) {
      buffer_base = buffer;

      if (program->kick_usc) {
         /* Src0 for DOUTU. */
         pvr_pds_write_wide_constant(buffer_base,
                                     usc_control_constant64,
                                     program->usc_task_control.src0); /* DOUTU
                                                                       * 64-bit
                                                                       * Src0.
                                                                       */
         buffer += 2;
      }

      if (program->clear_pds_barrier) {
         /* Encode the fence constant src0. Fence barrier is initialized to
          * zero.
          */
         pvr_pds_write_wide_constant(buffer_base, fence_constant_word, 0);
         buffer += 2;
      }

      if (total_num_doutw > 0) {
         for (uint32_t i = 0; i < program->num_q_word_doutw; i++) {
            /* Write the constant for the coefficient register write. */
            pvr_pds_write_constant64(buffer_base,
                                     doutw_value_constant64,
                                     program->q_word_doutw_value[2 * i],
                                     program->q_word_doutw_value[2 * i + 1]);
            pvr_pds_write_constant32(
               buffer_base,
               doutw_control_constant32,
               program->q_word_doutw_control[i] |
                  ((!program->num_dma_kicks && i == total_num_doutw - 1)
                      ? PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN
                      : 0));

            doutw_value_constant64 += 2;
            doutw_control_constant32 += 1;
         }

         for (uint32_t i = 0; i < program->num_dword_doutw; i++) {
            /* Write the constant for the coefficient register write. */
            pvr_pds_write_constant32(buffer_base,
                                     doutw_value_constant64,
                                     program->dword_doutw_value[i]);
            pvr_pds_write_constant32(
               buffer_base,
               doutw_control_constant32,
               program->dword_doutw_control[i] |
                  ((!program->num_dma_kicks && i == program->num_dword_doutw - 1)
                      ? PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN
                      : 0));

            doutw_value_constant64 += 1;
            doutw_control_constant32 += 1;
         }

         buffer += total_size_dma + total_num_doutw;
      }

      if (program->num_dma_kicks == 1) /* Most-common case. */
      {
         /* Src0 for DOUTD - Address. */
         pvr_pds_write_dma_address(buffer_base,
                                   dma_address_constant64,
                                   program->dma_address[0],
                                   false,
                                   dev_info);

         /* Src1 for DOUTD - Control Word. */
         pvr_pds_write_constant32(
            buffer_base,
            dma_control_constant32,
            program->dma_control[0] |
               PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTD_SRC1_LAST_EN);

         /* Move the buffer ptr along as we will return 1 past the buffer. */
         buffer += 3;
      } else if (program->num_dma_kicks > 1) {
         for (kick_index = 0; kick_index < program->num_dma_kicks - 1;
              kick_index++) {
            /* Src0 for DOUTD - Address. */
            pvr_pds_write_dma_address(buffer_base,
                                      dma_address_constant64,
                                      program->dma_address[kick_index],
                                      false,
                                      dev_info);

            /* Src1 for DOUTD - Control Word. */
            pvr_pds_write_constant32(buffer_base,
                                     dma_control_constant32,
                                     program->dma_control[kick_index]);
            dma_address_constant64 += 2;
            dma_control_constant32 += 1;
         }

         /* Src0 for DOUTD - Address. */
         pvr_pds_write_dma_address(buffer_base,
                                   dma_address_constant64,
                                   program->dma_address[kick_index],
                                   false,
                                   dev_info);

         /* Src1 for DOUTD - Control Word. */
         pvr_pds_write_constant32(
            buffer_base,
            dma_control_constant32,
            program->dma_control[kick_index] |
               PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTD_SRC1_LAST_EN);

         buffer += 3 * program->num_dma_kicks;
      }
   } else if (gen_mode == PDS_GENERATE_CODE_SEGMENT) {
      if (program->clear_pds_barrier) {
         /* Zero the persistent temp (SW fence for context switch). */
         *buffer++ = pvr_pds_inst_encode_add64(
            0, /* cc */
            PVR_ROGUE_PDSINST_ALUM_UNSIGNED,
            PVR_ROGUE_PDSINST_MAD_SNA_ADD,
            PVR_ROGUE_PDSINST_REGS64_CONST64_LOWER +
               (fence_constant_word >> 1), /* src0 = 0 */
            PVR_ROGUE_PDSINST_REGS64_CONST64_LOWER +
               (fence_constant_word >> 1), /* src1 = 0 */
            PVR_ROGUE_PDSINST_REGS64TP_PTEMP64_LOWER + 0); /* dest =
                                                            * ptemp[0]
                                                            */
      }

      if (total_num_doutw > 0) {
         for (uint32_t i = 0; i < program->num_q_word_doutw; i++) {
            /* Set the coefficient register to data value. */
            *buffer++ = pvr_pds_encode_doutw64(
               /* cc */ 0,
               /* END */ !program->num_dma_kicks && !program->kick_usc &&
                  (i == total_num_doutw - 1),
               /* SRC1 */ doutw_control_constant32,
               /* SRC0 */ doutw_value_constant64 >> 1);

            doutw_value_constant64 += 2;
            doutw_control_constant32 += 1;
         }

         for (uint32_t i = 0; i < program->num_dword_doutw; i++) {
            /* Set the coefficient register to data value. */
            *buffer++ = pvr_pds_encode_doutw64(
               /* cc */ 0,
               /* END */ !program->num_dma_kicks && !program->kick_usc &&
                  (i == program->num_dword_doutw - 1),
               /* SRC1 */ doutw_control_constant32,
               /* SRC0 */ doutw_value_constant64 >> 1);

            doutw_value_constant64 += 1;
            doutw_control_constant32 += 1;
         }
      }

      if (program->num_dma_kicks != 0) {
         /* DMA the state into the secondary attributes. */

         if (program->num_dma_kicks == 1) /* Most-common case. */
         {
            *buffer++ = pvr_pds_encode_doutd(
               /* cc */ 0,
               /* END */ !program->kick_usc,
               /* SRC1 */ dma_control_constant32, /* DOUTD 32-bit Src1 */
               /* SRC0 */ dma_address_constant64 >> 1); /* DOUTD 64-bit
                                                         * Src0.
                                                         */
         } else {
            for (kick_index = 0; kick_index < program->num_dma_kicks;
                 kick_index++) {
               *buffer++ = pvr_pds_encode_doutd(
                  /* cc */ 0,
                  /* END */ (!program->kick_usc) &&
                     (kick_index + 1 == program->num_dma_kicks),
                  /* SRC1 */ dma_control_constant32, /* DOUTD 32-bit
                                                      * Src1.
                                                      */
                  /* SRC0 */ dma_address_constant64 >> 1); /* DOUTD
                                                            * 64-bit
                                                            * Src0.
                                                            */
               dma_address_constant64 += 2;
               dma_control_constant32 += 1;
            }
         }
      }

      if (program->kick_usc) {
         /* Kick the USC. */
         *buffer++ = pvr_pds_encode_doutu(
            /* cc */ 0,
            /* END */ 1,
            /* SRC0 */ usc_control_constant64 >> 1); /* DOUTU 64-bit Src0.
                                                      */
      }

      if (!program->kick_usc && program->num_dma_kicks == 0 &&
          total_num_doutw == 0) {
         *buffer++ = pvr_pds_inst_encode_halt(0);
      }
   }

   code_size = program->num_dma_kicks + total_num_doutw;
   if (program->clear_pds_barrier)
      code_size++; /* ADD64 instruction. */

   if (program->kick_usc)
      code_size++;

   /* If there are no DMAs and no USC kick then code is HALT only. */
   if (code_size == 0)
      code_size = 1;

   program->data_size = data_size;
   program->code_size = code_size;

   return buffer;
}

/**
 * Writes the Uniform Data block for the PDS pixel shader secondary attributes
 * program.
 *
 * \param program Pointer to the PDS pixel shader secondary attributes program.
 * \param buffer Pointer to the buffer for the code/data.
 * \param gen_mode Either code or data can be generated or sizes only updated.
 * \returns Pointer to just beyond the buffer for the program/data.
 */
uint32_t *pvr_pds_pixel_shader_uniform_texture_code(
   struct pvr_pds_pixel_shader_sa_program *restrict program,
   uint32_t *restrict buffer,
   enum pvr_pds_generate_mode gen_mode)
{
   uint32_t *instruction;
   uint32_t code_size = 0;
   uint32_t data_size = 0;
   uint32_t temps_used = 0;
   uint32_t next_constant;

   assert((((uintptr_t)buffer) & (PDS_ROGUE_TA_STATE_PDS_ADDR_ALIGNSIZE - 1)) ==
          0);

   assert((gen_mode == PDS_GENERATE_CODE_SEGMENT && buffer) ||
          gen_mode == PDS_GENERATE_SIZES);

   /* clang-format off */
   /* Shape of code segment (note: clear is different)
    *
    *      Code
    *    +------------+
    *    | BRA if0    |
    *    | DOUTD      |
    *    |  ...       |
    *    | DOUTD.halt |
    *    | uniform    |
    *    | DOUTD      |
    *    |  ...       |
    *    |  ...       |
    *    | DOUTW      |
    *    |  ...       |
    *    |  ...       |
    *    | DOUTU.halt |
    *    | HALT       |
    *    +------------+
    */
   /* clang-format on */
   instruction = buffer;

   next_constant = PVR_PDS_CONSTANTS_BLOCK_BASE;

   /* The clear color can arrive packed in the right form in the first (or
    * first 2) dwords of the shared registers and the program will issue a
    * single doutw for this.
    */
   if (program->clear && program->packed_clear) {
      uint32_t color_constant1 =
         pvr_pds_get_constants(&next_constant, 2, &data_size);

      uint32_t control_word_constant1 =
         pvr_pds_get_constants(&next_constant, 2, &data_size);

      if (gen_mode == PDS_GENERATE_CODE_SEGMENT) {
         /* DOUTW the clear color to the USC constants. Predicate with
          * uniform loading flag (IF0).
          */
         *instruction++ = pvr_pds_encode_doutw64(
            /* cc */ 1, /* Only for uniform loading program. */
            /* END */ program->kick_usc ? 0 : 1, /* Last
                                                  * instruction
                                                  * for a clear.
                                                  */
            /* SRC1 */ control_word_constant1, /* DOUTW 32-bit Src1 */
            /* SRC0 */ color_constant1 >> 1); /* DOUTW 64-bit Src0 */

         code_size += 1;
      }
   } else if (program->clear) {
      uint32_t color_constant1, color_constant2;

      if (program->clear_color_dest_reg & 0x1) {
         uint32_t color_constant3, control_word_constant1,
            control_word_constant2, color_constant4;

         color_constant1 = pvr_pds_get_constants(&next_constant, 1, &data_size);
         color_constant2 = pvr_pds_get_constants(&next_constant, 2, &data_size);
         color_constant3 = pvr_pds_get_constants(&next_constant, 1, &data_size);

         control_word_constant1 =
            pvr_pds_get_constants(&next_constant, 2, &data_size);
         control_word_constant2 =
            pvr_pds_get_constants(&next_constant, 2, &data_size);
         color_constant4 = pvr_pds_get_constants(&next_constant, 2, &data_size);

         if (gen_mode == PDS_GENERATE_CODE_SEGMENT) {
            /* DOUTW the clear color to the USSE constants. Predicate with
             * uniform loading flag (IF0).
             */
            *instruction++ = pvr_pds_encode_doutw64(
               /* cc */ 1, /* Only for Uniform Loading program */
               /* END */ 0,
               /* SRC1 */ control_word_constant1, /* DOUTW 32-bit Src1 */
               /* SRC0 */ color_constant1 >> 1); /* DOUTW 64-bit Src0 */

            *instruction++ = pvr_pds_encode_doutw64(
               /* cc */ 1, /* Only for Uniform Loading program */
               /* END */ 0,
               /* SRC1 */ control_word_constant2, /* DOUTW 32-bit Src1 */
               /* SRC0 */ color_constant2 >> 1); /* DOUTW 64-bit Src0 */

            *instruction++ = pvr_pds_encode_doutw64(
               /* cc */ 1, /* Only for uniform loading program */
               /* END */ program->kick_usc ? 0 : 1, /* Last
                                                     * instruction
                                                     * for a clear.
                                                     */
               /* SRC1 */ color_constant4, /* DOUTW 32-bit Src1 */
               /* SRC0 */ color_constant3 >> 1); /* DOUTW 64-bit Src0 */
         }

         code_size += 3;
      } else {
         uint32_t control_word_constant, control_word_last_constant;

         /* Put the clear color and control words into the first 8
          * constants.
          */
         color_constant1 = pvr_pds_get_constants(&next_constant, 2, &data_size);
         color_constant2 = pvr_pds_get_constants(&next_constant, 2, &data_size);
         control_word_constant =
            pvr_pds_get_constants(&next_constant, 2, &data_size);
         control_word_last_constant =
            pvr_pds_get_constants(&next_constant, 2, &data_size);

         if (gen_mode == PDS_GENERATE_CODE_SEGMENT) {
            /* DOUTW the clear color to the USSE constants. Predicate with
             * uniform loading flag (IF0).
             */
            *instruction++ = pvr_pds_encode_doutw64(
               /* cc */ 1, /* Only for Uniform Loading program */
               /* END */ 0,
               /* SRC1 */ control_word_constant, /* DOUTW 32-bit Src1 */
               /* SRC0 */ color_constant1 >> 1); /* DOUTW 64-bit Src0 */

            *instruction++ = pvr_pds_encode_doutw64(
               /* cc */ 1, /* Only for uniform loading program */
               /* END */ program->kick_usc ? 0 : 1, /* Last
                                                     * instruction
                                                     * for a clear.
                                                     */
               /* SRC1 */ control_word_last_constant, /* DOUTW 32-bit Src1
                                                       */
               /* SRC0 */ color_constant2 >> 1); /* DOUTW 64-bit Src0 */
         }

         code_size += 2;
      }

      if (program->kick_usc) {
         uint32_t doutu_constant64;

         doutu_constant64 =
            pvr_pds_get_constants(&next_constant, 2, &data_size);

         if (gen_mode == PDS_GENERATE_CODE_SEGMENT) {
            /* Issue the task to the USC.
             *
             * dout ds1[constant_use], ds0[constant_use],
             * ds1[constant_use], emit
             */
            *instruction++ = pvr_pds_encode_doutu(
               /* cc */ 0,
               /* END */ 1,
               /* SRC0 */ doutu_constant64 >> 1); /* DOUTU 64-bit Src0
                                                   */
         }

         code_size += 1;
      }

      if (gen_mode == PDS_GENERATE_CODE_SEGMENT) {
         /* End the program. */
         *instruction++ = pvr_pds_inst_encode_halt(0);
      }
      code_size += 1;
   } else {
      uint32_t total_num_doutw =
         program->num_dword_doutw + program->num_q_word_doutw;
      bool both_textures_and_uniforms =
         ((program->num_texture_dma_kicks > 0) &&
          ((program->num_uniform_dma_kicks > 0 || total_num_doutw > 0) ||
           program->kick_usc));
      uint32_t doutu_constant64 = 0;

      if (both_textures_and_uniforms) {
         /* If the size of a PDS data section is 0, the hardware won't run
          * it. We therefore don't need to branch when there is only a
          * texture OR a uniform update program.
          */
         if (gen_mode == PDS_GENERATE_CODE_SEGMENT) {
            uint32_t branch_address =
               MAX2(1 + program->num_texture_dma_kicks, 2);

            /* Use If0 to BRAnch to uniform code. */
            *instruction++ = pvr_pds_encode_bra(
               /* SRCC */ PVR_ROGUE_PDSINST_PREDICATE_IF0,
               /* NEG */ PVR_ROGUE_PDSINST_NEG_DISABLE,
               /* SETC */ PVR_ROGUE_PDSINST_PREDICATE_KEEP,
               /* ADDR */ branch_address);
         }

         code_size += 1;
      }

      if (program->num_texture_dma_kicks > 0) {
         uint32_t dma_address_constant64;
         uint32_t dma_control_constant32;
         /* Allocate 3 constant spaces for each kick. The 64-bit constants
          * come first followed by the 32-bit constants.
          */
         dma_address_constant64 = PVR_PDS_CONSTANTS_BLOCK_BASE;
         dma_control_constant32 =
            dma_address_constant64 + (program->num_texture_dma_kicks * 2);

         for (uint32_t dma = 0; dma < program->num_texture_dma_kicks; dma++) {
            code_size += 1;
            if (gen_mode != PDS_GENERATE_CODE_SEGMENT)
               continue;

            /* DMA the state into the secondary attributes. */
            *instruction++ = pvr_pds_encode_doutd(
               /* cc */ 0,
               /* END */ dma == (program->num_texture_dma_kicks - 1),
               /* SRC1 */ dma_control_constant32, /* DOUT 32-bit Src1 */
               /* SRC0 */ dma_address_constant64 >> 1); /* DOUT
                                                         * 64-bit
                                                         * Src0
                                                         */
            dma_address_constant64 += 2;
            dma_control_constant32 += 1;
         }
      } else if (both_textures_and_uniforms) {
         if (gen_mode == PDS_GENERATE_CODE_SEGMENT) {
            /* End the program. */
            *instruction++ = pvr_pds_inst_encode_halt(0);
         }

         code_size += 1;
      }

      /* Reserve space at the beginning of the data segment for the DOUTU Task
       * Control if one is needed.
       */
      if (program->kick_usc) {
         doutu_constant64 =
            pvr_pds_get_constants(&next_constant, 2, &data_size);
      }

      /* Allocate 3 constant spaces for each DMA and 2 for a USC kick. The
       * 64-bit constants come first followed by the 32-bit constants.
       */
      uint32_t total_size_dma =
         program->num_dword_doutw + 2 * program->num_q_word_doutw;

      uint32_t dma_address_constant64 = pvr_pds_get_constants(
         &next_constant,
         program->num_uniform_dma_kicks * 3 + total_size_dma + total_num_doutw,
         &data_size);
      uint32_t doutw_value_constant64 =
         dma_address_constant64 + program->num_uniform_dma_kicks * 2;
      uint32_t dma_control_constant32 = doutw_value_constant64 + total_size_dma;
      uint32_t doutw_control_constant32 =
         dma_control_constant32 + program->num_uniform_dma_kicks;

      if (total_num_doutw > 0) {
         pvr_pds_get_constants(&next_constant, 0, &data_size);

         if (gen_mode == PDS_GENERATE_CODE_SEGMENT) {
            for (uint32_t i = 0; i < program->num_q_word_doutw; i++) {
               /* Set the coefficient register to data value. */
               *instruction++ = pvr_pds_encode_doutw64(
                  /* cc */ 0,
                  /* END */ !program->num_uniform_dma_kicks &&
                     !program->kick_usc && (i == total_num_doutw - 1),
                  /* SRC1 */ doutw_control_constant32,
                  /* SRC0 */ doutw_value_constant64 >> 1);

               doutw_value_constant64 += 2;
               doutw_control_constant32 += 1;
            }

            for (uint32_t i = 0; i < program->num_dword_doutw; i++) {
               /* Set the coefficient register to data value. */
               *instruction++ = pvr_pds_encode_doutw64(
                  /* cc */ 0,
                  /* END */ !program->num_uniform_dma_kicks &&
                     !program->kick_usc && (i == program->num_dword_doutw - 1),
                  /* SRC1 */ doutw_control_constant32,
                  /* SRC0 */ doutw_value_constant64 >> 1);

               doutw_value_constant64 += 1;
               doutw_control_constant32 += 1;
            }
         }
         code_size += total_num_doutw;
      }

      if (program->num_uniform_dma_kicks > 0) {
         for (uint32_t dma = 0; dma < program->num_uniform_dma_kicks; dma++) {
            code_size += 1;

            if (gen_mode != PDS_GENERATE_CODE_SEGMENT)
               continue;

            bool last_instruction = false;
            if (!program->kick_usc &&
                (dma == program->num_uniform_dma_kicks - 1)) {
               last_instruction = true;
            }
            /* DMA the state into the secondary attributes. */
            *instruction++ = pvr_pds_encode_doutd(
               /* cc */ 0,
               /* END */ last_instruction,
               /* SRC1 */ dma_control_constant32, /* DOUT 32-bit Src1
                                                   */
               /* SRC0 */ dma_address_constant64 >> 1); /* DOUT
                                                         * 64-bit
                                                         * Src0
                                                         */
            dma_address_constant64 += 2;
            dma_control_constant32 += 1;
         }
      }

      if (program->kick_usc) {
         if (gen_mode == PDS_GENERATE_CODE_SEGMENT) {
            /* Issue the task to the USC.
             *
             * dout ds1[constant_use], ds0[constant_use],
             * ds1[constant_use], emit
             */

            *instruction++ = pvr_pds_encode_doutu(
               /* cc */ 0,
               /* END */ 1,
               /* SRC0 */ doutu_constant64 >> 1); /* DOUTU 64-bit Src0 */
         }

         code_size += 1;
      } else if (program->num_uniform_dma_kicks == 0 && total_num_doutw == 0) {
         if (gen_mode == PDS_GENERATE_CODE_SEGMENT) {
            /* End the program. */
            *instruction++ = pvr_pds_inst_encode_halt(0);
         }

         code_size += 1;
      }
   }

   /* Minimum temp count is 1. */
   program->temps_used = MAX2(temps_used, 1);
   program->code_size = code_size;

   if (gen_mode == PDS_GENERATE_CODE_SEGMENT)
      return instruction;
   else
      return NULL;
}

/**
 * Writes the Uniform Data block for the PDS pixel shader secondary attributes
 * program.
 *
 * \param program Pointer to the PDS pixel shader secondary attributes program.
 * \param buffer Pointer to the buffer for the code/data.
 * \param gen_mode Either code or data can be generated or sizes only updated.
 * \param dev_info PVR device information struct.
 * \returns Pointer to just beyond the buffer for the program/data.
 */
uint32_t *pvr_pds_pixel_shader_uniform_texture_data(
   struct pvr_pds_pixel_shader_sa_program *restrict program,
   uint32_t *restrict buffer,
   enum pvr_pds_generate_mode gen_mode,
   bool uniform,
   const struct pvr_device_info *dev_info)
{
   uint32_t *constants = buffer;
   uint32_t next_constant = PVR_PDS_CONSTANTS_BLOCK_BASE;
   uint32_t temps_used = 0;
   uint32_t data_size = 0;

   assert((((uintptr_t)buffer) & (PDS_ROGUE_TA_STATE_PDS_ADDR_ALIGNSIZE - 1)) ==
          0);

   assert(gen_mode != PDS_GENERATE_CODE_SEGMENT);

   /* Shape of data segment (note: clear is different).
    *
    *        Uniform            Texture
    *    +--------------+   +-------------+
    *    | USC Task   L |   | USC Task  L |
    *    |            H |   |           H |
    *    | DMA1 Src0  L |   | DMA1 Src0 L |
    *    |            H |   |           H |
    *    | DMA2 Src0  L |   |             |
    *    |            H |   |             |
    *    | DMA1 Src1    |   | DMA1 Src1   |
    *    | DMA2 Src1    |   |             |
    *    | DOUTW0 Src1  |   |             |
    *    | DOUTW1 Src1  |   |             |
    *    |   ...        |   |             |
    *    | DOUTWn Srcn  |   |             |
    *    | other data   |   |             |
    *    +--------------+   +-------------+
    */

   /* Generate the PDS pixel shader secondary attributes data.
    *
    * Packed Clear
    * The clear color can arrive packed in the right form in the first (or
    * first 2) dwords of the shared registers and the program will issue a
    * single DOUTW for this.
    */
   if (program->clear && uniform && program->packed_clear) {
      uint32_t color_constant1 =
         pvr_pds_get_constants(&next_constant, 2, &data_size);

      uint32_t control_word_constant1 =
         pvr_pds_get_constants(&next_constant, 2, &data_size);

      if (gen_mode == PDS_GENERATE_DATA_SEGMENT) {
         uint32_t doutw;

         pvr_pds_write_constant64(constants,
                                  color_constant1,
                                  program->clear_color[0],
                                  program->clear_color[1]);

         /* Load into first constant in common store. */
         doutw = pvr_pds_encode_doutw_src1(
            program->clear_color_dest_reg,
            PVR_PDS_DOUTW_LOWER64,
            PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_DEST_COMMON_STORE,
            false,
            dev_info);

         /* Set the last flag. */
         doutw |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN;
         pvr_pds_write_constant64(constants, control_word_constant1, doutw, 0);
      }
   } else if (program->clear && uniform) {
      uint32_t color_constant1, color_constant2;

      if (program->clear_color_dest_reg & 0x1) {
         uint32_t color_constant3, control_word_constant1,
            control_word_constant2, color_constant4;

         color_constant1 = pvr_pds_get_constants(&next_constant, 1, &data_size);
         color_constant2 = pvr_pds_get_constants(&next_constant, 2, &data_size);
         color_constant3 = pvr_pds_get_constants(&next_constant, 1, &data_size);

         control_word_constant1 =
            pvr_pds_get_constants(&next_constant, 2, &data_size);
         control_word_constant2 =
            pvr_pds_get_constants(&next_constant, 2, &data_size);
         color_constant4 = pvr_pds_get_constants(&next_constant, 2, &data_size);

         if (gen_mode == PDS_GENERATE_DATA_SEGMENT) {
            uint32_t doutw;

            pvr_pds_write_constant32(constants,
                                     color_constant1,
                                     program->clear_color[0]);

            pvr_pds_write_constant64(constants,
                                     color_constant2,
                                     program->clear_color[1],
                                     program->clear_color[2]);

            pvr_pds_write_constant32(constants,
                                     color_constant3,
                                     program->clear_color[3]);

            /* Load into first constant in common store. */
            doutw = pvr_pds_encode_doutw_src1(
               program->clear_color_dest_reg,
               PVR_PDS_DOUTW_LOWER32,
               PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_DEST_COMMON_STORE,
               false,
               dev_info);

            pvr_pds_write_constant64(constants,
                                     control_word_constant1,
                                     doutw,
                                     0);

            /* Move the destination register along. */
            doutw = pvr_pds_encode_doutw_src1(
               program->clear_color_dest_reg + 1,
               PVR_PDS_DOUTW_LOWER64,
               PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_DEST_COMMON_STORE,
               false,
               dev_info);

            pvr_pds_write_constant64(constants,
                                     control_word_constant2,
                                     doutw,
                                     0);

            /* Move the destination register along. */
            doutw = pvr_pds_encode_doutw_src1(
               program->clear_color_dest_reg + 3,
               PVR_PDS_DOUTW_LOWER32,
               PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_DEST_COMMON_STORE,
               false,
               dev_info);

            /* Set the last flag. */
            doutw |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN;
            pvr_pds_write_constant64(constants, color_constant4, doutw, 0);
         }
      } else {
         uint32_t control_word_constant, control_word_last_constant;

         /* Put the clear color and control words into the first 8
          * constants.
          */
         color_constant1 = pvr_pds_get_constants(&next_constant, 2, &data_size);
         color_constant2 = pvr_pds_get_constants(&next_constant, 2, &data_size);
         control_word_constant =
            pvr_pds_get_constants(&next_constant, 2, &data_size);
         control_word_last_constant =
            pvr_pds_get_constants(&next_constant, 2, &data_size);

         if (gen_mode == PDS_GENERATE_DATA_SEGMENT) {
            uint32_t doutw;
            pvr_pds_write_constant64(constants,
                                     color_constant1,
                                     program->clear_color[0],
                                     program->clear_color[1]);

            pvr_pds_write_constant64(constants,
                                     color_constant2,
                                     program->clear_color[2],
                                     program->clear_color[3]);

            /* Load into first constant in common store. */
            doutw = pvr_pds_encode_doutw_src1(
               program->clear_color_dest_reg,
               PVR_PDS_DOUTW_LOWER64,
               PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_DEST_COMMON_STORE,
               false,
               dev_info);

            pvr_pds_write_constant64(constants, control_word_constant, doutw, 0);

            /* Move the destination register along. */
            doutw &= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_AO_CLRMSK;
            doutw |= (program->clear_color_dest_reg + 2)
                     << PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_AO_SHIFT;

            /* Set the last flag. */
            doutw |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN;
            pvr_pds_write_constant64(constants,
                                     control_word_last_constant,
                                     doutw,
                                     0);
         }
      }

      /* Constants for the DOUTU Task Control, if needed. */
      if (program->kick_usc) {
         uint32_t doutu_constant64 =
            pvr_pds_get_constants(&next_constant, 2, &data_size);

         if (gen_mode == PDS_GENERATE_DATA_SEGMENT) {
            pvr_pds_write_wide_constant(
               constants,
               doutu_constant64,
               program->usc_task_control.src0); /* 64-bit
                                                 */
            /* Src0 */
         }
      }
   } else {
      if (uniform) {
         /* Reserve space at the beginning of the data segment for the DOUTU
          * Task Control if one is needed.
          */
         if (program->kick_usc) {
            uint32_t doutu_constant64 =
               pvr_pds_get_constants(&next_constant, 2, &data_size);

            if (gen_mode == PDS_GENERATE_DATA_SEGMENT) {
               pvr_pds_write_wide_constant(
                  constants,
                  doutu_constant64,
                  program->usc_task_control.src0); /* 64-bit Src0 */
            }
         }

         uint32_t total_num_doutw =
            program->num_dword_doutw + program->num_q_word_doutw;
         uint32_t total_size_dma =
            program->num_dword_doutw + 2 * program->num_q_word_doutw;

         /* Allocate 3 constant spaces for each kick. The 64-bit constants
          * come first followed by the 32-bit constants.
          */
         uint32_t dma_address_constant64 =
            pvr_pds_get_constants(&next_constant,
                                  program->num_uniform_dma_kicks * 3 +
                                     total_size_dma + total_num_doutw,
                                  &data_size);
         uint32_t doutw_value_constant64 =
            dma_address_constant64 + program->num_uniform_dma_kicks * 2;
         uint32_t dma_control_constant32 =
            doutw_value_constant64 + total_size_dma;
         uint32_t doutw_control_constant32 =
            dma_control_constant32 + program->num_uniform_dma_kicks;

         if (total_num_doutw > 0) {
            if (gen_mode == PDS_GENERATE_DATA_SEGMENT) {
               for (uint32_t i = 0; i < program->num_q_word_doutw; i++) {
                  pvr_pds_write_constant64(
                     constants,
                     doutw_value_constant64,
                     program->q_word_doutw_value[2 * i],
                     program->q_word_doutw_value[2 * i + 1]);
                  pvr_pds_write_constant32(
                     constants,
                     doutw_control_constant32,
                     program->q_word_doutw_control[i] |
                        ((!program->num_uniform_dma_kicks &&
                          i == total_num_doutw - 1)
                            ? PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN
                            : 0));

                  doutw_value_constant64 += 2;
                  doutw_control_constant32 += 1;
               }

               for (uint32_t i = 0; i < program->num_dword_doutw; i++) {
                  pvr_pds_write_constant32(constants,
                                           doutw_value_constant64,
                                           program->dword_doutw_value[i]);
                  pvr_pds_write_constant32(
                     constants,
                     doutw_control_constant32,
                     program->dword_doutw_control[i] |
                        ((!program->num_uniform_dma_kicks &&
                          i == program->num_dword_doutw - 1)
                            ? PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN
                            : 0));

                  doutw_value_constant64 += 1;
                  doutw_control_constant32 += 1;
               }
            }
         }

         if (program->num_uniform_dma_kicks > 0) {
            uint32_t kick;

            if (gen_mode == PDS_GENERATE_DATA_SEGMENT) {
               for (kick = 0; kick < program->num_uniform_dma_kicks - 1;
                    kick++) {
                  /* Copy the dma control words to constants. */
                  pvr_pds_write_dma_address(constants,
                                            dma_address_constant64,
                                            program->uniform_dma_address[kick],
                                            false,
                                            dev_info);
                  pvr_pds_write_constant32(constants,
                                           dma_control_constant32,
                                           program->uniform_dma_control[kick]);

                  dma_address_constant64 += 2;
                  dma_control_constant32 += 1;
               }

               pvr_pds_write_dma_address(constants,
                                         dma_address_constant64,
                                         program->uniform_dma_address[kick],
                                         false,
                                         dev_info);
               pvr_pds_write_constant32(
                  constants,
                  dma_control_constant32,
                  program->uniform_dma_control[kick] |
                     PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTD_SRC1_LAST_EN);
            }
         }

      } else if (program->num_texture_dma_kicks > 0) {
         /* Allocate 3 constant spaces for each kick. The 64-bit constants
          * come first followed by the 32-bit constants.
          */
         uint32_t dma_address_constant64 =
            pvr_pds_get_constants(&next_constant,
                                  program->num_texture_dma_kicks * 3,
                                  &data_size);
         uint32_t dma_control_constant32 =
            dma_address_constant64 + (program->num_texture_dma_kicks * 2);

         if (gen_mode == PDS_GENERATE_DATA_SEGMENT) {
            uint32_t kick;
            for (kick = 0; kick < program->num_texture_dma_kicks - 1; kick++) {
               /* Copy the DMA control words to constants. */
               pvr_pds_write_dma_address(constants,
                                         dma_address_constant64,
                                         program->texture_dma_address[kick],
                                         false,
                                         dev_info);

               pvr_pds_write_constant32(constants,
                                        dma_control_constant32,
                                        program->texture_dma_control[kick]);

               dma_address_constant64 += 2;
               dma_control_constant32 += 1;
            }

            pvr_pds_write_dma_address(constants,
                                      dma_address_constant64,
                                      program->texture_dma_address[kick],
                                      false,
                                      dev_info);

            pvr_pds_write_constant32(
               constants,
               dma_control_constant32,
               program->texture_dma_control[kick] |
                  PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTD_SRC1_LAST_EN);
         }
      }
   }

   /* Save the data segment pointer and size. */
   program->data_segment = constants;

   /* Minimum temp count is 1. */
   program->temps_used = MAX2(temps_used, 1);
   program->data_size = data_size;

   if (gen_mode == PDS_GENERATE_DATA_SEGMENT)
      return (constants + next_constant);
   else
      return NULL;
}

/**
 * Generates generic DOUTC PDS program.
 *
 * \param program Pointer to the PDS kick USC.
 * \param buffer Pointer to the buffer for the program.
 * \param gen_mode Either code and data can be generated, or sizes only updated.
 * \returns Pointer to just beyond the buffer for the code or program segment.
 */
uint32_t *pvr_pds_generate_doutc(struct pvr_pds_fence_program *restrict program,
                                 uint32_t *restrict buffer,
                                 enum pvr_pds_generate_mode gen_mode)
{
   uint32_t constant = 0;

   /* Automatically get a data size of 1x 128bit chunks. */
   uint32_t data_size = 0, code_size = 0;

   /* Setup the data part. */
   uint32_t *constants = buffer; /* Constants placed at front of buffer. */
   uint32_t *instruction = buffer;
   uint32_t next_constant = PVR_PDS_CONSTANTS_BLOCK_BASE; /* Constants count in
                                                           * dwords.
                                                           */

   /* Update the program sizes. */
   program->data_size = data_size;
   program->code_size = code_size;
   program->data_segment = constants;

   if (gen_mode == PDS_GENERATE_SIZES)
      return NULL;

   if (gen_mode == PDS_GENERATE_DATA_SEGMENT) {
      /* Copy the USC task control words to constants. */

      constant = pvr_pds_get_constants(&next_constant, 2, &data_size);
      pvr_pds_write_wide_constant(constants, constant + 0, 0); /* 64-bit
                                                                * Src0
                                                                */

      uint32_t control_word_constant =
         pvr_pds_get_constants(&next_constant, 2, &data_size);
      pvr_pds_write_constant64(constants, control_word_constant, 0, 0); /* 32-bit
                                                                         * Src1
                                                                         */

      program->data_size = data_size;
      buffer += data_size;

      return buffer;
   } else if (gen_mode == PDS_GENERATE_CODE_SEGMENT && instruction) {
      *instruction++ = pvr_pds_inst_encode_doutc(
         /* cc */ 0,
         /* END */ 0);

      code_size++;

      /* End the program. */
      *instruction++ = pvr_pds_inst_encode_halt(0);
      code_size++;

      program->code_size = code_size;
   }

   return instruction;
}

/**
 * Generates generic kick DOUTU PDS program in a single data+code block.
 *
 * \param control Pointer to the PDS kick USC.
 * \param buffer Pointer to the buffer for the program.
 * \param gen_mode Either code and data can be generated or sizes only updated.
 * \param dev_info PVR device information structure.
 * \returns Pointer to just beyond the buffer for the code or program segment.
 */
uint32_t *pvr_pds_generate_doutw(struct pvr_pds_doutw_control *restrict control,
                                 uint32_t *restrict buffer,
                                 enum pvr_pds_generate_mode gen_mode,
                                 const struct pvr_device_info *dev_info)
{
   uint32_t next_constant = PVR_PDS_CONSTANTS_BLOCK_BASE;
   uint32_t doutw;
   uint32_t data_size = 0, code_size = 0;
   uint32_t constant[PVR_PDS_MAX_NUM_DOUTW_CONSTANTS];
   uint32_t control_word_constant[PVR_PDS_MAX_NUM_DOUTW_CONSTANTS];

   /* Assert if buffer is exceeded. */
   assert(control->num_const64 <= PVR_PDS_MAX_NUM_DOUTW_CONSTANTS);

   uint32_t *constants = buffer;
   uint32_t *instruction = buffer;

   /* Put the constants and control words interleaved in the data region. */
   for (uint32_t const_pair = 0; const_pair < control->num_const64;
        const_pair++) {
      constant[const_pair] =
         pvr_pds_get_constants(&next_constant, 2, &data_size);
      control_word_constant[const_pair] =
         pvr_pds_get_constants(&next_constant, 2, &data_size);
   }

   if (gen_mode == PDS_GENERATE_DATA_SEGMENT) {
      /* Data segment points to start of constants. */
      control->data_segment = constants;

      for (uint32_t const_pair = 0; const_pair < control->num_const64;
           const_pair++) {
         pvr_pds_write_constant64(constants,
                                  constant[const_pair],
                                  H32(control->doutw_data[const_pair]),
                                  L32(control->doutw_data[const_pair]));

         /* Start loading at offset 0. */
         if (control->dest_store == PDS_COMMON_STORE) {
            doutw = pvr_pds_encode_doutw_src1(
               (2 * const_pair),
               PVR_PDS_DOUTW_LOWER64,
               PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_DEST_COMMON_STORE,
               false,
               dev_info);
         } else {
            doutw = pvr_pds_encode_doutw_src1(
               (2 * const_pair),
               PVR_PDS_DOUTW_LOWER64,
               PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_DEST_UNIFIED_STORE,
               false,
               dev_info);
         }

         if (const_pair + 1 == control->num_const64) {
            /* Set the last flag for the MCU (assume there are no following
             * DOUTD's).
             */
            doutw |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN;
         }
         pvr_pds_write_constant64(constants,
                                  control_word_constant[const_pair],
                                  doutw,
                                  0);
      }

      control->data_size = data_size;
   } else if (gen_mode == PDS_GENERATE_CODE_SEGMENT && instruction) {
      /* Code section. */

      for (uint32_t const_pair = 0; const_pair < control->num_const64;
           const_pair++) {
         /* DOUTW the PDS data to the USC constants. */
         *instruction++ = pvr_pds_encode_doutw64(
            /* cc */ 0,
            /* END */ control->last_instruction &&
               (const_pair + 1 == control->num_const64),
            /* SRC1 */ control_word_constant[const_pair], /* DOUTW 32-bit
                                                           * Src1.
                                                           */
            /* SRC0 */ constant[const_pair] >> 1); /* DOUTW 64-bit Src0. */

         code_size++;
      }

      if (control->last_instruction) {
         /* End the program. */
         *instruction++ = pvr_pds_inst_encode_halt(0);
         code_size++;
      }

      control->code_size = code_size;
   }

   if (gen_mode == PDS_GENERATE_DATA_SEGMENT)
      return (constants + next_constant);
   else
      return instruction;
}

/**
 * Generates generic kick DOUTU PDS program in a single data+code block.
 *
 * \param program Pointer to the PDS kick USC.
 * \param buffer Pointer to the buffer for the program.
 * \param start_next_constant Next constant in data segment. Non-zero if another
 *                            instruction precedes the DOUTU.
 * \param cc_enabled If true then the DOUTU is predicated (cc set).
 * \param gen_mode Either code and data can be generated or sizes only updated.
 * \returns Pointer to just beyond the buffer for the code or program segment.
 */
uint32_t *pvr_pds_kick_usc(struct pvr_pds_kickusc_program *restrict program,
                           uint32_t *restrict buffer,
                           uint32_t start_next_constant,
                           bool cc_enabled,
                           enum pvr_pds_generate_mode gen_mode)
{
   uint32_t constant = 0;

   /* Automatically get a data size of 2 128bit chunks. */
   uint32_t data_size = ROGUE_PDS_FIXED_PIXEL_SHADER_DATA_SIZE;
   uint32_t code_size = 1; /* Single doutu */
   uint32_t dummy_count = 0;

   /* Setup the data part. */
   uint32_t *constants = buffer; /* Constants placed at front of buffer. */
   uint32_t next_constant = PVR_PDS_CONSTANTS_BLOCK_BASE; /* Constants count in
                                                           * dwords.
                                                           */

   /* Update the program sizes. */
   program->data_size = data_size;
   program->code_size = code_size;
   program->data_segment = constants;

   if (gen_mode == PDS_GENERATE_SIZES)
      return NULL;

   if (gen_mode == PDS_GENERATE_DATA_SEGMENT ||
       gen_mode == PDS_GENERATE_CODEDATA_SEGMENTS) {
      /* Copy the USC task control words to constants. */

      constant = pvr_pds_get_constants(&next_constant, 2, &dummy_count);

      pvr_pds_write_wide_constant(constants,
                                  constant + 0,
                                  program->usc_task_control.src0); /* 64-bit
                                                                    * Src0.
                                                                    */
      buffer += data_size;

      if (gen_mode == PDS_GENERATE_DATA_SEGMENT)
         return buffer;
   }

   if (gen_mode == PDS_GENERATE_CODE_SEGMENT ||
       gen_mode == PDS_GENERATE_CODEDATA_SEGMENTS) {
      /* Generate the PDS pixel shader code. */

      /* Setup the instruction pointer. */
      uint32_t *instruction = buffer;

      /* Issue the task to the USC.
       *
       * dout ds1[constant_use], ds0[constant_use], ds1[constant_use], emit ;
       * halt halt
       */

      *instruction++ = pvr_pds_encode_doutu(
         /* cc */ cc_enabled,
         /* END */ 1,
         /* SRC0 */ (constant + start_next_constant) >> 1); /* DOUTU
                                                             * 64-bit Src0
                                                             */

      /* Return pointer to just after last instruction. */
      return instruction;
   }

   /* Execution should never reach here; keep compiler happy. */
   return NULL;
}

uint32_t *pvr_pds_generate_compute_barrier_conditional(
   uint32_t *buffer,
   enum pvr_pds_generate_mode gen_mode)
{
   /* Compute barriers supported. Need to test for coeff sync task. */

   if (gen_mode == PDS_GENERATE_DATA_SEGMENT)
      return buffer; /* No data segment. */

   if (gen_mode == PDS_GENERATE_CODE_SEGMENT) {
      /* Test whether this is the coefficient update task or not. */
      *buffer++ = pvr_pds_encode_bra(PVR_ROGUE_PDSINST_PREDICATE_IF0, /* SRCC
                                                                       */
                                     PVR_ROGUE_PDSINST_BRA_NEG_DISABLE, /* NEG
                                                                         */
                                     PVR_ROGUE_PDSINST_PREDICATE_IF1, /* SETC
                                                                       */
                                     1 /* ADDR */);

      /* Encode a HALT. */
      *buffer++ = pvr_pds_inst_encode_halt(1);

      /* Reset the default predicate to IF0. */
      *buffer++ = pvr_pds_encode_bra(PVR_ROGUE_PDSINST_PREDICATE_IF0, /* SRCC
                                                                       */
                                     PVR_ROGUE_PDSINST_BRA_NEG_DISABLE, /* NEG
                                                                         */
                                     PVR_ROGUE_PDSINST_PREDICATE_IF0, /* SETC
                                                                       */
                                     1 /* ADDR */);
   }

   return buffer;
}

/**
 * Generates program to kick the USC task to store shared.
 *
 * \param program Pointer to the PDS shared register.
 * \param buffer Pointer to the buffer for the program.
 * \param gen_mode Either code and data can be generated or sizes only updated.
 * \param dev_info PVR device information structure.
 * \returns Pointer to just beyond the buffer for the program.
 */
uint32_t *pvr_pds_generate_shared_storing_program(
   struct pvr_pds_shared_storing_program *restrict program,
   uint32_t *restrict buffer,
   enum pvr_pds_generate_mode gen_mode,
   const struct pvr_device_info *dev_info)
{
   struct pvr_pds_kickusc_program *kick_usc_program = &program->usc_task;
   struct pvr_pds_doutw_control *doutw_control = &program->doutw_control;

   if (gen_mode == PDS_GENERATE_SIZES)
      return NULL;

   if (gen_mode == PDS_GENERATE_DATA_SEGMENT) {
      uint32_t *constants = buffer;

      constants =
         pvr_pds_generate_doutw(doutw_control, constants, gen_mode, dev_info);
      program->data_size = doutw_control->data_size;

      constants = pvr_pds_kick_usc(kick_usc_program,
                                   constants,
                                   0,
                                   program->cc_enable,
                                   gen_mode);
      program->data_size += kick_usc_program->data_size;

      return constants;
   }

   if (gen_mode == PDS_GENERATE_CODE_SEGMENT) {
      /* Generate PDS code segment. */
      uint32_t *instruction = buffer;

      /* doutw	vi1, vi0
       * doutu	ds1[constant_use], ds0[constant_use], ds1[constant_use],
       * emit
       */
      instruction =
         pvr_pds_generate_doutw(doutw_control, buffer, gen_mode, dev_info);
      program->code_size = doutw_control->code_size;

      /* Offset into data segment follows on from doutw data segment. */
      instruction = pvr_pds_kick_usc(kick_usc_program,
                                     instruction,
                                     doutw_control->data_size,
                                     program->cc_enable,
                                     gen_mode);
      program->code_size += kick_usc_program->code_size;

      return instruction;
   }

   /* Execution should never reach here. */
   return NULL;
}

uint32_t *pvr_pds_generate_fence_terminate_program(
   struct pvr_pds_fence_program *restrict program,
   uint32_t *restrict buffer,
   enum pvr_pds_generate_mode gen_mode,
   const struct pvr_device_info *dev_info)
{
   uint32_t data_size = 0;
   uint32_t code_size = 0;

   if (gen_mode == PDS_GENERATE_DATA_SEGMENT) {
      /* Data segment. */
      uint32_t *constants, *constants_base;

      constants = constants_base = (uint32_t *)buffer;

      /* DOUTC sources are not used, but they must be valid. */
      pvr_pds_generate_doutc(program, constants, PDS_GENERATE_DATA_SEGMENT);
      data_size += program->data_size;

      if (PVR_NEED_SW_COMPUTE_PDS_BARRIER(dev_info)) {
         /* Append a 64-bit constant with value 1. Used to increment ptemp.
          * Return the offset into the data segment.
          */
         program->fence_constant_word =
            pvr_pds_append_constant64(constants_base, 1, &data_size);
      }

      program->data_size = data_size;
      return constants;
   }

   if (gen_mode == PDS_GENERATE_CODE_SEGMENT) {
      /* Code segment. */
      uint32_t *instruction = (uint32_t *)buffer;

      instruction = pvr_pds_generate_compute_barrier_conditional(
         instruction,
         PDS_GENERATE_CODE_SEGMENT);
      code_size += 3;

      if (PVR_NEED_SW_COMPUTE_PDS_BARRIER(dev_info)) {
         /* lock */
         *instruction++ = pvr_pds_inst_encode_lock(0); /* cc */

         /* add64	pt[0], pt[0], #1 */
         *instruction++ = pvr_pds_inst_encode_add64(
            0, /* cc */
            PVR_ROGUE_PDSINST_ALUM_UNSIGNED,
            PVR_ROGUE_PDSINST_MAD_SNA_ADD,
            PVR_ROGUE_PDSINST_REGS64_PTEMP64_LOWER + 0, /* src0 = ptemp[0]
                                                         */
            PVR_ROGUE_PDSINST_REGS64_CONST64_LOWER +
               (program->fence_constant_word >> 1), /* src1 = 1 */
            PVR_ROGUE_PDSINST_REGS64TP_PTEMP64_LOWER + 0); /* dest =
                                                            * ptemp[0]
                                                            */

         /* release */
         *instruction++ = pvr_pds_inst_encode_release(0); /* cc */

         /* cmp		pt[0] EQ 0x4 == Number of USC clusters per phantom */
         *instruction++ = pvr_pds_inst_encode_cmpi(
            0, /* cc */
            PVR_ROGUE_PDSINST_COP_EQ,
            PVR_ROGUE_PDSINST_REGS64TP_PTEMP64_LOWER + 0, /* src0
                                                           * = ptemp[0]
                                                           */
            PVR_GET_FEATURE_VALUE(dev_info, num_clusters, 0));

         /* bra		-1 */
         *instruction++ =
            pvr_pds_encode_bra(0, /* cc */
                               1, /* PVR_ROGUE_PDSINST_BRA_NEG_ENABLE
                                   */
                               0, /* PVR_ROGUE_PDSINST_BRA_SETC_P0
                                   */
                               -1); /* bra PC */
         code_size += 5;
      }

      /* DOUTC */
      instruction = pvr_pds_generate_doutc(program,
                                           instruction,
                                           PDS_GENERATE_CODE_SEGMENT);
      code_size += program->code_size;

      program->code_size = code_size;
      return instruction;
   }

   /* Execution should never reach here. */
   return NULL;
}

/**
 * Generates program to kick the USC task to load shared registers from memory.
 *
 * \param program Pointer to the PDS shared register.
 * \param buffer Pointer to the buffer for the program.
 * \param gen_mode Either code and data can be generated or sizes only updated.
 * \param dev_info PVR device information struct.
 * \returns Pointer to just beyond the buffer for the program.
 */
uint32_t *pvr_pds_generate_compute_shared_loading_program(
   struct pvr_pds_shared_storing_program *restrict program,
   uint32_t *restrict buffer,
   enum pvr_pds_generate_mode gen_mode,
   const struct pvr_device_info *dev_info)
{
   struct pvr_pds_kickusc_program *kick_usc_program = &program->usc_task;
   struct pvr_pds_doutw_control *doutw_control = &program->doutw_control;

   uint32_t next_constant;
   uint32_t data_size = 0;
   uint32_t code_size = 0;

   /* This needs to persist to the CODE_SEGMENT call. */
   static uint32_t fence_constant_word = 0;
   uint64_t zero_constant64 = 0;

   if (gen_mode == PDS_GENERATE_SIZES)
      return NULL;

   if (gen_mode == PDS_GENERATE_DATA_SEGMENT) {
      uint32_t *constants = buffer;

      constants = pvr_pds_generate_doutw(doutw_control,
                                         constants,
                                         PDS_GENERATE_DATA_SEGMENT,
                                         dev_info);
      data_size += doutw_control->data_size;

      constants = pvr_pds_kick_usc(kick_usc_program,
                                   constants,
                                   0,
                                   program->cc_enable,
                                   gen_mode);
      data_size += kick_usc_program->data_size;

      /* Copy the fence constant value (64-bit). */
      next_constant = data_size; /* Assumes data words fully packed. */
      fence_constant_word =
         pvr_pds_get_constants(&next_constant, 2, &data_size);

      /* Encode the fence constant src0 (offset measured from start of data
       * buffer). Fence barrier is initialized to zero.
       */
      pvr_pds_write_wide_constant(buffer, fence_constant_word, zero_constant64);
      /* Update the const size. */
      data_size += 2;
      constants += 2;

      program->data_size = data_size;
      return constants;
   }

   if (gen_mode == PDS_GENERATE_CODE_SEGMENT) {
      /* Generate PDS code segment. */
      uint32_t *instruction = buffer;

      /* add64	pt0, c0, c0
       * IF [2x Phantoms]
       * add64	pt1, c0, c0
       * st		[constant_mem_addr], pt0, 4
       * ENDIF
       * doutw	vi1, vi0
       * doutu	ds1[constant_use], ds0[constant_use], ds1[constant_use],
       * emit
       *
       * Zero the persistent temp (SW fence for context switch).
       */
      *instruction++ = pvr_pds_inst_encode_add64(
         0, /* cc */
         PVR_ROGUE_PDSINST_ALUM_UNSIGNED,
         PVR_ROGUE_PDSINST_MAD_SNA_ADD,
         PVR_ROGUE_PDSINST_REGS64_CONST64_LOWER +
            (fence_constant_word >> 1), /* src0
                                         *  = 0
                                         */
         PVR_ROGUE_PDSINST_REGS64_CONST64_LOWER +
            (fence_constant_word >> 1), /* src1
                                         * = 0
                                         */
         PVR_ROGUE_PDSINST_REGS64TP_PTEMP64_LOWER + 0); /* dest = ptemp64[0]
                                                         */
      code_size++;

      instruction = pvr_pds_generate_doutw(doutw_control,
                                           instruction,
                                           PDS_GENERATE_CODE_SEGMENT,
                                           dev_info);
      code_size += doutw_control->code_size;

      /* Offset into data segment follows on from doutw data segment. */
      instruction = pvr_pds_kick_usc(kick_usc_program,
                                     instruction,
                                     doutw_control->data_size,
                                     program->cc_enable,
                                     gen_mode);
      code_size += kick_usc_program->code_size;

      program->code_size = code_size;
      return instruction;
   }

   /* Execution should never reach here. */
   return NULL;
}

/**
 * Generates both code and data when gen_mode is not PDS_GENERATE_SIZES.
 * Relies on num_fpu_iterators being initialized for size calculation.
 * Relies on num_fpu_iterators, destination[], and FPU_iterators[] being
 * initialized for program generation.
 *
 * \param program Pointer to the PDS pixel shader program.
 * \param buffer Pointer to the buffer for the program.
 * \param gen_mode Either code and data can be generated or sizes only updated.
 * \returns Pointer to just beyond the buffer for the program.
 */
uint32_t *pvr_pds_coefficient_loading(
   struct pvr_pds_coeff_loading_program *restrict program,
   uint32_t *restrict buffer,
   enum pvr_pds_generate_mode gen_mode)
{
   uint32_t constant;
   uint32_t *instruction;
   uint32_t total_data_size, code_size;

   /* Place constants at the front of the buffer. */
   uint32_t *constants = buffer;
   /* Start counting constants from 0. */
   uint32_t next_constant = PVR_PDS_CONSTANTS_BLOCK_BASE;

   /* Save the data segment pointer and size. */
   program->data_segment = constants;

   total_data_size = 0;
   code_size = 0;

   total_data_size += 2 * program->num_fpu_iterators;
   code_size += program->num_fpu_iterators;

   /* Instructions start where constants finished, but we must take note of
    * alignment.
    *
    * 128-bit boundary = 4 dwords.
    */
   total_data_size = ALIGN_POT(total_data_size, 4);
   if (gen_mode != PDS_GENERATE_SIZES) {
      uint32_t data_size = 0;
      uint32_t iterator = 0;

      instruction = buffer + total_data_size;

      while (iterator < program->num_fpu_iterators) {
         uint64_t iterator_word;

         /* Copy the USC task control words to constants. */
         constant = pvr_pds_get_constants(&next_constant, 2, &data_size);

         /* Write the first iterator. */
         iterator_word =
            (uint64_t)program->FPU_iterators[iterator]
            << PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTI_SRC0_ISSUE0_SHIFT;

         /* Write the destination. */
         iterator_word |=
            (uint64_t)program->destination[iterator++]
            << PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTI_SRC0_DEST_SHIFT;

         /* If this is the last DOUTI word the "Last Issue" bit should be
          * set.
          */
         if (iterator >= program->num_fpu_iterators) {
            iterator_word |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTI_SRC0_ISSUE_EN;
         }

         /* Write the word to the buffer. */
         pvr_pds_write_wide_constant(constants,
                                     constant,
                                     iterator_word); /* 64-bit
                                                        Src0
                                                      */

         /* Write the DOUT instruction. */
         *instruction++ = pvr_pds_encode_douti(
            /* cc */ 0,
            /* END */ 0,
            /* SRC0 */ constant >> 1); /* DOUT Issue word 0 64-bit */
      }

      /* Update the last DOUTI instruction to have the END flag set. */
      *(instruction - 1) |= 1 << PVR_ROGUE_PDSINST_DOUT_END_SHIFT;
   } else {
      instruction = NULL;
   }

   /* Update the data size and code size. Minimum temp count is 1. */
   program->temps_used = 1;
   program->data_size = total_data_size;
   program->code_size = code_size;

   return instruction;
}

/**
 * Generate a single ld/st instruction. This can correspond to one or more
 * real ld/st instructions based on the value of count.
 *
 * \param ld true to generate load, false to generate store.
 * \param control Cache mode control.
 * \param temp_index Dest temp for load/source temp for store, in 32bits
 *                   register index.
 * \param address Source for load/dest for store in bytes.
 * \param count Number of dwords for load/store.
 * \param next_constant
 * \param total_data_size
 * \param total_code_size
 * \param buffer Pointer to the buffer for the program.
 * \param data_fence Issue data fence.
 * \param gen_mode Either code and data can be generated or sizes only updated.
 * \param dev_info PVR device information structure.
 * \returns Pointer to just beyond the buffer for the program.
 */
uint32_t *pvr_pds_generate_single_ldst_instruction(
   bool ld,
   const struct pvr_pds_ldst_control *control,
   uint32_t temp_index,
   uint64_t address,
   uint32_t count,
   uint32_t *next_constant,
   uint32_t *total_data_size,
   uint32_t *total_code_size,
   uint32_t *restrict buffer,
   bool data_fence,
   enum pvr_pds_generate_mode gen_mode,
   const struct pvr_device_info *dev_info)
{
   /* A single ld/ST here does NOT actually correspond to a single ld/ST
    * instruction, but may needs multiple ld/ST instructions because each ld/ST
    * instruction can only ld/ST a restricted max number of dwords which may
    * less than count passed here.
    */

   uint32_t num_inst;
   uint32_t constant;

   if (ld) {
      /* ld must operate on 64bits unit, and it needs to load from and to 128
       * bits aligned. Apart from the last ld, all the other need to ld 2x(x =
       * 1, 2, ...) times 64bits unit.
       */
      uint32_t per_inst_count = 0;
      uint32_t last_inst_count;

      assert((gen_mode == PDS_GENERATE_SIZES) ||
             (((count % 2) == 0) && ((address % 16) == 0) &&
              (temp_index % 2) == 0));

      count >>= 1;
      temp_index >>= 1;

      /* Found out how many ld instructions are needed and ld size for the all
       * possible ld instructions.
       */
      if (count <= PVR_ROGUE_PDSINST_LD_COUNT8_MAX_SIZE) {
         num_inst = 1;
         last_inst_count = count;
      } else {
         per_inst_count = PVR_ROGUE_PDSINST_LD_COUNT8_MAX_SIZE;
         if ((per_inst_count % 2) != 0)
            per_inst_count -= 1;

         num_inst = count / per_inst_count;
         last_inst_count = count - per_inst_count * num_inst;
         num_inst += 1;
      }

      /* Generate all the instructions. */
      for (uint32_t i = 0; i < num_inst; i++) {
         if ((i == (num_inst - 1)) && (last_inst_count == 0))
            break;

         /* A single load instruction. */
         constant = pvr_pds_get_constants(next_constant, 2, total_data_size);

         if (gen_mode == PDS_GENERATE_DATA_SEGMENT) {
            uint64_t ld_src0 = 0;

            ld_src0 |= (((address >> 2) & PVR_ROGUE_PDSINST_LD_SRCADD_MASK)
                        << PVR_ROGUE_PDSINST_LD_LD_SRC0_SRCADD_SHIFT);
            ld_src0 |= (((uint64_t)((i == num_inst - 1) ? last_inst_count
                                                        : per_inst_count) &
                         PVR_ROGUE_PDSINST_LD_COUNT8_MASK)
                        << PVR_ROGUE_PDSINST_LD_LD_SRC0_COUNT8_SHIFT);
            ld_src0 |= (((uint64_t)temp_index & PVR_ROGUE_PDSINST_REGS64TP_MASK)
                        << PVR_ROGUE_PDSINST_LD_LD_SRC0_DEST_SHIFT);

            if (!control) {
               ld_src0 |= PVR_ROGUE_PDSINST_LD_LD_SRC0_CMODE_CACHED;

               if (PVR_HAS_FEATURE(dev_info, slc_mcu_cache_controls))
                  ld_src0 |= PVR_ROGUE_PDSINST_LD_LD_SRC0_SLCMODE_CACHED;

            } else {
               ld_src0 |= control->cache_control_const;
            }

            /* Write it to the constant. */
            pvr_pds_write_constant64(buffer,
                                     constant,
                                     (uint32_t)(ld_src0),
                                     (uint32_t)(ld_src0 >> 32));

            /* Adjust value for next ld instruction. */
            temp_index += per_inst_count;
            address += (((uint64_t)(per_inst_count)) << 3);
         }

         if (gen_mode == PDS_GENERATE_CODE_SEGMENT) {
            *buffer++ = pvr_pds_inst_encode_ld(0, constant >> 1);

            if (data_fence)
               *buffer++ = pvr_pds_inst_encode_wdf(0);
         }
      }
   } else {
      /* ST needs source memory address to be 32bits aligned. */
      assert((gen_mode == PDS_GENERATE_SIZES) || ((address % 4) == 0));

      /* Found out how many ST instructions are needed, each ST can only store
       * PVR_ROGUE_PDSINST_ST_COUNT4_MASK number of 32bits.
       */
      num_inst = count / PVR_ROGUE_PDSINST_ST_COUNT4_MAX_SIZE;
      num_inst += ((count % PVR_ROGUE_PDSINST_ST_COUNT4_MAX_SIZE) == 0 ? 0 : 1);

      /* Generate all the instructions. */
      for (uint32_t i = 0; i < num_inst; i++) {
         /* A single store instruction. */
         constant = pvr_pds_get_constants(next_constant, 2, total_data_size);

         if (gen_mode == PDS_GENERATE_DATA_SEGMENT) {
            uint32_t per_inst_count =
               (count <= PVR_ROGUE_PDSINST_ST_COUNT4_MAX_SIZE
                   ? count
                   : PVR_ROGUE_PDSINST_ST_COUNT4_MAX_SIZE);
            uint64_t st_src0 = 0;

            st_src0 |= (((address >> 2) & PVR_ROGUE_PDSINST_ST_SRCADD_MASK)
                        << PVR_ROGUE_PDSINST_ST_ST_SRC0_DSTADD_SHIFT);
            st_src0 |=
               (((uint64_t)per_inst_count & PVR_ROGUE_PDSINST_ST_COUNT4_MASK)
                << PVR_ROGUE_PDSINST_ST_ST_SRC0_COUNT4_SHIFT);
            st_src0 |= (((uint64_t)temp_index & PVR_ROGUE_PDSINST_REGS32TP_MASK)
                        << PVR_ROGUE_PDSINST_ST_ST_SRC0_SRC_SHIFT);

            if (!control) {
               st_src0 |= PVR_ROGUE_PDSINST_ST_ST_SRC0_CMODE_WRITE_THROUGH;

               if (PVR_HAS_FEATURE(dev_info, slc_mcu_cache_controls)) {
                  st_src0 |= PVR_ROGUE_PDSINST_ST_ST_SRC0_SLCMODE_WRITE_THROUGH;
               }

            } else {
               st_src0 |= control->cache_control_const;
            }

            /* Write it to the constant. */
            pvr_pds_write_constant64(buffer,
                                     constant,
                                     (uint32_t)(st_src0),
                                     (uint32_t)(st_src0 >> 32));

            /* Adjust value for next ST instruction. */
            temp_index += per_inst_count;
            count -= per_inst_count;
            address += (((uint64_t)(per_inst_count)) << 2);
         }

         if (gen_mode == PDS_GENERATE_CODE_SEGMENT) {
            *buffer++ = pvr_pds_inst_encode_st(0, constant >> 1);

            if (data_fence)
               *buffer++ = pvr_pds_inst_encode_wdf(0);
         }
      }
   }

   (*total_code_size) += num_inst;
   if (data_fence)
      (*total_code_size) += num_inst;

   if (gen_mode != PDS_GENERATE_SIZES)
      return buffer;
   return NULL;
}

/**
 * Generate programs used to prepare stream out, i.e., clear stream out buffer
 * overflow flags and update Persistent temps by a ld instruction.
 *
 * This must be used in PPP state update.
 *
 * \param program Pointer to the stream out program.
 * \param buffer Pointer to the buffer for the program.
 * \param store_mode If true then the data is stored to memory. If false then
 *                   the data is loaded from memory.
 * \param gen_mode Either code and data can be generated or sizes only updated.
 * \param dev_info PVR device information structure.
 * \returns Pointer to just beyond the buffer for the program.
 */
uint32_t *pvr_pds_generate_stream_out_init_program(
   struct pvr_pds_stream_out_init_program *restrict program,
   uint32_t *restrict buffer,
   bool store_mode,
   enum pvr_pds_generate_mode gen_mode,
   const struct pvr_device_info *dev_info)
{
   uint32_t total_data_size = 0;
   uint32_t PTDst = PVR_ROGUE_PDSINST_REGS32TP_PTEMP32_LOWER;

   /* Start counting constants from 0. */
   uint32_t next_constant = PVR_PDS_CONSTANTS_BLOCK_BASE;

   uint32_t total_code_size = 1;

   if (gen_mode == PDS_GENERATE_CODE_SEGMENT) {
      /* We only need to clear global stream out predicate, other predicates
       * are not used during the stream out buffer overflow test.
       */
      *buffer++ = pvr_pds_inst_encode_stmc(0, 0x10);
   }

   for (uint32_t index = 0; index < program->num_buffers; index++) {
      if (program->dev_address_for_buffer_data[index] != 0) {
         /* Generate load/store program to load/store persistent temps. */

         /* NOTE: store_mode == true case should be handled by
          * StreamOutTerminate.
          */
         buffer = pvr_pds_generate_single_ldst_instruction(
            !store_mode,
            NULL,
            PTDst,
            program->dev_address_for_buffer_data[index],
            program->pds_buffer_data_size[index],
            &next_constant,
            &total_data_size,
            &total_code_size,
            buffer,
            false,
            gen_mode,
            dev_info);
      }

      PTDst += program->pds_buffer_data_size[index];
   }

   total_code_size += 2;

   if (gen_mode == PDS_GENERATE_CODE_SEGMENT) {
      /* We need to fence the loading. */
      *buffer++ = pvr_pds_inst_encode_wdf(0);
      *buffer++ = pvr_pds_inst_encode_halt(0);
   }

   /* Save size information to program */
   program->stream_out_init_pds_data_size =
      ALIGN_POT(total_data_size, 4); /* 128-bit boundary = 4 dwords; */
   /* PDS program code size. */
   program->stream_out_init_pds_code_size = total_code_size;

   if (gen_mode == PDS_GENERATE_DATA_SEGMENT)
      return buffer + program->stream_out_init_pds_data_size;
   else if (gen_mode == PDS_GENERATE_CODE_SEGMENT)
      return buffer;

   return NULL;
}

/**
 * Generate stream out terminate program for stream out.
 *
 * If pds_persistent_temp_size_to_store is 0, the final primitive written value
 * will be stored.
 *
 * If pds_persistent_temp_size_to_store is non 0, the value of persistent temps
 * will be stored into memory.
 *
 * The stream out terminate program is used to update the PPP state and the data
 * and code section cannot be separate.
 *
 * \param program Pointer to the stream out program.
 * \param buffer Pointer to the buffer for the program.
 * \param gen_mode Either code and data can be generated or sizes only updated.
 * \param dev_info PVR device info structure.
 * \returns Pointer to just beyond the buffer for the program.
 */
uint32_t *pvr_pds_generate_stream_out_terminate_program(
   struct pvr_pds_stream_out_terminate_program *restrict program,
   uint32_t *restrict buffer,
   enum pvr_pds_generate_mode gen_mode,
   const struct pvr_device_info *dev_info)
{
   uint32_t next_constant;
   uint32_t total_data_size = 0, total_code_size = 0;

   /* Start counting constants from 0. */
   next_constant = PVR_PDS_CONSTANTS_BLOCK_BASE;

   /* Generate store program to store persistent temps. */
   buffer = pvr_pds_generate_single_ldst_instruction(
      false,
      NULL,
      PVR_ROGUE_PDSINST_REGS32TP_PTEMP32_LOWER,
      program->dev_address_for_storing_persistent_temp,
      program->pds_persistent_temp_size_to_store,
      &next_constant,
      &total_data_size,
      &total_code_size,
      buffer,
      false,
      gen_mode,
      dev_info);

   total_code_size += 2;
   if (gen_mode == PDS_GENERATE_CODE_SEGMENT) {
      *buffer++ = pvr_pds_inst_encode_wdf(0);
      *buffer++ = pvr_pds_inst_encode_halt(0);
   }

   /* Save size information to program. */
   program->stream_out_terminate_pds_data_size =
      ALIGN_POT(total_data_size, 4); /* 128-bit boundary = 4 dwords; */
   /* PDS program code size. */
   program->stream_out_terminate_pds_code_size = total_code_size;

   if (gen_mode == PDS_GENERATE_DATA_SEGMENT)
      return buffer + program->stream_out_terminate_pds_data_size;
   else if (gen_mode == PDS_GENERATE_CODE_SEGMENT)
      return buffer;

   return NULL;
}

/* DrawArrays works in several steps:
 *
 * 1) load data from draw_indirect buffer
 * 2) tweak data to match hardware formats
 * 3) write data to indexblock
 * 4) signal the VDM to continue
 *
 * This is complicated by HW limitations on alignment, as well as a HWBRN.
 *
 * 1) Load data.
 * Loads _must_ be 128-bit aligned. Because there is no such limitation in the
 * spec we must deal with this by choosing an appropriate earlier address and
 * loading enough dwords that we load the entirety of the buffer.
 *
 * if addr & 0xf:
 *   load [addr & ~0xf] 6 dwords -> tmp[0, 1, 2, 3, 4, 5]
 *   data = tmp[0 + (uiAddr & 0xf) >> 2]...
 * else
 *   load [addr] 4 dwords -> tmp[0, 1, 2, 3]
 *   data = tmp[0]...
 *
 *
 * 2) Tweak data.
 * primCount in the spec does not match the encoding of INDEX_INSTANCE_COUNT in
 * the VDM control stream. We must subtract 1 from the loaded primCount.
 *
 * However, there is a HWBRN that disallows the ADD32 instruction from sourcing
 * a tmp that is non-64-bit-aligned. To work around this, we must move primCount
 * into another tmp that has the correct alignment. Note: this is only required
 * when data = tmp[even], as primCount is data+1:
 *
 * if data = tmp[even]:
 *   primCount = data + 1 = tmp[odd] -- not 64-bit aligned!
 * else:
 *   primCount = data + 1 = tmp[even] -- already aligned, don't need workaround.
 *
 * This boils down to:
 *
 * primCount = data[1]
 * primCountSrc = data[1]
 * if brn_present && (data is even):
 *   mov scratch, primCount
 *   primCountSrc = scratch
 * endif
 * sub primCount, primCountSrc, 1
 *
 * 3) Store Data.
 * Write the now-tweaked data over the top of the indexblock.
 * To ensure the write completes before the VDM re-reads the data, we must cause
 * a data hazard by doing a dummy (dummy meaning we don't care about the
 * returned data) load from the same addresses. Again, because the ld must
 * always be 128-bit aligned (note: the ST is dword-aligned), we must ensure the
 * index block is 128-bit aligned. This is the client driver's responsibility.
 *
 * st data[0, 1, 2] -> (idxblock + 4)
 * load [idxblock] 4 dwords
 *
 * 4) Signal the VDM
 * This is simply a DOUTV with a src1 of 0, indicating the VDM should continue
 * where it is currently fenced on a dummy idxblock that has been inserted by
 * the driver.
 */

#include "pvr_draw_indirect_arrays0.h"
#include "pvr_draw_indirect_arrays1.h"
#include "pvr_draw_indirect_arrays2.h"
#include "pvr_draw_indirect_arrays3.h"

#include "pvr_draw_indirect_arrays_base_instance0.h"
#include "pvr_draw_indirect_arrays_base_instance1.h"
#include "pvr_draw_indirect_arrays_base_instance2.h"
#include "pvr_draw_indirect_arrays_base_instance3.h"

#include "pvr_draw_indirect_arrays_base_instance_drawid0.h"
#include "pvr_draw_indirect_arrays_base_instance_drawid1.h"
#include "pvr_draw_indirect_arrays_base_instance_drawid2.h"
#include "pvr_draw_indirect_arrays_base_instance_drawid3.h"

#define ENABLE_SLC_MCU_CACHE_CONTROLS(device)        \
   ((device)->features.has_slc_mcu_cache_controls    \
       ? PVR_ROGUE_PDSINST_LD_LD_SRC0_SLCMODE_CACHED \
       : PVR_ROGUE_PDSINST_LD_LD_SRC0_SLCMODE_BYPASS)

void pvr_pds_generate_draw_arrays_indirect(
   struct pvr_pds_drawindirect_program *restrict program,
   uint32_t *restrict buffer,
   enum pvr_pds_generate_mode gen_mode,
   const struct pvr_device_info *dev_info)
{
   if ((gen_mode == PDS_GENERATE_CODE_SEGMENT) ||
       (gen_mode == PDS_GENERATE_SIZES)) {
      const struct pvr_psc_program_output *psc_program = NULL;
      switch ((program->arg_buffer >> 2) % 4) {
      case 0:
         if (program->support_base_instance) {
            if (program->increment_draw_id) {
               psc_program =
                  &pvr_draw_indirect_arrays_base_instance_drawid0_program;
            } else {
               psc_program = &pvr_draw_indirect_arrays_base_instance0_program;
            }
         } else {
            psc_program = &pvr_draw_indirect_arrays0_program;
         }
         break;
      case 1:
         if (program->support_base_instance) {
            if (program->increment_draw_id) {
               psc_program =
                  &pvr_draw_indirect_arrays_base_instance_drawid1_program;
            } else {
               psc_program = &pvr_draw_indirect_arrays_base_instance1_program;
            }
         } else {
            psc_program = &pvr_draw_indirect_arrays1_program;
         }
         break;
      case 2:
         if (program->support_base_instance) {
            if (program->increment_draw_id) {
               psc_program =
                  &pvr_draw_indirect_arrays_base_instance_drawid2_program;
            } else {
               psc_program = &pvr_draw_indirect_arrays_base_instance2_program;
            }
         } else {
            psc_program = &pvr_draw_indirect_arrays2_program;
         }
         break;
      case 3:
         if (program->support_base_instance) {
            if (program->increment_draw_id) {
               psc_program =
                  &pvr_draw_indirect_arrays_base_instance_drawid3_program;
            } else {
               psc_program = &pvr_draw_indirect_arrays_base_instance3_program;
            }
         } else {
            psc_program = &pvr_draw_indirect_arrays3_program;
         }
         break;
      }

      if (gen_mode == PDS_GENERATE_CODE_SEGMENT) {
         memcpy(buffer,
                psc_program->code,
                psc_program->code_size * sizeof(uint32_t));
#if defined(DUMP_PDS)
         for (uint32_t i = 0; i < psc_program->code_size; i++)
            PVR_PDS_PRINT_INST(buffer[i]);
#endif
      }

      program->program = *psc_program;
   } else {
      switch ((program->arg_buffer >> 2) % 4) {
      case 0:
         if (program->support_base_instance) {
            if (program->increment_draw_id) {
               pvr_write_draw_indirect_arrays_base_instance_drawid0_di_data(
                  buffer,
                  program->arg_buffer & ~0xfull,
                  dev_info);
               pvr_write_draw_indirect_arrays_base_instance_drawid0_write_vdm(
                  buffer,
                  program->index_list_addr_buffer + 4);
               pvr_write_draw_indirect_arrays_base_instance_drawid0_flush_vdm(
                  buffer,
                  program->index_list_addr_buffer);
               pvr_write_draw_indirect_arrays_base_instance_drawid0_num_views(
                  buffer,
                  program->num_views);
               pvr_write_draw_indirect_arrays_base_instance_drawid0_immediates(
                  buffer);
            } else {
               pvr_write_draw_indirect_arrays_base_instance0_di_data(
                  buffer,
                  program->arg_buffer & ~0xfull,
                  dev_info);
               pvr_write_draw_indirect_arrays_base_instance0_write_vdm(
                  buffer,
                  program->index_list_addr_buffer + 4);
               pvr_write_draw_indirect_arrays_base_instance0_flush_vdm(
                  buffer,
                  program->index_list_addr_buffer);
               pvr_write_draw_indirect_arrays_base_instance0_num_views(
                  buffer,
                  program->num_views);
               pvr_write_draw_indirect_arrays_base_instance0_immediates(buffer);
            }
         } else {
            pvr_write_draw_indirect_arrays0_di_data(buffer,
                                                    program->arg_buffer &
                                                       ~0xfull,
                                                    dev_info);
            pvr_write_draw_indirect_arrays0_write_vdm(
               buffer,
               program->index_list_addr_buffer + 4);
            pvr_write_draw_indirect_arrays0_flush_vdm(
               buffer,
               program->index_list_addr_buffer);
            pvr_write_draw_indirect_arrays0_num_views(buffer,
                                                      program->num_views);
            pvr_write_draw_indirect_arrays0_immediates(buffer);
         }
         break;
      case 1:
         if (program->support_base_instance) {
            if (program->increment_draw_id) {
               pvr_write_draw_indirect_arrays_base_instance_drawid1_di_data(
                  buffer,
                  program->arg_buffer & ~0xfull,
                  dev_info);
               pvr_write_draw_indirect_arrays_base_instance_drawid1_write_vdm(
                  buffer,
                  program->index_list_addr_buffer + 4);
               pvr_write_draw_indirect_arrays_base_instance_drawid1_flush_vdm(
                  buffer,
                  program->index_list_addr_buffer);
               pvr_write_draw_indirect_arrays_base_instance_drawid1_num_views(
                  buffer,
                  program->num_views);
               pvr_write_draw_indirect_arrays_base_instance_drawid1_immediates(
                  buffer);
            } else {
               pvr_write_draw_indirect_arrays_base_instance1_di_data(
                  buffer,
                  program->arg_buffer & ~0xfull,
                  dev_info);
               pvr_write_draw_indirect_arrays_base_instance1_write_vdm(
                  buffer,
                  program->index_list_addr_buffer + 4);
               pvr_write_draw_indirect_arrays_base_instance1_flush_vdm(
                  buffer,
                  program->index_list_addr_buffer);
               pvr_write_draw_indirect_arrays_base_instance1_num_views(
                  buffer,
                  program->num_views);
               pvr_write_draw_indirect_arrays_base_instance1_immediates(buffer);
            }
         } else {
            pvr_write_draw_indirect_arrays1_di_data(buffer,
                                                    program->arg_buffer &
                                                       ~0xfull,
                                                    dev_info);
            pvr_write_draw_indirect_arrays1_write_vdm(
               buffer,
               program->index_list_addr_buffer + 4);
            pvr_write_draw_indirect_arrays1_flush_vdm(
               buffer,
               program->index_list_addr_buffer);
            pvr_write_draw_indirect_arrays1_num_views(buffer,
                                                      program->num_views);
            pvr_write_draw_indirect_arrays1_immediates(buffer);
         }
         break;
      case 2:
         if (program->support_base_instance) {
            if (program->increment_draw_id) {
               pvr_write_draw_indirect_arrays_base_instance_drawid2_di_data(
                  buffer,
                  program->arg_buffer & ~0xfull,
                  dev_info);
               pvr_write_draw_indirect_arrays_base_instance_drawid2_write_vdm(
                  buffer,
                  program->index_list_addr_buffer + 4);
               pvr_write_draw_indirect_arrays_base_instance_drawid2_flush_vdm(
                  buffer,
                  program->index_list_addr_buffer);
               pvr_write_draw_indirect_arrays_base_instance_drawid2_num_views(
                  buffer,
                  program->num_views);
               pvr_write_draw_indirect_arrays_base_instance_drawid2_immediates(
                  buffer);
            } else {
               pvr_write_draw_indirect_arrays_base_instance2_di_data(
                  buffer,
                  program->arg_buffer & ~0xfull,
                  dev_info);
               pvr_write_draw_indirect_arrays_base_instance2_write_vdm(
                  buffer,
                  program->index_list_addr_buffer + 4);
               pvr_write_draw_indirect_arrays_base_instance2_flush_vdm(
                  buffer,
                  program->index_list_addr_buffer);
               pvr_write_draw_indirect_arrays_base_instance2_num_views(
                  buffer,
                  program->num_views);
               pvr_write_draw_indirect_arrays_base_instance2_immediates(buffer);
            }
         } else {
            pvr_write_draw_indirect_arrays2_di_data(buffer,
                                                    program->arg_buffer &
                                                       ~0xfull,
                                                    dev_info);
            pvr_write_draw_indirect_arrays2_write_vdm(
               buffer,
               program->index_list_addr_buffer + 4);
            pvr_write_draw_indirect_arrays2_flush_vdm(
               buffer,
               program->index_list_addr_buffer);
            pvr_write_draw_indirect_arrays2_num_views(buffer,
                                                      program->num_views);
            pvr_write_draw_indirect_arrays2_immediates(buffer);
         }
         break;
      case 3:
         if (program->support_base_instance) {
            if (program->increment_draw_id) {
               pvr_write_draw_indirect_arrays_base_instance_drawid3_di_data(
                  buffer,
                  program->arg_buffer & ~0xfull,
                  dev_info);
               pvr_write_draw_indirect_arrays_base_instance_drawid3_write_vdm(
                  buffer,
                  program->index_list_addr_buffer + 4);
               pvr_write_draw_indirect_arrays_base_instance_drawid3_flush_vdm(
                  buffer,
                  program->index_list_addr_buffer);
               pvr_write_draw_indirect_arrays_base_instance_drawid3_num_views(
                  buffer,
                  program->num_views);
               pvr_write_draw_indirect_arrays_base_instance_drawid3_immediates(
                  buffer);
            } else {
               pvr_write_draw_indirect_arrays_base_instance3_di_data(
                  buffer,
                  program->arg_buffer & ~0xfull,
                  dev_info);
               pvr_write_draw_indirect_arrays_base_instance3_write_vdm(
                  buffer,
                  program->index_list_addr_buffer + 4);
               pvr_write_draw_indirect_arrays_base_instance3_flush_vdm(
                  buffer,
                  program->index_list_addr_buffer);
               pvr_write_draw_indirect_arrays_base_instance3_num_views(
                  buffer,
                  program->num_views);
               pvr_write_draw_indirect_arrays_base_instance3_immediates(buffer);
            }
         } else {
            pvr_write_draw_indirect_arrays3_di_data(buffer,
                                                    program->arg_buffer &
                                                       ~0xfull,
                                                    dev_info);
            pvr_write_draw_indirect_arrays3_write_vdm(
               buffer,
               program->index_list_addr_buffer + 4);
            pvr_write_draw_indirect_arrays3_flush_vdm(
               buffer,
               program->index_list_addr_buffer);
            pvr_write_draw_indirect_arrays3_num_views(buffer,
                                                      program->num_views);
            pvr_write_draw_indirect_arrays3_immediates(buffer);
         }
         break;
      }
   }
}

#include "pvr_draw_indirect_elements0.h"
#include "pvr_draw_indirect_elements1.h"
#include "pvr_draw_indirect_elements2.h"
#include "pvr_draw_indirect_elements3.h"
#include "pvr_draw_indirect_elements_base_instance0.h"
#include "pvr_draw_indirect_elements_base_instance1.h"
#include "pvr_draw_indirect_elements_base_instance2.h"
#include "pvr_draw_indirect_elements_base_instance3.h"
#include "pvr_draw_indirect_elements_base_instance_drawid0.h"
#include "pvr_draw_indirect_elements_base_instance_drawid1.h"
#include "pvr_draw_indirect_elements_base_instance_drawid2.h"
#include "pvr_draw_indirect_elements_base_instance_drawid3.h"

void pvr_pds_generate_draw_elements_indirect(
   struct pvr_pds_drawindirect_program *restrict program,
   uint32_t *restrict buffer,
   enum pvr_pds_generate_mode gen_mode,
   const struct pvr_device_info *dev_info)
{
   if ((gen_mode == PDS_GENERATE_CODE_SEGMENT) ||
       (gen_mode == PDS_GENERATE_SIZES)) {
      const struct pvr_psc_program_output *psc_program = NULL;
      switch ((program->arg_buffer >> 2) % 4) {
      case 0:
         if (program->support_base_instance) {
            if (program->increment_draw_id) {
               psc_program =
                  &pvr_draw_indirect_elements_base_instance_drawid0_program;
            } else {
               psc_program = &pvr_draw_indirect_elements_base_instance0_program;
            }
         } else {
            psc_program = &pvr_draw_indirect_elements0_program;
         }
         break;
      case 1:
         if (program->support_base_instance) {
            if (program->increment_draw_id) {
               psc_program =
                  &pvr_draw_indirect_elements_base_instance_drawid1_program;
            } else {
               psc_program = &pvr_draw_indirect_elements_base_instance1_program;
            }
         } else {
            psc_program = &pvr_draw_indirect_elements1_program;
         }
         break;
      case 2:
         if (program->support_base_instance) {
            if (program->increment_draw_id) {
               psc_program =
                  &pvr_draw_indirect_elements_base_instance_drawid2_program;
            } else {
               psc_program = &pvr_draw_indirect_elements_base_instance2_program;
            }
         } else {
            psc_program = &pvr_draw_indirect_elements2_program;
         }
         break;
      case 3:
         if (program->support_base_instance) {
            if (program->increment_draw_id) {
               psc_program =
                  &pvr_draw_indirect_elements_base_instance_drawid3_program;
            } else {
               psc_program = &pvr_draw_indirect_elements_base_instance3_program;
            }
         } else {
            psc_program = &pvr_draw_indirect_elements3_program;
         }
         break;
      }

      if (gen_mode == PDS_GENERATE_CODE_SEGMENT) {
         memcpy(buffer,
                psc_program->code,
                psc_program->code_size * sizeof(uint32_t));

#if defined(DUMP_PDS)
         for (uint32_t i = 0; i < psc_program->code_size; i++)
            PVR_PDS_PRINT_INST(buffer[i]);
#endif
      }

      program->program = *psc_program;
   } else {
      switch ((program->arg_buffer >> 2) % 4) {
      case 0:
         if (program->support_base_instance) {
            if (program->increment_draw_id) {
               pvr_write_draw_indirect_elements_base_instance_drawid0_di_data(
                  buffer,
                  program->arg_buffer & ~0xfull,
                  dev_info);
               pvr_write_draw_indirect_elements_base_instance_drawid0_write_vdm(
                  buffer,
                  program->index_list_addr_buffer);
               pvr_write_draw_indirect_elements_base_instance_drawid0_flush_vdm(
                  buffer,
                  program->index_list_addr_buffer);
               pvr_write_draw_indirect_elements_base_instance_drawid0_num_views(
                  buffer,
                  program->num_views);
               pvr_write_draw_indirect_elements_base_instance_drawid0_idx_stride(
                  buffer,
                  program->index_stride);
               pvr_write_draw_indirect_elements_base_instance_drawid0_idx_base(
                  buffer,
                  program->index_buffer);
               pvr_write_draw_indirect_elements_base_instance_drawid0_idx_header(
                  buffer,
                  program->index_block_header);
               pvr_write_draw_indirect_elements_base_instance_drawid0_immediates(
                  buffer);
            } else {
               pvr_write_draw_indirect_elements_base_instance0_di_data(
                  buffer,
                  program->arg_buffer & ~0xfull,
                  dev_info);
               pvr_write_draw_indirect_elements_base_instance0_write_vdm(
                  buffer,
                  program->index_list_addr_buffer);
               pvr_write_draw_indirect_elements_base_instance0_flush_vdm(
                  buffer,
                  program->index_list_addr_buffer);
               pvr_write_draw_indirect_elements_base_instance0_num_views(
                  buffer,
                  program->num_views);
               pvr_write_draw_indirect_elements_base_instance0_idx_stride(
                  buffer,
                  program->index_stride);
               pvr_write_draw_indirect_elements_base_instance0_idx_base(
                  buffer,
                  program->index_buffer);
               pvr_write_draw_indirect_elements_base_instance0_idx_header(
                  buffer,
                  program->index_block_header);
               pvr_write_draw_indirect_elements_base_instance0_immediates(
                  buffer);
            }
         } else {
            pvr_write_draw_indirect_elements0_di_data(buffer,
                                                      program->arg_buffer &
                                                         ~0xfull,
                                                      dev_info);
            pvr_write_draw_indirect_elements0_write_vdm(
               buffer,
               program->index_list_addr_buffer);
            pvr_write_draw_indirect_elements0_flush_vdm(
               buffer,
               program->index_list_addr_buffer);
            pvr_write_draw_indirect_elements0_num_views(buffer,
                                                        program->num_views);
            pvr_write_draw_indirect_elements0_idx_stride(buffer,
                                                         program->index_stride);
            pvr_write_draw_indirect_elements0_idx_base(buffer,
                                                       program->index_buffer);
            pvr_write_draw_indirect_elements0_idx_header(
               buffer,
               program->index_block_header);
            pvr_write_draw_indirect_elements0_immediates(buffer);
         }
         break;
      case 1:
         if (program->support_base_instance) {
            if (program->increment_draw_id) {
               pvr_write_draw_indirect_elements_base_instance_drawid1_di_data(
                  buffer,
                  program->arg_buffer & ~0xfull,
                  dev_info);
               pvr_write_draw_indirect_elements_base_instance_drawid1_write_vdm(
                  buffer,
                  program->index_list_addr_buffer);
               pvr_write_draw_indirect_elements_base_instance_drawid1_flush_vdm(
                  buffer,
                  program->index_list_addr_buffer);
               pvr_write_draw_indirect_elements_base_instance_drawid1_num_views(
                  buffer,
                  program->num_views);
               pvr_write_draw_indirect_elements_base_instance_drawid1_idx_stride(
                  buffer,
                  program->index_stride);
               pvr_write_draw_indirect_elements_base_instance_drawid1_idx_base(
                  buffer,
                  program->index_buffer);
               pvr_write_draw_indirect_elements_base_instance_drawid1_idx_header(
                  buffer,
                  program->index_block_header);
               pvr_write_draw_indirect_elements_base_instance_drawid1_immediates(
                  buffer);
            } else {
               pvr_write_draw_indirect_elements_base_instance1_di_data(
                  buffer,
                  program->arg_buffer & ~0xfull,
                  dev_info);
               pvr_write_draw_indirect_elements_base_instance1_write_vdm(
                  buffer,
                  program->index_list_addr_buffer);
               pvr_write_draw_indirect_elements_base_instance1_flush_vdm(
                  buffer,
                  program->index_list_addr_buffer);
               pvr_write_draw_indirect_elements_base_instance1_num_views(
                  buffer,
                  program->num_views);
               pvr_write_draw_indirect_elements_base_instance1_idx_stride(
                  buffer,
                  program->index_stride);
               pvr_write_draw_indirect_elements_base_instance1_idx_base(
                  buffer,
                  program->index_buffer);
               pvr_write_draw_indirect_elements_base_instance1_idx_header(
                  buffer,
                  program->index_block_header);
               pvr_write_draw_indirect_elements_base_instance1_immediates(
                  buffer);
            }
         } else {
            pvr_write_draw_indirect_elements1_di_data(buffer,
                                                      program->arg_buffer &
                                                         ~0xfull,
                                                      dev_info);
            pvr_write_draw_indirect_elements1_write_vdm(
               buffer,
               program->index_list_addr_buffer);
            pvr_write_draw_indirect_elements1_flush_vdm(
               buffer,
               program->index_list_addr_buffer);
            pvr_write_draw_indirect_elements1_num_views(buffer,
                                                        program->num_views);
            pvr_write_draw_indirect_elements1_idx_stride(buffer,
                                                         program->index_stride);
            pvr_write_draw_indirect_elements1_idx_base(buffer,
                                                       program->index_buffer);
            pvr_write_draw_indirect_elements1_idx_header(
               buffer,
               program->index_block_header);
            pvr_write_draw_indirect_elements1_immediates(buffer);
         }
         break;
      case 2:
         if (program->support_base_instance) {
            if (program->increment_draw_id) {
               pvr_write_draw_indirect_elements_base_instance_drawid2_di_data(
                  buffer,
                  program->arg_buffer & ~0xfull,
                  dev_info);
               pvr_write_draw_indirect_elements_base_instance_drawid2_write_vdm(
                  buffer,
                  program->index_list_addr_buffer);
               pvr_write_draw_indirect_elements_base_instance_drawid2_flush_vdm(
                  buffer,
                  program->index_list_addr_buffer);
               pvr_write_draw_indirect_elements_base_instance_drawid2_num_views(
                  buffer,
                  program->num_views);
               pvr_write_draw_indirect_elements_base_instance_drawid2_idx_stride(
                  buffer,
                  program->index_stride);
               pvr_write_draw_indirect_elements_base_instance_drawid2_idx_base(
                  buffer,
                  program->index_buffer);
               pvr_write_draw_indirect_elements_base_instance_drawid2_idx_header(
                  buffer,
                  program->index_block_header);
               pvr_write_draw_indirect_elements_base_instance_drawid2_immediates(
                  buffer);
            } else {
               pvr_write_draw_indirect_elements_base_instance2_di_data(
                  buffer,
                  program->arg_buffer & ~0xfull,
                  dev_info);
               pvr_write_draw_indirect_elements_base_instance2_write_vdm(
                  buffer,
                  program->index_list_addr_buffer);
               pvr_write_draw_indirect_elements_base_instance2_flush_vdm(
                  buffer,
                  program->index_list_addr_buffer);
               pvr_write_draw_indirect_elements_base_instance2_num_views(
                  buffer,
                  program->num_views);
               pvr_write_draw_indirect_elements_base_instance2_idx_stride(
                  buffer,
                  program->index_stride);
               pvr_write_draw_indirect_elements_base_instance2_idx_base(
                  buffer,
                  program->index_buffer);
               pvr_write_draw_indirect_elements_base_instance2_idx_header(
                  buffer,
                  program->index_block_header);
               pvr_write_draw_indirect_elements_base_instance2_immediates(
                  buffer);
            }
         } else {
            pvr_write_draw_indirect_elements2_di_data(buffer,
                                                      program->arg_buffer &
                                                         ~0xfull,
                                                      dev_info);
            pvr_write_draw_indirect_elements2_write_vdm(
               buffer,
               program->index_list_addr_buffer);
            pvr_write_draw_indirect_elements2_flush_vdm(
               buffer,
               program->index_list_addr_buffer);
            pvr_write_draw_indirect_elements2_num_views(buffer,
                                                        program->num_views);
            pvr_write_draw_indirect_elements2_idx_stride(buffer,
                                                         program->index_stride);
            pvr_write_draw_indirect_elements2_idx_base(buffer,
                                                       program->index_buffer);
            pvr_write_draw_indirect_elements2_idx_header(
               buffer,
               program->index_block_header);
            pvr_write_draw_indirect_elements2_immediates(buffer);
         }
         break;
      case 3:
         if (program->support_base_instance) {
            if (program->increment_draw_id) {
               pvr_write_draw_indirect_elements_base_instance_drawid3_di_data(
                  buffer,
                  program->arg_buffer & ~0xfull,
                  dev_info);
               pvr_write_draw_indirect_elements_base_instance_drawid3_write_vdm(
                  buffer,
                  program->index_list_addr_buffer);
               pvr_write_draw_indirect_elements_base_instance_drawid3_flush_vdm(
                  buffer,
                  program->index_list_addr_buffer);
               pvr_write_draw_indirect_elements_base_instance_drawid3_num_views(
                  buffer,
                  program->num_views);
               pvr_write_draw_indirect_elements_base_instance_drawid3_idx_stride(
                  buffer,
                  program->index_stride);
               pvr_write_draw_indirect_elements_base_instance_drawid3_idx_base(
                  buffer,
                  program->index_buffer);
               pvr_write_draw_indirect_elements_base_instance_drawid3_idx_header(
                  buffer,
                  program->index_block_header);
               pvr_write_draw_indirect_elements_base_instance_drawid3_immediates(
                  buffer);
            } else {
               pvr_write_draw_indirect_elements_base_instance3_di_data(
                  buffer,
                  program->arg_buffer & ~0xfull,
                  dev_info);
               pvr_write_draw_indirect_elements_base_instance3_write_vdm(
                  buffer,
                  program->index_list_addr_buffer);
               pvr_write_draw_indirect_elements_base_instance3_flush_vdm(
                  buffer,
                  program->index_list_addr_buffer);
               pvr_write_draw_indirect_elements_base_instance3_num_views(
                  buffer,
                  program->num_views);
               pvr_write_draw_indirect_elements_base_instance3_idx_stride(
                  buffer,
                  program->index_stride);
               pvr_write_draw_indirect_elements_base_instance3_idx_base(
                  buffer,
                  program->index_buffer);
               pvr_write_draw_indirect_elements_base_instance3_idx_header(
                  buffer,
                  program->index_block_header);
               pvr_write_draw_indirect_elements_base_instance3_immediates(
                  buffer);
            }
         } else {
            pvr_write_draw_indirect_elements3_di_data(buffer,
                                                      program->arg_buffer &
                                                         ~0xfull,
                                                      dev_info);
            pvr_write_draw_indirect_elements3_write_vdm(
               buffer,
               program->index_list_addr_buffer);
            pvr_write_draw_indirect_elements3_flush_vdm(
               buffer,
               program->index_list_addr_buffer);
            pvr_write_draw_indirect_elements3_num_views(buffer,
                                                        program->num_views);
            pvr_write_draw_indirect_elements3_idx_stride(buffer,
                                                         program->index_stride);
            pvr_write_draw_indirect_elements3_idx_base(buffer,
                                                       program->index_buffer);
            pvr_write_draw_indirect_elements3_idx_header(
               buffer,
               program->index_block_header);
            pvr_write_draw_indirect_elements3_immediates(buffer);
         }
         break;
      }
   }
}
