/*
 * Copyright 2022 Alyssa Rosenzweig
 * Copyright 2019-2020 Collabora, Ltd.
 * SPDX-License-Identifier: MIT
 */

#include <xf86drm.h>
#include "asahi/lib/agx_device_virtio.h"
#include "asahi/lib/decode.h"
#include "util/bitset.h"
#include "util/u_dynarray.h"
#include "util/u_range.h"
#include "agx_state.h"
#include "vdrm.h"

#define foreach_active(ctx, idx)                                               \
   BITSET_FOREACH_SET(idx, ctx->batches.active, AGX_MAX_BATCHES)

#define foreach_submitted(ctx, idx)                                            \
   BITSET_FOREACH_SET(idx, ctx->batches.submitted, AGX_MAX_BATCHES)

#define batch_debug(batch, fmt, ...)                                           \
   do {                                                                        \
      if (unlikely(agx_device(batch->ctx->base.screen)->debug &                \
                   AGX_DBG_BATCH))                                             \
         agx_msg("[Queue %u Batch %u] " fmt "\n", batch->ctx->queue_id,        \
                 agx_batch_idx(batch), ##__VA_ARGS__);                         \
   } while (0)

bool
agx_batch_is_active(struct agx_batch *batch)
{
   return BITSET_TEST(batch->ctx->batches.active, agx_batch_idx(batch));
}

bool
agx_batch_is_submitted(struct agx_batch *batch)
{
   return BITSET_TEST(batch->ctx->batches.submitted, agx_batch_idx(batch));
}

static void
agx_batch_mark_active(struct agx_batch *batch)
{
   unsigned batch_idx = agx_batch_idx(batch);

   batch_debug(batch, "ACTIVE");

   assert(!BITSET_TEST(batch->ctx->batches.submitted, batch_idx));
   assert(!BITSET_TEST(batch->ctx->batches.active, batch_idx));
   BITSET_SET(batch->ctx->batches.active, batch_idx);
}

static void
agx_batch_mark_submitted(struct agx_batch *batch)
{
   unsigned batch_idx = agx_batch_idx(batch);

   batch_debug(batch, "SUBMIT");

   assert(BITSET_TEST(batch->ctx->batches.active, batch_idx));
   assert(!BITSET_TEST(batch->ctx->batches.submitted, batch_idx));
   BITSET_CLEAR(batch->ctx->batches.active, batch_idx);
   BITSET_SET(batch->ctx->batches.submitted, batch_idx);
}

static void
agx_batch_mark_complete(struct agx_batch *batch)
{
   unsigned batch_idx = agx_batch_idx(batch);

   batch_debug(batch, "COMPLETE");

   assert(!BITSET_TEST(batch->ctx->batches.active, batch_idx));
   assert(BITSET_TEST(batch->ctx->batches.submitted, batch_idx));
   BITSET_CLEAR(batch->ctx->batches.submitted, batch_idx);
}

struct agx_encoder
agx_encoder_allocate(struct agx_batch *batch, struct agx_device *dev)
{
   struct agx_bo *bo = agx_bo_create(dev, 0x80000, 0, 0, "Encoder");

   return (struct agx_encoder){
      .bo = bo,
      .current = bo->map,
      .end = (uint8_t *)bo->map + bo->size,
   };
}

static void
agx_batch_init(struct agx_context *ctx,
               const struct pipe_framebuffer_state *key,
               struct agx_batch *batch)
{
   struct agx_device *dev = agx_device(ctx->base.screen);

   batch->ctx = ctx;
   util_copy_framebuffer_state(&batch->key, key);
   batch->seqnum = ++ctx->batches.seqnum;

   agx_pool_init(&batch->pool, dev, 0, true);
   agx_pool_init(&batch->pipeline_pool, dev, AGX_BO_LOW_VA, true);

   /* These allocations can happen only once and will just be zeroed (not freed)
    * during batch clean up. The memory is owned by the context.
    */
   if (!batch->bo_list.set) {
      batch->bo_list.set = rzalloc_array(ctx, BITSET_WORD, 128);
      batch->bo_list.bit_count = 128 * sizeof(BITSET_WORD) * 8;
   } else {
      memset(batch->bo_list.set, 0, batch->bo_list.bit_count / 8);
   }

   if (agx_batch_is_compute(batch)) {
      batch->cdm = agx_encoder_allocate(batch, dev);
      memset(&batch->vdm, 0, sizeof(batch->vdm));
   } else {
      batch->vdm = agx_encoder_allocate(batch, dev);
      memset(&batch->cdm, 0, sizeof(batch->cdm));
   }

   util_dynarray_init(&batch->scissor, ctx);
   util_dynarray_init(&batch->depth_bias, ctx);
   util_dynarray_init(&batch->timestamps, ctx);

   batch->clear = 0;
   batch->draw = 0;
   batch->load = 0;
   batch->resolve = 0;
   memset(batch->uploaded_clear_color, 0, sizeof(batch->uploaded_clear_color));
   batch->clear_depth = 0;
   batch->clear_stencil = 0;
   batch->varyings = 0;
   batch->geometry_state = 0;
   batch->initialized = false;
   batch->draws = 0;
   batch->incoherent_writes = false;
   agx_bo_unreference(dev, batch->sampler_heap.bo);
   batch->sampler_heap.bo = NULL;
   batch->sampler_heap.count = 0;
   batch->vs_scratch = false;
   batch->fs_scratch = false;
   batch->cs_scratch = false;
   batch->vs_preamble_scratch = 0;
   batch->fs_preamble_scratch = 0;
   batch->cs_preamble_scratch = 0;

   /* May get read before write, need to initialize to 0 to avoid GPU-side UAF
    * conditions.
    */
   batch->uniforms.tables[AGX_SYSVAL_TABLE_PARAMS] = 0;

   /* We need to emit prim state at the start. Max collides with all. */
   batch->reduced_prim = MESA_PRIM_COUNT;

   if (!batch->syncobj) {
      int ret = drmSyncobjCreate(dev->fd, 0, &batch->syncobj);
      assert(!ret && batch->syncobj);
   }

   batch->result_off =
      (2 * sizeof(union agx_batch_result)) * agx_batch_idx(batch);
   batch->result =
      (void *)(((uint8_t *)ctx->result_buf->map) + batch->result_off);
   memset(batch->result, 0, sizeof(union agx_batch_result) * 2);

   agx_batch_mark_active(batch);
}

const char *status_str[] = {
   [DRM_ASAHI_STATUS_PENDING] = "(pending)",
   [DRM_ASAHI_STATUS_COMPLETE] = "Complete",
   [DRM_ASAHI_STATUS_UNKNOWN_ERROR] = "UNKNOWN ERROR",
   [DRM_ASAHI_STATUS_TIMEOUT] = "TIMEOUT",
   [DRM_ASAHI_STATUS_FAULT] = "FAULT",
   [DRM_ASAHI_STATUS_KILLED] = "KILLED",
   [DRM_ASAHI_STATUS_NO_DEVICE] = "NO DEVICE",
};

const char *fault_type_str[] = {
   [DRM_ASAHI_FAULT_NONE] = "(none)",
   [DRM_ASAHI_FAULT_UNKNOWN] = "Unknown",
   [DRM_ASAHI_FAULT_UNMAPPED] = "Unmapped",
   [DRM_ASAHI_FAULT_AF_FAULT] = "AF Fault",
   [DRM_ASAHI_FAULT_WRITE_ONLY] = "Write Only",
   [DRM_ASAHI_FAULT_READ_ONLY] = "Read Only",
   [DRM_ASAHI_FAULT_NO_ACCESS] = "No Access",
};

const char *low_unit_str[16] = {
   "DCMP", "UL1C", "CMP", "GSL1",    "IAP", "VCE",    "TE",  "RAS",
   "VDM",  "PPP",  "IPF", "IPF_CPF", "VF",  "VF_CPF", "ZLS", "UNK",
};

const char *mid_unit_str[16] = {
   "UNK",     "dPM",      "dCDM_KS0", "dCDM_KS1", "dCDM_KS2", "dIPP",
   "dIPP_CS", "dVDM_CSD", "dVDM_SSD", "dVDM_ILF", "dVDM_ILD", "dRDE0",
   "dRDE1",   "FC",       "GSL2",     "UNK",
};

const char *high_unit_str[16] = {
   "gPM_SP",         "gVDM_CSD_SP", "gVDM_SSD_SP",    "gVDM_ILF_SP",
   "gVDM_TFP_SP",    "gVDM_MMB_SP", "gCDM_CS_KS0_SP", "gCDM_CS_KS1_SP",
   "gCDM_CS_KS2_SP", "gCDM_KS0_SP", "gCDM_KS1_SP",    "gCDM_KS2_SP",
   "gIPP_SP",        "gIPP_CS_SP",  "gRDE0_SP",       "gRDE1_SP",
};

static void
agx_print_result(struct agx_device *dev, struct agx_context *ctx,
                 struct drm_asahi_result_info *info, unsigned batch_idx,
                 bool is_compute)
{
   if (unlikely(info->status != DRM_ASAHI_STATUS_COMPLETE)) {
      ctx->any_faults = true;
   }

   if (likely(info->status == DRM_ASAHI_STATUS_COMPLETE &&
              !((dev)->debug & AGX_DBG_STATS)))
      return;

   if (is_compute) {
      struct drm_asahi_result_compute *r = (void *)info;
      float time = (r->ts_end - r->ts_start) / dev->params.timer_frequency_hz;

      mesa_logw(
         "[Batch %d] Compute %s: %.06f\n", batch_idx,
         info->status < ARRAY_SIZE(status_str) ? status_str[info->status] : "?",
         time);
   } else {
      struct drm_asahi_result_render *r = (void *)info;
      float time_vtx = (r->vertex_ts_end - r->vertex_ts_start) /
                       (float)dev->params.timer_frequency_hz;
      float time_frag = (r->fragment_ts_end - r->fragment_ts_start) /
                        (float)dev->params.timer_frequency_hz;
      mesa_logw(
         "[Batch %d] Render %s: TVB %9ld/%9ld bytes (%d ovf) %c%c%c | vtx %.06f frag %.06f\n",
         batch_idx,
         info->status < ARRAY_SIZE(status_str) ? status_str[info->status] : "?",
         (long)r->tvb_usage_bytes, (long)r->tvb_size_bytes,
         (int)r->num_tvb_overflows,
         r->flags & DRM_ASAHI_RESULT_RENDER_TVB_GROW_OVF ? 'G' : ' ',
         r->flags & DRM_ASAHI_RESULT_RENDER_TVB_GROW_MIN ? 'M' : ' ',
         r->flags & DRM_ASAHI_RESULT_RENDER_TVB_OVERFLOWED ? 'O' : ' ',
         time_vtx, time_frag);
   }

   if (info->fault_type != DRM_ASAHI_FAULT_NONE) {
      const char *unit_name;
      int unit_index;

      switch (info->unit) {
      case 0x00 ... 0x9f:
         unit_name = low_unit_str[info->unit & 0xf];
         unit_index = info->unit >> 4;
         break;
      case 0xa0 ... 0xaf:
         unit_name = mid_unit_str[info->unit & 0xf];
         unit_index = 0;
         break;
      case 0xb0 ... 0xb7:
         unit_name = "GL2CC_META";
         unit_index = info->unit & 0x7;
         break;
      case 0xb8:
         unit_name = "GL2CC_MB";
         unit_index = 0;
         break;
      case 0xe0 ... 0xff:
         unit_name = high_unit_str[info->unit & 0xf];
         unit_index = (info->unit >> 4) & 1;
         break;
      default:
         unit_name = "UNK";
         unit_index = 0;
         break;
      }

      mesa_logw(
         "[Batch %d] Fault: %s : Addr 0x%llx %c Unit %02x (%s/%d) SB 0x%02x L%d Extra 0x%x\n",
         batch_idx,
         info->fault_type < ARRAY_SIZE(fault_type_str)
            ? fault_type_str[info->fault_type]
            : "?",
         (long long)info->address, info->is_read ? 'r' : 'W', info->unit,
         unit_name, unit_index, info->sideband, info->level, info->extra);

      agx_debug_fault(dev, info->address);
   }

   /* Obscurely, we need to tolerate faults to pass the robustness parts of the
    * CTS, so we can't assert that we don't fault. But it's helpful for any sort
    * of debugging to crash on fault.
    */
   if (dev->debug) {
      assert(info->status == DRM_ASAHI_STATUS_COMPLETE ||
             info->status == DRM_ASAHI_STATUS_KILLED);
   }
}

static void
agx_batch_print_stats(struct agx_device *dev, struct agx_batch *batch)
{
   unsigned batch_idx = agx_batch_idx(batch);

   if (!batch->result)
      return;

   if (batch->cdm.bo) {
      agx_print_result(dev, batch->ctx, &batch->result[0].compute.info,
                       batch_idx, true);
   }

   if (batch->vdm.bo) {
      agx_print_result(dev, batch->ctx, &batch->result[1].render.info,
                       batch_idx, false);
   }
}

static void
agx_batch_cleanup(struct agx_context *ctx, struct agx_batch *batch, bool reset)
{
   struct agx_device *dev = agx_device(ctx->base.screen);
   assert(batch->ctx == ctx);
   assert(agx_batch_is_submitted(batch));

   assert(ctx->batch != batch);

   uint64_t begin_ts = ~0, end_ts = 0;
   if (batch->result) {
      if (batch->cdm.bo) {
         begin_ts = MIN2(begin_ts, batch->result[0].compute.ts_start);
         end_ts = MAX2(end_ts, batch->result[0].compute.ts_end);
      }

      if (batch->vdm.bo) {
         begin_ts = MIN2(begin_ts, batch->result[1].render.vertex_ts_start);
         end_ts = MAX2(end_ts, batch->result[1].render.fragment_ts_end);
      }
   }

   agx_finish_batch_queries(batch, begin_ts, end_ts);

   if (reset) {
      int handle;
      AGX_BATCH_FOREACH_BO_HANDLE(batch, handle) {
         /* We should write no buffers if this is an empty batch */
         assert(agx_writer_get(ctx, handle) != batch);

         agx_bo_unreference(dev, agx_lookup_bo(dev, handle));
      }
   } else {
      int handle;
      AGX_BATCH_FOREACH_BO_HANDLE(batch, handle) {
         struct agx_bo *bo = agx_lookup_bo(dev, handle);

         /* There is no more writer on this context for anything we wrote */
         struct agx_batch *writer = agx_writer_get(ctx, handle);

         if (writer == batch)
            agx_writer_remove(ctx, handle);

         p_atomic_cmpxchg(&bo->writer,
                          agx_bo_writer(ctx->queue_id, batch->syncobj), 0);

         agx_bo_unreference(dev, agx_lookup_bo(dev, handle));
      }
   }

   agx_bo_unreference(dev, batch->vdm.bo);
   agx_bo_unreference(dev, batch->cdm.bo);
   agx_pool_cleanup(&batch->pool);
   agx_pool_cleanup(&batch->pipeline_pool);

   util_dynarray_fini(&batch->scissor);
   util_dynarray_fini(&batch->depth_bias);
   util_dynarray_fini(&batch->timestamps);

   if (!(dev->debug & (AGX_DBG_TRACE | AGX_DBG_SYNC))) {
      agx_batch_print_stats(dev, batch);
   }

   util_unreference_framebuffer_state(&batch->key);
   agx_batch_mark_complete(batch);
}

int
agx_cleanup_batches(struct agx_context *ctx)
{
   struct agx_device *dev = agx_device(ctx->base.screen);

   unsigned i;
   unsigned count = 0;
   struct agx_batch *batches[AGX_MAX_BATCHES];
   uint32_t syncobjs[AGX_MAX_BATCHES];
   uint32_t first = 0;

   foreach_submitted(ctx, i) {
      batches[count] = &ctx->batches.slots[i];
      syncobjs[count++] = ctx->batches.slots[i].syncobj;
   }

   if (!count)
      return -1;

   int ret = drmSyncobjWait(dev->fd, syncobjs, count, 0, 0, &first);
   assert(!ret || ret == -ETIME);
   if (ret)
      return -1;

   assert(first < AGX_MAX_BATCHES);
   agx_batch_cleanup(ctx, batches[first], false);
   return agx_batch_idx(batches[first]);
}

static struct agx_batch *
agx_get_batch_for_framebuffer(struct agx_context *ctx,
                              const struct pipe_framebuffer_state *state)
{
   /* Look if we have a matching batch */
   unsigned i;
   foreach_active(ctx, i) {
      struct agx_batch *candidate = &ctx->batches.slots[i];

      if (util_framebuffer_state_equal(&candidate->key, state)) {
         /* We found a match, increase the seqnum for the LRU
          * eviction logic.
          */
         candidate->seqnum = ++ctx->batches.seqnum;
         return candidate;
      }
   }

   /* Look for a free batch */
   for (i = 0; i < AGX_MAX_BATCHES; ++i) {
      if (!BITSET_TEST(ctx->batches.active, i) &&
          !BITSET_TEST(ctx->batches.submitted, i)) {
         struct agx_batch *batch = &ctx->batches.slots[i];
         agx_batch_init(ctx, state, batch);
         return batch;
      }
   }

   /* Try to clean up one batch */
   int freed = agx_cleanup_batches(ctx);
   if (freed >= 0) {
      struct agx_batch *batch = &ctx->batches.slots[freed];
      agx_batch_init(ctx, state, batch);
      return batch;
   }

   /* Else, evict something */
   struct agx_batch *batch = NULL;
   bool submitted = false;
   for (i = 0; i < AGX_MAX_BATCHES; ++i) {
      struct agx_batch *candidate = &ctx->batches.slots[i];
      bool cand_submitted = BITSET_TEST(ctx->batches.submitted, i);

      /* Prefer submitted batches first */
      if (!cand_submitted && submitted)
         continue;

      if (!batch || batch->seqnum > candidate->seqnum) {
         batch = candidate;
         submitted = cand_submitted;
      }
   }
   assert(batch);

   agx_sync_batch_for_reason(ctx, batch, "Too many batches");

   /* Batch is now free */
   agx_batch_init(ctx, state, batch);
   return batch;
}

struct agx_batch *
agx_get_batch(struct agx_context *ctx)
{
   if (!ctx->batch || agx_batch_is_compute(ctx->batch)) {
      ctx->batch = agx_get_batch_for_framebuffer(ctx, &ctx->framebuffer);
      agx_dirty_all(ctx);
   }

   assert(util_framebuffer_state_equal(&ctx->framebuffer, &ctx->batch->key));
   return ctx->batch;
}

struct agx_batch *
agx_get_compute_batch(struct agx_context *ctx)
{
   agx_dirty_all(ctx);

   struct pipe_framebuffer_state key = {.width = AGX_COMPUTE_BATCH_WIDTH};
   ctx->batch = agx_get_batch_for_framebuffer(ctx, &key);
   return ctx->batch;
}

void
agx_flush_all(struct agx_context *ctx, const char *reason)
{
   unsigned idx;
   foreach_active(ctx, idx) {
      if (reason)
         perf_debug_ctx(ctx, "Flushing due to: %s\n", reason);

      agx_flush_batch(ctx, &ctx->batches.slots[idx]);
   }
}

void
agx_flush_batch_for_reason(struct agx_context *ctx, struct agx_batch *batch,
                           const char *reason)
{
   if (reason)
      perf_debug_ctx(ctx, "Flushing due to: %s\n", reason);

   if (agx_batch_is_active(batch))
      agx_flush_batch(ctx, batch);
}

static void
agx_flush_readers_except(struct agx_context *ctx, struct agx_resource *rsrc,
                         struct agx_batch *except, const char *reason,
                         bool sync)
{
   unsigned idx;

   /* Flush everything to the hardware first */
   foreach_active(ctx, idx) {
      struct agx_batch *batch = &ctx->batches.slots[idx];

      if (batch == except)
         continue;

      if (agx_batch_uses_bo(batch, rsrc->bo)) {
         perf_debug_ctx(ctx, "Flush reader due to: %s\n", reason);
         agx_flush_batch(ctx, batch);
      }
   }

   /* Then wait on everything if necessary */
   if (sync) {
      foreach_submitted(ctx, idx) {
         struct agx_batch *batch = &ctx->batches.slots[idx];

         if (batch == except)
            continue;

         if (agx_batch_uses_bo(batch, rsrc->bo)) {
            perf_debug_ctx(ctx, "Sync reader due to: %s\n", reason);
            agx_sync_batch(ctx, batch);
         }
      }
   }
}

static void
agx_flush_writer_except(struct agx_context *ctx, struct agx_resource *rsrc,
                        struct agx_batch *except, const char *reason, bool sync)
{
   struct agx_batch *writer = agx_writer_get(ctx, rsrc->bo->handle);

   if (writer && writer != except &&
       (agx_batch_is_active(writer) || agx_batch_is_submitted(writer))) {
      if (agx_batch_is_active(writer) || sync) {
         perf_debug_ctx(ctx, "%s writer due to: %s\n", sync ? "Sync" : "Flush",
                        reason);
      }
      if (agx_batch_is_active(writer))
         agx_flush_batch(ctx, writer);
      /* Check for submitted state, because if the batch was a no-op it'll
       * already be cleaned up */
      if (sync && agx_batch_is_submitted(writer))
         agx_sync_batch(ctx, writer);
   }
}

bool
agx_any_batch_uses_resource(struct agx_context *ctx, struct agx_resource *rsrc)
{
   unsigned idx;
   foreach_active(ctx, idx) {
      struct agx_batch *batch = &ctx->batches.slots[idx];

      if (agx_batch_uses_bo(batch, rsrc->bo))
         return true;
   }

   foreach_submitted(ctx, idx) {
      struct agx_batch *batch = &ctx->batches.slots[idx];

      if (agx_batch_uses_bo(batch, rsrc->bo))
         return true;
   }

   return false;
}

void
agx_flush_readers(struct agx_context *ctx, struct agx_resource *rsrc,
                  const char *reason)
{
   agx_flush_readers_except(ctx, rsrc, NULL, reason, false);
}

void
agx_sync_readers(struct agx_context *ctx, struct agx_resource *rsrc,
                 const char *reason)
{
   agx_flush_readers_except(ctx, rsrc, NULL, reason, true);
}

void
agx_flush_writer(struct agx_context *ctx, struct agx_resource *rsrc,
                 const char *reason)
{
   agx_flush_writer_except(ctx, rsrc, NULL, reason, false);
}

void
agx_sync_writer(struct agx_context *ctx, struct agx_resource *rsrc,
                const char *reason)
{
   agx_flush_writer_except(ctx, rsrc, NULL, reason, true);
}

void
agx_batch_reads(struct agx_batch *batch, struct agx_resource *rsrc)
{
   agx_batch_add_bo(batch, rsrc->bo);

   if (rsrc->separate_stencil)
      agx_batch_add_bo(batch, rsrc->separate_stencil->bo);

   /* Don't hazard track fake resources internally created for meta */
   if (!rsrc->base.screen)
      return;

   /* Hazard: read-after-write */
   agx_flush_writer_except(batch->ctx, rsrc, batch, "Read from another batch",
                           false);
}

static void
agx_batch_writes_internal(struct agx_batch *batch, struct agx_resource *rsrc,
                          unsigned level)
{
   struct agx_context *ctx = batch->ctx;
   struct agx_batch *writer = agx_writer_get(ctx, rsrc->bo->handle);

   assert(batch->initialized);

   agx_flush_readers_except(ctx, rsrc, batch, "Write from other batch", false);

   BITSET_SET(rsrc->data_valid, level);

   /* Nothing to do if we're already writing */
   if (writer == batch)
      return;

   /* Hazard: writer-after-write, write-after-read */
   if (writer)
      agx_flush_writer(ctx, rsrc, "Multiple writers");

   /* Write is strictly stronger than a read */
   agx_batch_reads(batch, rsrc);

   writer = agx_writer_get(ctx, rsrc->bo->handle);
   assert(!writer || agx_batch_is_submitted(writer));

   /* We are now the new writer. Disregard the previous writer -- anything that
    * needs to wait for the writer going forward needs to wait for us.
    */
   agx_writer_remove(ctx, rsrc->bo->handle);
   agx_writer_add(ctx, agx_batch_idx(batch), rsrc->bo->handle);
   assert(agx_batch_is_active(batch));
}

void
agx_batch_writes(struct agx_batch *batch, struct agx_resource *rsrc,
                 unsigned level)
{
   agx_batch_writes_internal(batch, rsrc, level);

   if (rsrc->base.target == PIPE_BUFFER) {
      /* Assume BOs written by the GPU are fully valid */
      rsrc->valid_buffer_range.start = 0;
      rsrc->valid_buffer_range.end = ~0;
   }
}

void
agx_batch_writes_range(struct agx_batch *batch, struct agx_resource *rsrc,
                       unsigned offset, unsigned size)
{
   assert(rsrc->base.target == PIPE_BUFFER);
   agx_batch_writes_internal(batch, rsrc, 0);
   util_range_add(&rsrc->base, &rsrc->valid_buffer_range, offset,
                  offset + size);
}

static int
agx_get_in_sync(struct agx_context *ctx)
{
   struct agx_device *dev = agx_device(ctx->base.screen);

   if (ctx->in_sync_fd >= 0) {
      int ret =
         drmSyncobjImportSyncFile(dev->fd, ctx->in_sync_obj, ctx->in_sync_fd);
      assert(!ret);

      close(ctx->in_sync_fd);
      ctx->in_sync_fd = -1;

      return ctx->in_sync_obj;
   } else {
      return 0;
   }
}

static void
agx_add_sync(struct drm_asahi_sync *syncs, unsigned *count, uint32_t handle)
{
   if (!handle)
      return;

   syncs[(*count)++] = (struct drm_asahi_sync){
      .sync_type = DRM_ASAHI_SYNC_SYNCOBJ,
      .handle = handle,
   };
}

void
agx_batch_submit(struct agx_context *ctx, struct agx_batch *batch,
                 struct drm_asahi_cmd_compute *compute,
                 struct drm_asahi_cmd_render *render)
{
   struct agx_device *dev = agx_device(ctx->base.screen);
   struct agx_screen *screen = agx_screen(ctx->base.screen);

   bool feedback = dev->debug & (AGX_DBG_TRACE | AGX_DBG_SYNC | AGX_DBG_STATS);

#ifndef NDEBUG
   /* Debug builds always get feedback (for fault checks) */
   feedback = true;
#endif

   /* Timer queries use the feedback timestamping */
   feedback |= (batch->timestamps.size > 0);

   if (!feedback)
      batch->result = NULL;

   /* We allocate the worst-case sync array size since this won't be excessive
    * for most workloads
    */
   unsigned max_syncs = batch->bo_list.bit_count + 2;
   unsigned in_sync_count = 0;
   unsigned shared_bo_count = 0;
   struct drm_asahi_sync *in_syncs =
      malloc(max_syncs * sizeof(struct drm_asahi_sync));
   struct agx_bo **shared_bos = malloc(max_syncs * sizeof(struct agx_bo *));

   uint64_t wait_seqid = p_atomic_read(&screen->flush_wait_seqid);

   /* Elide syncing against our own queue */
   if (wait_seqid && wait_seqid == ctx->flush_my_seqid) {
      batch_debug(batch,
                  "Wait sync point %" PRIu64 " is ours, waiting on %" PRIu64
                  " instead",
                  wait_seqid, ctx->flush_other_seqid);
      wait_seqid = ctx->flush_other_seqid;
   }

   uint64_t seqid = p_atomic_inc_return(&screen->flush_cur_seqid);
   assert(seqid > wait_seqid);

   batch_debug(batch, "Sync point is %" PRIu64, seqid);

   /* Subtle concurrency note: Since we assign seqids atomically and do
    * not lock submission across contexts, it is possible for two threads
    * to submit timeline syncobj updates out of order. As far as I can
    * tell, this case is handled in the kernel conservatively: it triggers
    * a fence context bump and effectively "splits" the timeline at the
    * larger point, causing future lookups for earlier points to return a
    * later point, waiting more. The signaling code still makes sure all
    * prior fences have to be signaled before considering a given point
    * signaled, regardless of order. That's good enough for us.
    *
    * (Note: this case breaks drm_syncobj_query_ioctl and for this reason
    * triggers a DRM_DEBUG message on submission, but we don't use that
    * so we don't care.)
    *
    * This case can be tested by setting seqid = 1 unconditionally here,
    * causing every single syncobj update to reuse the same timeline point.
    * Everything still works (but over-synchronizes because this effectively
    * serializes all submissions once any context flushes once).
    */
   struct drm_asahi_sync out_syncs[2] = {
      {
         .sync_type = DRM_ASAHI_SYNC_SYNCOBJ,
         .handle = batch->syncobj,
      },
      {
         .sync_type = DRM_ASAHI_SYNC_TIMELINE_SYNCOBJ,
         .handle = screen->flush_syncobj,
         .timeline_value = seqid,
      },
   };

   /* This lock protects against a subtle race scenario:
    * - Context 1 submits and registers itself as writer for a BO
    * - Context 2 runs the below loop, and finds the writer syncobj
    * - Context 1 is destroyed,
    *     - flushing all batches, unregistering itself as a writer, and
    *     - Destroying syncobjs for all batches
    * - Context 2 submits, with a now invalid syncobj ID
    *
    * Since batch syncobjs are only destroyed on context destruction, we can
    * protect against this scenario with a screen-wide rwlock to ensure that
    * the syncobj destroy code cannot run concurrently with any other
    * submission. If a submit runs before the wrlock is taken, the syncobjs
    * must still exist (even if the batch was flushed and no longer a writer).
    * If it runs after the wrlock is released, then by definition the
    * just-destroyed syncobjs cannot be writers for any BO at that point.
    *
    * A screen-wide (not device-wide) rwlock is sufficient because by definition
    * resources can only be implicitly shared within a screen. Any shared
    * resources across screens must have been imported and will go through the
    * AGX_BO_SHARED path instead, which has no race (but is slower).
    */
   u_rwlock_rdlock(&screen->destroy_lock);

   int handle;
   AGX_BATCH_FOREACH_BO_HANDLE(batch, handle) {
      struct agx_bo *bo = agx_lookup_bo(dev, handle);

      if (bo->flags & AGX_BO_SHARED) {
         batch_debug(batch, "Waits on shared BO @ 0x%" PRIx64, bo->va->addr);

         /* Get a sync file fd from the buffer */
         int in_sync_fd = agx_export_sync_file(dev, bo);
         assert(in_sync_fd >= 0);

         /* Create a new syncobj */
         uint32_t sync_handle;
         int ret = drmSyncobjCreate(dev->fd, 0, &sync_handle);
         assert(ret >= 0);

         /* Import the sync file into it */
         ret = drmSyncobjImportSyncFile(dev->fd, sync_handle, in_sync_fd);
         assert(ret >= 0);
         assert(sync_handle);
         close(in_sync_fd);

         /* Add it to our wait list */
         agx_add_sync(in_syncs, &in_sync_count, sync_handle);

         /* And keep track of the BO for cloning the out_sync */
         shared_bos[shared_bo_count++] = bo;
      } else {
         /* Deal with BOs which are not externally shared, but which have been
          * written from another context within the same screen. We also need to
          * wait on these using their syncobj.
          */
         uint64_t writer = p_atomic_read_relaxed(&bo->writer);
         uint32_t queue_id = agx_bo_writer_queue(writer);
         if (writer && queue_id != ctx->queue_id) {
            batch_debug(
               batch, "Waits on inter-context BO @ 0x%" PRIx64 " from queue %u",
               bo->va->addr, queue_id);

            agx_add_sync(in_syncs, &in_sync_count,
                         agx_bo_writer_syncobj(writer));
            shared_bos[shared_bo_count++] = NULL;
         }
      }
   }

   if (dev->debug & AGX_DBG_SCRATCH) {
      if (compute)
         agx_scratch_debug_pre(&ctx->scratch_cs);
      if (render) {
         agx_scratch_debug_pre(&ctx->scratch_vs);
         agx_scratch_debug_pre(&ctx->scratch_fs);
      }
   }

   /* Add an explicit fence from gallium, if any */
   agx_add_sync(in_syncs, &in_sync_count, agx_get_in_sync(ctx));

   /* Add an implicit cross-context flush sync point, if any */
   if (wait_seqid) {
      batch_debug(batch, "Waits on inter-context sync point %" PRIu64,
                  wait_seqid);
      in_syncs[in_sync_count++] = (struct drm_asahi_sync){
         .sync_type = DRM_ASAHI_SYNC_TIMELINE_SYNCOBJ,
         .handle = screen->flush_syncobj,
         .timeline_value = wait_seqid,
      };
   }

   /* Submit! */
   struct drm_asahi_command commands[2];
   unsigned command_count = 0;

   if (compute) {
      commands[command_count++] = (struct drm_asahi_command){
         .cmd_type = DRM_ASAHI_CMD_COMPUTE,
         .flags = 0,
         .cmd_buffer = (uint64_t)(uintptr_t)compute,
         .cmd_buffer_size = sizeof(struct drm_asahi_cmd_compute),
         .result_offset = feedback ? batch->result_off : 0,
         .result_size = feedback ? sizeof(union agx_batch_result) : 0,
         /* Barrier on previous submission */
         .barriers = {0, 0},
      };
   }

   if (render) {
      commands[command_count++] = (struct drm_asahi_command){
         .cmd_type = DRM_ASAHI_CMD_RENDER,
         .flags = 0,
         .cmd_buffer = (uint64_t)(uintptr_t)render,
         .cmd_buffer_size = sizeof(struct drm_asahi_cmd_render),
         .result_offset =
            feedback ? (batch->result_off + sizeof(union agx_batch_result)) : 0,
         .result_size = feedback ? sizeof(union agx_batch_result) : 0,
         /* Barrier on previous submission */
         .barriers = {compute ? DRM_ASAHI_BARRIER_NONE : 0, compute ? 1 : 0},
      };
   }

   struct drm_asahi_submit submit = {
      .flags = 0,
      .queue_id = ctx->queue_id,
      .result_handle = feedback ? ctx->result_buf->handle : 0,
      .in_sync_count = in_sync_count,
      .out_sync_count = 2,
      .command_count = command_count,
      .in_syncs = (uint64_t)(uintptr_t)(in_syncs),
      .out_syncs = (uint64_t)(uintptr_t)(out_syncs),
      .commands = (uint64_t)(uintptr_t)(&commands[0]),
   };

   int ret = dev->ops.submit(dev, &submit, ctx->result_buf->vbo_res_id);

   u_rwlock_rdunlock(&screen->destroy_lock);

   if (ret) {
      if (compute) {
         fprintf(stderr, "DRM_IOCTL_ASAHI_SUBMIT compute failed: %m\n");
      }

      if (render) {
         struct drm_asahi_cmd_render *c = render;
         fprintf(
            stderr,
            "DRM_IOCTL_ASAHI_SUBMIT render failed: %m (%dx%d tile %dx%d layers %d samples %d)\n",
            c->fb_width, c->fb_height, c->utile_width, c->utile_height,
            c->layers, c->samples);
      }

      assert(0);
   }

   if (ret == ENODEV)
      abort();

   /* Now stash our batch fence into any shared BOs. */
   if (shared_bo_count) {
      /* Convert our handle to a sync file */
      int out_sync_fd = -1;
      int ret = drmSyncobjExportSyncFile(dev->fd, batch->syncobj, &out_sync_fd);
      assert(ret >= 0);
      assert(out_sync_fd >= 0);

      for (unsigned i = 0; i < shared_bo_count; i++) {
         if (!shared_bos[i])
            continue;

         batch_debug(batch, "Signals shared BO @ 0x%" PRIx64,
                     shared_bos[i]->va->addr);

         /* Free the in_sync handle we just acquired */
         ret = drmSyncobjDestroy(dev->fd, in_syncs[i].handle);
         assert(ret >= 0);
         /* And then import the out_sync sync file into it */
         ret = agx_import_sync_file(dev, shared_bos[i], out_sync_fd);
         assert(ret >= 0);
      }

      close(out_sync_fd);
   }

   /* Record the syncobj on each BO we write, so it can be added post-facto as a
    * fence if the BO is exported later...
    */
   AGX_BATCH_FOREACH_BO_HANDLE(batch, handle) {
      struct agx_bo *bo = agx_lookup_bo(dev, handle);
      struct agx_batch *writer = agx_writer_get(ctx, handle);

      if (!writer)
         continue;

      /* Skip BOs that are written by submitted batches, they're not ours */
      if (agx_batch_is_submitted(writer))
         continue;

      /* But any BOs written by active batches are ours */
      assert(writer == batch && "exclusive writer");
      p_atomic_set(&bo->writer, agx_bo_writer(ctx->queue_id, batch->syncobj));
      batch_debug(batch, "Writes to BO @ 0x%" PRIx64, bo->va->addr);
   }

   free(in_syncs);
   free(shared_bos);

   if (dev->debug & (AGX_DBG_TRACE | AGX_DBG_SYNC | AGX_DBG_SCRATCH)) {
      if (dev->debug & AGX_DBG_TRACE) {
         if (compute) {
            agxdecode_drm_cmd_compute(dev->agxdecode, &dev->params, compute,
                                      true);
         }

         if (render) {
            agxdecode_drm_cmd_render(dev->agxdecode, &dev->params, render,
                                     true);
         }

         agxdecode_next_frame();
      }

      /* Wait so we can get errors reported back */
      int ret = drmSyncobjWait(dev->fd, &batch->syncobj, 1, INT64_MAX, 0, NULL);
      assert(!ret);

      agx_batch_print_stats(dev, batch);

      if (dev->debug & AGX_DBG_SCRATCH) {
         if (compute) {
            fprintf(stderr, "CS scratch:\n");
            agx_scratch_debug_post(&ctx->scratch_cs);
         }
         if (render) {
            fprintf(stderr, "VS scratch:\n");
            agx_scratch_debug_post(&ctx->scratch_vs);
            fprintf(stderr, "FS scratch:\n");
            agx_scratch_debug_post(&ctx->scratch_fs);
         }
      }
   }

   agx_batch_mark_submitted(batch);

   /* Record the last syncobj for fence creation */
   ctx->syncobj = batch->syncobj;

   /* Update the last seqid in the context (must only happen if the submit
    * succeeded, otherwise the timeline point would not be valid).
    */
   ctx->flush_last_seqid = seqid;

   if (ctx->batch == batch)
      ctx->batch = NULL;

   /* Try to clean up up to two batches, to keep memory usage down */
   if (agx_cleanup_batches(ctx) >= 0)
      agx_cleanup_batches(ctx);
}

void
agx_sync_batch(struct agx_context *ctx, struct agx_batch *batch)
{
   struct agx_device *dev = agx_device(ctx->base.screen);

   if (agx_batch_is_active(batch))
      agx_flush_batch(ctx, batch);

   /* Empty batch case, already cleaned up */
   if (!agx_batch_is_submitted(batch))
      return;

   assert(batch->syncobj);
   int ret = drmSyncobjWait(dev->fd, &batch->syncobj, 1, INT64_MAX, 0, NULL);
   assert(!ret);
   agx_batch_cleanup(ctx, batch, false);
}

void
agx_sync_batch_for_reason(struct agx_context *ctx, struct agx_batch *batch,
                          const char *reason)
{
   if (reason)
      perf_debug_ctx(ctx, "Syncing due to: %s\n", reason);

   agx_sync_batch(ctx, batch);
}

void
agx_sync_all(struct agx_context *ctx, const char *reason)
{
   if (reason)
      perf_debug_ctx(ctx, "Syncing all due to: %s\n", reason);

   unsigned idx;
   foreach_active(ctx, idx) {
      agx_flush_batch(ctx, &ctx->batches.slots[idx]);
   }

   foreach_submitted(ctx, idx) {
      agx_sync_batch(ctx, &ctx->batches.slots[idx]);
   }
}

void
agx_batch_reset(struct agx_context *ctx, struct agx_batch *batch)
{
   batch_debug(batch, "RESET");

   assert(!batch->initialized);

   /* Reset an empty batch. Like submit, but does nothing. */
   agx_batch_mark_submitted(batch);

   if (ctx->batch == batch)
      ctx->batch = NULL;

   /* Elide printing stats */
   batch->result = NULL;

   agx_batch_cleanup(ctx, batch, true);
}

/*
 * Timestamp queries record the time after all current work is finished,
 * which we handle as the time after all current batches finish (since we're a
 * tiler and would rather not split the batch). So add a query to all active
 * batches.
 */
void
agx_add_timestamp_end_query(struct agx_context *ctx, struct agx_query *q)
{
   unsigned idx;
   foreach_active(ctx, idx) {
      agx_batch_add_timestamp_query(&ctx->batches.slots[idx], q);
   }
}

/*
 * To implement a memory barrier conservatively, flush any batch that contains
 * an incoherent memory write (requiring a memory barrier to synchronize). This
 * could be further optimized.
 */
void
agx_memory_barrier(struct pipe_context *pctx, unsigned flags)
{
   struct agx_context *ctx = agx_context(pctx);

   unsigned i;
   foreach_active(ctx, i) {
      struct agx_batch *batch = &ctx->batches.slots[i];

      if (batch->incoherent_writes)
         agx_flush_batch_for_reason(ctx, batch, "Memory barrier");
   }
}