/*
 * Copyright © 2022 Igalia S.L.
 * SPDX-License-Identifier: MIT
 */

#include <assert.h>
#include <ctype.h>
#include <err.h>
#include <errno.h>
#include <fcntl.h>
#include <getopt.h>
#include <inttypes.h>
#include <signal.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <libgen.h>
#if FD_REPLAY_KGSL
#include "../vulkan/msm_kgsl.h"
#elif FD_REPLAY_MSM
#include <xf86drm.h>
#include "drm-uapi/msm_drm.h"
#elif FD_REPLAY_WSL
#define __KERNEL__
#include "drm-uapi/d3dkmthk.h"
#endif

#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/wait.h>

#include "util/os_time.h"
#include "util/rb_tree.h"
#include "util/u_vector.h"
#include "util/vma.h"
#include "buffers.h"
#include "cffdec.h"
#include "io.h"
#include "redump.h"
#include "rdutil.h"

/**
 * Replay command stream obtained from:
 * - /sys/kernel/debug/dri/0/rd
 * - /sys/kernel/debug/dri/0/hangrd
 * !!! Command stream capture should be done with ALL buffers:
 * - echo 1 > /sys/module/msm/parameters/rd_full
 *
 * Requires kernel with MSM_INFO_SET_IOVA support.
 * In case userspace IOVAs are not supported, like on KGSL, we have to
 * pre-allocate a single buffer and hope it always allocated starting
 * from the same address.
 *
 * TODO: Misrendering, would require marking framebuffer images
 *       at each renderpass in order to fetch and decode them.
 *
 * Code from Freedreno/Turnip is not re-used here since the relevant
 * pieces may introduce additional allocations which cannot be allowed
 * during the replay.
 *
 * For how-to see freedreno.rst
 */

static const char *exename = NULL;

static const uint64_t FAKE_ADDRESS_SPACE_SIZE = 1024 * 1024 * 1024;

static int handle_file(const char *filename, uint32_t first_submit,
                       uint32_t last_submit, uint32_t submit_to_override,
                       uint64_t base_addr, const char *cmdstreamgen);

static void
print_usage(const char *name, const char *default_csgen)
{
   /* clang-format off */
   fprintf(stderr, "Usage:\n\n"
           "\t%s [OPTIONS]... FILE...\n\n"
           "Options:\n"
           "\t-e, --exe=NAME         - only use cmdstream from named process\n"
           "\t-o  --override=submit  - № of the submit to override\n"
           "\t-g  --generator=path   - executable which generate cmdstream for override (default: %s)\n"
           "\t-f  --first=submit     - first submit № to replay\n"
           "\t-l  --last=submit      - last submit № to replay\n"
           "\t-a  --address=address  - base iova address on WSL\n"
           "\t-h, --help             - show this message\n"
           , name, default_csgen);
   /* clang-format on */
   exit(2);
}

/* clang-format off */
static const struct option opts[] = {
      { "exe",       required_argument, 0, 'e' },
      { "override",  required_argument, 0, 'o' },
      { "generator", required_argument, 0, 'g' },
      { "first",     required_argument, 0, 'f' },
      { "last",      required_argument, 0, 'l' },
      { "address",   required_argument, 0, 'a' },
      { "help",      no_argument,       0, 'h' },
};
/* clang-format on */

int
main(int argc, char **argv)
{
   int ret = -1;
   int c;

   uint32_t submit_to_override = -1;
   uint32_t first_submit = 0;
   uint32_t last_submit = -1;
   uint64_t base_addr = 0;

   char *default_csgen = malloc(PATH_MAX);
   snprintf(default_csgen, PATH_MAX, "%s/generate_rd", dirname(argv[0]));

   const char *csgen = default_csgen;

   while ((c = getopt_long(argc, argv, "e:o:g:f:l:a:h", opts, NULL)) != -1) {
      switch (c) {
      case 0:
         /* option that set a flag, nothing to do */
         break;
      case 'e':
         exename = optarg;
         break;
      case 'o':
         submit_to_override = strtoul(optarg, NULL, 0);
         break;
      case 'g':
         csgen = optarg;
         break;
      case 'f':
         first_submit = strtoul(optarg, NULL, 0);
         break;
      case 'l':
         last_submit = strtoul(optarg, NULL, 0);
         break;
      case 'a':
         base_addr = strtoull(optarg, NULL, 0);
         break;
      case 'h':
      default:
         print_usage(argv[0], default_csgen);
      }
   }

   while (optind < argc) {
      ret = handle_file(argv[optind], first_submit, last_submit,
                        submit_to_override, base_addr, csgen);
      if (ret) {
         fprintf(stderr, "error reading: %s\n", argv[optind]);
         fprintf(stderr, "continuing..\n");
      }
      optind++;
   }

   if (ret)
      print_usage(argv[0], default_csgen);

   return ret;
}

struct buffer {
   struct rb_node node;

   uint32_t gem_handle;
   uint64_t size;
   uint64_t iova;
   void *map;

   bool used;
   uint32_t flags;
};

struct cmdstream {
   uint64_t iova;
   uint64_t size;
};

struct wrbuf {
   uint64_t iova;
   uint64_t size;
   char* name;
};

struct device {
   int fd;

   struct rb_tree buffers;
   struct util_vma_heap vma;

   struct u_vector cmdstreams;

   uint64_t shader_log_iova;
   uint64_t cp_log_iova;

   bool has_set_iova;

   uint32_t va_id;
   void *va_map;
   uint64_t va_iova;

   struct u_vector wrbufs;

#ifdef FD_REPLAY_MSM
   uint32_t queue_id;
#endif

#ifdef FD_REPLAY_KGSL
   uint32_t context_id;
#endif

#ifdef FD_REPLAY_WSL
   struct d3dkmthandle device;
   struct d3dkmthandle context;

   /* We don't know at the moment a good way to wait for submission to complete
    * on WSL, so we could use our own fences.
    */
   uint64_t fence_iova;
   uint64_t fence_ib_iova;
   volatile uint32_t *fence;
   uint32_t *fence_ib;
#endif
};

void buffer_mem_free(struct device *dev, struct buffer *buf);

static int
rb_buffer_insert_cmp(const struct rb_node *n1, const struct rb_node *n2)
{
   const struct buffer *buf1 = (const struct buffer *)n1;
   const struct buffer *buf2 = (const struct buffer *)n2;
   /* Note that gpuaddr comparisions can overflow an int: */
   if (buf1->iova > buf2->iova)
      return 1;
   else if (buf1->iova < buf2->iova)
      return -1;
   return 0;
}

static int
rb_buffer_search_cmp(const struct rb_node *node, const void *addrptr)
{
   const struct buffer *buf = (const struct buffer *)node;
   uint64_t iova = *(uint64_t *)addrptr;
   if (buf->iova + buf->size <= iova)
      return -1;
   else if (buf->iova > iova)
      return 1;
   return 0;
}

static struct buffer *
device_get_buffer(struct device *dev, uint64_t iova)
{
   if (iova == 0)
      return NULL;
   return (struct buffer *)rb_tree_search(&dev->buffers, &iova,
                                          rb_buffer_search_cmp);
}

static void
device_mark_buffers(struct device *dev)
{
   rb_tree_foreach_safe (struct buffer, buf, &dev->buffers, node) {
      buf->used = false;
   }
}

static void
device_free_buffers(struct device *dev)
{
   rb_tree_foreach_safe (struct buffer, buf, &dev->buffers, node) {
      buffer_mem_free(dev, buf);
      rb_tree_remove(&dev->buffers, &buf->node);
      free(buf);
   }
}

static void
device_print_shader_log(struct device *dev)
{
   struct shader_log {
      uint64_t cur_iova;
      union {
         uint32_t entries_u32[0];
         float entries_float[0];
      };
   };

   if (dev->shader_log_iova != 0)
   {
      struct buffer *buf = device_get_buffer(dev, dev->shader_log_iova);
      if (buf) {
         struct shader_log *log = buf->map + (dev->shader_log_iova - buf->iova);
         uint32_t count = (log->cur_iova - dev->shader_log_iova -
                           offsetof(struct shader_log, entries_u32)) / 4;

         printf("Shader Log Entries: %u\n", count);

         for (uint32_t i = 0; i < count; i++) {
            printf("[%u] %08x %.4f\n", i, log->entries_u32[i],
                   log->entries_float[i]);
         }

         printf("========================================\n");
      }
   }
}

static void
device_print_cp_log(struct device *dev)
{
   struct cp_log {
      uint64_t cur_iova;
      uint64_t tmp;
      uint64_t first_entry_size;
   };

   struct cp_log_entry {
      uint64_t size;
      uint32_t data[0];
   };

   if (dev->cp_log_iova == 0)
      return;

   struct buffer *buf = device_get_buffer(dev, dev->cp_log_iova);
   if (!buf)
      return;

   struct cp_log *log = buf->map + (dev->cp_log_iova - buf->iova);
   if (log->first_entry_size == 0)
      return;

   struct cp_log_entry *log_entry =
      buf->map + offsetof(struct cp_log, first_entry_size);
   uint32_t idx = 0;
   while (log_entry->size != 0) {
      printf("\nCP Log [%u]:\n", idx++);
      uint32_t dwords = log_entry->size / 4;

      for (uint32_t i = 0; i < dwords; i++) {
         if (i % 8 == 0)
            printf("\t");
         printf("%08x ", log_entry->data[i]);
         if (i % 8 == 7)
            printf("\n");
      }
      printf("\n");

      log_entry = (void *)log_entry + log_entry->size +
                  offsetof(struct cp_log_entry, data);
   }
}

static void
device_dump_wrbuf(struct device *dev)
{
   if (!u_vector_length(&dev->wrbufs))
      return;

   char buffer_dir[PATH_MAX];
   getcwd(buffer_dir, sizeof(buffer_dir));
   strcat(buffer_dir, "/buffers");
   rmdir(buffer_dir);
   mkdir(buffer_dir, 0777);

   struct wrbuf *wrbuf;
   u_vector_foreach(wrbuf, &dev->wrbufs) {
      char buffer_path[PATH_MAX];
      snprintf(buffer_path, sizeof(buffer_path), "%s/%s", buffer_dir, wrbuf->name);
      FILE *f = fopen(buffer_path, "wb");
      if (!f) {
         fprintf(stderr, "Error opening %s\n", buffer_path);
         goto end_it;
      }

      struct buffer *buf = device_get_buffer(dev, wrbuf->iova);
      if (!buf) {
         fprintf(stderr, "Error getting buffer for %s\n", buffer_path);
         goto end_it;
      }

      uint64_t offset = wrbuf->iova - buf->iova;
      uint64_t size = MIN2(wrbuf->size, buf->size - offset);
      if (size != wrbuf->size) {
         fprintf(stderr, "Warning: Clamping buffer %s as it's smaller than expected (0x%" PRIx64 " < 0x%" PRIx64 ")\n", wrbuf->name, size, wrbuf->size);
      }

      printf("Dumping %s (0x%" PRIx64 " - 0x%" PRIx64 ")\n", wrbuf->name, wrbuf->iova, wrbuf->iova + size);

      fwrite(buf->map + offset, size, 1, f);

      end_it:
      fclose(f);
   }
}

#if FD_REPLAY_MSM
static inline void
get_abs_timeout(struct drm_msm_timespec *tv, uint64_t ns)
{
   struct timespec t;
   clock_gettime(CLOCK_MONOTONIC, &t);
   tv->tv_sec = t.tv_sec + ns / 1000000000;
   tv->tv_nsec = t.tv_nsec + ns % 1000000000;
}

static struct device *
device_create(uint64_t base_addr)
{
   struct device *dev = calloc(sizeof(struct device), 1);

   dev->fd = drmOpenWithType("msm", NULL, DRM_NODE_RENDER);
   if (dev->fd < 0) {
      errx(1, "Cannot open MSM fd!");
   }

   uint64_t va_start, va_size;

   struct drm_msm_param req = {
      .pipe = MSM_PIPE_3D0,
      .param = MSM_PARAM_VA_START,
   };

   int ret = drmCommandWriteRead(dev->fd, DRM_MSM_GET_PARAM, &req, sizeof(req));
   va_start = req.value;

   if (!ret) {
      req.param = MSM_PARAM_VA_SIZE;
      ret = drmCommandWriteRead(dev->fd, DRM_MSM_GET_PARAM, &req, sizeof(req));
      va_size = req.value;

      dev->has_set_iova = true;
   }

   if (ret) {
      printf("MSM_INFO_SET_IOVA is not supported!\n");

      struct drm_msm_gem_new req_new = {.size = FAKE_ADDRESS_SPACE_SIZE, .flags = MSM_BO_CACHED_COHERENT};
      drmCommandWriteRead(dev->fd, DRM_MSM_GEM_NEW, &req_new, sizeof(req_new));
      dev->va_id = req_new.handle;

      struct drm_msm_gem_info req_info = {
         .handle = req_new.handle,
         .info = MSM_INFO_GET_IOVA,
      };

      drmCommandWriteRead(dev->fd,
                                 DRM_MSM_GEM_INFO, &req_info, sizeof(req_info));
      dev->va_iova = req_info.value;

      struct drm_msm_gem_info req_offset = {
         .handle = req_new.handle,
         .info = MSM_INFO_GET_OFFSET,
      };

      drmCommandWriteRead(dev->fd, DRM_MSM_GEM_INFO, &req_offset, sizeof(req_offset));

      dev->va_map = mmap(0, FAKE_ADDRESS_SPACE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
                       dev->fd, req_offset.value);
      if (dev->va_map == MAP_FAILED) {
         err(1, "mmap failure");
      }

      va_start = dev->va_iova;
      va_size = FAKE_ADDRESS_SPACE_SIZE;

      printf("Allocated iova %" PRIx64 "\n", dev->va_iova);
   }

   struct drm_msm_submitqueue req_queue = {
      .flags = 0,
      .prio = 0,
   };

   ret = drmCommandWriteRead(dev->fd, DRM_MSM_SUBMITQUEUE_NEW, &req_queue,
                             sizeof(req_queue));
   if (ret) {
      err(1, "DRM_MSM_SUBMITQUEUE_NEW failure");
   }

   dev->queue_id = req_queue.id;

   rb_tree_init(&dev->buffers);
   util_vma_heap_init(&dev->vma, va_start, ROUND_DOWN_TO(va_size, 4096));
   u_vector_init(&dev->cmdstreams, 8, sizeof(struct cmdstream));
   u_vector_init(&dev->wrbufs, 8, sizeof(struct wrbuf));

   return dev;
}

static void
device_submit_cmdstreams(struct device *dev)
{
   if (!u_vector_length(&dev->cmdstreams)) {
      device_free_buffers(dev);
      return;
   }

   struct drm_msm_gem_submit_cmd cmds[u_vector_length(&dev->cmdstreams)];

   uint32_t idx = 0;
   struct cmdstream *cmd;
   u_vector_foreach(cmd, &dev->cmdstreams) {
      struct buffer *cmdstream_buf = device_get_buffer(dev, cmd->iova);

      uint32_t bo_idx = 0;
      rb_tree_foreach (struct buffer, buf, &dev->buffers, node) {
         if (buf == cmdstream_buf)
            break;

         bo_idx++;
      }

      if (cmdstream_buf)
         cmdstream_buf->flags = MSM_SUBMIT_BO_DUMP;

      struct drm_msm_gem_submit_cmd *submit_cmd = &cmds[idx];
      submit_cmd->type = MSM_SUBMIT_CMD_BUF;
      submit_cmd->submit_idx = dev->has_set_iova ? bo_idx : 0;
      if (dev->has_set_iova) {
         submit_cmd->submit_offset = cmd->iova - cmdstream_buf->iova;
      } else {
         submit_cmd->submit_offset = cmd->iova - dev->va_iova;
      }
      submit_cmd->size = cmd->size;
      submit_cmd->pad = 0;
      submit_cmd->nr_relocs = 0;
      submit_cmd->relocs = 0;

      idx++;
   }

   uint32_t bo_count = 0;
   rb_tree_foreach (struct buffer, buf, &dev->buffers, node) {
      if (buf)
         bo_count++;
   }

   if (!dev->has_set_iova) {
      bo_count = 1;
   }

   struct drm_msm_gem_submit_bo *bo_list =
      calloc(sizeof(struct drm_msm_gem_submit_bo), bo_count);

   if (dev->has_set_iova) {
      uint32_t bo_idx = 0;
      rb_tree_foreach (struct buffer, buf, &dev->buffers, node) {
         struct drm_msm_gem_submit_bo *submit_bo = &bo_list[bo_idx++];
         submit_bo->handle = buf->gem_handle;
         submit_bo->flags =
            buf->flags | MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE;
         submit_bo->presumed = buf->iova;

         buf->flags = 0;
      }
   } else {
      bo_list[0].handle = dev->va_id;
      bo_list[0].flags =
         MSM_SUBMIT_BO_DUMP | MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE;
      bo_list[0].presumed = dev->va_iova;
   }

   struct drm_msm_gem_submit submit_req = {
      .flags = MSM_PIPE_3D0,
      .queueid = dev->queue_id,
      .bos = (uint64_t)(uintptr_t)bo_list,
      .nr_bos = bo_count,
      .cmds = (uint64_t)(uintptr_t)cmds,
      .nr_cmds = u_vector_length(&dev->cmdstreams),
      .in_syncobjs = 0,
      .out_syncobjs = 0,
      .nr_in_syncobjs = 0,
      .nr_out_syncobjs = 0,
      .syncobj_stride = sizeof(struct drm_msm_gem_submit_syncobj),
   };

   int ret = drmCommandWriteRead(dev->fd, DRM_MSM_GEM_SUBMIT, &submit_req,
                                 sizeof(submit_req));

   if (ret) {
      err(1, "DRM_MSM_GEM_SUBMIT failure %d", ret);
   }

   /* Wait for submission to complete in order to be sure that
    * freeing buffers would free their VMAs in the kernel.
    * Makes sure that new allocations won't clash with old ones.
    */
   struct drm_msm_wait_fence wait_req = {
      .fence = submit_req.fence,
      .queueid = dev->queue_id,
   };
   get_abs_timeout(&wait_req.timeout, 1000000000);

   ret =
      drmCommandWrite(dev->fd, DRM_MSM_WAIT_FENCE, &wait_req, sizeof(wait_req));
   if (ret && (ret != -ETIMEDOUT)) {
      err(1, "DRM_MSM_WAIT_FENCE failure %d", ret);
   }

   u_vector_finish(&dev->cmdstreams);
   u_vector_init(&dev->cmdstreams, 8, sizeof(struct cmdstream));

   device_print_shader_log(dev);
   device_print_cp_log(dev);

   device_dump_wrbuf(dev);
   u_vector_finish(&dev->wrbufs);
   u_vector_init(&dev->wrbufs, 8, sizeof(struct wrbuf));

   device_free_buffers(dev);
}

static void
buffer_mem_alloc(struct device *dev, struct buffer *buf)
{
   bool success = util_vma_heap_alloc_addr(&dev->vma, buf->iova, buf->size);
   if (!success)
      errx(1, "Failed to allocate buffer");

   if (!dev->has_set_iova) {
      uint64_t offset = buf->iova - dev->va_iova;
      assert(offset < FAKE_ADDRESS_SPACE_SIZE && (offset + buf->size) <= FAKE_ADDRESS_SPACE_SIZE);
      buf->map = ((uint8_t*)dev->va_map) + offset;
      return;
   }

   {
      struct drm_msm_gem_new req = {.size = buf->size, .flags = MSM_BO_WC};

      int ret =
         drmCommandWriteRead(dev->fd, DRM_MSM_GEM_NEW, &req, sizeof(req));
      if (ret) {
         err(1, "DRM_MSM_GEM_NEW failure %d", ret);
      }

      buf->gem_handle = req.handle;
   }

   {
      struct drm_msm_gem_info req = {
         .handle = buf->gem_handle,
         .info = MSM_INFO_SET_IOVA,
         .value = buf->iova,
      };

      int ret =
         drmCommandWriteRead(dev->fd, DRM_MSM_GEM_INFO, &req, sizeof(req));

      if (ret) {
         err(1, "MSM_INFO_SET_IOVA failure %d", ret);
      }
   }

   {
      struct drm_msm_gem_info req = {
         .handle = buf->gem_handle,
         .info = MSM_INFO_GET_OFFSET,
      };

      int ret =
         drmCommandWriteRead(dev->fd, DRM_MSM_GEM_INFO, &req, sizeof(req));
      if (ret) {
         err(1, "MSM_INFO_GET_OFFSET failure %d", ret);
      }

      void *map = mmap(0, buf->size, PROT_READ | PROT_WRITE, MAP_SHARED,
                       dev->fd, req.value);
      if (map == MAP_FAILED) {
         err(1, "mmap failure");
      }

      buf->map = map;
   }
}

void
buffer_mem_free(struct device *dev, struct buffer *buf)
{
   if (dev->has_set_iova) {
      munmap(buf->map, buf->size);

      struct drm_msm_gem_info req_iova = {
         .handle = buf->gem_handle,
         .info = MSM_INFO_SET_IOVA,
         .value = 0,
      };

      int ret = drmCommandWriteRead(dev->fd, DRM_MSM_GEM_INFO, &req_iova,
                                    sizeof(req_iova));
      if (ret < 0) {
         err(1, "MSM_INFO_SET_IOVA(0) failed! %d", ret);
         return;
      }

      struct drm_gem_close req = {
         .handle = buf->gem_handle,
      };
      drmIoctl(dev->fd, DRM_IOCTL_GEM_CLOSE, &req);
   }

   util_vma_heap_free(&dev->vma, buf->iova, buf->size);
}

#elif FD_REPLAY_KGSL
static int
safe_ioctl(int fd, unsigned long request, void *arg)
{
   int ret;

   do {
      ret = ioctl(fd, request, arg);
   } while (ret == -1 && (errno == EINTR || errno == EAGAIN));

   return ret;
}

static struct device *
device_create(uint64_t base_addr)
{
   struct device *dev = calloc(sizeof(struct device), 1);

   static const char path[] = "/dev/kgsl-3d0";

   dev->fd = open(path, O_RDWR | O_CLOEXEC);
   if (dev->fd < 0) {
      errx(1, "Cannot open KGSL fd!");
   }

   struct kgsl_gpumem_alloc_id req = {
      .size = FAKE_ADDRESS_SPACE_SIZE,
      .flags = KGSL_MEMFLAGS_IOCOHERENT,
   };

   int ret = safe_ioctl(dev->fd, IOCTL_KGSL_GPUMEM_ALLOC_ID, &req);
   if (ret) {
      err(1, "IOCTL_KGSL_GPUMEM_ALLOC_ID failure");
   }

   dev->va_id = req.id;
   dev->va_iova = req.gpuaddr;
   dev->va_map = mmap(0, FAKE_ADDRESS_SPACE_SIZE, PROT_READ | PROT_WRITE,
                      MAP_SHARED, dev->fd, req.id << 12);

   rb_tree_init(&dev->buffers);
   util_vma_heap_init(&dev->vma, req.gpuaddr, ROUND_DOWN_TO(FAKE_ADDRESS_SPACE_SIZE, 4096));
   u_vector_init(&dev->cmdstreams, 8, sizeof(struct cmdstream));
   u_vector_init(&dev->wrbufs, 8, sizeof(struct wrbuf));

   struct kgsl_drawctxt_create drawctxt_req = {
      .flags = KGSL_CONTEXT_SAVE_GMEM |
              KGSL_CONTEXT_NO_GMEM_ALLOC |
              KGSL_CONTEXT_PREAMBLE,
   };

   ret = safe_ioctl(dev->fd, IOCTL_KGSL_DRAWCTXT_CREATE, &drawctxt_req);
   if (ret) {
      err(1, "IOCTL_KGSL_DRAWCTXT_CREATE failure");
   }

   printf("Allocated iova %" PRIx64 "\n", dev->va_iova);

   dev->context_id = drawctxt_req.drawctxt_id;

   return dev;
}

static void
device_submit_cmdstreams(struct device *dev)
{
   if (!u_vector_length(&dev->cmdstreams)) {
      device_free_buffers(dev);
      return;
   }

   struct kgsl_command_object cmds[u_vector_length(&dev->cmdstreams)];

   uint32_t idx = 0;
   struct cmdstream *cmd;
   u_vector_foreach(cmd, &dev->cmdstreams) {
      struct kgsl_command_object *submit_cmd = &cmds[idx++];
      submit_cmd->gpuaddr = cmd->iova;
      submit_cmd->size = cmd->size;
      submit_cmd->flags = KGSL_CMDLIST_IB;
      submit_cmd->id = dev->va_id;
   }

   struct kgsl_gpu_command submit_req = {
      .flags = KGSL_CMDBATCH_SUBMIT_IB_LIST,
      .cmdlist = (uintptr_t) &cmds,
      .cmdsize = sizeof(struct kgsl_command_object),
      .numcmds = u_vector_length(&dev->cmdstreams),
      .numsyncs = 0,
      .context_id = dev->context_id,
   };

   int ret = safe_ioctl(dev->fd, IOCTL_KGSL_GPU_COMMAND, &submit_req);

   if (ret) {
      err(1, "IOCTL_KGSL_GPU_COMMAND failure %d", ret);
   }

   struct kgsl_device_waittimestamp_ctxtid wait = {
      .context_id = dev->context_id,
      .timestamp = submit_req.timestamp,
      .timeout = 3000,
   };

   ret = safe_ioctl(dev->fd, IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID, &wait);

   if (ret) {
      err(1, "IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID failure %d", ret);
   }

   u_vector_finish(&dev->cmdstreams);
   u_vector_init(&dev->cmdstreams, 8, sizeof(struct cmdstream));

   device_print_shader_log(dev);
   device_print_cp_log(dev);

   device_dump_wrbuf(dev);
   u_vector_finish(&dev->wrbufs);
   u_vector_init(&dev->wrbufs, 8, sizeof(struct wrbuf));

   device_free_buffers(dev);
}

static void
buffer_mem_alloc(struct device *dev, struct buffer *buf)
{
   bool success = util_vma_heap_alloc_addr(&dev->vma, buf->iova, buf->size);
   if (!success)
      errx(1, "Failed to allocate buffer");

   buf->map = ((uint8_t*)dev->va_map) + (buf->iova - dev->va_iova);
}

void
buffer_mem_free(struct device *dev, struct buffer *buf)
{
   util_vma_heap_free(&dev->vma, buf->iova, buf->size);
}
#else

static int
safe_ioctl(int fd, unsigned long request, void *arg)
{
   int ret;

   do {
      ret = ioctl(fd, request, arg);
   } while (ret == -1 && (errno == EINTR || errno == EAGAIN));

   return ret;
}

struct alloc_priv_info {
   __u32 struct_size;
   char _pad0[4];
   __u32 unk0; // 1
   char _pad1[4];
   __u64 size;
   __u32 alignment;
   char _pad2[20];
   __u64 allocated_size;
   __u32 unk1;   // 1
   char _pad4[8]; /* offset: 60*/
   __u32 unk2;   // 61
   char _pad5[76];
   __u32 unk3; /* offset: 148 */ // 1
   char _pad6[8];
   __u32 unk4; /* offset: 160 */ // 1
   char _pad7[44];
   __u32 unk5; /* offset: 208 */ // 3
   char _pad8[16];
   __u32 size_2; /* offset: 228 */
   __u32 unk6;   // 1
   __u32 size_3;
   __u32 size_4;
   __u32 unk7; /* offset: 244 */ // 1
   char _pad9[56];
};
static_assert(sizeof(struct alloc_priv_info) == 304);
static_assert(offsetof(struct alloc_priv_info, unk1) == 56);
static_assert(offsetof(struct alloc_priv_info, unk3) == 148);
static_assert(offsetof(struct alloc_priv_info, unk5) == 208);

struct submit_priv_ib_info {
   char _pad5[4];
   __u32 size_dwords;
   __u64 iova;
   char _pad6[8];
} __attribute__((packed));

struct submit_priv_data {
   __u32 magic0;
   char _pad0[4];
   __u32 struct_size;
   char _pad1[4];
   /* It seems that priv data can have several sub-datas
    * cmdbuf is one of them, after it there is another 8 byte struct
    * without anything useful in it. That second data doesn't seem
    * important for replaying.
    */
   __u32 datas_count;
   char _pad2[32];
   struct {
      __u32 magic1;
      __u32 data_size;

      struct {
         __u32 unk1;
         __u32 cmdbuf_size;
         char _pad3[32];
         __u32 ib_count;
         char _pad4[36];

         struct submit_priv_ib_info ibs[];
      } cmdbuf;
   } data0;

   //    unsigned char magic2[8];
} __attribute__((packed));
static_assert(offsetof(struct submit_priv_data, data0) == 0x34);
static_assert(offsetof(struct submit_priv_data, data0.cmdbuf.ibs) == 0x8c);

static struct device *
device_create(uint64_t base_addr)
{
   struct device *dev = calloc(sizeof(struct device), 1);

   static const char path[] = "/dev/dxg";

   dev->fd = open(path, O_RDWR | O_CLOEXEC);
   if (dev->fd < 0) {
      errx(1, "Cannot open /dev/dxg fd");
   }

   struct d3dkmt_adapterinfo adapters[1];
   struct d3dkmt_enumadapters3 enum_adapters = {
      .adapter_count = 1,
      .adapters = adapters,
   };
   int ret = safe_ioctl(dev->fd, LX_DXENUMADAPTERS3, &enum_adapters);
   if (ret) {
      errx(1, "LX_DXENUMADAPTERS3 failure");
   }

   if (enum_adapters.adapter_count == 0) {
      errx(1, "No adapters found");
   }

   struct winluid adapter_luid = enum_adapters.adapters[0].adapter_luid;

   struct d3dkmt_openadapterfromluid open_adapter = {
      .adapter_luid = adapter_luid,
   };
   ret = safe_ioctl(dev->fd, LX_DXOPENADAPTERFROMLUID, &open_adapter);
   if (ret) {
      errx(1, "LX_DXOPENADAPTERFROMLUID failure");
   }

   struct d3dkmthandle adapter = open_adapter.adapter_handle;

   struct d3dkmt_createdevice create_device = {
      .adapter = adapter,
   };
   ret = safe_ioctl(dev->fd, LX_DXCREATEDEVICE, &create_device);
   if (ret) {
      errx(1, "LX_DXCREATEDEVICE failure");
   }

   struct d3dkmthandle device = create_device.device;
   dev->device = device;

   unsigned char create_context_priv_data[] = {
      0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00,
      0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1c, 0x0c, 0x00, 0x00, 0x00,
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   };

   struct d3dkmt_createcontextvirtual create_context = {
      .device = device,
      .node_ordinal = 0,
      .engine_affinity = 1,
      .priv_drv_data = create_context_priv_data,
      .priv_drv_data_size = sizeof(create_context_priv_data),
      .client_hint = 16,
   };
   ret = safe_ioctl(dev->fd, LX_DXCREATECONTEXTVIRTUAL, &create_context);
   if (ret) {
      errx(1, "LX_DXCREATECONTEXTVIRTUAL failure");
   }

   dev->context = create_context.context;

   struct d3dkmt_createpagingqueue create_paging_queue = {
      .device = device,
      .priority = _D3DDDI_PAGINGQUEUE_PRIORITY_NORMAL,
      .physical_adapter_index = 0,
   };
   ret = safe_ioctl(dev->fd, LX_DXCREATEPAGINGQUEUE, &create_paging_queue);
   if (ret) {
      errx(1, "LX_DXCREATEPAGINGQUEUE failure");
   }
   struct d3dkmthandle paging_queue = create_paging_queue.paging_queue;


   uint32_t alloc_size = FAKE_ADDRESS_SPACE_SIZE;
   struct alloc_priv_info priv_alloc_info = {
      .struct_size = sizeof(struct alloc_priv_info),
      .unk0 = 1,
      .size = alloc_size,
      .alignment = 4096,
      .unk1 = 1,
      .unk2 = 61,
      .unk3 = 1,
      .unk4 = 1,
      .unk5 = 3,
      .size_2 = alloc_size,
      .unk6 = 1,
      .size_3 = alloc_size,
      .size_4 = alloc_size,
      .unk7 = 1,
   };

   struct d3dddi_allocationinfo2 alloc_info = {
      .priv_drv_data = &priv_alloc_info,
      .priv_drv_data_size = sizeof(struct alloc_priv_info),
   };

   struct d3dkmt_createallocation create_allocation = {
      .device = device,
      .alloc_count = 1,
      .allocation_info = &alloc_info,
   };
   ret = safe_ioctl(dev->fd, LX_DXCREATEALLOCATION, &create_allocation);
   if (ret) {
      errx(1, "LX_DXCREATEALLOCATION failure");
   }

   assert(priv_alloc_info.allocated_size == alloc_size);

   struct d3dddi_mapgpuvirtualaddress map_virtual_address = {
      .paging_queue = paging_queue,
      .base_address = base_addr,
      .maximum_address = 18446744073709551615ull,
      .allocation = create_allocation.allocation_info[0].allocation,
      .size_in_pages = MAX2(alloc_size / 4096, 1),
      .protection = {
         .write = 1,
         .execute = 1,
      },
   };
   ret = safe_ioctl(dev->fd, LX_DXMAPGPUVIRTUALADDRESS, &map_virtual_address);
   if (ret != 259) {
      errx(1, "LX_DXMAPGPUVIRTUALADDRESS failure");
   }

   __u32 priority = 0;
   struct d3dddi_makeresident make_resident = {
      .paging_queue = paging_queue,
      .alloc_count = 1,
      .allocation_list = &create_allocation.allocation_info[0].allocation,
      .priority_list = &priority,
   };
   ret = safe_ioctl(dev->fd, LX_DXMAKERESIDENT, &make_resident);
   if (ret != 259) {
      errx(1, "LX_DXMAKERESIDENT failure");
   }

   struct d3dkmt_lock2 lock = {
      .device = device,
      .allocation = create_allocation.allocation_info[0].allocation,
   };
   ret = safe_ioctl(dev->fd, LX_DXLOCK2, &lock);
   if (ret) {
      errx(1, "LX_DXLOCK2 failure");
   }

   dev->va_iova = map_virtual_address.virtual_address;
   dev->va_map = lock.data;

   rb_tree_init(&dev->buffers);
   util_vma_heap_init(&dev->vma, dev->va_iova, ROUND_DOWN_TO(alloc_size, 4096));
   u_vector_init(&dev->cmdstreams, 8, sizeof(struct cmdstream));
   u_vector_init(&dev->wrbufs, 8, sizeof(struct wrbuf));

   printf("Allocated iova at 0x%" PRIx64 "\n", dev->va_iova);

   uint64_t hole_size = 4096;
   dev->vma.alloc_high = true;
   dev->fence_iova = util_vma_heap_alloc(&dev->vma, hole_size, 4096);
   dev->fence_ib_iova = dev->fence_iova + 8;
   dev->fence = (uint32_t *) ((uint8_t*)dev->va_map + (dev->fence_iova - dev->va_iova));
   dev->fence_ib = (uint32_t *) ((uint8_t*)dev->va_map + (dev->fence_ib_iova - dev->va_iova));
   dev->vma.alloc_high = false;

   return dev;
}

static void
device_submit_cmdstreams(struct device *dev)
{
   if (!u_vector_length(&dev->cmdstreams)) {
      device_free_buffers(dev);
      return;
   }

   uint32_t cmdstream_count = u_vector_length(&dev->cmdstreams) + 1;

   uint32_t priv_data_size =
      sizeof(struct submit_priv_data) +
      cmdstream_count * sizeof(struct submit_priv_ib_info);

   struct submit_priv_data *priv_data = calloc(1, priv_data_size);
   priv_data->magic0 = 0xccaabbee;
   priv_data->struct_size = priv_data_size;
   priv_data->datas_count = 1;

   priv_data->data0.magic1 = 0xfadcab02;
   priv_data->data0.data_size =
      sizeof(priv_data->data0) +
      cmdstream_count * sizeof(struct submit_priv_ib_info);
   priv_data->data0.cmdbuf.unk1 = 0xcccc0001;
   priv_data->data0.cmdbuf.cmdbuf_size = sizeof(priv_data->data0.cmdbuf) +
      cmdstream_count * sizeof(struct submit_priv_ib_info);
   priv_data->data0.cmdbuf.ib_count = cmdstream_count;

   struct cmdstream *cmd;
   uint32_t idx = 0;
   u_vector_foreach(cmd, &dev->cmdstreams) {
      priv_data->data0.cmdbuf.ibs[idx].size_dwords = cmd->size / 4;
      priv_data->data0.cmdbuf.ibs[idx].iova = cmd->iova;
      idx++;
   }

   priv_data->data0.cmdbuf.ibs[idx].size_dwords = 4;
   priv_data->data0.cmdbuf.ibs[idx].iova = dev->fence_ib_iova;

   *dev->fence = 0x00000000;
   dev->fence_ib[0] = pm4_pkt7_hdr(0x3d, 3); // CP_MEM_WRITE
   dev->fence_ib[1] = dev->fence_iova;
   dev->fence_ib[2] = dev->fence_iova >> 32;
   dev->fence_ib[3] = 0xababfcfc;

   // Fill second (empty) data block
   // uint32_t *magic_end = (uint32_t *)(((char *) priv_data) + priv_data_size - 8);
   // magic_end[0] = 0xfadcab00;
   // magic_end[1] = 0x00000008;

   struct d3dkmt_submitcommand submission = {
      .command_buffer = priv_data->data0.cmdbuf.ibs[0].iova,
      .command_length = priv_data->data0.cmdbuf.ibs[0].size_dwords * sizeof(uint32_t),
      .broadcast_context_count = 1,
      .broadcast_context[0] = dev->context,
      .priv_drv_data_size = priv_data_size,
      .priv_drv_data = priv_data,
   };

   int ret = safe_ioctl(dev->fd, LX_DXSUBMITCOMMAND, &submission);
   if (ret) {
      errx(1, "LX_DXSUBMITCOMMAND failure");
   }

   free(priv_data);

   u_vector_finish(&dev->cmdstreams);
   u_vector_init(&dev->cmdstreams, 8, sizeof(struct cmdstream));

   // TODO: better way to wait
   for (unsigned i = 0; i < 1000; i++) {
      usleep(1000);
      if (*dev->fence != 0)
         break;
   }
   if (*dev->fence == 0) {
      errx(1, "Waiting for submission failed! GPU faulted or kernel did not execute this submission.");
   }

   device_print_shader_log(dev);
   device_print_cp_log(dev);

   device_dump_wrbuf(dev);
   u_vector_finish(&dev->wrbufs);
   u_vector_init(&dev->wrbufs, 8, sizeof(struct wrbuf));

   device_free_buffers(dev);
}

static void
buffer_mem_alloc(struct device *dev, struct buffer *buf)
{
   bool success = util_vma_heap_alloc_addr(&dev->vma, buf->iova, buf->size);
   if (!success)
      errx(1, "Failed to allocate buffer");

   buf->map = ((uint8_t*)dev->va_map) + (buf->iova - dev->va_iova);
}

void
buffer_mem_free(struct device *dev, struct buffer *buf)
{
   util_vma_heap_free(&dev->vma, buf->iova, buf->size);
}

#endif

static void
upload_buffer(struct device *dev, uint64_t iova, unsigned int size,
              void *hostptr)
{
   struct buffer *buf = device_get_buffer(dev, iova);

   if (!buf) {
      buf = calloc(sizeof(struct buffer), 1);
      buf->iova = iova;
      buf->size = size;

      rb_tree_insert(&dev->buffers, &buf->node, rb_buffer_insert_cmp);

      buffer_mem_alloc(dev, buf);
   } else if (buf->size != size) {
      buffer_mem_free(dev, buf);
      buf->size = size;
      buffer_mem_alloc(dev, buf);
   }

   memcpy(buf->map, hostptr, size);

   buf->used = true;
}

static int
override_cmdstream(struct device *dev, struct cmdstream *cs,
                   const char *cmdstreamgen)
{
#if FD_REPLAY_KGSL
   static const char *tmpfilename = "/sdcard/Download/cmdstream_override.rd";
#elif FD_REPLAY_MSM || FD_REPLAY_WSL
   static const char *tmpfilename = "/tmp/cmdstream_override.rd";
#endif


   /* Find a free space for the new cmdstreams and resources we will use
    * when overriding existing cmdstream.
    */
   uint64_t hole_size = util_vma_heap_get_max_free_continuous_size(&dev->vma);
   uint64_t hole_iova = util_vma_heap_alloc(&dev->vma, hole_size, 1);
   util_vma_heap_free(&dev->vma, hole_iova, hole_size);

   char cmd[2048];
   snprintf(cmd, sizeof(cmd),
            "%s --vastart=%" PRIu64 " --vasize=%" PRIu64 " %s", cmdstreamgen,
            hole_iova, hole_size, tmpfilename);

   printf("generating cmdstream '%s'\n", cmd);

   int ret = system(cmd);
   if (ret) {
      fprintf(stderr, "Error executing %s\n", cmd);
      return -1;
   }

   struct io *io;
   struct rd_parsed_section ps = {0};

   io = io_open(tmpfilename);
   if (!io) {
      fprintf(stderr, "could not open: %s\n", tmpfilename);
      return -1;
   }

   struct {
      unsigned int len;
      uint64_t gpuaddr;
   } gpuaddr = {0};

   while (parse_rd_section(io, &ps)) {
      switch (ps.type) {
      case RD_GPUADDR:
         parse_addr(ps.buf, ps.sz, &gpuaddr.len, &gpuaddr.gpuaddr);
         /* no-op */
         break;
      case RD_BUFFER_CONTENTS:
         upload_buffer(dev, gpuaddr.gpuaddr, gpuaddr.len, ps.buf);
         ps.buf = NULL;
         break;
      case RD_CMDSTREAM_ADDR: {
         unsigned int sizedwords;
         uint64_t gpuaddr;
         parse_addr(ps.buf, ps.sz, &sizedwords, &gpuaddr);
         printf("override cmdstream: %d dwords\n", sizedwords);

         cs->iova = gpuaddr;
         cs->size = sizedwords * sizeof(uint32_t);
         break;
      }
      case RD_SHADER_LOG_BUFFER: {
         unsigned int sizedwords;
         parse_addr(ps.buf, ps.sz, &sizedwords, &dev->shader_log_iova);
         break;
      }
      case RD_CP_LOG_BUFFER: {
         unsigned int sizedwords;
         parse_addr(ps.buf, ps.sz, &sizedwords, &dev->cp_log_iova);
         break;
      }
      case RD_WRBUFFER: {
         struct wrbuf *wrbuf = u_vector_add(&dev->wrbufs);
         uint64_t *p = (uint64_t *)ps.buf;
         wrbuf->iova = p[0];
         wrbuf->size = p[1];
         bool clear = p[2];
         int name_len = ps.sz - (3 * sizeof(uint64_t));
         wrbuf->name = calloc(sizeof(char), name_len);
         memcpy(wrbuf->name, (char*)(p + 3), name_len); // includes null terminator

         if (clear) {
            struct buffer *buf = device_get_buffer(dev, wrbuf->iova);
            assert(buf);

            uint64_t offset = wrbuf->iova - buf->iova;
            uint64_t end = MIN2(offset + wrbuf->size, buf->size);
            while (offset < end) {
               static const uint64_t clear_value = 0xdeadbeefdeadbeef;
               memcpy(buf->map + offset, &clear_value,
                      MIN2(sizeof(clear_value), end - offset));
               offset += sizeof(clear_value);
            }
         }

         break;
      }
      default:
         break;
      }
   }

   io_close(io);
   if (ps.ret < 0) {
      fprintf(stderr, "corrupt file %s\n", tmpfilename);
   }

   return ps.ret;
}

static int
handle_file(const char *filename, uint32_t first_submit, uint32_t last_submit,
            uint32_t submit_to_override, uint64_t base_addr, const char *cmdstreamgen)
{
   struct io *io;
   int submit = 0;
   bool skip = false;
   bool need_submit = false;
   struct rd_parsed_section ps = {0};

   printf("Reading %s...\n", filename);

   if (!strcmp(filename, "-"))
      io = io_openfd(0);
   else
      io = io_open(filename);

   if (!io) {
      fprintf(stderr, "could not open: %s\n", filename);
      return -1;
   }

   struct device *dev = device_create(base_addr);

   struct {
      unsigned int len;
      uint64_t gpuaddr;
   } gpuaddr = {0};

   while (parse_rd_section(io, &ps)) {
      switch (ps.type) {
      case RD_TEST:
      case RD_VERT_SHADER:
      case RD_FRAG_SHADER:
         /* no-op */
         break;
      case RD_CMD:
         skip = false;
         if (exename) {
            skip |= (strstr(ps.buf, exename) != ps.buf);
         } else {
            skip |= (strstr(ps.buf, "fdperf") == ps.buf);
            skip |= (strstr(ps.buf, "chrome") == ps.buf);
            skip |= (strstr(ps.buf, "surfaceflinger") == ps.buf);
            skip |= ((char *)ps.buf)[0] == 'X';
         }
         break;

      case RD_GPUADDR:
         if (need_submit) {
            need_submit = false;
            device_submit_cmdstreams(dev);
         }

         parse_addr(ps.buf, ps.sz, &gpuaddr.len, &gpuaddr.gpuaddr);
         /* no-op */
         break;
      case RD_BUFFER_CONTENTS:
         /* TODO: skip buffer uploading and even reading if this buffer
          * is used for submit outside of [first_submit, last_submit]
          * range. A set of buffers is shared between several cmdstreams,
          * so we'd have to find starting from which RD_CMD to upload
          * the buffers.
          */
         upload_buffer(dev, gpuaddr.gpuaddr, gpuaddr.len, ps.buf);
         break;
      case RD_CMDSTREAM_ADDR: {
         unsigned int sizedwords;
         uint64_t gpuaddr;
         parse_addr(ps.buf, ps.sz, &sizedwords, &gpuaddr);

         bool add_submit = !skip && (submit >= first_submit) && (submit <= last_submit);
         printf("%scmdstream %d: %d dwords\n", add_submit ? "" : "skipped ",
                submit, sizedwords);

         if (add_submit) {
            struct cmdstream *cs = u_vector_add(&dev->cmdstreams);

            if (submit == submit_to_override) {
               if (override_cmdstream(dev, cs, cmdstreamgen) < 0)
                  break;
            } else {
               cs->iova = gpuaddr;
               cs->size = sizedwords * sizeof(uint32_t);
            }
         }

         need_submit = true;

         submit++;
         break;
      }
      case RD_GPU_ID: {
         uint32_t gpu_id = parse_gpu_id(ps.buf);
         if (gpu_id)
            printf("gpuid: %d\n", gpu_id);
         break;
      }
      case RD_CHIP_ID: {
         uint64_t chip_id = parse_chip_id(ps.buf);
         printf("chip_id: 0x%" PRIx64 "\n", chip_id);
         break;
      }
      default:
         break;
      }
   }

   if (need_submit)
      device_submit_cmdstreams(dev);

   close(dev->fd);

   io_close(io);
   fflush(stdout);

   if (ps.ret < 0) {
      printf("corrupt file\n");
   }
   return 0;
}
