/*
 * Copyright © 2021 Igalia S.L.
 * SPDX-License-Identifier: MIT
 */

#ifndef TU_AUTOTUNE_H
#define TU_AUTOTUNE_H

#include "tu_common.h"

#include "util/hash_table.h"
#include "util/rwlock.h"

#include "tu_suballoc.h"

struct tu_renderpass_history;

/**
 * "autotune" our decisions about bypass vs GMEM rendering, based on historical
 * data about a given render target.
 *
 * In deciding which path to take there are tradeoffs, including some that
 * are not reasonably estimateable without having some additional information:
 *
 *  (1) If you know you are touching every pixel (ie. there is a clear),
 *      then the GMEM path will at least not cost more memory bandwidth than
 *      sysmem[1]
 *
 *  (2) If there is no clear, GMEM could potentially cost *more* bandwidth
 *      if there is sysmem->GMEM restore pass.
 *
 *  (3) If you see a high draw count, that is an indication that there will be
 *      enough pixels accessed multiple times to benefit from the reduced
 *      memory bandwidth that GMEM brings
 *
 *  (4) But high draw count where there is not much overdraw can actually be
 *      faster in bypass mode if it is pushing a lot of state change, due to
 *      not having to go thru the state changes per-tile[1]
 *
 * The approach taken is to measure the samples-passed for the batch to estimate
 * the amount of overdraw to detect cases where the number of pixels touched is
 * low.
 *
 * [1] ignoring early-tile-exit optimizations, but any draw that touches all/
 *     most of the tiles late in the tile-pass can defeat that
 */
struct tu_autotune {

   /* We may have to disable autotuner if there are too many
    * renderpasses in-flight.
    */
   bool enabled;

   struct tu_device *device;

   /**
    * Cache to map renderpass key to historical information about
    * rendering to that particular render target.
    */
   struct hash_table *ht;
   struct u_rwlock ht_lock;

   /**
    * List of per-renderpass results that we are waiting for the GPU
    * to finish with before reading back the results.
    */
   struct list_head pending_results;

   /**
    * List of per-submission data that we may want to free after we
    * processed submission results.
    * This could happend after command buffers which were in the submission
    * are destroyed.
    */
   struct list_head pending_submission_data;

   /**
    * List of per-submission data that has been finished and can be reused.
    */
   struct list_head submission_data_pool;

   uint32_t fence_counter;
   uint32_t idx_counter;
};

/**
 * From the cmdstream, the captured samples-passed values are recorded
 * at the start and end of the batch.
 *
 * Note that we do the math on the CPU to avoid a WFI.  But pre-emption
 * may force us to revisit that.
 */
struct PACKED tu_renderpass_samples {
   uint64_t samples_start;
   /* hw requires the sample start/stop locations to be 128b aligned. */
   uint64_t __pad0;
   uint64_t samples_end;
   uint64_t __pad1;
};

/* Necessary when writing sample counts using CP_EVENT_WRITE7::ZPASS_DONE. */
static_assert(offsetof(struct tu_renderpass_samples, samples_end) == 16);

/**
 * Tracks the results from an individual renderpass. Initially created
 * per renderpass, and appended to the tail of at->pending_results. At a later
 * time, when the GPU has finished writing the results, we fill samples_passed.
 */
struct tu_renderpass_result {
   /* Points into GPU memory */
   struct tu_renderpass_samples* samples;

   struct tu_suballoc_bo bo;

   /*
    * Below here, only used internally within autotune
    */
   uint64_t rp_key;
   struct tu_renderpass_history *history;
   struct list_head node;
   uint32_t fence;
   uint64_t samples_passed;
};

VkResult tu_autotune_init(struct tu_autotune *at, struct tu_device *dev);
void tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev);

bool tu_autotune_use_bypass(struct tu_autotune *at,
                            struct tu_cmd_buffer *cmd_buffer,
                            struct tu_renderpass_result **autotune_result);
void tu_autotune_free_results(struct tu_device *dev, struct list_head *results);

bool tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers,
                                       uint32_t cmd_buffer_count);

/**
 * A magic 8-ball that tells the gmem code whether we should do bypass mode
 * for moar fps.
 */
struct tu_cs *tu_autotune_on_submit(struct tu_device *dev,
                                    struct tu_autotune *at,
                                    struct tu_cmd_buffer **cmd_buffers,
                                    uint32_t cmd_buffer_count);

struct tu_autotune_results_buffer;

template <chip CHIP>
void tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
                                  struct tu_cs *cs,
                                  struct tu_renderpass_result *autotune_result);

template <chip CHIP>
void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd,
                                struct tu_cs *cs,
                                struct tu_renderpass_result *autotune_result);

#endif /* TU_AUTOTUNE_H */
