/*
 * Copyright © 2020 Google, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

#ifndef _U_TRACE_H
#define _U_TRACE_H

#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>

#include "util/macros.h"
#include "util/u_atomic.h"
#include "util/u_queue.h"

#ifdef __cplusplus
extern "C" {
#endif

/* A trace mechanism (very) loosely inspired by the linux kernel tracepoint
 * mechanism, in that it allows for defining driver specific (or common)
 * tracepoints, which generate 'trace_$name()' functions that can be
 * called at various points in commandstream emit.
 *
 * Currently a printf backend is implemented, but the expectation is to
 * also implement a perfetto backend for shipping out traces to a tool like
 * AGI.
 *
 * Notable differences:
 *
 *  - GPU timestamps!  A driver provided callback is used to emit timestamps
 *    to a buffer.  At a later point in time (when stalling to wait for the
 *    GPU is not required), the timestamps are re-united with the trace
 *    payload.  This makes the trace mechanism suitable for profiling.
 *
 *  - Instead of a systemwide trace ringbuffer, buffering of un-retired
 *    tracepoints is split into two stages.  Traces are emitted to a
 *    'u_trace' instance, and at a later time flushed to a 'u_trace_context'
 *    instance.  This avoids the requirement that commandstream containing
 *    tracepoints is emitted in the same order as it is generated.
 *
 *    If the hw has multiple parallel "engines" (for example, 3d/blit/compute)
 *    then a `u_trace_context` per-engine should be used.
 *
 *  - Unlike kernel tracepoints, u_trace tracepoints are defined in py
 *    from which header and src files are generated.  Since we already have
 *    a build dependency on python+mako, this gives more flexibility than
 *    clunky preprocessor macro magic.
 *
 */

struct u_trace_context;
struct u_trace;
struct u_trace_chunk;
struct u_trace_printer;

/**
 * Special reserved value to indicate that no timestamp was captured,
 * and that the timestamp of the previous trace should be reused.
 */
#define U_TRACE_NO_TIMESTAMP ((uint64_t) 0)

/**
 * Address representation
 */
struct u_trace_address {
   /**
    * Pointer to a buffer object
    */
   void *bo;
   /**
    * Offset inside the buffer object or address of bo is NULL
    */
   uint64_t offset;
};

/**
 * Driver provided callback to create a buffer which will be read by
 * u_trace_read_ts function.
 */
typedef void *(*u_trace_create_buffer)(struct u_trace_context *utctx,
                                       uint64_t size_B);

/**
 * Driver provided callback to delete a buffer.
 */
typedef void (*u_trace_delete_buffer)(struct u_trace_context *utctx,
                                      void *buffer);

/**
 * Driver provided callback to emit commands into the soecified command
 * stream to capture a 64b timestamp into the specified timestamps buffer,
 * at the specified index.
 *
 * The hw counter that the driver records should be something that runs at
 * a fixed rate, even as the GPU freq changes.  The same source used for
 * GL_TIMESTAMP queries should be appropriate.
 */
typedef void (*u_trace_record_ts)(struct u_trace *ut,
                                  void *cs,
                                  void *timestamps,
                                  uint64_t offset_B,
                                  uint32_t flags);

/**
 * Driver provided callback to capture indirect data.
 */
typedef void (*u_trace_capture_data)(struct u_trace *ut,
                                     void *cs,
                                     void *dst_buffer,
                                     uint64_t dst_offset_B,
                                     void *src_buffer,
                                     uint64_t src_offset_B,
                                     uint32_t size_B);

/**
 * Driver provided callback to read back previously recorded indirect data.
 */
typedef const void *(*u_trace_get_data)(struct u_trace_context *utctx,
                                        void *buffer,
                                        uint64_t offset_B,
                                        uint32_t size_B);
/**
 * Driver provided callback to read back a previously recorded timestamp.
 * If necessary, this should block until the GPU has finished writing back
 * the timestamps.  (The timestamps will be read back in order, so it is
 * safe to only synchronize on idx==0.)
 *
 * flush_data is data provided by the driver via u_trace_flush.
 *
 * The returned timestamp should be in units of nanoseconds.  The same
 * timebase as GL_TIMESTAMP queries should be used.
 *
 * The driver can return the special U_TRACE_NO_TIMESTAMP value to indicate
 * that no timestamp was captured and the timestamp from the previous trace
 * will be re-used.  (The first trace in the u_trace buf may not do this.)
 * This allows the driver to detect cases where multiple tracepoints are
 * emitted with no other intervening cmdstream, to avoid pointlessly
 * capturing the same timestamp multiple times in a row.
 */
typedef uint64_t (*u_trace_read_ts)(struct u_trace_context *utctx,
                                    void *timestamps,
                                    uint64_t offset_B,
                                    void *flush_data);

/**
 * Driver provided callback to create a buffer which will be read by
 * u_trace_read_ts function.
 */
typedef void *(*u_trace_copy_data)(struct u_trace *ut,
                                   void *cs,
                                   void *dst,
                                   uint64_t dst_offset_B,
                                   void *src,
                                   uint64_t src_offset_B,
                                   uint64_t size_B);

/**
 * Driver provided callback to delete flush data.
 */
typedef void (*u_trace_delete_flush_data)(struct u_trace_context *utctx,
                                          void *flush_data);

enum u_trace_type {
   U_TRACE_TYPE_PRINT = 1u << 0,
   U_TRACE_TYPE_JSON = 1u << 1,
   U_TRACE_TYPE_PERFETTO_ACTIVE = 1u << 2,
   U_TRACE_TYPE_PERFETTO_ENV = 1u << 3,
   U_TRACE_TYPE_MARKERS = 1u << 4,
   U_TRACE_TYPE_INDIRECTS = 1u << 5,
   U_TRACE_TYPE_CSV = 1u << 6,

   U_TRACE_TYPE_PRINT_CSV = U_TRACE_TYPE_PRINT | U_TRACE_TYPE_CSV,
   U_TRACE_TYPE_PRINT_JSON = U_TRACE_TYPE_PRINT | U_TRACE_TYPE_JSON,
   U_TRACE_TYPE_PERFETTO =
      U_TRACE_TYPE_PERFETTO_ACTIVE | U_TRACE_TYPE_PERFETTO_ENV,

   /*
    * A mask of traces that require appending to the tracepoint chunk list.
    */
   U_TRACE_TYPE_REQUIRE_QUEUING = U_TRACE_TYPE_PRINT | U_TRACE_TYPE_PERFETTO,
   /*
    * A mask of traces that require processing the tracepoint chunk list.
    */
   U_TRACE_TYPE_REQUIRE_PROCESSING =
      U_TRACE_TYPE_PRINT | U_TRACE_TYPE_PERFETTO_ACTIVE,
};

/**
 * The trace context provides tracking for "in-flight" traces, once the
 * cmdstream that records timestamps has been flushed.
 */
struct u_trace_context {
   /* All traces enabled in this context */
   enum u_trace_type enabled_traces;

   void *pctx;

   u_trace_create_buffer create_buffer;
   u_trace_delete_buffer delete_buffer;
   u_trace_capture_data capture_data;
   u_trace_get_data get_data;
   u_trace_record_ts record_timestamp;
   u_trace_read_ts read_timestamp;
   u_trace_delete_flush_data delete_flush_data;

   uint64_t timestamp_size_bytes;
   uint64_t max_indirect_size_bytes;

   FILE *out;
   struct u_trace_printer *out_printer;

   /* Once u_trace_flush() is called u_trace_chunk's are queued up to
    * render tracepoints on a queue.  The per-chunk queue jobs block until
    * timestamps are available.
    */
   struct util_queue queue;

#ifdef HAVE_PERFETTO
   /* node in global list of trace contexts. */
   struct list_head node;
#endif

   /* State to accumulate time across N chunks associated with a single
    * batch (u_trace).
    */
   uint64_t last_time_ns;
   uint64_t first_time_ns;

   uint32_t frame_nr;
   uint32_t batch_nr;
   uint32_t event_nr;
   bool start_of_frame;

   void *dummy_indirect_data;

   /* list of unprocessed trace chunks in fifo order: */
   struct list_head flushed_trace_chunks;
};

/**
 * The u_trace ptr is passed as the first arg to generated tracepoints.
 * It provides buffering for tracepoint payload until the corresponding
 * driver cmdstream containing the emitted commands to capture is
 * flushed.
 *
 * Individual tracepoints emitted to u_trace are expected to be "executed"
 * (ie. timestamp captured) in FIFO order with respect to other tracepoints
 * emitted to the same u_trace.  But the order WRT other u_trace instances
 * is undefined util u_trace_flush().
 */
struct u_trace {
   struct u_trace_context *utctx;

   uint32_t num_traces;

   struct list_head
      trace_chunks; /* list of unflushed trace chunks in fifo order */
};

void u_trace_context_init(struct u_trace_context *utctx,
                          void *pctx,
                          uint32_t timestamp_size_bytes,
                          uint32_t max_indirect_size_bytes,
                          u_trace_create_buffer create_buffer,
                          u_trace_delete_buffer delete_buffer,
                          u_trace_record_ts record_timestamp,
                          u_trace_read_ts read_timestamp,
                          u_trace_capture_data capture_data,
                          u_trace_get_data get_data,
                          u_trace_delete_flush_data delete_flush_data);
void u_trace_context_fini(struct u_trace_context *utctx);

/**
 * Flush (trigger processing) of traces previously flushed to the
 * trace-context by u_trace_flush().
 *
 * This should typically be called in the driver's pctx->flush().
 */
void u_trace_context_process(struct u_trace_context *utctx, bool eof);

void u_trace_init(struct u_trace *ut, struct u_trace_context *utctx);
void u_trace_fini(struct u_trace *ut);

void u_trace_state_init(void);
bool u_trace_is_enabled(enum u_trace_type type);

bool u_trace_has_points(struct u_trace *ut);

struct u_trace_iterator {
   struct u_trace *ut;
   struct u_trace_chunk *chunk;
   uint32_t event_idx;
};

struct u_trace_iterator u_trace_begin_iterator(struct u_trace *ut);

struct u_trace_iterator u_trace_end_iterator(struct u_trace *ut);

bool u_trace_iterator_equal(struct u_trace_iterator a,
                            struct u_trace_iterator b);

typedef void (*u_trace_copy_buffer)(struct u_trace_context *utctx,
                                    void *cmdstream,
                                    void *ts_from,
                                    uint64_t from_offset,
                                    void *ts_to,
                                    uint64_t to_offset,
                                    uint64_t size_B);

/**
 * Clones tracepoints range into target u_trace.
 * Provides callback for driver to copy timestamps on GPU from
 * one buffer to another.
 *
 * It allows:
 * - Tracing re-usable command buffer in Vulkan, by copying tracepoints
 *   each time it is submitted.
 * - Per-tile tracing for tiling GPUs, by copying a range of tracepoints
 *   corresponding to a tile.
 */
void u_trace_clone_append(struct u_trace_iterator begin_it,
                          struct u_trace_iterator end_it,
                          struct u_trace *into,
                          void *cmdstream,
                          u_trace_copy_buffer copy_buffer);

void u_trace_disable_event_range(struct u_trace_iterator begin_it,
                                 struct u_trace_iterator end_it);

#define U_TRACE_FRAME_UNKNOWN -1
/**
 * Flush traces to the parent trace-context.  At this point, the expectation
 * is that all the tracepoints are "executed" by the GPU following any
 * previously flushed u_trace batch.
 *
 * flush_data is a way for driver to pass additional data, which becomes
 * available only at the point of flush, to the u_trace_read_ts callback and
 * perfetto. The typical example of such data would be a fence to wait on in
 * u_trace_read_ts, and a submission_id to pass into perfetto. The destruction
 * of the data is done via u_trace_delete_flush_data.
 *
 * This should typically be called when the corresponding cmdstream
 * (containing the timestamp reads) is flushed to the kernel.
 */
void u_trace_flush(struct u_trace *ut,
                   void *flush_data,
                   uint32_t frame_nr,
                   bool free_data);

#ifdef HAVE_PERFETTO
static ALWAYS_INLINE bool
u_trace_perfetto_active(struct u_trace_context *utctx)
{
   return p_atomic_read_relaxed(&utctx->enabled_traces) &
          U_TRACE_TYPE_PERFETTO_ACTIVE;
}

void u_trace_perfetto_start(void);
void u_trace_perfetto_stop(void);
#else
static ALWAYS_INLINE bool
u_trace_perfetto_active(UNUSED struct u_trace_context *utctx)
{
   return false;
}
#endif

/**
 * Return whether utrace is enabled at all or not, this can be used to
 * gate any expensive traces.
 */
static ALWAYS_INLINE bool
u_trace_enabled(struct u_trace_context *utctx)
{
   return p_atomic_read_relaxed(&utctx->enabled_traces) != 0;
}

/**
 * Return whether chunks should be processed or not.
 */
static ALWAYS_INLINE bool
u_trace_should_process(struct u_trace_context *utctx)
{
   return p_atomic_read_relaxed(&utctx->enabled_traces) &
          U_TRACE_TYPE_REQUIRE_PROCESSING;
}

/**
 * Return whether to emit markers into the command stream even if the queue
 * isn't active.
 */
static ALWAYS_INLINE bool
u_trace_markers_enabled(struct u_trace_context *utctx)
{
   return p_atomic_read_relaxed(&utctx->enabled_traces) &
          U_TRACE_TYPE_MARKERS;
}

#ifdef __cplusplus
}
#endif

#endif /* _U_TRACE_H */
