// Copyright 2020 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#pragma once

#include <stddef.h>
#include <stdint.h>

#include <xnnpack.h>
#include <xnnpack/common.h>
#include <xnnpack/cache.h>
#include <xnnpack/node-type.h>

#if defined(EMSCRIPTEN)
#include <emscripten/emscripten.h>
#elif XNN_PLATFORM_WINDOWS
#include <windows.h>
#else
#include <time.h>
#endif

#define XNN_MAX_INPUTS 4
#define XNN_MAX_OUTPUTS 4

#define XNN_MAX_RUNTIME_INPUTS 4
#define XNN_MAX_RUNTIME_OUTPUTS 4

#define XNN_INVALID_NODE_ID UINT32_MAX

#define XNN_MAX_OPERATOR_OBJECTS 4

/// Disable fusion of nodes in subgraph. Fusion is enabled by default, set this flag to turn it off.
#define XNN_FLAG_NO_OPERATOR_FUSION 0x80000000

#ifdef __cplusplus
extern "C" {
#endif

struct xnn_shape {
  size_t num_dims;
  size_t dim[XNN_MAX_TENSOR_DIMS];
};

enum xnn_value_type {
  xnn_value_type_invalid = 0,
  xnn_value_type_dense_tensor = 1,
};

enum xnn_layout_type {
  xnn_layout_type_nhwc = 0,
  xnn_layout_type_nchw = 1,
};

/// Abstraction for a collections of elements produced and consumed by nodes.
struct xnn_value {
  /// Unique ID for the value.
  uint32_t id;
  /// Type of the collection of elements.
  ///
  /// Currently only dense tensors are supported.
  /// Other types (e.g. sparse tensors) might be supported in the future.
  enum xnn_value_type type;
  /// Type of elements in the collection.
  enum xnn_datatype datatype;
  /// Per-value quantization parameters.
  struct {
    /// Offset from zero of the quantized elements.
    int32_t zero_point;
    union {
      /// Multiplication factor to convert quantized elements to real representation.
      float scale;
      struct {
        /// Per-channel multiplication factor to convert quantized elements to real representation.
        const float* channelwise_scale;
        /// Index of the channel dimension with per-channel quantization parameters.
        size_t channel_dimension;
      };
    };
  } quantization;
  /// Tensor shape.
  struct xnn_shape shape;
  /// Binary features of the tensor. Supported values are any combination of:
  /// - XNN_VALUE_FLAG_EXTERNAL_INPUT
  /// - XNN_VALUE_FLAG_EXTERNAL_OUTPUT
  uint32_t flags;
  /// Static initialization data. Must be null for non-static values.
  const void* data;
  /// Index of the Subgraph node that produced the value, or XNN_INVALID_NODE_ID is the Value is an external input.
  uint32_t producer;
  /// Index of the first Node that consume the value, or XNN_INVALID_NODE_ID if the Value has no consumers within the
  /// graph (e.g. Value is an external output).
  uint32_t first_consumer;
  /// Number of Nodes that consume the value.
  /// If multiple inputs in a Node refer to this Value as input, the Node is counted as consumer multiple times.
  /// If the Value is an external output, it counts as having an extra consumer.
  uint32_t num_consumers;
  uint32_t num_nchw_compatible_consumers;
  enum xnn_layout_type layout;
  /// Set during analysis in xnn_subgraph_rewrite_for_fp16.
  /// Indicates that this value should be converted to FP16.
  bool fp16_compatible;
  /// Set during analysis in xnn_subgraph_rewrite_for_fp16.
  /// Indicates Value ID of the FP16 variant of this Value.
  uint32_t fp16_id;
  /// Set during analysis in xnn_subgraph_rewrite_for_fp16.
  /// Indicates Value ID of the FP32 variant of this Value.
  uint32_t fp32_id;
};


XNN_INLINE bool xnn_value_is_external(const struct xnn_value* value) {
  return (value->flags & (XNN_VALUE_FLAG_EXTERNAL_INPUT | XNN_VALUE_FLAG_EXTERNAL_OUTPUT)) != 0;
}

XNN_INLINE bool xnn_value_is_external_output(const struct xnn_value* value) {
  return (value->flags & XNN_VALUE_FLAG_EXTERNAL_OUTPUT) != 0;
}

XNN_INLINE bool xnn_value_is_external_input(const struct xnn_value* value) {
  return (value->flags & XNN_VALUE_FLAG_EXTERNAL_INPUT) != 0;
}

enum xnn_allocation_type {
  xnn_allocation_type_invalid = 0,
  /// Static data that is provided by caller, needs to outlive the xnn_runtime.
  xnn_allocation_type_static,
  /// Lives in XNNPACK-managed internal workspace.
  xnn_allocation_type_workspace,
  /// Non-static data that is external to the runtime, provided by caller, specified in xnn_setup_runtime.
  xnn_allocation_type_external,
};

struct xnn_blob {
  /// Size in bytes.
  size_t size;
  /// Data pointer.
  void* data;
  enum xnn_allocation_type allocation_type;
};

struct xnn_node;
struct xnn_operator_data;

typedef enum xnn_status (*xnn_create_operator_fn)(
  const struct xnn_node* node,
  const struct xnn_value* values,
  size_t num_values,
  struct xnn_operator_data* opdata,
  const struct xnn_caches* caches);

typedef enum xnn_status (*xnn_setup_operator_fn)(
  const struct xnn_operator_data* opdata,
  const struct xnn_blob* blobs,
  size_t num_blobs,
  pthreadpool_t threadpool);

enum xnn_compute_type {
  xnn_compute_type_invalid = 0,
  xnn_compute_type_fp32,
  xnn_compute_type_fp16,
  xnn_compute_type_qc8,
  xnn_compute_type_qs8,
  xnn_compute_type_qu8,
  xnn_compute_type_fp32_to_fp16,
  xnn_compute_type_fp32_to_qs8,
  xnn_compute_type_fp32_to_qu8,
  xnn_compute_type_fp16_to_fp32,
  xnn_compute_type_qs8_to_fp32,
  xnn_compute_type_qu8_to_fp32,
};

struct xnn_node {
  enum xnn_node_type type;
  uint32_t id;
  enum xnn_compute_type compute_type;
  /// Static parameters of the operator node.
  union {
    struct {
      uint32_t input_padding_top;
      uint32_t input_padding_right;
      uint32_t input_padding_bottom;
      uint32_t input_padding_left;
      uint32_t kernel_height;
      uint32_t kernel_width;
      uint32_t subsampling_height;
      uint32_t subsampling_width;
      uint32_t dilation_height;
      uint32_t dilation_width;
      uint32_t groups;
      size_t group_input_channels;
      size_t group_output_channels;
    } convolution_2d;
    struct {
      uint32_t padding_top;
      uint32_t padding_right;
      uint32_t padding_bottom;
      uint32_t padding_left;
      uint32_t adjustment_height;
      uint32_t adjustment_width;
      uint32_t kernel_height;
      uint32_t kernel_width;
      uint32_t upsampling_height;
      uint32_t upsampling_width;
      uint32_t dilation_height;
      uint32_t dilation_width;
      uint32_t groups;
      size_t group_input_channels;
      size_t group_output_channels;
    } deconvolution_2d;
    struct {
      uint32_t input_padding_top;
      uint32_t input_padding_right;
      uint32_t input_padding_bottom;
      uint32_t input_padding_left;
      uint32_t kernel_height;
      uint32_t kernel_width;
      uint32_t subsampling_height;
      uint32_t subsampling_width;
      uint32_t dilation_height;
      uint32_t dilation_width;
      uint32_t depth_multiplier;
      size_t input_channels;
    } depthwise_convolution_2d;
    struct {
      uint32_t block_size;
    } depth_to_space;
    struct {
      uint32_t padding_top;
      uint32_t padding_right;
      uint32_t padding_bottom;
      uint32_t padding_left;
      uint32_t pooling_height;
      uint32_t pooling_width;
      uint32_t stride_height;
      uint32_t stride_width;
      uint32_t dilation_height;
      uint32_t dilation_width;
    } pooling_2d;
    struct {
      float alpha;
    } elu;
    struct {
      float negative_slope;
    } leaky_relu;
    struct {
      size_t pre_paddings[XNN_MAX_TENSOR_DIMS];
      size_t post_paddings[XNN_MAX_TENSOR_DIMS];
      uint32_t padding_value;
    } static_pad;
    struct {
      struct xnn_shape new_shape;
    } static_reshape;
    struct {
      size_t new_height;
      size_t new_width;
    } static_resize;
    struct {
      size_t axis;
    } concatenate;
    struct {
      size_t axis;
    } even_split;
    struct {
      size_t perm[XNN_MAX_TENSOR_DIMS];
      size_t num_dims;
    } transpose;
  } params;
  struct {
    float output_min;
    float output_max;
  } activation;
  /// Value IDs for node inputs.
  uint32_t inputs[XNN_MAX_INPUTS];
  uint32_t num_inputs;
  /// Value IDs for node outputs.
  uint32_t outputs[XNN_MAX_OUTPUTS];
  uint32_t num_outputs;
  uint32_t flags;
  uint32_t layout_flags;
  uint32_t cluster_leader;
  // Number of filter parameters in all 1x1 Convolutions of the sparse cluster.
  // This value is properly initialized only in sparse inference analysis of 1x1 Convolutions.
  size_t num_params;
  // Number of zero filter parameters in all 1x1 Convolutions of the sparse cluster.
  // This value is properly initialized only in sparse inference analysis of 1x1 Convolutions.
  size_t num_zeroes;
  // Factory function to create an operator object from the node.
  xnn_create_operator_fn create;
  // Function to setup an operator using opdata.
  xnn_setup_operator_fn setup;
};

#ifdef __MACH__
typedef uint64_t xnn_timestamp;
#elif __EMSCRIPTEN__
typedef double xnn_timestamp;
#elif XNN_PLATFORM_WINDOWS
typedef LARGE_INTEGER xnn_timestamp;
#else
typedef struct timespec xnn_timestamp;
#endif

struct xnn_operator_data {
  xnn_operator_t operator_objects[XNN_MAX_OPERATOR_OBJECTS];
  xnn_setup_operator_fn setup;
  size_t batch_size;
  size_t input_height;
  size_t input_width;
  size_t output_height;
  size_t output_width;
  struct xnn_shape shape1;
  struct xnn_shape shape2;
  size_t pre_paddings[XNN_MAX_TENSOR_DIMS];
  size_t post_paddings[XNN_MAX_TENSOR_DIMS];
  uint32_t adjustment_height;
  uint32_t adjustment_width;
  uint32_t inputs[XNN_MAX_RUNTIME_INPUTS];
  uint32_t outputs[XNN_MAX_RUNTIME_OUTPUTS];
  xnn_timestamp end_ts[XNN_MAX_OPERATOR_OBJECTS];
};

struct xnn_subgraph {
  /// Number of Value IDs reserved for communication with external graph representation.
  /// Values created during subgraph transformation avoid using IDs in [0, reserved_value_ids-1] range.
  uint32_t external_value_ids;

  uint32_t num_reserved_values;
  uint32_t num_values;
  struct xnn_value* values;

  uint32_t num_reserved_nodes;
  uint32_t num_nodes;
  struct xnn_node* nodes;
};

/// Runtime is a combination of an execution plan for subgraph Nodes and a memory manager for subgraph Values.
struct xnn_runtime {
  uint32_t num_external_values;

  /// List of operators in the execution plan, in execution order.
  struct xnn_operator_data* opdata;
  /// Number of operators in the execution plan.
  size_t num_ops;

  struct xnn_blob* blobs;
  size_t num_blobs;

  struct xnn_workspace* workspace;
  struct xnn_runtime* next_workspace_user;

#if XNN_PLATFORM_JIT
  struct xnn_code_cache code_cache;
#endif // XNN_PLATFORM_JIT

  pthreadpool_t threadpool;

  bool profiling;
  // The start timestamp of the first operator in the subgraph. This is set when profiling is true.
  xnn_timestamp start_ts;
};

struct xnn_value* xnn_subgraph_new_internal_value(xnn_subgraph_t subgraph);

struct xnn_node* xnn_subgraph_new_node(xnn_subgraph_t subgraph);

void xnn_subgraph_add_nodes(xnn_subgraph_t subgraph, size_t num_nodes);

size_t xnn_tensor_get_size(
  xnn_subgraph_t subgraph,
  uint32_t value_id);

// Product of all shape dimensions
size_t xnn_shape_multiply_all_dims(
  const struct xnn_shape shape[1]);

// Product of all shape dimensions, except for the specified number of the last dimensions
size_t xnn_shape_multiply_batch_dims(
  const struct xnn_shape shape[1], size_t num_nonbatch_dims);

// Product of all shape dimensions, except for the last (channel) one
size_t xnn_shape_multiply_non_channel_dims(
  const struct xnn_shape shape[1]);

enum xnn_status xnn_subgraph_optimize(xnn_subgraph_t subgraph, uint32_t flags);

void xnn_subgraph_rewrite_for_nchw(xnn_subgraph_t subgraph);
// Rewrites subgraph for FP16, returns true if success, false if rewrite failed.
bool xnn_subgraph_rewrite_for_fp16(xnn_subgraph_t subgraph);

void xnn_node_clear(struct xnn_node* node);
void xnn_value_clear(struct xnn_value* value);

void xnn_value_copy(struct xnn_value* dst_value, const struct xnn_value* src_value);

void xnn_init_convert_node(
  struct xnn_node* node,
  enum xnn_compute_type compute_type,
  uint32_t input_id,
  uint32_t output_id,
  uint32_t flags);

struct xnn_workspace {
  void* data;
  size_t size;
  struct xnn_runtime* first_user;
  // Workspace will be destroyed in xnn_delete_runtime or xnn_delete_workspace if num_users reaches 0.
  size_t ref_count;
};

#ifdef __cplusplus
}  // extern "C"
#endif
