// Copyright 2022 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#pragma once

#include <stddef.h>
#include <stdint.h>

#include <xnnpack/common.h>


// Default: serves to differentiate pointer types for micro-kernels without fused activation.

union xnn_f16_default_params {
  char _; // Dummy member variable to comply with the C standard
};

union xnn_f32_default_params {
  char _; // Dummy member variable to comply with the C standard
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    int32_t mask_table[14];
  } avx;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
};


// ReLU: serves to differentiate pointer types for micro-kernels with fused ReLU activation.

union xnn_f32_relu_params {
  char _; // Dummy member variable to comply with the C standard
};


// Scale+Min+Max: used by AVGPOOL/GAVGPOOL microkernels.

union xnn_f16_scaleminmax_params {
  char _; // Dummy member variable to comply with the C standard
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
  struct {
    uint16_t scale;
    uint16_t min;
    uint16_t max;
  } neon;
#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(32) float scale[8];
    XNN_ALIGN(32) float min[8];
    XNN_ALIGN(32) float max[8];
  } avx;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
};

union xnn_f32_scaleminmax_params {
  struct {
    float scale;
    float min;
    float max;
  } scalar;
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(16) float scale[4];
    XNN_ALIGN(16) float min[4];
    XNN_ALIGN(16) float max[4];
  } sse;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
};


// Min+Max: used by VCLAMP and GEMM/IGEMM/DWCONV/MAXPOOL/etc with MINMAX activation.

union xnn_bf16_minmax_params {
  struct {
    float min;
    float max;
  } scalar;
};

union xnn_f16_minmax_params {
  char _; // Dummy member variable to comply with the C standard
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
  struct {
    uint16_t min;
    uint16_t max;
  } neon;
#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(32) float min[8];
    XNN_ALIGN(32) float max[8];
  } avx;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
};

union xnn_f32_minmax_params {
  struct {
    float min;
    float max;
  } scalar;
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(16) float min[4];
    XNN_ALIGN(16) float max[4];
  } sse;
  struct {
    XNN_ALIGN(32) float min[8];
    XNN_ALIGN(32) float max[8];
    int32_t mask_table[14];
  } avx;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
  struct {
    XNN_ALIGN(8) float min[2];
    XNN_ALIGN(8) float max[2];
  } wasmsimd;
#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
};

union xnn_s8_minmax_params {
  struct {
    int32_t min;
    int32_t max;
  } scalar;
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(16) uint8_t bias[16];
    XNN_ALIGN(16) uint8_t min_with_bias[16];
    XNN_ALIGN(16) uint8_t max_with_bias[16];
  } sse2;
  struct {
    XNN_ALIGN(16) int8_t min[16];
    XNN_ALIGN(16) int8_t max[16];
  } sse4;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
  struct {
    int8_t min;
    int8_t max;
  } neon;
#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
  struct {
    XNN_ALIGN(8) int8_t min[8];
    XNN_ALIGN(8) int8_t max[8];
  } wasmsimd;
#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
};

union xnn_u8_minmax_params {
  struct {
    uint32_t min;
    uint32_t max;
  } scalar;
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(16) uint8_t min[16];
    XNN_ALIGN(16) uint8_t max[16];
  } sse2;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
  struct {
    uint8_t min;
    uint8_t max;
  } neon;
#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
  struct {
    XNN_ALIGN(8) uint8_t min[8];
    XNN_ALIGN(8) uint8_t max[8];
  } wasmsimd;
#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
};


// Conv w. Min+Max: used by quantized GEMM/IGEMM/DWCONV microkernels with MINMAX activation.

union xnn_qc8_conv_minmax_params {
  struct {
    float magic_bias;
    int32_t magic_min;
    int32_t magic_max;
    int32_t magic_bias_less_zero_point;
  } fp32_scalar_imagic;
  struct {
    float output_min_less_zero_point;
    float output_max_less_zero_point;
    float magic_bias;
    int32_t magic_bias_less_output_zero_point;
  } fp32_scalar_fmagic;
  struct {
    float output_min_less_zero_point;
    float output_max_less_zero_point;
    int32_t output_zero_point;
  } fp32_scalar_lrintf;
#if XNN_ARCH_ARM
  struct {
    float magic_bias;
    int32_t magic_bias_less_zero_point;
    uint32_t output_min;
    uint32_t output_max;
  } fp32_armsimd32;
#endif  // XNN_ARCH_ARM
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
  struct {
    float magic_bias;
    int32_t magic_bias_less_output_zero_point;
    int8_t output_min;
    int8_t output_max;
  } fp32_neon;
  struct {
    int16_t output_zero_point;
    uint8_t output_min;
    uint8_t output_max;
  } fp32_neonv8;
#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(16) float output_max_less_zero_point[4];
    XNN_ALIGN(16) int16_t output_zero_point[8];
    XNN_ALIGN(16) int16_t output_min[8];
  } fp32_sse2;
  struct {
    XNN_ALIGN(16) float output_max_less_zero_point[4];
    XNN_ALIGN(16) int16_t output_zero_point[8];
    XNN_ALIGN(16) int8_t output_min[16];
  } fp32_sse4;
  struct {
    XNN_ALIGN(32) float output_max_less_zero_point[8];
    XNN_ALIGN(32) int16_t output_zero_point[16];
    XNN_ALIGN(32) int8_t output_min[32];
  } fp32_avx2;
  struct {
    XNN_ALIGN(64) float output_max_less_zero_point[16];
    XNN_ALIGN(64) int16_t output_zero_point[32];
    XNN_ALIGN(64) int8_t output_min[64];
  } fp32_avx512;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
  struct {
    XNN_ALIGN(8) float magic_bias[2];
    XNN_ALIGN(8) int32_t magic_min[2];
    XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2];
    XNN_ALIGN(8) int8_t output_max[8];
  } fp32_wasmsimd;
#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
};

union xnn_qs8_conv_minmax_params {
  struct {
    float scale;
    float output_min_less_zero_point;
    float output_max_less_zero_point;
    float magic_bias;
    int32_t magic_bias_less_output_zero_point;
  } fp32_scalar_fmagic;
  struct {
    float scale;
    float magic_bias;
    int32_t magic_min;
    int32_t magic_max;
    int32_t magic_bias_less_zero_point;
  } fp32_scalar_imagic;
  struct {
    float scale;
    float output_min_less_zero_point;
    float output_max_less_zero_point;
    int32_t output_zero_point;
  } fp32_scalar_lrintf;
#if XNN_ARCH_ARM
  struct {
    float scale;
    float magic_bias;
    int32_t magic_bias_less_zero_point;
    uint32_t output_min;
    uint32_t output_max;
  } fp32_armsimd32;
#endif  // XNN_ARCH_ARM
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
  struct {
    float scale;
    float magic_bias;
    int32_t magic_bias_less_output_zero_point;
    int8_t output_min;
    int8_t output_max;
  } fp32_neon;
  struct {
    float scale;
    int16_t output_zero_point;
    int8_t output_min;
    int8_t output_max;
  } fp32_neonv8;
  struct {
    int32_t right_pre_shift;
    int32_t multiplier;
    int32_t right_post_shift;
    int16_t output_zero_point;
    int8_t output_min;
    int8_t output_max;
  } rndnu_neon;
#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(16) float scale[4];
    XNN_ALIGN(16) float output_max_less_zero_point[4];
    XNN_ALIGN(16) int16_t output_zero_point[8];
    XNN_ALIGN(16) int16_t output_min[8];
  } fp32_sse2;
  struct {
    XNN_ALIGN(16) float scale[4];
    XNN_ALIGN(16) float output_max_less_zero_point[4];
    XNN_ALIGN(16) int16_t output_zero_point[8];
    XNN_ALIGN(16) int8_t output_min[16];
  } fp32_sse4;
  struct {
    XNN_ALIGN(32) float scale[8];
    XNN_ALIGN(32) float output_max_less_zero_point[8];
    XNN_ALIGN(32) int16_t output_zero_point[16];
    XNN_ALIGN(32) int8_t output_min[32];
  } fp32_avx2;
  struct {
    XNN_ALIGN(64) float scale[16];
    XNN_ALIGN(64) float output_max_less_zero_point[16];
    XNN_ALIGN(64) int16_t output_zero_point[32];
    XNN_ALIGN(64) int8_t output_min[64];
  } fp32_avx512;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
  struct {
    XNN_ALIGN(8) float scale[2];
    XNN_ALIGN(8) float magic_bias[2];
    XNN_ALIGN(8) int32_t magic_min[2];
    XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2];
    XNN_ALIGN(8) int8_t output_max[8];
  } fp32_wasmsimd;
#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
};

union xnn_qu8_conv_minmax_params {
  struct {
    int32_t kernel_zero_point;
    float scale;
    float output_min_less_zero_point;
    float output_max_less_zero_point;
    float magic_bias;
    int32_t magic_bias_less_output_zero_point;
  } fp32_scalar_fmagic;
  struct {
    int32_t kernel_zero_point;
    float scale;
    float magic_bias;
    int32_t magic_min;
    int32_t magic_max;
    int32_t magic_bias_less_zero_point;
  } fp32_scalar_imagic;
  struct {
    int32_t kernel_zero_point;
    float scale;
    float output_min_less_zero_point;
    float output_max_less_zero_point;
    int32_t output_zero_point;
  } fp32_scalar_lrintf;
#if XNN_ARCH_ARM
  struct {
    float scale;
    float magic_bias;
    uint32_t minus_kernel_zero_point;
    int32_t magic_bias_less_zero_point;
    uint32_t output_min;
    uint32_t output_max;
  } fp32_armsimd32;
#endif  // XNN_ARCH_ARM
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
  struct {
    uint8_t kernel_zero_point[4];
    float scale;
    float magic_bias;
    int32_t magic_bias_less_output_zero_point;
    uint8_t output_min;
    uint8_t output_max;
  } fp32_neon;
  struct {
    uint8_t kernel_zero_point[4];
    float scale;
    int16_t output_zero_point;
    uint8_t output_min;
    uint8_t output_max;
  } fp32_neonv8;
  struct {
    uint8_t kernel_zero_point[4];
    int32_t right_pre_shift;
    int32_t multiplier;
    int32_t right_post_shift;
    int16_t output_zero_point;
    uint8_t output_min;
    uint8_t output_max;
  } rndnu_neon;
#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(16) int16_t kernel_zero_point[8];
    XNN_ALIGN(16) float scale[4];
    XNN_ALIGN(16) float output_max_less_zero_point[4];
    XNN_ALIGN(16) int16_t output_zero_point[8];
    XNN_ALIGN(16) uint8_t output_min[16];
  } fp32_sse2;
  struct {
    XNN_ALIGN(32) int16_t kernel_zero_point[16];
    XNN_ALIGN(32) float scale[8];
    XNN_ALIGN(32) float output_max_less_zero_point[8];
    XNN_ALIGN(32) int16_t output_zero_point[16];
    XNN_ALIGN(32) uint8_t output_min[32];
  } fp32_avx2;
  struct {
    XNN_ALIGN(64) int16_t kernel_zero_point[32];
    XNN_ALIGN(64) float scale[16];
    XNN_ALIGN(64) float output_max_less_zero_point[16];
    XNN_ALIGN(64) int16_t output_zero_point[32];
    XNN_ALIGN(64) uint8_t output_min[64];
  } fp32_avx512;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
  struct {
    XNN_ALIGN(8) int16_t kernel_zero_point[4];
    XNN_ALIGN(8) float scale[2];
    XNN_ALIGN(8) float magic_bias[2];
    XNN_ALIGN(8) int32_t magic_min[2];
    XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2];
    XNN_ALIGN(8) int8_t output_max[8];
  } fp32_wasmsimd;
#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
};


// Add w. Min+Max: used by quantized VADD[C] microkernels with MINMAX activation.

union xnn_qs8_add_minmax_params {
  struct {
    int32_t bias;
    int32_t a_multiplier;
    int32_t b_multiplier;
    uint32_t shift;
    int32_t output_min_less_zero_point;
    int32_t output_max_less_zero_point;
    int32_t output_zero_point;
  } scalar;
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
  struct {
    int8_t a_zero_point;
    int8_t b_zero_point;
    int16_t output_zero_point;
    int32_t a_multiplier;
    int32_t b_multiplier;
    int32_t right_shift;
    int8_t output_min;
    int8_t output_max;
  } neon;
#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(16) int32_t bias[4];
    XNN_ALIGN(16) uint16_t a_multiplier_lo[8];
    XNN_ALIGN(16) uint16_t a_multiplier_hi[8];
    XNN_ALIGN(16) uint16_t b_multiplier_lo[8];
    XNN_ALIGN(16) uint16_t b_multiplier_hi[8];
    uint32_t shift;
    uint32_t b_multiplier;
    XNN_ALIGN(16) int16_t output_zero_point[8];
    XNN_ALIGN(16) int16_t output_min[8];
    XNN_ALIGN(16) int16_t output_max[8];
  } sse2;
  struct {
    XNN_ALIGN(16) int32_t bias[4];
    XNN_ALIGN(16) uint16_t a_multiplier_lo[8];
    XNN_ALIGN(16) uint16_t a_multiplier_hi[8];
    XNN_ALIGN(16) uint16_t b_multiplier_lo[8];
    XNN_ALIGN(16) uint16_t b_multiplier_hi[8];
    uint32_t shift;
    uint32_t b_multiplier;
    XNN_ALIGN(16) int16_t output_zero_point[8];
    XNN_ALIGN(16) int8_t output_min[16];
    XNN_ALIGN(16) int8_t output_max[16];
  } sse4_mul16;
  struct {
    XNN_ALIGN(16) int32_t bias[4];
    XNN_ALIGN(16) int32_t a_multiplier[4];
    XNN_ALIGN(16) int32_t b_multiplier[4];
    XNN_ALIGN(16) uint64_t shift[2];
    XNN_ALIGN(16) int16_t output_zero_point[8];
    XNN_ALIGN(16) int8_t output_min[16];
    XNN_ALIGN(16) int8_t output_max[16];
  } sse4_mul32;
  struct {
    XNN_ALIGN(32) int32_t bias[8];
    XNN_ALIGN(32) int32_t a_multiplier[8];
    XNN_ALIGN(32) int32_t b_multiplier[8];
    XNN_ALIGN(32) uint64_t shift[4];
    XNN_ALIGN(32) int16_t output_zero_point[16];
    XNN_ALIGN(16) int8_t output_min[16];
    XNN_ALIGN(16) int8_t output_max[16];
  } avx2;
  struct {
    XNN_ALIGN(64) int32_t bias[16];
    XNN_ALIGN(64) int32_t a_multiplier[16];
    XNN_ALIGN(64) int32_t b_multiplier[16];
    XNN_ALIGN(64) uint64_t shift[8];
    XNN_ALIGN(64) int16_t output_zero_point[32];
    XNN_ALIGN(32) int8_t output_min[32];
    XNN_ALIGN(32) int8_t output_max[32];
  } avx512;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
  struct {
    XNN_ALIGN(8) int32_t bias[2];
    XNN_ALIGN(8) int32_t a_multiplier[2];
    XNN_ALIGN(8) int32_t b_multiplier[2];
    uint32_t shift;
    XNN_ALIGN(8) int16_t output_zero_point[4];
    XNN_ALIGN(8) int8_t output_min[8];
    XNN_ALIGN(8) int8_t output_max[8];
  } wasmsimd;
#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
};

union xnn_qu8_add_minmax_params {
  struct {
    int32_t bias;
    int32_t a_multiplier;
    int32_t b_multiplier;
    int32_t rounding;
    uint32_t shift;
    int32_t output_min_less_zero_point;
    int32_t output_max_less_zero_point;
    int32_t output_zero_point;
  } scalar;
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
  struct {
    uint8_t a_zero_point;
    uint8_t b_zero_point;
    int16_t output_zero_point;
    int32_t a_multiplier;
    int32_t b_multiplier;
    int32_t right_shift;
    uint8_t output_min;
    uint8_t output_max;
  } neon;
#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(16) int32_t bias[4];
    XNN_ALIGN(16) uint16_t a_multiplier_lo[8];
    XNN_ALIGN(16) uint16_t a_multiplier_hi[8];
    XNN_ALIGN(16) uint16_t b_multiplier_lo[8];
    XNN_ALIGN(16) uint16_t b_multiplier_hi[8];
    uint32_t shift;
    uint32_t b_multiplier;
    XNN_ALIGN(16) int16_t output_zero_point[8];
    XNN_ALIGN(16) uint8_t output_min[16];
    XNN_ALIGN(16) uint8_t output_max[16];
  } sse2;
  struct {
    XNN_ALIGN(16) int32_t bias[4];
    XNN_ALIGN(16) int32_t a_multiplier[4];
    XNN_ALIGN(16) int32_t b_multiplier[4];
    XNN_ALIGN(16) uint64_t shift[2];
    XNN_ALIGN(16) int16_t output_zero_point[8];
    XNN_ALIGN(16) uint8_t output_min[16];
    XNN_ALIGN(16) uint8_t output_max[16];
  } sse4;
  struct {
    XNN_ALIGN(32) int32_t bias[8];
    XNN_ALIGN(32) int32_t a_multiplier[8];
    XNN_ALIGN(32) int32_t b_multiplier[8];
    XNN_ALIGN(32) uint64_t shift[4];
    XNN_ALIGN(32) int16_t output_zero_point[16];
    XNN_ALIGN(16) uint8_t output_min[16];
    XNN_ALIGN(16) uint8_t output_max[16];
  } avx2;
  struct {
    XNN_ALIGN(64) int32_t bias[16];
    XNN_ALIGN(64) int32_t a_multiplier[16];
    XNN_ALIGN(64) int32_t b_multiplier[16];
    XNN_ALIGN(64) uint64_t shift[8];
    XNN_ALIGN(64) int16_t output_zero_point[32];
    XNN_ALIGN(32) uint8_t output_min[32];
    XNN_ALIGN(32) uint8_t output_max[32];
  } avx512;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
  struct {
    XNN_ALIGN(8) int32_t bias[2];
    XNN_ALIGN(8) int32_t a_multiplier[2];
    XNN_ALIGN(8) int32_t b_multiplier[2];
    uint32_t shift;
    XNN_ALIGN(8) int16_t output_zero_point[4];
    XNN_ALIGN(8) uint8_t output_min[8];
    XNN_ALIGN(8) uint8_t output_max[8];
  } wasmsimd;
#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
};


// Mul w. Min+Max: used by quantized VMUL[C] microkernels with MINMAX activation.

union xnn_qs8_mul_minmax_params {
  struct {
    int32_t a_zero_point;
    int32_t b_zero_point;
    float scale;
    float output_min_less_zero_point;
    float output_max_less_zero_point;
    float magic_bias;
    int32_t magic_bias_less_output_zero_point;
  } fp32_scalar;
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
  struct {
    int8_t a_zero_point[2];
    int8_t b_zero_point[2];
    float scale;
    float magic_bias;
    int32_t magic_bias_less_output_zero_point;
    int8_t output_min;
    int8_t output_max;
  } fp32_neon;
  struct {
    int8_t a_zero_point[2];
    int8_t b_zero_point[2];
    float scale;
    int16_t output_zero_point;
    int8_t output_min;
    int8_t output_max;
  } fp32_neonv8;
  struct {
    int8_t a_zero_point[2];
    int8_t b_zero_point[2];
    int32_t left_pre_shift;
    int32_t multiplier;
    int32_t left_post_shift;
    int16_t output_zero_point;
    int8_t output_min;
    int8_t output_max;
  } rndnu_neon;
#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(16) int16_t a_zero_point[8];
    XNN_ALIGN(16) int16_t b_zero_point[8];
    XNN_ALIGN(16) float scale[4];
    XNN_ALIGN(16) int16_t output_zero_point[8];
    XNN_ALIGN(16) int16_t output_min[8];
    XNN_ALIGN(16) int16_t output_max[8];
  } fp32_sse2;
  struct {
    XNN_ALIGN(16) int16_t a_zero_point[8];
    XNN_ALIGN(16) int16_t b_zero_point[8];
    XNN_ALIGN(16) float scale[4];
    XNN_ALIGN(16) int16_t output_zero_point[8];
    XNN_ALIGN(16) int8_t output_min[16];
    XNN_ALIGN(16) int8_t output_max[16];
  } fp32_sse4;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
  struct {
    XNN_ALIGN(8) int16_t a_zero_point[4];
    XNN_ALIGN(8) int16_t b_zero_point[4];
    XNN_ALIGN(8) float scale[2];
    XNN_ALIGN(8) float magic_bias[2];
    XNN_ALIGN(8) int32_t magic_min[2];
    XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2];
    XNN_ALIGN(8) int8_t output_max[8];
  } fp32_wasmsimd;
#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
};

union xnn_qu8_mul_minmax_params {
  struct {
    int32_t a_zero_point;
    int32_t b_zero_point;
    float scale;
    float output_min_less_zero_point;
    float output_max_less_zero_point;
    float magic_bias;
    int32_t magic_bias_less_output_zero_point;
  } fp32_scalar;
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
  struct {
    uint8_t a_zero_point[2];
    uint8_t b_zero_point[2];
    float scale;
    float magic_bias;
    int32_t magic_bias_less_output_zero_point;
    uint8_t output_min;
    uint8_t output_max;
  } fp32_neon;
  struct {
    uint8_t a_zero_point[2];
    uint8_t b_zero_point[2];
    float scale;
    int16_t output_zero_point;
    uint8_t output_min;
    uint8_t output_max;
  } fp32_neonv8;
  struct {
    uint8_t a_zero_point[2];
    uint8_t b_zero_point[2];
    int32_t left_pre_shift;
    int32_t multiplier;
    int32_t left_post_shift;
    int16_t output_zero_point;
    uint8_t output_min;
    uint8_t output_max;
  } rndnu_neon;
#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(16) int16_t a_zero_point[8];
    XNN_ALIGN(16) int16_t b_zero_point[8];
    XNN_ALIGN(16) float scale[4];
    XNN_ALIGN(16) int16_t output_zero_point[8];
    XNN_ALIGN(16) uint8_t output_min[16];
    XNN_ALIGN(16) uint8_t output_max[16];
  } fp32_sse2;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
  struct {
    XNN_ALIGN(8) int16_t a_zero_point[4];
    XNN_ALIGN(8) int16_t b_zero_point[4];
    XNN_ALIGN(8) float scale[2];
    XNN_ALIGN(8) float magic_bias[2];
    XNN_ALIGN(8) int32_t magic_min[2];
    XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2];
    XNN_ALIGN(8) uint8_t output_max[8];
  } fp32_wasmsimd;
#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
};


// AvgPool w. Min+Max: used by quantized GAVGPOOL microkernels with MINMAX activation.

union xnn_qs8_avgpool_minmax_params {
  struct {
    int32_t init_bias;
    float scale;
    float output_min_less_zero_point;
    float output_max_less_zero_point;
    float magic_bias;
    int32_t magic_bias_less_output_zero_point;
  } fp32_scalar_fmagic;
  struct {
    int32_t init_bias;
    float scale;
    float magic_bias;
    int32_t magic_min;
    int32_t magic_max;
    int32_t magic_bias_less_zero_point;
  } fp32_scalar_imagic;
  struct {
    int32_t init_bias;
    float scale;
    float output_min_less_zero_point;
    float output_max_less_zero_point;
    int32_t output_zero_point;
  } fp32_scalar_lrintf;
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
  struct {
    int32_t init_bias;
    float scale;
    float magic_bias;
    int32_t magic_bias_less_output_zero_point;
    int8_t output_min;
    int8_t output_max;
  } fp32_neon;
  struct {
    int32_t init_bias;
    float scale;
    int16_t output_zero_point;
    int8_t output_min;
    int8_t output_max;
  } fp32_neonv8;
  struct {
    int32_t init_bias;
    int32_t left_pre_shift;
    int32_t multiplier;
    int32_t left_post_shift;
    int16_t output_zero_point;
    int8_t output_min;
    int8_t output_max;
  } rndnu_neon;
#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(16) int32_t init_bias[4];
    XNN_ALIGN(16) float scale[4];
    XNN_ALIGN(16) float output_max_less_zero_point[4];
    XNN_ALIGN(16) int16_t output_zero_point[8];
    XNN_ALIGN(16) int16_t output_min[8];
  } fp32_sse2;
  struct {
    XNN_ALIGN(16) int32_t init_bias[4];
    XNN_ALIGN(16) float scale[4];
    XNN_ALIGN(16) float output_max_less_zero_point[4];
    XNN_ALIGN(16) int16_t output_zero_point[8];
    XNN_ALIGN(16) int8_t output_min[16];
  } fp32_sse4;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
  struct {
    XNN_ALIGN(8) int32_t init_bias[2];
    XNN_ALIGN(8) float scale[2];
    XNN_ALIGN(8) float magic_bias[2];
    XNN_ALIGN(8) int32_t magic_min[2];
    XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2];
    XNN_ALIGN(8) int8_t output_max[8];
  } fp32_wasmsimd;
#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
};

union xnn_qu8_avgpool_minmax_params {
  struct {
    int32_t init_bias;
    float scale;
    float output_min_less_zero_point;
    float output_max_less_zero_point;
    float magic_bias;
    int32_t magic_bias_less_output_zero_point;
  } fp32_scalar_fmagic;
  struct {
    int32_t init_bias;
    float scale;
    float magic_bias;
    int32_t magic_min;
    int32_t magic_max;
    int32_t magic_bias_less_zero_point;
  } fp32_scalar_imagic;
  struct {
    int32_t init_bias;
    float scale;
    float output_min_less_zero_point;
    float output_max_less_zero_point;
    int32_t output_zero_point;
  } fp32_scalar_lrintf;
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
  struct {
    int32_t init_bias;
    float scale;
    float magic_bias;
    int32_t magic_bias_less_output_zero_point;
    uint8_t output_min;
    uint8_t output_max;
  } fp32_neon;
  struct {
    int32_t init_bias;
    float scale;
    int16_t output_zero_point;
    uint8_t output_min;
    uint8_t output_max;
  } fp32_neonv8;
  struct {
    int32_t init_bias;
    int32_t left_pre_shift;
    int32_t multiplier;
    int32_t left_post_shift;
    int16_t output_zero_point;
    uint8_t output_min;
    uint8_t output_max;
  } rndnu_neon;
#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(16) int32_t init_bias[4];
    XNN_ALIGN(16) float scale[4];
    XNN_ALIGN(16) float output_max_less_zero_point[4];
    XNN_ALIGN(16) int16_t output_zero_point[8];
    XNN_ALIGN(16) uint8_t output_min[16];
  } fp32_sse2;
  struct {
    XNN_ALIGN(16) int32_t init_bias[4];
    XNN_ALIGN(16) float scale[4];
    XNN_ALIGN(16) float output_max_less_zero_point[4];
    XNN_ALIGN(16) int16_t output_zero_point[8];
    XNN_ALIGN(16) uint8_t output_min[16];
  } fp32_sse4;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
  struct {
    XNN_ALIGN(8) int32_t init_bias[2];
    XNN_ALIGN(8) float scale[2];
    XNN_ALIGN(8) float magic_bias[2];
    XNN_ALIGN(8) int32_t magic_min[2];
    XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2];
    XNN_ALIGN(8) uint8_t output_max[8];
  } fp32_wasmsimd;
#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD

  // Legacy parameters used by QU8 AVGPOOL microkernels
  struct {
    int32_t bias;
    int32_t multiplier;
    int64_t rounding;
    uint32_t right_shift;
    int32_t output_min_less_zero_point;
    int32_t output_max_less_zero_point;
    int32_t output_zero_point;
  } scalar;
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
  struct {
    int32_t bias;
    int32_t multiplier;
    int64_t left_shift;
    int16_t output_zero_point;
    uint8_t output_min;
    uint8_t output_max;
  } neon;
#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(16) int32_t bias[4];
    XNN_ALIGN(16) uint32_t multiplier[4];
    XNN_ALIGN(16) uint64_t rounding[2];
    XNN_ALIGN(16) uint64_t right_shift[2];
    XNN_ALIGN(16) int16_t output_zero_point[8];
    XNN_ALIGN(16) uint8_t output_min[16];
    XNN_ALIGN(16) uint8_t output_max[16];
  } sse2;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
};


// Abs: used by VABS microkernels.

union xnn_f16_abs_params {
  char _; // Dummy member variable to comply with the C standard
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(16) uint16_t nonsign_mask[8];
  } sse;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
};

union xnn_f32_abs_params {
  char _; // Dummy member variable to comply with the C standard
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(16) float nonsign_mask[4];
  } sse;
  struct {
    XNN_ALIGN(32) float nonsign_mask[8];
    int32_t mask_table[14];
  } avx;
  struct {
    uint32_t nonsign_mask;
  } avx512;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
  struct {
    XNN_ALIGN(8) float nonsign_mask[2];
  } wasmsimd;
#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
};


// Cvt (Convert): used by VCVT microkernels.

union xnn_f16_f32_cvt_params {
  struct {
    uint32_t sign_mask;
    uint32_t exp_offset;
    float exp_scale;
    uint32_t magic_mask;
    float magic_bias;
    uint32_t denorm_cutoff;
  } scalar;
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
  struct {
    float exp_scale;
  } neon;
#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(16) uint16_t sign_mask[8];
    XNN_ALIGN(16) uint16_t exp_offset[8];
    XNN_ALIGN(16) float exp_scale[4];
    XNN_ALIGN(16) uint16_t magic_mask[8];
    XNN_ALIGN(16) float magic_bias[4];
    XNN_ALIGN(16) int16_t denorm_cutoff[8];
  } sse_int16;
  struct {
    XNN_ALIGN(16) uint32_t sign_mask[4];
    XNN_ALIGN(16) uint32_t exp_offset[4];
    XNN_ALIGN(16) float exp_scale[4];
    XNN_ALIGN(16) uint32_t magic_bias[4];
    XNN_ALIGN(16) int32_t denorm_cutoff[4];
  } sse_int32;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
  struct {
    XNN_ALIGN(8) uint16_t sign_mask[4];
    XNN_ALIGN(8) uint16_t exp_offset[4];
    XNN_ALIGN(8) float exp_scale[2];
    XNN_ALIGN(8) uint16_t magic_mask[4];
    XNN_ALIGN(8) float magic_bias[2];
    XNN_ALIGN(8) int16_t denorm_cutoff[4];
  } wasmsimd_int16;
  struct {
    XNN_ALIGN(8) uint32_t sign_mask[2];
    XNN_ALIGN(8) uint32_t exp_offset[2];
    XNN_ALIGN(8) float exp_scale[2];
    XNN_ALIGN(8) uint32_t magic_bias[2];
    XNN_ALIGN(8) int32_t denorm_cutoff[2];
  } wasmsimd_int32;
#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
};

union xnn_f32_f16_cvt_params {
  struct {
    uint32_t nonsign_mask;
    uint32_t exp_bias;
    float scale_to_inf;
    uint32_t expw_max;
    float scale_to_zero;
    uint32_t bias_min;
    uint16_t exph_mask;
    uint16_t manth_mask;
    uint16_t nanh;
  } scalar_bitcast;
  struct {
    float scale_to_inf;
    uint32_t exp_bias;
    float scale_to_zero;
    uint32_t expw_max;
    uint32_t bias_min;
    uint16_t exph_mask;
    uint16_t manth_mask;
    uint16_t nanh;
  } scalar_fabsf;
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
  struct {
    uint32_t exp_bias;
    float scale_to_inf;
    uint32_t expw_max;
    float scale_to_zero;
  } neon;
#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(16) uint32_t nonsign_mask[4];
    XNN_ALIGN(16) uint32_t exp_bias[4];
    XNN_ALIGN(16) float scale_to_inf[4];
    XNN_ALIGN(16) uint32_t expw_max[4];
    XNN_ALIGN(16) float scale_to_zero[4];
    XNN_ALIGN(16) int16_t bias_min[8];
    XNN_ALIGN(16) uint32_t manth_mask[4];
    XNN_ALIGN(16) uint32_t exph_mask[4];
    XNN_ALIGN(16) uint16_t nanh[8];
  } sse2;
  struct {
    int32_t mask_table[14];
  } f16c;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
  struct {
    XNN_ALIGN(8) uint32_t exp_bias[2];
    XNN_ALIGN(8) float scale_to_inf[2];
    XNN_ALIGN(8) uint32_t expw_max[2];
    XNN_ALIGN(8) float scale_to_zero[2];
    XNN_ALIGN(8) int16_t bias_min[4];
    XNN_ALIGN(8) uint32_t manth_mask[2];
    XNN_ALIGN(8) uint32_t exph_mask[2];
    XNN_ALIGN(8) uint16_t nanh[4];
  } wasmsimd;
#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
};

union xnn_f32_qs8_cvt_params {
  struct {
    float scale;
    float output_min_less_zero_point;
    float output_max_less_zero_point;
    float magic_bias;
    int32_t magic_bias_less_zero_point;
  } scalar_fmagic;
  struct {
    float scale;
    float magic_bias;
    int32_t magic_min;
    int32_t magic_max;
    int32_t magic_bias_less_zero_point;
  } scalar_imagic;
  struct {
    float scale;
    float output_min_less_zero_point;
    float output_max_less_zero_point;
    int32_t output_zero_point;
  } scalar_lrintf;
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
  struct {
    float scale;
    float magic_bias;
    int32_t magic_bias_less_zero_point;
    int8_t output_min;
    int8_t output_max;
  } neon;
  struct {
    float scale;
    int16_t output_zero_point;
    int8_t output_min;
    int8_t output_max;
  } neonv8;
#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(16) float scale[4];
    XNN_ALIGN(16) float output_max_less_zero_point[4];
    XNN_ALIGN(16) int16_t output_zero_point[8];
    XNN_ALIGN(16) int16_t output_min[8];
  } sse2;
  struct {
    XNN_ALIGN(16) float scale[4];
    XNN_ALIGN(16) float output_max_less_zero_point[4];
    XNN_ALIGN(16) int16_t output_zero_point[8];
    XNN_ALIGN(16) int8_t output_min[16];
  } sse4;
  struct {
    XNN_ALIGN(32) float scale[8];
    XNN_ALIGN(32) float output_max_less_zero_point[8];
    XNN_ALIGN(16) int16_t output_zero_point[8];
    XNN_ALIGN(16) int8_t output_min[16];
    int32_t mask_table[14];
  } avx;
  struct {
    XNN_ALIGN(32) float scale[8];
    XNN_ALIGN(32) float output_max_less_zero_point[8];
    XNN_ALIGN(32) int16_t output_zero_point[16];
    XNN_ALIGN(32) uint32_t shuffle_mask[8];
    XNN_ALIGN(32) int8_t output_min[32];
    int32_t mask_table[14];
  } avx2;
  struct {
    XNN_ALIGN(64) float scale[16];
    XNN_ALIGN(64) float output_max_less_zero_point[16];
    XNN_ALIGN(64) int16_t output_zero_point[32];
    XNN_ALIGN(64) int8_t output_min[64];
    XNN_ALIGN(64) uint32_t shuffle512_mask[16];
    XNN_ALIGN(32) uint32_t shuffle256_mask[8];
  } avx512;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
  struct {
    XNN_ALIGN(8) float scale[2];
    XNN_ALIGN(8) int16_t output_zero_point[4];
    XNN_ALIGN(8) int8_t output_min[8];
    XNN_ALIGN(8) int8_t output_max[8];
  } wasmsimd_cvt;
  struct {
    XNN_ALIGN(8) float scale[2];
    XNN_ALIGN(8) float magic_bias[2];
    XNN_ALIGN(8) int32_t magic_min[2];
    XNN_ALIGN(8) int32_t magic_bias_less_zero_point[2];
    XNN_ALIGN(8) int8_t output_max[8];
  } wasmsimd_magic;
#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
};

union xnn_f32_qu8_cvt_params {
  struct {
    float scale;
    float output_min_less_zero_point;
    float output_max_less_zero_point;
    float magic_bias;
    int32_t magic_bias_less_zero_point;
  } scalar_fmagic;
  struct {
    float scale;
    float magic_bias;
    int32_t magic_min;
    int32_t magic_max;
    int32_t magic_bias_less_zero_point;
  } scalar_imagic;
  struct {
    float scale;
    float output_min_less_zero_point;
    float output_max_less_zero_point;
    int32_t output_zero_point;
  } scalar_lrintf;
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
  struct {
    float scale;
    float magic_bias;
    int32_t magic_bias_less_zero_point;
    uint8_t output_min;
    uint8_t output_max;
  } neon;
  struct {
    float scale;
    int16_t output_zero_point;
    uint8_t output_min;
    uint8_t output_max;
  } neonv8;
#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(16) float scale[4];
    XNN_ALIGN(16) float output_max_less_zero_point[4];
    XNN_ALIGN(16) int16_t output_zero_point[8];
    XNN_ALIGN(16) uint8_t output_min[16];
  } sse2;
  struct {
    XNN_ALIGN(32) float scale[8];
    XNN_ALIGN(32) float output_max_less_zero_point[8];
    XNN_ALIGN(16) int16_t output_zero_point[8];
    XNN_ALIGN(16) uint8_t output_min[16];
    int32_t mask_table[14];
  } avx;
  struct {
    XNN_ALIGN(32) float scale[8];
    XNN_ALIGN(32) float output_max_less_zero_point[8];
    XNN_ALIGN(32) int16_t output_zero_point[16];
    XNN_ALIGN(32) uint32_t shuffle_mask[8];
    XNN_ALIGN(32) uint8_t output_min[32];
    int32_t mask_table[14];
  } avx2;
  struct {
    XNN_ALIGN(64) float scale[16];
    XNN_ALIGN(64) float output_max_less_zero_point[16];
    XNN_ALIGN(64) int16_t output_zero_point[32];
    XNN_ALIGN(64) uint8_t output_min[64];
    XNN_ALIGN(64) uint32_t shuffle512_mask[16];
    XNN_ALIGN(32) uint32_t shuffle256_mask[8];
  } avx512;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
  struct {
    XNN_ALIGN(8) float scale[2];
    XNN_ALIGN(8) int16_t output_zero_point[4];
    XNN_ALIGN(8) uint8_t output_min[8];
    XNN_ALIGN(8) uint8_t output_max[8];
  } wasmsimd_cvt;
  struct {
    XNN_ALIGN(8) float scale[2];
    XNN_ALIGN(8) float magic_bias[2];
    XNN_ALIGN(8) int32_t magic_min[2];
    XNN_ALIGN(8) int32_t magic_bias_less_zero_point[2];
    XNN_ALIGN(8) uint8_t output_max[8];
  } wasmsimd_magic;
#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
};

union xnn_qs8_cvt_params {
  struct {
    int32_t bias;
    int32_t multiplier;
  } scalar;
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
  struct {
    uint32_t minus_input_zero_point;
    int32_t multiplier;
    int32_t bias;
  } armsimd32;
  struct {
    int16_t input_zero_point;
    int16_t multiplier;
    int16_t output_zero_point;
  } neon;
#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(16) int16_t multiplier[8];
    XNN_ALIGN(16) int32_t bias[4];
  } sse2;
  struct {
    XNN_ALIGN(16) int16_t input_zero_point[8];
    XNN_ALIGN(16) int16_t multiplier[8];
    XNN_ALIGN(16) int16_t output_zero_point[8];
  } ssse3;
  struct {
    XNN_ALIGN(32) int16_t input_zero_point[16];
    XNN_ALIGN(32) int16_t multiplier[16];
    XNN_ALIGN(32) int16_t output_zero_point[16];
  } avx2;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
  struct {
    XNN_ALIGN(8) int16_t input_zero_point[4];
    XNN_ALIGN(8) int16_t multiplier[4];
    XNN_ALIGN(8) int16_t output_zero_point[4];
  } wasmsimd;
#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
};

union xnn_qs8_f32_cvt_params {
  struct {
    int32_t zero_point;
    float scale;
  } scalar;
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
  struct {
    int16_t minus_zero_point[2];
    float scale;
  } neon;
#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(16) uint8_t sign_mask[16];
    XNN_ALIGN(16) uint16_t magic_exp[8];
    XNN_ALIGN(16) float magic_bias[4];
    XNN_ALIGN(16) float scale[4];
  } sse2;
  struct {
    XNN_ALIGN(16) int32_t minus_zero_point[4];
    XNN_ALIGN(16) float scale[4];
  } sse4;
  struct {
    XNN_ALIGN(32) int32_t minus_zero_point[8];
    XNN_ALIGN(32) float scale[8];
  } avx;
  struct {
    XNN_ALIGN(64) int32_t minus_zero_point[16];
    XNN_ALIGN(64) float scale[16];
  } avx512;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
  struct {
    XNN_ALIGN(8) int16_t minus_zero_point[4];
    XNN_ALIGN(8) float scale[2];
  } wasmsimd;
#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
};

union xnn_qu8_cvt_params {
  struct {
    int32_t bias;
    int32_t multiplier;
  } scalar;
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
  struct {
    uint32_t minus_input_zero_point;
    int32_t multiplier;
    int32_t bias;
  } armsimd32;
  struct {
    uint16_t input_zero_point;
    int16_t multiplier;
    int16_t output_zero_point;
  } neon;
#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(16) uint16_t multiplier[8];
    XNN_ALIGN(16) int32_t bias[4];
  } sse2;
  struct {
    XNN_ALIGN(16) uint16_t input_zero_point[8];
    XNN_ALIGN(16) int16_t multiplier[8];
    XNN_ALIGN(16) int16_t output_zero_point[8];
  } ssse3;
  struct {
    XNN_ALIGN(32) uint16_t input_zero_point[16];
    XNN_ALIGN(32) int16_t multiplier[16];
    XNN_ALIGN(32) int16_t output_zero_point[16];
  } avx2;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
  struct {
    XNN_ALIGN(8) uint16_t input_zero_point[4];
    XNN_ALIGN(8) int16_t multiplier[4];
    XNN_ALIGN(8) int16_t output_zero_point[4];
  } wasmsimd;
#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
};

union xnn_qu8_f32_cvt_params {
  struct {
    int32_t zero_point;
    float scale;
  } scalar;
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
  struct {
    int16_t minus_zero_point[2];
    float scale;
  } neon;
#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(16) uint16_t magic_exp[8];
    XNN_ALIGN(16) float magic_bias[4];
    XNN_ALIGN(16) float scale[4];
  } sse2;
  struct {
    XNN_ALIGN(16) int32_t minus_zero_point[4];
    XNN_ALIGN(16) float scale[4];
  } sse4;
  struct {
    XNN_ALIGN(32) int32_t minus_zero_point[8];
    XNN_ALIGN(32) float scale[8];
  } avx;
  struct {
    XNN_ALIGN(64) int32_t minus_zero_point[16];
    XNN_ALIGN(64) float scale[16];
  } avx512;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
  struct {
    XNN_ALIGN(8) int16_t minus_zero_point[4];
    XNN_ALIGN(8) float scale[2];
  } wasmsimd;
#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
};


// ELU: used by VELU microkernels.

union xnn_f16_elu_params {
  char _; // Dummy member variable to comply with the C standard
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
  struct {
    uint16_t prescale;
    uint16_t sat_cutoff;
    uint16_t magic_bias;
    uint16_t log2e;
    uint16_t minus_ln2;
    uint16_t c3;
    uint16_t c2;
    uint16_t minus_alpha;
    uint16_t beta;
  } neonfp16arith_rr1_p3;
#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(32) float prescale[8];
    XNN_ALIGN(32) float sat_cutoff[8];
    XNN_ALIGN(32) float magic_bias[8];
    XNN_ALIGN(32) float log2e[8];
    XNN_ALIGN(32) float minus_ln2[8];
    XNN_ALIGN(32) float c3[8];
    XNN_ALIGN(32) float c2[8];
    XNN_ALIGN(32) float c1[8];
    XNN_ALIGN(32) float alpha[8];
    XNN_ALIGN(32) float beta[8];
  } avx2_rr1_p3;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
};

union xnn_f32_elu_params {
  struct {
    float prescale;
    float alpha;
    float beta;
    float sat_cutoff;
    float magic_bias;
    float log2e;
    float minus_ln2_hi;
    float minus_ln2_lo;
    float c3;
    float c2;
    float one;
  } scalar_rr2_lut16_p3;
  struct {
    float prescale;
    float alpha;
    float beta;
    float sat_cutoff;
    float magic_bias;
    float log2e;
    float minus_ln2_hi;
    float minus_ln2_lo;
    float c6;
    float c5;
    float c4;
    float c3;
    float c2;
    float one;
  } scalar_rr2_p6;
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
  struct {
    float prescale;
    float alpha;
    float beta;
    float sat_cutoff;
    float magic_bias;
    float log2e;
    float minus_ln2_hi;
    float minus_ln2_lo;
    float c6;
    float c5;
    float c4;
    float c3;
    float c2;
  } neon_rr2_p6;
  struct {
    float prescale;
    float alpha;
    float beta;
    float sat_cutoff;
    float magic_bias;
    float log2e;
    float minus_ln2_hi;
    float minus_ln2_lo;
    float c3;
    float c2;
  } neon_rr2_lut16_p3;
  struct {
    float prescale;
    float alpha;
    float beta;
    float sat_cutoff;
    float magic_bias;
    float log2e;
    float minus_ln2;
    float c6;
    float c5;
    float c4;
    float c3;
    float c2;
  } neonfma_rr1_p6;
  struct {
    float prescale;
    float alpha;
    float beta;
    float sat_cutoff;
    float magic_bias;
    float log2e;
    float minus_ln2;
    float c3;
    float c2;
  } neonfma_rr1_lut16_p3;
#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(16) float prescale[4];
    XNN_ALIGN(16) float alpha[4];
    XNN_ALIGN(16) float beta[4];
    XNN_ALIGN(16) float sat_cutoff[4];
    XNN_ALIGN(16) float magic_bias[4];
    XNN_ALIGN(16) float log2e[4];
    XNN_ALIGN(16) uint32_t index_mask[4];
    XNN_ALIGN(16) float minus_ln2_hi[4];
    XNN_ALIGN(16) float minus_ln2_lo[4];
    XNN_ALIGN(16) float c3[4];
    XNN_ALIGN(16) float c2[4];
    XNN_ALIGN(16) float one[4];
  } sse2_rr2_lut16_p3;
  struct {
    XNN_ALIGN(16) float prescale[4];
    XNN_ALIGN(16) float alpha[4];
    XNN_ALIGN(16) float beta[4];
    XNN_ALIGN(16) float sat_cutoff[4];
    XNN_ALIGN(16) float magic_bias[4];
    XNN_ALIGN(16) float log2e[4];
    XNN_ALIGN(16) float minus_ln2_hi[4];
    XNN_ALIGN(16) float minus_ln2_lo[4];
    XNN_ALIGN(16) float c6[4];
    XNN_ALIGN(16) float c5[4];
    XNN_ALIGN(16) float c4[4];
    XNN_ALIGN(16) float c3[4];
    XNN_ALIGN(16) float c2[4];
    XNN_ALIGN(16) float one[4];
  } sse2_rr2_p6;
  struct {
    XNN_ALIGN(32) float prescale[8];
    XNN_ALIGN(32) float alpha[8];
    XNN_ALIGN(32) float beta[8];
    XNN_ALIGN(32) float sat_cutoff[8];
    XNN_ALIGN(32) float magic_bias[8];
    XNN_ALIGN(32) float log2e[8];
    XNN_ALIGN(32) uint32_t index_mask[8];
    XNN_ALIGN(32) float minus_ln2_hi[8];
    XNN_ALIGN(32) float minus_ln2_lo[8];
    XNN_ALIGN(32) float c3[8];
    XNN_ALIGN(32) float c2[8];
    XNN_ALIGN(32) float one[8];
    int32_t mask_table[14];
  } avx_rr2_lut16_p3;
  struct {
    XNN_ALIGN(32) float prescale[8];
    XNN_ALIGN(32) float alpha[8];
    XNN_ALIGN(32) float beta[8];
    XNN_ALIGN(32) float sat_cutoff[8];
    XNN_ALIGN(32) float magic_bias[8];
    XNN_ALIGN(32) float log2e[8];
    XNN_ALIGN(32) uint32_t index_mask[8];
    XNN_ALIGN(32) float table[8];
    XNN_ALIGN(32) float minus_ln2_hi[8];
    XNN_ALIGN(32) float minus_ln2_lo[8];
    XNN_ALIGN(32) float c4[8];
    XNN_ALIGN(32) float c3[8];
    XNN_ALIGN(32) float c2[8];
    XNN_ALIGN(32) float one[8];
    int32_t mask_table[14];
  } avx_rr2_lut4_p4;
  struct {
    XNN_ALIGN(32) float prescale[8];
    XNN_ALIGN(32) float alpha[8];
    XNN_ALIGN(32) float beta[8];
    XNN_ALIGN(32) float sat_cutoff[8];
    XNN_ALIGN(32) float magic_bias[8];
    XNN_ALIGN(32) float log2e[8];
    XNN_ALIGN(32) float minus_ln2_hi[8];
    XNN_ALIGN(32) float minus_ln2_lo[8];
    XNN_ALIGN(32) float c6[8];
    XNN_ALIGN(32) float c5[8];
    XNN_ALIGN(32) float c4[8];
    XNN_ALIGN(32) float c3[8];
    XNN_ALIGN(32) float c2[8];
    XNN_ALIGN(32) float one[8];
    int32_t mask_table[14];
  } avx_rr2_p6;
  struct {
    XNN_ALIGN(32) float prescale[8];
    XNN_ALIGN(32) float alpha[8];
    XNN_ALIGN(32) float beta[8];
    XNN_ALIGN(32) float sat_cutoff[8];
    XNN_ALIGN(32) float magic_bias[8];
    XNN_ALIGN(32) float log2e[8];
    XNN_ALIGN(32) uint32_t index_mask[8];
    XNN_ALIGN(32) float minus_ln2[8];
    XNN_ALIGN(32) float c3[8];
    XNN_ALIGN(32) float c2[8];
    int32_t mask_table[14];
  } avx2_rr1_lut16_p3;
  struct {
    XNN_ALIGN(32) float prescale[8];
    XNN_ALIGN(32) float alpha[8];
    XNN_ALIGN(32) float beta[8];
    XNN_ALIGN(32) float sat_cutoff[8];
    XNN_ALIGN(32) float magic_bias[8];
    XNN_ALIGN(32) float log2e[8];
    XNN_ALIGN(32) uint32_t table[8];
    XNN_ALIGN(32) float minus_ln2[8];
    XNN_ALIGN(32) float c4[8];
    XNN_ALIGN(32) float c3[8];
    XNN_ALIGN(32) float c2[8];
    int32_t mask_table[14];
  } avx2_rr1_lut8_p4;
  struct {
    XNN_ALIGN(32) float prescale[8];
    XNN_ALIGN(32) float alpha[8];
    XNN_ALIGN(32) float beta[8];
    XNN_ALIGN(32) float sat_cutoff[8];
    XNN_ALIGN(32) float magic_bias[8];
    XNN_ALIGN(32) float log2e[8];
    XNN_ALIGN(32) float table[8];
    XNN_ALIGN(32) float minus_ln2[8];
    XNN_ALIGN(32) float c4[8];
    XNN_ALIGN(32) float c3[8];
    XNN_ALIGN(32) float c2[8];
    int32_t mask_table[14];
  } avx2_rr1_lut4_p4;
  struct {
    XNN_ALIGN(32) float prescale[8];
    XNN_ALIGN(32) float alpha[8];
    XNN_ALIGN(32) float beta[8];
    XNN_ALIGN(32) float sat_cutoff[8];
    XNN_ALIGN(32) float magic_bias[8];
    XNN_ALIGN(32) float log2e[8];
    XNN_ALIGN(32) float minus_ln2[8];
    XNN_ALIGN(32) float c6[8];
    XNN_ALIGN(32) float c5[8];
    XNN_ALIGN(32) float c4[8];
    XNN_ALIGN(32) float c3[8];
    XNN_ALIGN(32) float c2[8];
    int32_t mask_table[14];
  } avx2_rr1_p6;
  struct {
    float prescale;
    float alpha;
    float beta;
    float sat_cutoff;
    float magic_bias;
    float log2e;
    float minus_ln2;
    float c3;
    float c2;
    XNN_ALIGN(64) uint32_t table[16];
  } avx512_rr1_lut16_p3;
  struct {
    float prescale;
    float alpha;
    float beta;
    float sat_cutoff;
    float magic_bias;
    float log2e;
    float minus_ln2;
    float c6;
    float c5;
    float c4;
    float c3;
    float c2;
  } avx512_rr1_p6;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
  struct {
    XNN_ALIGN(8) float prescale[2];
    XNN_ALIGN(8) float alpha[2];
    XNN_ALIGN(8) float beta[2];
    XNN_ALIGN(8) float sat_cutoff[2];
    XNN_ALIGN(8) float magic_bias[2];
    XNN_ALIGN(8) float log2e[2];
    XNN_ALIGN(8) uint32_t index_mask[2];
    XNN_ALIGN(8) float minus_ln2_hi[2];
    XNN_ALIGN(8) float minus_ln2_lo[2];
    XNN_ALIGN(8) float c3[2];
    XNN_ALIGN(8) float c2[2];
    XNN_ALIGN(8) float one[2];
  } wasmsimd_rr2_lut16_p3;
  struct {
    XNN_ALIGN(8) float prescale[2];
    XNN_ALIGN(8) float alpha[2];
    XNN_ALIGN(8) float beta[2];
    XNN_ALIGN(8) float sat_cutoff[2];
    XNN_ALIGN(8) float magic_bias[2];
    XNN_ALIGN(8) float log2e[2];
    XNN_ALIGN(8) float minus_ln2_hi[2];
    XNN_ALIGN(8) float minus_ln2_lo[2];
    XNN_ALIGN(8) float c6[2];
    XNN_ALIGN(8) float c5[2];
    XNN_ALIGN(8) float c4[2];
    XNN_ALIGN(8) float c3[2];
    XNN_ALIGN(8) float c2[2];
    XNN_ALIGN(8) float one[2];
  } wasmsimd_rr2_p6;
#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
};


// ExpMinus: used by RADDEXPMINUSMAX microkernels.

union xnn_f16_expminus_params {
  char _; // Dummy member variable to comply with the C standard
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
  struct {
    uint16_t magic_bias;
    uint16_t log2e;
    uint16_t minus_ln2_hi;
    uint16_t minus_ln2_lo;
    uint16_t c2;
    uint16_t c1;
    uint16_t denorm_cutoff;
  } neonfp16arith_rr2_p2;
#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(32) float magic_bias[8];
    XNN_ALIGN(32) float log2e[8];
    XNN_ALIGN(32) float minus_ln2[8];
    XNN_ALIGN(32) float c2[8];
    XNN_ALIGN(32) float c1[8];
    XNN_ALIGN(32) float denorm_cutoff[8];
  } avx2_rr1_p2;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
};

union xnn_f32_expminus_params {
  struct {
    float log2e;
    float magic_bias;
    float minus_ln2_hi;
    float minus_ln2_lo;
    float c5;
    float c4;
    float c3;
    float c2;
    float c1;
    float denorm_cutoff;
  } scalar_rr2_p5;
  struct {
    float log2e;
    float magic_bias;
    float minus_ln2_hi;
    float minus_ln2_lo;
    float c2;
    float denorm_cutoff;
  } scalar_rr2_lut64_p2;
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
  struct {
    float log2e;
    float magic_bias;
    float minus_ln2_hi;
    float minus_ln2_lo;
    float c5;
    float c4;
    float c3;
    float c2;
    float c1;
    float denorm_cutoff;
  } neon_rr2_p5;
  struct {
    float log2e;
    float magic_bias;
    float minus_ln2_hi;
    float minus_ln2_lo;
    float c2;
    float denorm_cutoff;
  } neon_rr2_lut64_p2;
  struct {
    float log2e;
    float magic_bias;
    float minus_ln2;
    float c5;
    float c4;
    float c3;
    float c2;
    float c1;
    float denorm_cutoff;
  } neonfma_rr1_p5;
  struct {
    float log2e;
    float magic_bias;
    float minus_ln2;
    float c2;
    float denorm_cutoff;
  } neonfma_rr1_lut64_p2;
#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(16) float log2e[4];
    XNN_ALIGN(16) float magic_bias[4];
    XNN_ALIGN(16) float minus_ln2_hi[4];
    XNN_ALIGN(16) float minus_ln2_lo[4];
    XNN_ALIGN(16) float c5[4];
    XNN_ALIGN(16) float c4[4];
    XNN_ALIGN(16) float c3[4];
    XNN_ALIGN(16) float c2[4];
    XNN_ALIGN(16) float c1[4];
    XNN_ALIGN(16) float denorm_cutoff[4];
  } sse2_rr2_p5;
  struct {
    XNN_ALIGN(32) float log2e[8];
    XNN_ALIGN(32) float magic_bias[8];
    XNN_ALIGN(32) float minus_ln2[8];
    XNN_ALIGN(32) float c5[8];
    XNN_ALIGN(32) float c4[8];
    XNN_ALIGN(32) float c3[8];
    XNN_ALIGN(32) float c2[8];
    XNN_ALIGN(32) float c1[8];
    XNN_ALIGN(32) float denorm_cutoff[8];
    int32_t mask_table[14];
  } avx2_rr1_p5;
  struct {
    float log2e;
    float minus_ln2;
    float c5;
    float c4;
    float c3;
    float c2;
    float c1;
    float c0;
  } avx512_rr1_p5;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
  struct {
    XNN_ALIGN(8) float log2e[2];
    XNN_ALIGN(8) float magic_bias[2];
    XNN_ALIGN(8) float minus_ln2_hi[2];
    XNN_ALIGN(8) float minus_ln2_lo[2];
    XNN_ALIGN(8) float c5[2];
    XNN_ALIGN(8) float c4[2];
    XNN_ALIGN(8) float c3[2];
    XNN_ALIGN(8) float c2[2];
    XNN_ALIGN(8) float c1[2];
    XNN_ALIGN(8) float denorm_cutoff[2];
  } wasmsimd_rr2_p5;
#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
};


// HSwish: used by VHSWISH microkernels.

union xnn_f16_hswish_params {
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
  struct {
    uint16_t sixth;
    uint16_t three;
    uint16_t six;
    uint16_t pad;  // pad to 8 bytes for neonfp16arith assembly.
  } neon;
#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64 */
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(32) float sixth[8];
    XNN_ALIGN(32) float three[8];
    XNN_ALIGN(16) uint16_t six[8];
  } avx;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
};

union xnn_f32_hswish_params {
  struct {
    float sixth;
    float three;
    float six;
  } scalar;
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(16) float sixth[4];
    XNN_ALIGN(16) float half[4];
    XNN_ALIGN(16) float one[4];
  } sse;
  struct {
    XNN_ALIGN(32) float sixth[8];
    XNN_ALIGN(32) float half[8];
    XNN_ALIGN(32) float one[8];
    int32_t mask_table[14];
  } avx;
  struct {
    float sixth;
    float half;
    float one;
  } avx512;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
  struct {
    XNN_ALIGN(8) float sixth[2];
    XNN_ALIGN(8) float three[2];
    XNN_ALIGN(8) float six[2];
  } wasmsimd;
#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
};


// LReLU (Leaky ReLU): used by VLRELU microkernels.

union xnn_f16_lrelu_params {
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
  struct {
    uint16_t slope;
  } neon;
#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(32) float slope[8];
  } avx;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
};

union xnn_f32_lrelu_params {
  struct {
    float slope;
  } scalar;
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(16) float slope[4];
  } sse;
  struct {
    XNN_ALIGN(32) float slope[8];
    int32_t mask_table[14];
  } avx;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
  struct {
    XNN_ALIGN(8) float slope[2];
  } wasmsimd;
#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
};

union xnn_qs8_lrelu_params {
  struct {
    int32_t input_zero_point;
    int32_t positive_multiplier;
    int32_t negative_multiplier;
    int32_t bias;
  } scalar_select;
  struct {
    int32_t input_zero_point;
    int32_t multiplier_diff;
    int32_t multiplier_base;
    int32_t bias;
  } scalar_andxor;
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
  struct {
    uint32_t input_zero_point;
    uint32_t positive_multiplier;
    uint32_t negative_multiplier;
    int32_t bias;
  } armsimd32;
  struct {
    int16_t input_zero_point;
    int16_t positive_multiplier;
    int16_t negative_multiplier;
    int16_t output_zero_point;
  } neon;
#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(16) int16_t input_zero_point[8];
    XNN_ALIGN(16) int16_t multiplier_diff[8];
    XNN_ALIGN(16) int16_t multiplier_base[8];
    XNN_ALIGN(16) int16_t output_zero_point[8];
  } sse2;
  struct {
    XNN_ALIGN(16) int16_t input_zero_point[8];
    XNN_ALIGN(16) int16_t positive_multiplier[8];
    XNN_ALIGN(16) int16_t negative_multiplier[8];
    XNN_ALIGN(16) int16_t output_zero_point[8];
  } avx;
  struct {
    XNN_ALIGN(32) int16_t input_zero_point[16];
    XNN_ALIGN(32) int16_t positive_multiplier[16];
    XNN_ALIGN(32) int16_t negative_multiplier[16];
    XNN_ALIGN(32) int16_t output_zero_point[16];
  } avx2;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
  struct {
    XNN_ALIGN(8) int16_t input_zero_point[4];
    XNN_ALIGN(8) int16_t positive_multiplier[4];
    XNN_ALIGN(8) int16_t negative_multiplier[4];
    XNN_ALIGN(8) int16_t output_zero_point[4];
  } wasmsimd_arm;
  struct {
    XNN_ALIGN(8) int16_t input_zero_point[4];
    XNN_ALIGN(8) int16_t multiplier_diff[4];
    XNN_ALIGN(8) int16_t multiplier_base[4];
    XNN_ALIGN(8) int16_t output_zero_point[4];
  } wasmsimd_x86;
#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
};

union xnn_qu8_lrelu_params {
  struct {
    int32_t input_zero_point;
    int32_t positive_multiplier;
    int32_t negative_multiplier;
    int32_t bias;
  } scalar_select;
  struct {
    int32_t input_zero_point;
    int32_t multiplier_base;
    int32_t multiplier_diff;
    int32_t bias;
  } scalar_andxor;
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
  struct {
    uint32_t input_zero_point;
    uint32_t positive_multiplier;
    uint32_t negative_multiplier;
    int32_t bias;
  } armsimd32;
  struct {
    uint16_t input_zero_point;
    int16_t positive_multiplier;
    int16_t negative_multiplier;
    int16_t output_zero_point;
  } neon;
#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(16) int16_t input_zero_point[8];
    XNN_ALIGN(16) int16_t multiplier_diff[8];
    XNN_ALIGN(16) int16_t multiplier_base[8];
    XNN_ALIGN(16) int16_t output_zero_point[8];
  } sse2;
  struct {
    XNN_ALIGN(16) int16_t input_zero_point[8];
    XNN_ALIGN(16) int16_t positive_multiplier[8];
    XNN_ALIGN(16) int16_t negative_multiplier[8];
    XNN_ALIGN(16) int16_t output_zero_point[8];
  } avx;
  struct {
    XNN_ALIGN(32) int16_t input_zero_point[16];
    XNN_ALIGN(32) int16_t positive_multiplier[16];
    XNN_ALIGN(32) int16_t negative_multiplier[16];
    XNN_ALIGN(32) int16_t output_zero_point[16];
  } avx2;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
  struct {
    XNN_ALIGN(8) int16_t input_zero_point[4];
    XNN_ALIGN(8) int16_t positive_multiplier[4];
    XNN_ALIGN(8) int16_t negative_multiplier[4];
    XNN_ALIGN(8) int16_t output_zero_point[4];
  } wasmsimd_arm;
  struct {
    XNN_ALIGN(8) int16_t input_zero_point[4];
    XNN_ALIGN(8) int16_t multiplier_diff[4];
    XNN_ALIGN(8) int16_t multiplier_base[4];
    XNN_ALIGN(8) int16_t output_zero_point[4];
  } wasmsimd_x86;
#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
};


// Neg: used by VNEG microkernels.

union xnn_f16_neg_params {
  char _; // Dummy member variable to comply with the C standard
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(16) uint16_t sign_mask[8];
  } sse;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
};

union xnn_f32_neg_params {
  char _; // Dummy member variable to comply with the C standard
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(16) float sign_mask[4];
  } sse;
  struct {
    XNN_ALIGN(32) float sign_mask[8];
    int32_t mask_table[14];
  } avx;
  struct {
    uint32_t sign_mask;
  } avx512;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
  struct {
    XNN_ALIGN(8) float sign_mask[2];
  } wasmsimd;
#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
};


// Rnd (Round): used by VRNDNE/VRNDU/VRNDD/VRNDZ microkernels.

union xnn_f16_rnd_params {
  char _; // Dummy member variable to comply with the C standard
};

union xnn_f32_rnd_params {
  char _; // Dummy member variable to comply with the C standard
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(16) float sign_mask[4];
    XNN_ALIGN(16) float one[4];
  } sse2;
  struct {
    int32_t mask_table[14];
  } avx;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
};


// Sigmoid: used by VSIGMOID microkernels.

union xnn_f16_sigmoid_params {
  char _; // Dummy member variable to comply with the C standard
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
  struct {
    uint16_t magic_bias;
    uint16_t minus_log2e;
    uint16_t ln2_hi;
    uint16_t ln2_lo;
    uint16_t c2;
    uint16_t c1;
    uint16_t denorm_cutoff;
  } neonfp16arith_rr2_p2;
#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(32) float sign_mask[8];
    XNN_ALIGN(32) float magic_bias[8];
    XNN_ALIGN(32) float log2e[8];
    XNN_ALIGN(32) float minus_ln2[8];
    XNN_ALIGN(32) float c2[8];
    XNN_ALIGN(32) float c1[8];
    XNN_ALIGN(32) float one[8];
    XNN_ALIGN(32) float denorm_cutoff[8];
  } avx2_rr1_p2;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
};

union xnn_f32_sigmoid_params {
  struct {
    float magic_bias;
    float minus_log2e;
    float ln2_hi;
    float ln2_lo;
    float c1;
    float one;
    float denorm_cutoff;
  } scalar_rr2_lut2048_p1;
  struct {
    float magic_bias;
    float minus_log2e;
    float ln2_hi;
    float ln2_lo;
    float c2;
    float one;
    float denorm_cutoff;
  } scalar_rr2_lut64_p2;
  struct {
    float magic_bias;
    float minus_log2e;
    float ln2_hi;
    float ln2_lo;
    float c5;
    float c4;
    float c3;
    float c2;
    float c1;
    float one;
    float denorm_cutoff;
  } scalar_rr2_p5;
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
  struct {
    float magic_bias;
    float minus_log2e;
    float ln2_hi;
    float ln2_lo;
    float c1;
    float denorm_cutoff;
  } neon_rr2_lut2048_p1;
  struct {
    float magic_bias;
    float minus_log2e;
    float ln2_hi;
    float ln2_lo;
    float c2;
    float denorm_cutoff;
  } neon_rr2_lut64_p2;
  struct {
    float magic_bias;
    float minus_log2e;
    float ln2_hi;
    float ln2_lo;
    float c5;
    float c4;
    float c3;
    float c2;
    float c1;
    float denorm_cutoff;
  } neon_rr2_p5;
  struct {
    float magic_bias;
    float minus_log2e;
    float ln2;
    float c1;
    float denorm_cutoff;
  } neonfma_rr1_lut2048_p1;
  struct {
    float magic_bias;
    float minus_log2e;
    float ln2;
    float c2;
    float denorm_cutoff;
  } neonfma_rr1_lut64_p2;
  struct {
    float magic_bias;
    float minus_log2e;
    float ln2;
    float c5;
    float c4;
    float c3;
    float c2;
    float c1;
    float denorm_cutoff;
  } neonfma_rr1_p5;
#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(16) float sign_mask[4];
    XNN_ALIGN(16) float magic_bias[4];
    XNN_ALIGN(16) float log2e[4];
    XNN_ALIGN(16) uint32_t index_mask[4];
    XNN_ALIGN(16) float minus_ln2_hi[4];
    XNN_ALIGN(16) float minus_ln2_lo[4];
    XNN_ALIGN(16) float c2[4];
    XNN_ALIGN(16) float one[4];
    XNN_ALIGN(16) float denorm_cutoff[4];
  } sse2_rr2_lut64_p2;
  struct {
    XNN_ALIGN(16) float sign_mask[4];
    XNN_ALIGN(16) float magic_bias[4];
    XNN_ALIGN(16) float log2e[4];
    XNN_ALIGN(16) float minus_ln2_hi[4];
    XNN_ALIGN(16) float minus_ln2_lo[4];
    XNN_ALIGN(16) float c5[4];
    XNN_ALIGN(16) float c4[4];
    XNN_ALIGN(16) float c3[4];
    XNN_ALIGN(16) float c2[4];
    XNN_ALIGN(16) float c1[4];
    XNN_ALIGN(16) float one[4];
    XNN_ALIGN(16) float denorm_cutoff[4];
  } sse2_rr2_p5;
  struct {
    XNN_ALIGN(32) float sign_mask[8];
    XNN_ALIGN(32) float magic_bias[8];
    XNN_ALIGN(32) float log2e[8];
    XNN_ALIGN(32) float minus_ln2_hi[8];
    XNN_ALIGN(32) float minus_ln2_lo[8];
    XNN_ALIGN(32) float c5[8];
    XNN_ALIGN(32) float c4[8];
    XNN_ALIGN(32) float c3[8];
    XNN_ALIGN(32) float c2[8];
    XNN_ALIGN(32) float c1[8];
    XNN_ALIGN(32) float one[8];
    XNN_ALIGN(32) float two[8];
    XNN_ALIGN(32) float denorm_cutoff[8];
    int32_t mask_table[14];
  } avx_rr2_p5;
  struct {
    XNN_ALIGN(32) float sign_mask[8];
    XNN_ALIGN(32) float magic_bias[8];
    XNN_ALIGN(32) float log2e[8];
    XNN_ALIGN(32) float minus_ln2[8];
    XNN_ALIGN(32) float c5[8];
    XNN_ALIGN(32) float c4[8];
    XNN_ALIGN(32) float c3[8];
    XNN_ALIGN(32) float c2[8];
    XNN_ALIGN(32) float c1[8];
    XNN_ALIGN(32) float one[8];
    XNN_ALIGN(32) float denorm_cutoff[8];
    int32_t mask_table[14];
  } avx2_rr1_p5;
  struct {
    uint32_t sign_mask;
    float magic_bias;
    float log2e;
    float minus_ln2;
    float c3;
    float c2;
    float one;
    XNN_ALIGN(64) float table[16];
  } avx512_rr1_lut16_p3;
  struct {
    uint32_t sign_mask;
    float magic_bias;
    float log2e;
    float minus_ln2_hi;
    float minus_ln2_lo;
    float c2;
    float c1;
    float one;
    XNN_ALIGN(64) float table_lo[16];
    XNN_ALIGN(64) float table_hi[16];
  } avx512_rr2_lut32_p2;
  struct {
    uint32_t sign_mask;
    float log2e;
    float minus_ln2;
    float c5;
    float c4;
    float c3;
    float c2;
    float c1;
    float one;
  } avx512_rr1_p5;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
  struct {
    XNN_ALIGN(8) float magic_bias[2];
    XNN_ALIGN(8) float minus_log2e[2];
    XNN_ALIGN(8) uint32_t index_mask[2];
    XNN_ALIGN(8) float ln2_hi[2];
    XNN_ALIGN(8) float ln2_lo[2];
    XNN_ALIGN(8) float c2[2];
    XNN_ALIGN(8) float one[2];
    XNN_ALIGN(8) float denorm_cutoff[2];
  } wasmsimd_rr2_lut64_p2;
  struct {
    XNN_ALIGN(8) float magic_bias[2];
    XNN_ALIGN(8) float minus_log2e[2];
    XNN_ALIGN(8) float ln2_hi[2];
    XNN_ALIGN(8) float ln2_lo[2];
    XNN_ALIGN(8) float c5[2];
    XNN_ALIGN(8) float c4[2];
    XNN_ALIGN(8) float c3[2];
    XNN_ALIGN(8) float c2[2];
    XNN_ALIGN(8) float c1[2];
    XNN_ALIGN(8) float one[2];
    XNN_ALIGN(8) float denorm_cutoff[2];
  } wasmsimd_rr2_p5;
#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
};


// Sqrt (Square Root): used by VSQRT microkernels.

union xnn_f16_sqrt_params {
  char _; // Dummy member variable to comply with the C standard
};

union xnn_f32_sqrt_params {
  char _; // Dummy member variable to comply with the C standard
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    int32_t mask_table[14];
  } avx;
  struct {
    XNN_ALIGN(32) float half[8];
    int32_t mask_table[14];
  } fma;
  struct {
    float half;
  } avx512;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
};


// SqrtShift (Square Root + Shift): used by VSQRTSHIFT microkernels.

union xnn_u64_u32_sqrtshift_params {
  struct {
    uint32_t shift;
  } scalar;
};

// CHW: used by CONV/DWCONV microkernels in CHW layout with Min+Max parameters.

union xnn_f16_chw_params {
  char _; // Dummy member variable to comply with the C standard
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
  struct {
    uint16_t min;
    uint16_t max;
    XNN_ALIGN(8) uint16_t mask_even[4]; // used by stride 2 kernels
    XNN_ALIGN(8) uint16_t mask_odd[4];  // used by stride 2 kernels
    XNN_ALIGN(8) uint16_t mask[4]; // used by stride 1 kernels
    XNN_ALIGN(16) uint16_t maskx8[8]; // used by stride 1 x8 kernels
  } neonfp16arith;
#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
};

union xnn_f32_chw_params {
  struct {
    XNN_ALIGN(16) int32_t mask_even[4]; // used by stride 2 kernels
    XNN_ALIGN(16) int32_t mask_odd[4];  // used by stride 2 kernels
    XNN_ALIGN(16) int32_t mask[4]; // used by stride 1 kernels
    float min;
    float max;
  } scalar;
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
  struct {
    float min;
    float max;
    XNN_ALIGN(16) uint32_t mask_even[4]; // used by stride 2 kernels
    XNN_ALIGN(16) uint32_t mask_odd[4];  // used by stride 2 kernels
    XNN_ALIGN(16) uint32_t mask[4]; // used by stride 1 kernels
  } neon;
#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(16) float min[4];
    XNN_ALIGN(16) float max[4];
    XNN_ALIGN(16) uint32_t mask_even[4]; // used by stride 2 kernels
    XNN_ALIGN(16) uint32_t mask_odd[4];  // used by stride 2 kernels
    XNN_ALIGN(16) uint32_t mask[4]; // used by stride 1 kernels
  } sse;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
};


// GAvgPool (Global Average Pool): used by GAVGPOOL microkernels in CHW layout with Scale+Min+Max parameters.

union xnn_f16_gavgpool_params {
  char _; // Dummy member variable to comply with the C standard
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
  struct {
    XNN_ALIGN(16) uint16_t mask[8];
    uint16_t multiplier;
    uint16_t output_min;
    uint16_t output_max;
  } neonfp16arith;
#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64 */
};

union xnn_f32_gavgpool_params {
  struct {
    XNN_ALIGN(16) int32_t mask[4];
    float multiplier;
    float output_min;
    float output_max;
  } scalar;
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
  struct {
    XNN_ALIGN(16) float multiplier[4];
    XNN_ALIGN(16) float output_min[4];
    XNN_ALIGN(16) float output_max[4];
    XNN_ALIGN(16) uint32_t mask[4];
  } sse;
#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
  struct {
    XNN_ALIGN(16) uint32_t mask[4];
    float multiplier;
    float output_min;
    float output_max;
  } neon;
#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64 */
};

// Forward declare for use in microkernel headers for JIT generator functions.
struct xnn_code_buffer;

typedef int xnn_status_t;