#version 450 core
// clang-format off
#define PRECISION ${PRECISION}
#define FORMAT ${FORMAT}

#define OP(X, Y, A) ${OPERATOR}
// clang-format on

#include "texel_access.h"

layout(std430) buffer;

// clang-format off
$if not INPLACE:
  layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D uOutput;
  layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
  layout(set = 0, binding = 2) uniform PRECISION sampler3D uOther;
  layout(set = 0, binding = 3) uniform PRECISION restrict Block {
    ivec4 output_sizes;
    ivec4 input_sizes;
    ivec4 other_sizes;
    float alpha;
  }
  uArgs;
$else:
  layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict image3D uOutput;
  layout(set = 0, binding = 1) uniform PRECISION sampler3D uOther;
  layout(set = 0, binding = 2) uniform PRECISION restrict Block {
    ivec4 output_sizes;
    ivec4 other_sizes;
    float alpha;
  }
  uArgs;
// clang-format on

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

void main() {
  const ivec3 pos = ivec3(gl_GlobalInvocationID);

  ivec3 output_extents;
  output_extents.xy = uArgs.output_sizes.xy;
  output_extents.z =
      uArgs.output_sizes.w * int(ceil(uArgs.output_sizes.z / 4.0));

  if (any(greaterThanEqual(pos, output_extents.xyz))) {
    return;
  }

  ivec3 other_pos =
      map_output_pos_to_input_pos(pos, uArgs.output_sizes, uArgs.other_sizes);
  vec4 other_texel =
      load_texel(other_pos, uArgs.output_sizes, uArgs.other_sizes, uOther);

  // Zero padding is added to the channels dimension when tensors are stored as
  // image textures. This will cause a divide-by-zero when performing division.
  // For division, apply an extra step of detecting which elements are zero
  // padding to avoid division by zero.
  // clang-format off
  $if IS_DIV:
    const int c_index = (pos.z % ((uArgs.output_sizes.z + 3) / 4)) * 4;
    if (uArgs.other_sizes.z != 1 && c_index + 3 >= uArgs.output_sizes.z) {
      ivec4 c_ind = ivec4(c_index) + ivec4(0, 1, 2, 3);
      vec4 mask = vec4(lessThan(c_ind, ivec4(uArgs.output_sizes.z)));
      other_texel = other_texel * mask + vec4(1, 1, 1, 1) - mask;
    }

  $if not INPLACE:
    ivec3 input_pos =
        map_output_pos_to_input_pos(pos, uArgs.output_sizes, uArgs.input_sizes);
    const vec4 in_texel =
        load_texel(input_pos, uArgs.output_sizes, uArgs.input_sizes, uInput);

    imageStore(uOutput, pos, OP(in_texel, other_texel, uArgs.alpha));
  $else:
    const vec4 in_texel = imageLoad(uOutput, pos);
    imageStore(uOutput, pos, OP(in_texel, other_texel, uArgs.alpha));
  // clang-format on
}