#version 450 core

#define PRECISION ${PRECISION}
#define FORMAT ${FORMAT}

#include "indexing.h"

layout(std430) buffer;

/*
 * Output Buffer
 */
layout(set = 0, binding = 0) buffer PRECISION restrict writeonly OutBuffer {
  float data[];
}
uOutput;

/*
 * Output Buffer Metadata
 */
layout(set = 0, binding = 1) uniform PRECISION restrict OutMeta {
  uvec4 sizes;
  uvec4 strides;
  uint ndim;
  uint buf_length;
}
uOutMeta;

/*
 * Input Buffer
 */
layout(set = 0, binding = 2) buffer PRECISION restrict readonly InBuffer {
  float data[];
}
uInput;

/*
 * Input Buffer Metadata
 */
layout(set = 0, binding = 3) uniform PRECISION restrict InMeta {
  uvec4 sizes;
  uvec4 strides;
  uint ndim;
  uint buf_length;
}
uInMeta;

/*
 * Local Work Group Size
 */
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

/*
 * Copies data from the tensor at uInput to the tensor at uOutput based on 4D
 * coordinate. Each element at (x,y,c,n) in uInput will be copied to uOutput at
 * (x,y,c,n). If (x,y,c,n) is outside the bounds of uInput then 0 will be
 * written.
 *
 * Each shader invocation is responsible for one element of the output buffer.
 */
void main() {
  const uint write_idx = ivec3(gl_GlobalInvocationID).x;

  if (write_idx >= uOutMeta.buf_length) {
    return;
  }

  uvec4 write_coord =
      idx_to_coord(write_idx, uOutMeta.strides, uOutMeta.sizes);

  float outval = 0u;
  if (all(lessThan(write_coord, uInMeta.sizes))) {
    uint read_idx = coord_to_idx(write_coord, uInMeta.strides);
    outval = uInput.data[read_idx];
  }

  uOutput.data[write_idx] = outval;
}