// Implementation of the parallel prefix sum algorithm

layout(local_size_x = 256) in;
const int SIZE = 256;

layout(set=0, binding=0) readonly buffer inputs {
    float[] in_data;
};
layout(set=0, binding=1) writeonly buffer outputs {
    float[] out_data;
};

workgroup float[SIZE * 2] shared_data;

// Test that workgroup-shared variables are passed to user-defined functions
// correctly.
noinline void store(uint i, float value) {
    shared_data[i] = value;
}

void main() {
    uint id = sk_GlobalInvocationID.x;
    uint rd_id;
    uint wr_id;
    uint mask;

    // Each thread is responsible for two elements of the output array
    shared_data[id * 2] = in_data[id * 2];
    shared_data[id * 2 + 1] = in_data[id * 2 + 1];

    workgroupBarrier();

    const uint steps = uint(log2(float(SIZE))) + 1;
    for (uint step = 0; step < steps; step++) {
        // Calculate the read and write index in the shared array
        mask = (1 << step) - 1;
        rd_id = ((id >> step) << (step + 1)) + mask;
        wr_id = rd_id + 1 + (id & mask);

        // Accumulate the read data into our element
        store(wr_id, shared_data[wr_id] + shared_data[rd_id]);

        workgroupBarrier();
    }

    // Write the final result out
    out_data[id * 2] = shared_data[id * 2];
    out_data[id * 2 + 1] = shared_data[id * 2 + 1];
}