// Implementation of the parallel prefix sum algorithm layout(local_size_x = 256) in; const int SIZE = 256; layout(set=0, binding=0) readonly buffer inputs { float[] in_data; }; layout(set=0, binding=1) writeonly buffer outputs { float[] out_data; }; workgroup float[SIZE * 2] shared_data; // Test that workgroup-shared variables are passed to user-defined functions // correctly. noinline void store(uint i, float value) { shared_data[i] = value; } void main() { uint id = sk_GlobalInvocationID.x; uint rd_id; uint wr_id; uint mask; // Each thread is responsible for two elements of the output array shared_data[id * 2] = in_data[id * 2]; shared_data[id * 2 + 1] = in_data[id * 2 + 1]; workgroupBarrier(); const uint steps = uint(log2(float(SIZE))) + 1; for (uint step = 0; step < steps; step++) { // Calculate the read and write index in the shared array mask = (1 << step) - 1; rd_id = ((id >> step) << (step + 1)) + mask; wr_id = rd_id + 1 + (id & mask); // Accumulate the read data into our element store(wr_id, shared_data[wr_id] + shared_data[rd_id]); workgroupBarrier(); } // Write the final result out out_data[id * 2] = shared_data[id * 2]; out_data[id * 2 + 1] = shared_data[id * 2 + 1]; }