/* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #version 450 core #define PRECISION ${PRECISION} layout(std430) buffer; $if MEMTYPE == "ubo": ${layout_declare_ubo(0, "vec4", "A")} $elif MEMTYPE == "buffer": ${layout_declare_buffer(0, "r", "A", DTYPE, "PRECISION", False)} $else: ${layout_declare_buffer(0, "r", "_", DTYPE, "PRECISION", False)} ${layout_declare_buffer(1, "w", "B", DTYPE, "PRECISION", False)} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; layout(constant_id = 3) const int niter = 1; layout(constant_id = 4) const int nvec = 1; layout(constant_id = 5) const int local_group_size = 1; // The address mask works as a modulo because x % 2^n == x & (2^n - 1). // This will help us limit address accessing to a specific set of unique // addresses depending on the access size we want to measure. layout(constant_id = 6) const int addr_mask = 1; layout(constant_id = 7) const int workgroup_width = 1; $if MEMTYPE == "shared": shared vec4 A[nvec]; void main() { $if MEMTYPE == "shared": A[gl_LocalInvocationID[0]][0] = gl_LocalInvocationID[0]; memoryBarrierShared(); vec4 sum = vec4(0); uint offset = (gl_WorkGroupID[0] * workgroup_width + gl_LocalInvocationID[0]) & addr_mask; int i = 0; for (; i < niter; ++i){ $for j in range(int(NUNROLL)): sum *= A[offset]; // On each unroll, a new unique address will be accessed through the offset, // limited by the address mask to a specific set of unique addresses offset = (offset + local_group_size) & addr_mask; } // This is to ensure no compiler optimizations occur vec4 zero = vec4(i>>31); B[gl_LocalInvocationID[0]] = sum + zero; }