// Copyright 2021 The Fuchsia Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #version 460 // // Produce multiple radix size histograms from the keyvals. // // clang-format off #extension GL_GOOGLE_include_directive : require #extension GL_EXT_control_flow_attributes : require #extension GL_KHR_shader_subgroup_basic : require // clang-format on // // // #include "config.h" // // Optional switches: // // #define RS_HISTOGRAM_ENABLE_BITFIELD_EXTRACT // #define RS_HISTOGRAM_DISABLE_SMEM_HISTOGRAM // // // Buffer reference macros and push constants // #include "bufref.h" #include "push.h" // // Push constants for histogram shader // RS_STRUCT_PUSH_HISTOGRAM(); layout(push_constant) uniform block_push { rs_push_histogram push; }; // // Subgroup uniform support // #if defined(RS_HISTOGRAM_SUBGROUP_UNIFORM_DISABLE) && defined(GL_EXT_subgroupuniform_qualifier) #extension GL_EXT_subgroupuniform_qualifier : required #define RS_SUBGROUP_UNIFORM subgroupuniformEXT #else #define RS_SUBGROUP_UNIFORM #endif // // Check all switches are defined // // What's the size of the keyval? #ifndef RS_KEYVAL_DWORDS #error "Undefined: RS_KEYVAL_DWORDS" #endif // #ifndef RS_HISTOGRAM_BLOCK_ROWS #error "Undefined: RS_HISTOGRAM_BLOCK_ROWS" #endif // #ifndef RS_HISTOGRAM_WORKGROUP_SIZE_LOG2 #error "Undefined: RS_HISTOGRAM_WORKGROUP_SIZE_LOG2" #endif // #ifndef RS_HISTOGRAM_SUBGROUP_SIZE_LOG2 #error "Undefined: RS_HISTOGRAM_SUBGROUP_SIZE_LOG2" #endif // // Local macros // // clang-format off #define RS_WORKGROUP_SIZE (1 << RS_HISTOGRAM_WORKGROUP_SIZE_LOG2) #define RS_SUBGROUP_SIZE (1 << RS_HISTOGRAM_SUBGROUP_SIZE_LOG2) #define RS_WORKGROUP_SUBGROUPS (RS_WORKGROUP_SIZE / RS_SUBGROUP_SIZE) #define RS_BLOCK_KEYVALS (RS_HISTOGRAM_BLOCK_ROWS * RS_WORKGROUP_SIZE) #define RS_KEYVAL_SIZE (RS_KEYVAL_DWORDS * 4) #define RS_RADIX_MASK ((1 << RS_RADIX_LOG2) - 1) // clang-format on // // Keyval type // #if (RS_KEYVAL_DWORDS == 1) #define RS_KEYVAL_TYPE uint32_t #elif (RS_KEYVAL_DWORDS == 2) #define RS_KEYVAL_TYPE u32vec2 #else #error "Unsupported RS_KEYVAL_DWORDS" #endif // // Histogram offset depends on number of workgroups. // #define RS_HISTOGRAM_BASE(pass_) ((RS_RADIX_SIZE * 4) * pass_) #if (RS_WORKGROUP_SUBGROUPS == 1) #define RS_HISTOGRAM_OFFSET(pass_) (RS_HISTOGRAM_BASE(pass_) + gl_SubgroupInvocationID * 4) #else #define RS_HISTOGRAM_OFFSET(pass_) (RS_HISTOGRAM_BASE(pass_) + gl_LocalInvocationID.x * 4) #endif // // Assumes (RS_RADIX_LOG2 == 8) // // Error if this ever changes // #if (RS_RADIX_LOG2 != 8) #error "(RS_RADIX_LOG2 != 8)" #endif // // Is bitfield extract faster? // #ifdef RS_HISTOGRAM_ENABLE_BITFIELD_EXTRACT //---------------------------------------------------------------------- // // Extract a keyval digit // #if (RS_KEYVAL_DWORDS == 1) #define RS_KV_EXTRACT_DIGIT(kv_, pass_) bitfieldExtract(kv_, pass_ * RS_RADIX_LOG2, RS_RADIX_LOG2) #else #define RS_KV_EXTRACT_DIGIT(kv_, pass_) \ bitfieldExtract(kv_[pass_ / 4], (pass_ & 3) * RS_RADIX_LOG2, RS_RADIX_LOG2) #endif //---------------------------------------------------------------------- #else //---------------------------------------------------------------------- // // Extract a keyval digit // #if (RS_KEYVAL_DWORDS == 1) #define RS_KV_EXTRACT_DIGIT(kv_, pass_) ((kv_ >> (pass_ * RS_RADIX_LOG2)) & RS_RADIX_MASK) #else #define RS_KV_EXTRACT_DIGIT(kv_, pass_) \ ((kv_[pass_ / 4] >> ((pass_ & 3) * RS_RADIX_LOG2)) & RS_RADIX_MASK) #endif //---------------------------------------------------------------------- #endif // // // #ifndef RS_HISTOGRAM_DISABLE_SMEM_HISTOGRAM struct rs_histogram_smem { uint32_t histogram[RS_RADIX_SIZE]; }; shared rs_histogram_smem smem; #endif // // // layout(local_size_x = RS_WORKGROUP_SIZE) in; // // // layout(buffer_reference, std430) buffer buffer_rs_kv { RS_KEYVAL_TYPE extent[]; }; layout(buffer_reference, std430) buffer buffer_rs_histograms { uint32_t extent[]; }; // // Shared memory functions // #ifndef RS_HISTOGRAM_DISABLE_SMEM_HISTOGRAM // // NOTE: Must use same access pattern as rs_histogram_zero() // void rs_histogram_zero() { // // Zero SMEM histogram // #if (RS_WORKGROUP_SUBGROUPS == 1) const uint32_t smem_offset = gl_SubgroupInvocationID; [[unroll]] for (RS_SUBGROUP_UNIFORM uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE) { smem.histogram[smem_offset + ii] = 0; } #elif (RS_WORKGROUP_SIZE < RS_RADIX_SIZE) const uint32_t smem_offset = gl_LocalInvocationID.x; [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE) { smem.histogram[smem_offset + ii] = 0; } const uint32_t smem_idx = smem_offset + ((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE); if (smem_idx < RS_RADIX_SIZE) { smem.histogram[smem_idx] = 0; } #elif (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE) #if (RS_WORKGROUP_SIZE > RS_RADIX_SIZE) if (gl_LocalInvocationID.x < RS_RADIX_SIZE) #endif { smem.histogram[gl_LocalInvocationID.x] = 0; } #endif } // // NOTE: Must use same access pattern as rs_histogram_zero() // void rs_histogram_global_store(restrict buffer_rs_histograms rs_histograms) { // // Store to GMEM // #if (RS_WORKGROUP_SUBGROUPS == 1) const uint32_t smem_offset = gl_SubgroupInvocationID; [[unroll]] for (RS_SUBGROUP_UNIFORM uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_SUBGROUP_SIZE) { const uint32_t count = smem.histogram[smem_offset + ii]; atomicAdd(rs_histograms.extent[ii], count); } #elif (RS_WORKGROUP_SIZE < RS_RADIX_SIZE) const uint32_t smem_offset = gl_LocalInvocationID.x; [[unroll]] for (uint32_t ii = 0; ii < RS_RADIX_SIZE; ii += RS_WORKGROUP_SIZE) { const uint32_t count = smem.histogram[smem_offset + ii]; atomicAdd(rs_histograms.extent[ii], count); } const uint32_t smem_idx = smem_offset + ((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE); if (smem_idx < RS_RADIX_SIZE) { const uint32_t count = smem.histogram[smem_idx]; atomicAdd(rs_histograms.extent[((RS_RADIX_SIZE / RS_WORKGROUP_SIZE) * RS_WORKGROUP_SIZE)], count); } #elif (RS_WORKGROUP_SIZE >= RS_RADIX_SIZE) #if (RS_WORKGROUP_SIZE > RS_RADIX_SIZE) if (gl_LocalInvocationID.x < RS_RADIX_SIZE) #endif { const uint32_t count = smem.histogram[gl_LocalInvocationID.x]; atomicAdd(rs_histograms.extent[0], count); } #endif } #endif // // // #ifndef RS_HISTOGRAM_DISABLE_SMEM_HISTOGRAM void rs_histogram_atomic_after_write() { #if (RS_WORKGROUP_SUBGROUPS == 1) subgroupMemoryBarrierShared(); #else barrier(); #endif } void rs_histogram_read_after_atomic() { #if (RS_WORKGROUP_SUBGROUPS == 1) subgroupMemoryBarrierShared(); #else barrier(); #endif } #endif // // // void main() { // // Which subgroups have work? // RS_KEYVAL_TYPE kv[RS_HISTOGRAM_BLOCK_ROWS]; // // Define kv_in bufref // // Assumes less than 2^30-1 keys and then extended multiplies it // by the keyval size. // u32vec2 kv_in_offset; umulExtended(gl_WorkGroupID.x * RS_BLOCK_KEYVALS + gl_LocalInvocationID.x, RS_KEYVAL_SIZE, kv_in_offset.y, // msb kv_in_offset.x); // lsb readonly RS_BUFREF_DEFINE_AT_OFFSET_U32VEC2(buffer_rs_kv, rs_kv_in, push.devaddr_keyvals, kv_in_offset); // // Load keyvals // [[unroll]] for (RS_SUBGROUP_UNIFORM uint32_t ii = 0; ii < RS_HISTOGRAM_BLOCK_ROWS; ii++) { kv[ii] = rs_kv_in.extent[ii * RS_WORKGROUP_SIZE]; } //////////////////////////////////////////////////////////////////////////// // // Accumulate and store histograms for passes // //////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////// // // MACRO EXPANSION VARIANT // // NOTE: THIS ALSO SERVES AS A MALI R24+ WORKAROUND: EXPLICITLY // EXPAND THE FOR/LOOP PASSES // #ifndef RS_HISTOGRAM_DISABLE_SMEM_HISTOGRAM #define RS_HISTOGRAM_PASS(pass_) \ rs_histogram_zero(); \ \ rs_histogram_atomic_after_write(); \ \ [[unroll]] for (RS_SUBGROUP_UNIFORM uint32_t jj = 0; jj < RS_HISTOGRAM_BLOCK_ROWS; jj++) \ { \ const uint32_t digit = RS_KV_EXTRACT_DIGIT(kv[jj], pass_); \ \ atomicAdd(smem.histogram[digit], 1); \ } \ \ rs_histogram_read_after_atomic(); \ \ { \ const uint32_t rs_histogram_offset = RS_HISTOGRAM_OFFSET(pass_); \ \ RS_BUFREF_DEFINE_AT_OFFSET_UINT32(buffer_rs_histograms, \ rs_histograms, \ push.devaddr_histograms, \ rs_histogram_offset); \ \ rs_histogram_global_store(rs_histograms); \ } \ \ if (push.passes == (RS_KEYVAL_SIZE - pass_)) \ return; #else // NO SHARED MEMORY #define RS_HISTOGRAM_PASS(pass_) \ { \ const uint32_t rs_histogram_base = RS_HISTOGRAM_BASE(pass_); \ \ RS_BUFREF_DEFINE_AT_OFFSET_UINT32(buffer_rs_histograms, \ rs_histograms, \ push.devaddr_histograms, \ rs_histogram_base); \ \ [[unroll]] for (RS_SUBGROUP_UNIFORM uint32_t jj = 0; jj < RS_HISTOGRAM_BLOCK_ROWS; jj++) \ { \ const uint32_t digit = RS_KV_EXTRACT_DIGIT(kv[jj], pass_); \ \ atomicAdd(rs_histograms.extent[digit], 1); \ } \ } \ \ if (push.passes == (RS_KEYVAL_SIZE - pass_)) \ return; #endif #if (RS_KEYVAL_DWORDS == 1) RS_HISTOGRAM_PASS(3) RS_HISTOGRAM_PASS(2) RS_HISTOGRAM_PASS(1) RS_HISTOGRAM_PASS(0) #elif (RS_KEYVAL_DWORDS == 2) RS_HISTOGRAM_PASS(7) RS_HISTOGRAM_PASS(6) RS_HISTOGRAM_PASS(5) RS_HISTOGRAM_PASS(4) RS_HISTOGRAM_PASS(3) RS_HISTOGRAM_PASS(2) RS_HISTOGRAM_PASS(1) RS_HISTOGRAM_PASS(0) #else #error "Error: (RS_KEYVAL_DWORDS >= 3) not implemented." #endif } // // //