/* * Copyright © 2022 Bas Nieuwenhuizen * * SPDX-License-Identifier: MIT */ #version 460 #extension GL_GOOGLE_include_directive : require #extension GL_EXT_shader_explicit_arithmetic_types_int8 : require #extension GL_EXT_shader_explicit_arithmetic_types_int16 : require #extension GL_EXT_shader_explicit_arithmetic_types_int32 : require #extension GL_EXT_shader_explicit_arithmetic_types_int64 : require #extension GL_EXT_shader_explicit_arithmetic_types_float16 : require #extension GL_EXT_scalar_block_layout : require #extension GL_EXT_buffer_reference : require #extension GL_EXT_buffer_reference2 : require #extension GL_KHR_memory_scope_semantics : require #extension GL_KHR_shader_subgroup_vote : require #extension GL_KHR_shader_subgroup_arithmetic : require #extension GL_KHR_shader_subgroup_ballot : require layout(local_size_x = 1024, local_size_y = 1, local_size_z = 1) in; #define USE_GLOBAL_SYNC #include "build_interface.h" TYPE(ploc_prefix_scan_partition, 4); layout(push_constant) uniform CONSTS { ploc_args args; }; shared uint32_t exclusive_prefix_sum; shared uint32_t aggregate_sums[PLOC_WORKGROUP_SIZE / 64]; /* * Global prefix scan over all workgroups to find out the index of the collapsed node to write. * See https://research.nvidia.com/sites/default/files/publications/nvr-2016-002.pdf * One partition = one workgroup in this case. */ uint32_t prefix_scan(uvec4 ballot, REF(ploc_prefix_scan_partition) partitions, uint32_t task_index) { if (gl_LocalInvocationIndex == 0) { /* Temporary copy of exclusive_prefix_sum to avoid reading+writing LDS each addition */ uint32_t local_exclusive_prefix_sum = 0; if (task_index >= gl_WorkGroupSize.x) { REF(ploc_prefix_scan_partition) current_partition = REF(ploc_prefix_scan_partition)(INDEX(ploc_prefix_scan_partition, partitions, task_index / gl_WorkGroupSize.x)); REF(ploc_prefix_scan_partition) previous_partition = current_partition - 1; while (true) { /* See if this previous workgroup already set their inclusive sum */ if (atomicLoad(DEREF(previous_partition).inclusive_sum, gl_ScopeDevice, gl_StorageSemanticsBuffer, gl_SemanticsAcquire | gl_SemanticsMakeVisible) != 0xFFFFFFFF) { local_exclusive_prefix_sum += DEREF(previous_partition).inclusive_sum; break; } else { local_exclusive_prefix_sum += DEREF(previous_partition).aggregate; previous_partition -= 1; } } /* Set the inclusive sum for the next workgroups */ atomicStore(DEREF(current_partition).inclusive_sum, DEREF(current_partition).aggregate + local_exclusive_prefix_sum, gl_ScopeDevice, gl_StorageSemanticsBuffer, gl_SemanticsRelease | gl_SemanticsMakeAvailable); } exclusive_prefix_sum = local_exclusive_prefix_sum; } if (subgroupElect()) aggregate_sums[gl_SubgroupID] = subgroupBallotBitCount(ballot); barrier(); if (gl_LocalInvocationID.x < PLOC_WORKGROUP_SIZE / 64) { aggregate_sums[gl_LocalInvocationID.x] = exclusive_prefix_sum + subgroupExclusiveAdd(aggregate_sums[gl_LocalInvocationID.x]); } barrier(); return aggregate_sums[gl_SubgroupID] + subgroupBallotExclusiveBitCount(ballot); } /* Relative cost of increasing the BVH depth. Deep BVHs will require more backtracking. */ #define BVH_LEVEL_COST 0.2 uint32_t push_node(uint32_t children[2], radv_aabb bounds[2]) { uint32_t internal_node_index = atomicAdd(DEREF(args.header).ir_internal_node_count, 1); uint32_t dst_offset = args.internal_node_offset + internal_node_index * SIZEOF(radv_ir_box_node); uint32_t dst_id = pack_ir_node_id(dst_offset, radv_ir_node_internal); REF(radv_ir_box_node) dst_node = REF(radv_ir_box_node)(OFFSET(args.bvh, dst_offset)); radv_aabb total_bounds; total_bounds.min = vec3(INFINITY); total_bounds.max = vec3(-INFINITY); for (uint i = 0; i < 2; ++i) { VOID_REF node = OFFSET(args.bvh, ir_id_to_offset(children[i])); REF(radv_ir_node) child = REF(radv_ir_node)(node); total_bounds.min = min(total_bounds.min, bounds[i].min); total_bounds.max = max(total_bounds.max, bounds[i].max); DEREF(dst_node).children[i] = children[i]; } DEREF(dst_node).base.aabb = total_bounds; DEREF(dst_node).bvh_offset = RADV_UNKNOWN_BVH_OFFSET; return dst_id; } #define PLOC_NEIGHBOURHOOD 16 #define PLOC_OFFSET_MASK ((1 << 5) - 1) uint32_t encode_neighbour_offset(float sah, uint32_t i, uint32_t j) { int32_t offset = int32_t(j) - int32_t(i); uint32_t encoded_offset = offset + PLOC_NEIGHBOURHOOD - (offset > 0 ? 1 : 0); return (floatBitsToUint(sah) & (~PLOC_OFFSET_MASK)) | encoded_offset; } int32_t decode_neighbour_offset(uint32_t encoded_offset) { int32_t offset = int32_t(encoded_offset & PLOC_OFFSET_MASK) - PLOC_NEIGHBOURHOOD; return offset + (offset >= 0 ? 1 : 0); } #define NUM_PLOC_LDS_ITEMS PLOC_WORKGROUP_SIZE + 4 * PLOC_NEIGHBOURHOOD shared radv_aabb shared_bounds[NUM_PLOC_LDS_ITEMS]; shared uint32_t nearest_neighbour_indices[NUM_PLOC_LDS_ITEMS]; uint32_t load_id(VOID_REF ids, uint32_t iter, uint32_t index) { if (iter == 0) return DEREF(REF(key_id_pair)(INDEX(key_id_pair, ids, index))).id; else return DEREF(REF(uint32_t)(INDEX(uint32_t, ids, index))); } void load_bounds(VOID_REF ids, uint32_t iter, uint32_t task_index, uint32_t lds_base, uint32_t neighbourhood_overlap, uint32_t search_bound) { for (uint32_t i = task_index - 2 * neighbourhood_overlap; i < search_bound; i += gl_WorkGroupSize.x) { uint32_t id = load_id(ids, iter, i); if (id == RADV_BVH_INVALID_NODE) continue; VOID_REF addr = OFFSET(args.bvh, ir_id_to_offset(id)); REF(radv_ir_node) node = REF(radv_ir_node)(addr); shared_bounds[i - lds_base] = DEREF(node).aabb; } } float combined_node_cost(uint32_t lds_base, uint32_t i, uint32_t j) { radv_aabb combined_bounds; combined_bounds.min = min(shared_bounds[i - lds_base].min, shared_bounds[j - lds_base].min); combined_bounds.max = max(shared_bounds[i - lds_base].max, shared_bounds[j - lds_base].max); return aabb_surface_area(combined_bounds); } shared uint32_t shared_aggregate_sum; void main(void) { VOID_REF src_ids = args.ids_0; VOID_REF dst_ids = args.ids_1; /* We try to use LBVH for BVHs where we know there will be less than 5 leaves, * but sometimes all leaves might be inactive */ if (DEREF(args.header).active_leaf_count <= 2) { if (gl_GlobalInvocationID.x == 0) { uint32_t internal_node_index = atomicAdd(DEREF(args.header).ir_internal_node_count, 1); uint32_t dst_offset = args.internal_node_offset + internal_node_index * SIZEOF(radv_ir_box_node); REF(radv_ir_box_node) dst_node = REF(radv_ir_box_node)(OFFSET(args.bvh, dst_offset)); radv_aabb total_bounds; total_bounds.min = vec3(INFINITY); total_bounds.max = vec3(-INFINITY); uint32_t i = 0; for (; i < DEREF(args.header).active_leaf_count; i++) { uint32_t child_id = DEREF(INDEX(key_id_pair, src_ids, i)).id; if (child_id != RADV_BVH_INVALID_NODE) { VOID_REF node = OFFSET(args.bvh, ir_id_to_offset(child_id)); REF(radv_ir_node) child = REF(radv_ir_node)(node); radv_aabb bounds = DEREF(child).aabb; total_bounds.min = min(total_bounds.min, bounds.min); total_bounds.max = max(total_bounds.max, bounds.max); } DEREF(dst_node).children[i] = child_id; } for (; i < 2; i++) DEREF(dst_node).children[i] = RADV_BVH_INVALID_NODE; DEREF(dst_node).base.aabb = total_bounds; DEREF(dst_node).bvh_offset = RADV_UNKNOWN_BVH_OFFSET; } return; } /* Only initialize sync_data once per workgroup. For intra-workgroup synchronization, * fetch_task contains a workgroup-scoped control+memory barrier. */ if (gl_LocalInvocationIndex == 0) { atomicCompSwap(DEREF(args.header).sync_data.task_counts[0], 0xFFFFFFFF, DEREF(args.header).active_leaf_count); atomicCompSwap(DEREF(args.header).sync_data.current_phase_end_counter, 0xFFFFFFFF, DIV_ROUND_UP(DEREF(args.header).active_leaf_count, gl_WorkGroupSize.x)); } REF(ploc_prefix_scan_partition) partitions = REF(ploc_prefix_scan_partition)(args.prefix_scan_partitions); uint32_t task_index = fetch_task(args.header, false); for (uint iter = 0;; ++iter) { uint32_t current_task_count = task_count(args.header); if (task_index == TASK_INDEX_INVALID) break; /* Find preferred partners and merge them */ PHASE (args.header) { uint32_t base_index = task_index - gl_LocalInvocationID.x; uint32_t neighbourhood_overlap = min(PLOC_NEIGHBOURHOOD, base_index); uint32_t double_neighbourhood_overlap = min(2 * PLOC_NEIGHBOURHOOD, base_index); /* Upper bound to where valid nearest node indices are written. */ uint32_t write_bound = min(current_task_count, base_index + gl_WorkGroupSize.x + PLOC_NEIGHBOURHOOD); /* Upper bound to where valid nearest node indices are searched. */ uint32_t search_bound = min(current_task_count, base_index + gl_WorkGroupSize.x + 2 * PLOC_NEIGHBOURHOOD); uint32_t lds_base = base_index - double_neighbourhood_overlap; load_bounds(src_ids, iter, task_index, lds_base, neighbourhood_overlap, search_bound); for (uint32_t i = gl_LocalInvocationID.x; i < NUM_PLOC_LDS_ITEMS; i += gl_WorkGroupSize.x) nearest_neighbour_indices[i] = 0xFFFFFFFF; barrier(); for (uint32_t i = task_index - double_neighbourhood_overlap; i < write_bound; i += gl_WorkGroupSize.x) { uint32_t right_bound = min(search_bound - 1 - i, PLOC_NEIGHBOURHOOD); uint32_t fallback_pair = i == 0 ? (i + 1) : (i - 1); uint32_t min_offset = encode_neighbour_offset(INFINITY, i, fallback_pair); for (uint32_t j = max(i + 1, base_index - neighbourhood_overlap); j <= i + right_bound; ++j) { float sah = combined_node_cost(lds_base, i, j); uint32_t i_encoded_offset = encode_neighbour_offset(sah, i, j); uint32_t j_encoded_offset = encode_neighbour_offset(sah, j, i); min_offset = min(min_offset, i_encoded_offset); atomicMin(nearest_neighbour_indices[j - lds_base], j_encoded_offset); } if (i >= base_index - neighbourhood_overlap) atomicMin(nearest_neighbour_indices[i - lds_base], min_offset); } if (gl_LocalInvocationID.x == 0) shared_aggregate_sum = 0; barrier(); for (uint32_t i = task_index - neighbourhood_overlap; i < write_bound; i += gl_WorkGroupSize.x) { uint32_t left_bound = min(i, PLOC_NEIGHBOURHOOD); uint32_t right_bound = min(search_bound - 1 - i, PLOC_NEIGHBOURHOOD); /* * Workaround for a worst-case scenario in PLOC: If the combined area of * all nodes (in the neighbourhood) is the same, then the chosen nearest * neighbour is the first neighbour. However, this means that no nodes * except the first two will find each other as nearest neighbour. Therefore, * only one node is contained in each BVH level. By first testing if the immediate * neighbour on one side is the nearest, all immediate neighbours will be merged * on every step. */ uint32_t preferred_pair; if ((i & 1) != 0) preferred_pair = i - min(left_bound, 1); else preferred_pair = i + min(right_bound, 1); if (preferred_pair != i) { uint32_t encoded_min_sah = nearest_neighbour_indices[i - lds_base] & (~PLOC_OFFSET_MASK); float sah = combined_node_cost(lds_base, i, preferred_pair); uint32_t encoded_sah = floatBitsToUint(sah) & (~PLOC_OFFSET_MASK); uint32_t encoded_offset = encode_neighbour_offset(sah, i, preferred_pair); if (encoded_sah <= encoded_min_sah) { nearest_neighbour_indices[i - lds_base] = encoded_offset; } } } barrier(); bool has_valid_node = true; if (task_index < current_task_count) { uint32_t base_index = task_index - gl_LocalInvocationID.x; uint32_t neighbour_index = task_index + decode_neighbour_offset(nearest_neighbour_indices[task_index - lds_base]); uint32_t other_neighbour_index = neighbour_index + decode_neighbour_offset(nearest_neighbour_indices[neighbour_index - lds_base]); uint32_t id = load_id(src_ids, iter, task_index); if (other_neighbour_index == task_index) { if (task_index < neighbour_index) { uint32_t neighbour_id = load_id(src_ids, iter, neighbour_index); uint32_t children[2] = {id, neighbour_id}; radv_aabb bounds[2] = {shared_bounds[task_index - lds_base], shared_bounds[neighbour_index - lds_base]}; DEREF(REF(uint32_t)(INDEX(uint32_t, dst_ids, task_index))) = push_node(children, bounds); DEREF(REF(uint32_t)(INDEX(uint32_t, dst_ids, neighbour_index))) = RADV_BVH_INVALID_NODE; } else { /* We still store in the other case so we don't destroy the node id needed to * create the internal node */ has_valid_node = false; } } else { DEREF(REF(uint32_t)(INDEX(uint32_t, dst_ids, task_index))) = id; } /* Compact - prepare prefix scan */ uvec4 ballot = subgroupBallot(has_valid_node); uint32_t aggregate_sum = subgroupBallotBitCount(ballot); if (subgroupElect()) atomicAdd(shared_aggregate_sum, aggregate_sum); } barrier(); /* * The paper proposes initializing all partitions to an invalid state * and only computing aggregates afterwards. We skip that step and * initialize the partitions to a valid state. This also simplifies * the look-back, as there will never be any blocking due to invalid * partitions. */ if (gl_LocalInvocationIndex == 0) { REF(ploc_prefix_scan_partition) current_partition = REF(ploc_prefix_scan_partition)( INDEX(ploc_prefix_scan_partition, partitions, task_index / gl_WorkGroupSize.x)); DEREF(current_partition).aggregate = shared_aggregate_sum; if (task_index < gl_WorkGroupSize.x) { DEREF(current_partition).inclusive_sum = shared_aggregate_sum; } else { DEREF(current_partition).inclusive_sum = 0xFFFFFFFF; } } if (task_index == 0) set_next_task_count(args.header, task_count(args.header)); } /* Compact - prefix scan and update */ PHASE (args.header) { uint32_t current_task_count = task_count(args.header); uint32_t id = task_index < current_task_count ? DEREF(REF(uint32_t)(INDEX(uint32_t, dst_ids, task_index))) : RADV_BVH_INVALID_NODE; uvec4 ballot = subgroupBallot(id != RADV_BVH_INVALID_NODE); uint32_t new_offset = prefix_scan(ballot, partitions, task_index); if (task_index >= current_task_count) continue; if (id != RADV_BVH_INVALID_NODE) { DEREF(REF(uint32_t)(INDEX(uint32_t, src_ids, new_offset))) = id; ++new_offset; } if (task_index == current_task_count - 1) { set_next_task_count(args.header, new_offset); if (new_offset == 1) DEREF(args.header).sync_data.next_phase_exit_flag = 1; } } } }