// // Copyright (C) 2009-2021 Intel Corporation // // SPDX-License-Identifier: MIT // // module new_sah_builder; kernel_module bfs_kernels ("bvh_build_BFS.cl") { links lsc_intrinsics; kernel opencl_build_kernel_BinnedSAH_BFS_pass1_initial < kernelFunction="BFS_pass1_initial" > ; kernel opencl_build_kernel_BinnedSAH_BFS_pass1_indexed < kernelFunction="BFS_pass1_indexed" > ; kernel opencl_build_kernel_BinnedSAH_BFS_pass2_initial < kernelFunction="BFS_pass2_initial" > ; kernel opencl_build_kernel_BinnedSAH_BFS_pass2_indexed < kernelFunction="BFS_pass2_indexed" > ; kernel opencl_build_kernel_BinnedSAH_DFS < kernelFunction="DFS" >; // kernel opencl_build_kernel_BinnedSAH_BuildQNodes < kernelFunction="build_qnodes" >; kernel opencl_build_kernel_BinnedSAH_BuildQNodes_Kickoff < kernelFunction="build_qnodes_pc_kickoff" >; kernel opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify < kernelFunction="build_qnodes_pc_amplify" >; kernel opencl_build_kernel_BinnedSAH_begin < kernelFunction = "begin" >; kernel opencl_build_kernel_BinnedSAH_scheduler < kernelFunction = "scheduler" >; kernel opencl_build_kernel_BinnedSAH_BFS_pass1_initial_batch < kernelFunction="BFS_pass1_initial_batchable" >; kernel opencl_build_kernel_BinnedSAH_BFS_pass1_indexed_batch < kernelFunction="BFS_pass1_indexed_batchable" >; kernel opencl_build_kernel_BinnedSAH_BFS_pass2_initial_batch < kernelFunction="BFS_pass2_initial_batchable" >; kernel opencl_build_kernel_BinnedSAH_BFS_pass2_indexed_batch < kernelFunction="BFS_pass2_indexed_batchable" >; kernel opencl_build_kernel_BinnedSAH_categorize_builds_and_init_scheduler < kernelFunction="categorize_builds_and_init_scheduler" >; kernel opencl_build_kernel_BinnedSAH_begin_batched < kernelFunction="begin_batchable" >; kernel opencl_build_kernel_BinnedSAH_qnode_init_scheduler_batched < kernelFunction="build_qnodes_init_scheduler_batched" >; kernel opencl_build_kernel_BinnedSAH_qnode_begin_batched < kernelFunction="build_qnodes_begin_batchable" >; kernel opencl_build_kernel_BinnedSAH_qnode_scheduler < kernelFunction="build_qnodes_scheduler" >; kernel opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify_batch < kernelFunction="build_qnodes_pc_amplify_batched" >; kernel opencl_build_kernel_BinnedSAH_BuildQnodes_TryToFillGRB_batched < kernelFunction="build_qnodes_try_to_fill_grb_batched" >; } kernel opencl_build_kernel_DFS_single_wg < source="bvh_build_DFS.cl", kernelFunction="DFS_single_wg" > kernel opencl_build_kernel_DFS_trivial < source="bvh_build_DFS.cl", kernelFunction="DFS_trivial" > kernel opencl_build_kernel_DFS_single_wg_batch < source="bvh_build_DFS.cl", kernelFunction="DFS_single_wg_batchable" > kernel opencl_build_kernel_DFS_trivial_batch < source="bvh_build_DFS.cl", kernelFunction="DFS_trivial_batchable" > kernel single_pass_binsah < source="bvh_build_DFS.cl", kernelFunction="DFS" > const DFS_MIN_PRIMREFS = 6; const DFS_MAX_PRIMREFS = 256; const BFS_WG_SIZE_SHIFT = 9; struct Scheduler { dword num_bfs_wgs; dword num_dfs_wgs; dword scheduler_postsync; dword _pad1; dword num_trivial_builds; dword num_single_builds; dword batched_build_wg_count; dword batched_build_loop_mask; }; struct SAHBuildArgs { qword p_num_primitives; qword p_qnode_child_buffer; qword p_scheduler; qword p_sah_globals; qword p_globals; qword p_primref_buffer; qword p_primref_index_buffers; qword p_bvh_base; qword p_bvh2; qword p_root_buffer_counters; dword sah_build_flags; dword leaf_size; dword leaf_type; dword max_internal_nodes; }; metakernel single_pass_binsah( qword build_globals, qword bvh_buffer, qword build_primref_buffer, qword build_primref_index_buffers, dword alloc_backpointers ) { dispatch single_pass_binsah(1, 1, 1) args( build_globals, bvh_buffer, build_primref_buffer, build_primref_index_buffers, alloc_backpointers ); } metakernel new_sah_build( SAHBuildArgs build_args ) { define REG_num_prims REG0; { define C_MIN_PRIMREFS REG1; define C_MAX_PRIMREFS REG2; define REG_dispatch_trivial REG3; define REG_dispatch_single_wg REG4; REG_num_prims = load_dword( build_args.p_num_primitives ); C_MIN_PRIMREFS = DFS_MIN_PRIMREFS; C_MAX_PRIMREFS = DFS_MAX_PRIMREFS; REG_dispatch_trivial = REG_num_prims <= C_MIN_PRIMREFS; REG_dispatch_single_wg = REG_num_prims <= C_MAX_PRIMREFS; goto l_dispatch_trivial if(REG_dispatch_trivial.lo); goto l_dispatch_single_wg if(REG_dispatch_single_wg.lo); goto l_full_build; } l_dispatch_trivial: { dispatch opencl_build_kernel_DFS_trivial (1,1,1) args( build_args.p_globals, build_args.p_bvh_base, build_args.p_primref_buffer, build_args.p_primref_index_buffers, build_args.sah_build_flags ); control( wait_idle ); goto l_done; } l_dispatch_single_wg: { dispatch opencl_build_kernel_DFS_single_wg (1,1,1) args( build_args.p_globals, build_args.p_bvh_base, build_args.p_primref_buffer, build_args.p_primref_index_buffers, build_args.sah_build_flags ); control( wait_idle ); goto l_done; } l_full_build: { define p_scheduler build_args.p_scheduler; define p_num_dfs_wgs build_args.p_scheduler + offsetof(Scheduler.num_dfs_wgs); define p_scheduler_postsync (build_args.p_scheduler + offsetof(Scheduler.scheduler_postsync) ); define C_0 REG1; define C_8 REG2; C_8 = 8; C_0 = 0; // // Init pass // store_dword( p_scheduler_postsync, C_0.lo ); // compute number of BFS WGs from prim-count // NOTE: This code uses a hardcoded WG size of 512 for BFS // If the BFS wg size ever changes, it needs to be touched // This is necessary because DG2 shifter only supports POW2 shifts { define REG_scheduler_postsync REG3; define C_511 REG4; define C_1 REG5; REG_scheduler_postsync = p_scheduler_postsync; C_511 = 511; C_1 = 1; store_qword( REG_scheduler_postsync, C_0 ); // initialize scheduler semaphore REG_num_prims = REG_num_prims + C_511; REG_num_prims = REG_num_prims >> C_8; REG_num_prims = REG_num_prims >> C_1; DISPATCHDIM_X = REG_num_prims.lo; DISPATCHDIM_Y = 1; DISPATCHDIM_Z = 1; control( cs_store_fence ); // commit the semaphore write // launch scheduler init kernel dispatch opencl_build_kernel_BinnedSAH_begin (1,1,1) args( build_args.p_scheduler, build_args.leaf_size, build_args.leaf_type, build_args.p_primref_index_buffers, build_args.p_primref_buffer, build_args.p_bvh2, build_args.p_bvh_base, build_args.p_globals, build_args.p_sah_globals, build_args.p_qnode_child_buffer, build_args.sah_build_flags ) postsync store_dword( p_scheduler_postsync, 1 ); // wait on init kernel semaphore_wait while( *p_scheduler_postsync != 1 ); // launch BFS1 pass1 dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_initial args( build_args.p_scheduler, build_args.p_sah_globals) postsync store_dword( p_scheduler_postsync, 0 ); // wait on BFS pass1 semaphore_wait while( *p_scheduler_postsync != 0 ); // launch BFS pass2 dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_initial args( build_args.p_scheduler, build_args.p_sah_globals ) postsync store_dword( p_scheduler_postsync, 1 ); } // after BFS pass 2 we drop into a scheduling loop l_build_loop: { semaphore_wait while( *p_scheduler_postsync != 1 ); { dispatch opencl_build_kernel_BinnedSAH_scheduler(1,1,1) args( build_args.p_scheduler, build_args.p_sah_globals ) postsync store_dword( p_scheduler_postsync, 0 ); // wait on the scheduler semaphore_wait while( *p_scheduler_postsync != 0 ); } // load and process the scheduler results define REG_wg_counts REG0; define REG_num_bfs_wgs REG0.lo; define REG_num_dfs_wgs REG0.hi; define REG_loop_break REG1; define REG_p_scheduler REG2; { REG_p_scheduler = p_scheduler; REG_wg_counts = load_qword( REG_p_scheduler ); define C_MASK_LO REG3 ; C_MASK_LO = 0xffffffff; REG_loop_break = REG_wg_counts & C_MASK_LO; REG_loop_break = REG_loop_break == 0; } // dispatch new DFS WGs DISPATCHDIM_X = REG_num_dfs_wgs; dispatch_indirect opencl_build_kernel_BinnedSAH_DFS args( p_scheduler, build_args.p_sah_globals ); // jump out if there are no bfs WGs goto l_build_qnodes if (REG_loop_break); // dispatch new BFS1 WGs DISPATCHDIM_X = REG_num_bfs_wgs; dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_indexed args( p_scheduler, build_args.p_sah_globals ) postsync store_dword( p_scheduler_postsync, 2 ); semaphore_wait while( *p_scheduler_postsync != 2 ); // dispatch new BFS2 WGs dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_indexed args( p_scheduler, build_args.p_sah_globals ) postsync store_dword( p_scheduler_postsync, 1 ); //goto l_build_loop if not(REG_num_dfs_wgs); //TODO: maybe add some logic to do "bool have_dfs_work" which will add some cycles but may be faster than checking semaphore // wait until all upcoming DFS WGs have finished launching // so that the scheduler can refill the launch array // TODO_OPT: Look at replacing this with a ring buffer so that scheduler stalls instead (and only rarely) semaphore_wait while( *p_num_dfs_wgs != 0 ); goto l_build_loop; } } l_build_qnodes: control( wait_idle ); // P/C qnode build dispatch opencl_build_kernel_BinnedSAH_BuildQNodes_Kickoff (1,1,1) args( build_args.p_sah_globals, build_args.p_qnode_child_buffer, build_args.sah_build_flags ); { define p_pc_counters ( build_args.p_root_buffer_counters ); define REG_addr REG0; define REG_produced REG1; define REG_consumed REG2; define REG_have_work REG3; define REG_wg_count REG4; define C_8 REG5; define C_16 REG6; define C_1 REG7; C_1 = 1; C_8 = 8; C_16 = 16; REG_addr = build_args.p_root_buffer_counters; // HINT: should we use REG_addr or just pass separate arguments to metakernel to avoid add/sub from address REG_consumed = 0; l_qnode_loop: control( wait_idle ); // wait for previous pass // load counters and compute number of wgs to respawn REG_produced = load_qword( REG_addr ); REG_addr = REG_addr + C_8; REG_wg_count = REG_produced - REG_consumed; REG_have_work = REG_wg_count > 0; goto l_done if not(REG_have_work.lo); // save REG_consumed as a starting position in p_qnode_child_buffer store_qword(REG_addr, REG_consumed); REG_addr = REG_addr + C_8; // save REG_produced as ending position in p_qnode_child_buffer store_qword(REG_addr, REG_produced); REG_addr = REG_addr - C_16; REG_consumed = REG_consumed + REG_wg_count; // update consumed for next iteration // calculate amount of workgroups to schedule REG_wg_count = REG_wg_count + C_1; REG_wg_count = REG_wg_count >> C_1; DISPATCHDIM_X = REG_wg_count.lo; control( cs_store_fence ); // commit the stores dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify args( build_args.p_sah_globals, build_args.p_qnode_child_buffer, build_args.sah_build_flags); goto l_qnode_loop; } l_done: } struct SAHBuildArgsBatchable { qword p_globals_ptrs; qword p_scheduler; qword p_buffers_info; qword p_sah_globals; dword num_max_qnode_global_root_buffer_entries; dword num_builds; }; metakernel new_sah_build_batchable( SAHBuildArgsBatchable build_args ) { define p_scheduler build_args.p_scheduler; define p_scheduler_postsync (build_args.p_scheduler + offsetof(Scheduler.scheduler_postsync) ); define p_num_dfs_wgs (build_args.p_scheduler + offsetof(Scheduler.num_dfs_wgs)); // initialize scheduler semaphore REG0.lo = 0; store_dword( p_scheduler_postsync, REG0.lo ); // dispatch categorization pass dispatch opencl_build_kernel_BinnedSAH_categorize_builds_and_init_scheduler(2,1,1) args( build_args.p_scheduler, build_args.p_globals_ptrs, build_args.p_buffers_info, build_args.p_sah_globals, build_args.num_builds ) postsync store_dword( p_scheduler_postsync, 1 ); // wait on the categorization pass semaphore_wait while( *p_scheduler_postsync != 1 ); // dispatch the trivial and single-WG passes { REG0 = load_qword( build_args.p_scheduler + offsetof(Scheduler.num_trivial_builds) ); DISPATCHDIM_X = REG0.lo; DISPATCHDIM_Y = 1; DISPATCHDIM_Z = 1; // dispatch trivial builds dispatch_indirect opencl_build_kernel_DFS_trivial_batch args( build_args.p_sah_globals ); control( wait_idle ); // dispatch single-wg builds DISPATCHDIM_X = REG0.hi; dispatch_indirect opencl_build_kernel_DFS_single_wg_batch args( build_args.p_sah_globals, build_args.p_scheduler ); } // compute the number of builds not covered by the trivial passes // skip the builder loop if all builds are satisfied by trivial passes { REG1 = REG0.lo; REG2 = REG0.hi; REG3 = build_args.num_builds; REG5 = REG2 + REG1; REG5 = REG3 - REG5; REG4 = REG5 == 0 ; goto l_done if (REG4.lo); } // REG5 (number of non-trivial builds) will be used to launch build_qnodes kernel after the build loop define REG_num_nontrivial REG5; l_build_outer_loop: { // configure the scheduler to initiate a new block of builds dispatch opencl_build_kernel_BinnedSAH_begin_batched (1,1,1) args( build_args.p_scheduler, build_args.p_sah_globals ) postsync store_dword( p_scheduler_postsync, 0 ); // wait on init kernel semaphore_wait while( *p_scheduler_postsync != 0 ); // read results produced by scheduler init kernel // lo == BFS wg count. hi == all ones if we need to loop again // REG0 = build_args.p_scheduler + offsetof(Scheduler.batched_build_wg_count); REG4 = load_qword( REG0 ); // launch BFS1 pass1 DISPATCHDIM_X = REG4.lo; dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_initial_batch args( build_args.p_scheduler, build_args.p_sah_globals) postsync store_dword( p_scheduler_postsync, 1 ); // wait on BFS pass1 semaphore_wait while( *p_scheduler_postsync != 1 ); // launch BFS pass2 dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_initial_batch args( build_args.p_scheduler, build_args.p_sah_globals ) postsync store_dword( p_scheduler_postsync, 0 ); l_build_loop: { semaphore_wait while( *p_scheduler_postsync != 0 ); { dispatch opencl_build_kernel_BinnedSAH_scheduler(1,1,1) args( build_args.p_scheduler, build_args.p_sah_globals ) postsync store_dword( p_scheduler_postsync, 1 ); // wait on the scheduler semaphore_wait while( *p_scheduler_postsync != 1 ); } // load and process the scheduler results define REG_wg_counts REG0; define REG_num_bfs_wgs REG0.lo; define REG_num_dfs_wgs REG0.hi; define REG_loop_break REG1; define REG_p_scheduler REG2; { REG_p_scheduler = p_scheduler; REG_wg_counts = load_qword( REG_p_scheduler ); define C_MASK_LO REG3 ; C_MASK_LO = 0xffffffff; REG_loop_break = REG_wg_counts & C_MASK_LO; REG_loop_break = REG_loop_break == 0; } // dispatch new DFS WGs DISPATCHDIM_X = REG_num_dfs_wgs; dispatch_indirect opencl_build_kernel_BinnedSAH_DFS args( p_scheduler, build_args.p_sah_globals ); // jump out if there are no bfs WGs goto l_continue_outer_loop if (REG_loop_break); // dispatch new BFS1 WGs DISPATCHDIM_X = REG_num_bfs_wgs; dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_indexed_batch args( p_scheduler, build_args.p_sah_globals ) postsync store_dword( p_scheduler_postsync, 2 ); semaphore_wait while( *p_scheduler_postsync != 2 ); // dispatch new BFS2 WGs dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_indexed_batch args( p_scheduler, build_args.p_sah_globals ) postsync store_dword( p_scheduler_postsync, 0 ); //goto l_build_loop if not(REG_num_dfs_wgs); //TODO: maybe add some logic to do "bool have_dfs_work" which will add some cycles but may be faster than checking semaphore // wait until all upcoming DFS WGs have finished launching // so that the scheduler can refill the launch array // TODO_OPT: Look at replacing this with a ring buffer so that scheduler stalls instead (and only rarely) semaphore_wait while( *p_num_dfs_wgs != 0 ); goto l_build_loop; } l_continue_outer_loop: goto l_build_outer_loop if(REG4.hi); } //////// // // Qnode build phase // //////// // Wait for all outstanding DFS dispatches to complete, then build the QNodes control( wait_idle ); define REG_wg_counts REG1; define REG_p_scheduler REG2; define REG_have_work REG3; define REG_GRB_NUM_MAX_ENTRIES REG4; // init scheduler for qnode phase dispatch opencl_build_kernel_BinnedSAH_qnode_init_scheduler_batched(1,1,1) args( build_args.p_scheduler, build_args.num_builds, build_args.num_max_qnode_global_root_buffer_entries); REG_p_scheduler = p_scheduler; control( wait_idle ); REG_wg_counts = load_qword( REG_p_scheduler ); DISPATCHDIM_X = REG_wg_counts.lo; // configure the scheduler to initiate a new block of builds dispatch_indirect opencl_build_kernel_BinnedSAH_qnode_begin_batched args( build_args.p_scheduler, build_args.p_sah_globals); // read results produced by init scheduler kernel // lo == num of builds processed. hi == num of maximum global root buffer entries // REG0 = build_args.p_scheduler + offsetof(Scheduler.batched_build_wg_count); REG5 = load_qword( REG0 ); REG_GRB_NUM_MAX_ENTRIES.lo = REG5.hi; REG_GRB_NUM_MAX_ENTRIES.hi = 0; l_qnode_loop: { control( wait_idle ); // wait for previous pass dispatch opencl_build_kernel_BinnedSAH_qnode_scheduler(1,1,1) args( build_args.p_scheduler ); control( wait_idle ); REG_wg_counts = load_qword( REG_p_scheduler ); REG_have_work = REG_wg_counts > 0; goto l_done if not(REG_have_work.lo); DISPATCHDIM_X = REG_wg_counts.lo; dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify_batch args( build_args.p_sah_globals, build_args.p_scheduler ); control( wait_idle ); REG_wg_counts = load_qword( REG_p_scheduler ); // reload values REG_wg_counts.lo = REG_wg_counts.hi; REG_wg_counts.hi = 0; REG_have_work = REG_wg_counts < REG_GRB_NUM_MAX_ENTRIES; goto l_qnode_loop if not(REG_have_work.lo); DISPATCHDIM_X = REG5.lo; // dispatch single workgroup for each build scheduled dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQnodes_TryToFillGRB_batched args( build_args.p_sah_globals, build_args.p_scheduler ); goto l_qnode_loop; } //////// // // Old implementation - TODO: maybe add switch between two implementations? // //////// // Wait for all outstanding DFS dispatches to complete, then build the QNodes //DISPATCHDIM_X = REG5.lo; //dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQNodes // args( build_args.p_sah_globals, build_args.p_scheduler ); l_done: control( wait_idle ); }