// // Copyright (C) 2009-2021 Intel Corporation // // SPDX-License-Identifier: MIT // // module build_refit; kernel_module morton_kernels ("bvh_build_refit.cl") { links lsc_intrinsics; kernel update_instance_leaves < kernelFunction="update_instance_leaves" >; kernel refit_indirect_sg < kernelFunction="Refit_indirect_sg" >; kernel update_instance_leaves_indirect < kernelFunction="update_instance_leaves_indirect" >; } const INSTANCE_LEAF_GROUP_SIZE = 16; const REFIT_GROUP_SIZE = 8; metakernel update_instance_leaves( qword bvh, qword dxrInstancesArray, qword dxrInstancesPtrArray, qword instance_leaf_aabbs, dword num_instances ) { define num_groups (num_instances + INSTANCE_LEAF_GROUP_SIZE - 1) / INSTANCE_LEAF_GROUP_SIZE; dispatch update_instance_leaves(num_groups, 1, 1) args( bvh, dxrInstancesArray, dxrInstancesPtrArray, instance_leaf_aabbs); } metakernel update_instance_leaves_indirect( qword bvh, qword dxrInstancesArray, qword dxrInstancesPtrArray, qword instance_leaf_aabbs, qword indirectBuildRangeInfo) { define num_groups REG0; define groupsize_1 REG1; // groupsize - 1 define C_4 REG2; // init with primitiveCount num_groups = load_dword(indirectBuildRangeInfo); groupsize_1 = 15; // INSTANCE_LEAF_GROUP_SIZE - 1 C_4 = 4; // log_2(INSTANCE_LEAF_GROUP_SIZE) num_groups = num_groups + groupsize_1; num_groups = num_groups >> C_4; // num_groups / INSTANCE_LEAF_GROUP_SIZE; DISPATCHDIM_X = num_groups.lo; DISPATCHDIM_Y = 1; DISPATCHDIM_Z = 1; dispatch_indirect update_instance_leaves_indirect args( bvh, dxrInstancesArray, dxrInstancesPtrArray, instance_leaf_aabbs, indirectBuildRangeInfo); } /* metakernel refit( qword bvh, qword geomDesc, qword instance_aabbs, dword dispatchSize ) { define num_groups (dispatchSize + REFIT_GROUP_SIZE - 1) / REFIT_GROUP_SIZE; dispatch refit(num_groups, 1, 1) args( bvh, geomDesc, instance_aabbs); } const REFIT_SIMD_SIZE = 8; const REFIT_SIMD_SIZE_SHIFT = 3; metakernel refit_indirect( qword bvh, qword bvh_inner_nodes_start_value, qword bvh_inner_nodes_end, qword geomDesc, qword instance_aabbs ) { define cRoundingSIMD REG4; define TWO REG3; define ONE REG5; cRoundingSIMD = (REFIT_SIMD_SIZE - 1); TWO = 2; ONE = 1; REG0 = bvh_inner_nodes_start_value; REG1 = load_dword(bvh_inner_nodes_end); REG1.hi = 0; REG2 = REG1 - REG0; REG2 = REG2 + cRoundingSIMD; REG2 = REG2 >> TWO; // JDB: >>3 must be implemented as >>2 then >>1 because command streamer REG2 = REG2 >> ONE; // only supports pow2 shifts because somebody wanted to save area. DISPATCHDIM_X = REG2.lo; DISPATCHDIM_Y = 1; DISPATCHDIM_Z = 1; dispatch_indirect refit_indirect args( bvh, geomDesc, instance_aabbs); } */ metakernel refit_indirect_sg( qword bvh, qword bvh_inner_nodes_start_value, qword bvh_inner_nodes_end, qword geomDesc, qword instance_aabbs ) { REG0 = bvh_inner_nodes_start_value; REG1.lo = load_dword(bvh_inner_nodes_end); REG1.hi = 0; REG2 = REG1 - REG0; DISPATCHDIM_X = REG2.lo; DISPATCHDIM_Y = 1; DISPATCHDIM_Z = 1; dispatch_indirect refit_indirect_sg args( bvh, geomDesc, instance_aabbs); } /* //////////////////////////////////////////////////////////////// // constructing treelets // phase 1: mark nodes that will be roots of bottom treelets // also for each node leave a number of startpoints that are under it and max depth of the path from the node metakernel find_refit_treelets( qword bvh, qword treelet_node_data, qword scratch_startpoints, qword startpointAlloc, qword bvh_inner_nodes_start_value, qword bvh_inner_nodes_end ) { define cRoundingSIMD REG4; define TWO REG3; define ONE REG5; cRoundingSIMD = (REFIT_SIMD_SIZE - 1); TWO = 2; ONE = 1; REG0 = bvh_inner_nodes_start_value; REG1.lo = load_dword(bvh_inner_nodes_end); REG1.hi = 0; REG2 = REG1 - REG0; REG2 = REG2 + cRoundingSIMD; REG2 = REG2 >> TWO; // JDB: >>3 must be implemented as >>2 then >>1 because command streamer REG2 = REG2 >> ONE; // only supports pow2 shifts because somebody wanted to save area. DISPATCHDIM_X = REG2.lo; DISPATCHDIM_Y = 1; DISPATCHDIM_Z = 1; dispatch_indirect find_refit_treelets args( bvh, treelet_node_data, scratch_startpoints, startpointAlloc); } //////////////////////////////////////////////////////////////// // constructing treelets // phase 2 totally parallel, run threads up to assign startpoints to given treelet // metakernel assign_refit_startpoints_to_treelets( qword bvh, qword treelet_node_data, qword scratch_startpoints, qword bvh_inner_nodes_start_value, qword bvh_inner_nodes_end ) { define cRoundingSIMD REG4; define TWO REG3; define ONE REG5; cRoundingSIMD = (REFIT_SIMD_SIZE - 1); TWO = 2; ONE = 1; REG0 = bvh_inner_nodes_start_value; REG1.lo = load_dword(bvh_inner_nodes_end); REG1.hi = 0; REG2 = REG1 - REG0; REG2 = REG2 + cRoundingSIMD; REG2 = REG2 >> TWO; // JDB: >>3 must be implemented as >>2 then >>1 because command streamer REG2 = REG2 >> ONE; // only supports pow2 shifts because somebody wanted to save area. DISPATCHDIM_X = REG2.lo; DISPATCHDIM_Y = 1; DISPATCHDIM_Z = 1; dispatch_indirect assign_refit_startpoints_to_treelets args( bvh, treelet_node_data, scratch_startpoints); } //////////////////////////////////////////////////////////////// // constructing treelets // phase 3 local work: group per treelet, sort the startpoints in treelets ?// by length of the path metakernel finalize_treelets_in_groups( qword bvh, qword scratch_startpoints, qword ptrNumTreelets ) { REG0 = load_qword(ptrNumTreelets); DISPATCHDIM_X = REG0.lo; DISPATCHDIM_Y = 1; DISPATCHDIM_Z = 1; dispatch_indirect finalize_treelets_in_groups args( bvh, scratch_startpoints); } //////////////////////////////////////////////////////////////// // Updating treelets // phase 1 update vertex and generate boxes for vertices // const PER_GROUP_ELEMENTS_ROUNDING = 15; const PER_GROUP_ELEMENTS_SHIFT = 4; metakernel init_treelets_refit(qword pSquashGroupsCountToReset) { REG1 = 0; store_qword(pSquashGroupsCountToReset, REG1); DISPATCHDIM_Y = 1; DISPATCHDIM_Z = 1; //REG4 = PER_GROUP_ELEMENTS_SHIFT; //REG5.hi = PER_GROUP_ELEMENTS_ROUNDING; //REG5.lo = 0; } metakernel update_quads( qword scratch_box, qword bvh, qword input, dword numPrimsDividedBy32, qword bigSquashInput) { //REG0 = load_qword(quads_nodes_begin_end_pair); //REG1.hi = REG0.lo; // this holds inner nodes begin //REG2 = REG0 - REG1; //REG2 = REG2 + REG5; //REG2 = REG2 >> REG4; //DISPATCHDIM_X = REG2.hi; dispatch refit_quads(numPrimsDividedBy32, 1, 1) args( bvh, input, scratch_box, numPrimsDividedBy32, bigSquashInput ); } // //////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////// // // phase 1 or 2 - update primitives as well as bottom up refit internal nodes // in single dispatch (in single group per tree) metakernel refit_tree_by_group_including_quads( qword squashed_inputs, dword numBuilds ) { dispatch refit_tree_per_group(numBuilds, 1, 1) args( squashed_inputs); } // //////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////// // // phase 2 bottom up refit internal nodes // metakernel refit_treelet_per_group( qword bigSquashInput, qword ptrNumTreelets) { DISPATCHDIM_X = load_dword(ptrNumTreelets); dispatch_indirect refit_treelet_per_group args( bigSquashInput); } // //////////////////////////////////////////////////////////////// #endif */