//
// Copyright (C) 2009-2021 Intel Corporation
//
// SPDX-License-Identifier: MIT
//
//

module atomic_update;

kernel_module atomic_update ("atomic_update.cl")
{    
    links lsc_intrinsics;
    kernel init_refit_scratch   < kernelFunction = "init_refit_scratch"  >;
    kernel traverse_aabbs_quad  < kernelFunction = "traverse_aabbs_quad" >;
    kernel write_inner_nodes    < kernelFunction = "write_inner_nodes"   >;
    kernel build_fatleaf_table  < kernelFunction = "build_fatleaf_table" >;
    kernel build_innernode_table < kernelFunction = "build_innernode_table" >;

    kernel update_single_group_quads < kernelFunction = "update_single_group_quads" >;

    kernel build_fatleaf_table_new_update  < kernelFunction = "build_fatleaf_table_new_update" >;
    kernel fixup_quad_table  < kernelFunction = "fixup_quad_table" >;
    kernel traverse_aabbs_new_update  < kernelFunction = "traverse_aabbs_new_update" >;
    kernel traverse_aabbs_new_update_single_geo  < kernelFunction = "traverse_aabbs_new_update_single_geo" >;
}

import struct MKBuilderState "structs.grl";

// this metakernel only initializes registers for use in a batching loop by "init_refit_scratch"
metakernel init_refit_scratch_metakernel_registers()
{
    REG0.hi = 0;
    REG1 = 3;
    REG2 = 63;
    REG3 = 4;
    REG4 = 2;

    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
}

metakernel init_refit_scratch( qword bvh_base, qword scratch)//, dword max_inner_nodes )
{
    REG0.lo = load_dword( bvh_base + 12 ); // TODO: DON'T HARDCODE!!
    define C_3  REG1;
    define C_63 REG2;
    define C_4  REG3;
    define C_2  REG4;

    REG0 = REG0 - C_3; // nodedataCurr - fixed offset
    REG0 = REG0 + C_63; // + 63
    REG0 = REG0 >> C_4; // >> 4
    REG0 = REG0 >> C_2; // >> 2 == >> 6 == /64

    DISPATCHDIM_X = REG0.lo;

    dispatch_indirect init_refit_scratch//( (max_inner_nodes+63)/64, 1, 1 )
        args(bvh_base,scratch);

}

metakernel build_node_tables( qword bvh_base )
{
    REG0 = load_dword( bvh_base + 12 ); // TODO: DON'T HARDCODE!!
    REG1 = 2;
    REG2 = 63;
    REG3 = 4;
    REG4 = 3;  // fixed offset... TODO: DON'T HARDCODE!!

    REG0 = REG0 - REG4; // nodedataCurr - fixed offset
    REG0 = REG0 + REG2; // + 63
    REG0 = REG0 >> REG3; // >> 4
    REG0 = REG0 >> REG1; // >> 2 == >> 6 == /64

    DISPATCHDIM_X = REG0.lo;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;

    dispatch_indirect build_fatleaf_table//( (max_inner_nodes+63)/64, 1, 1 )
        args(bvh_base);
    dispatch_indirect build_innernode_table//( (max_inner_nodes+63)/64, 1, 1 )
        args(bvh_base);
}

metakernel build_node_tables_new_update( MKBuilderState state, qword bvh_base )
{
    REG0 = load_dword( bvh_base + 12 ); // TODO: DON'T HARDCODE!!
    REG1 = 2;
    REG2 = 63;
    REG3 = 4;
    REG4 = 3;  // fixed offset... TODO: DON'T HARDCODE!!

    REG0 = REG0 - REG4; // nodedataCurr - fixed offset
    REG0 = REG0 + REG2; // + 63
    REG0 = REG0 >> REG3; // >> 4
    REG0 = REG0 >> REG1; // >> 2 == >> 6 == /64

    DISPATCHDIM_X = REG0.lo;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;

    dispatch_indirect build_fatleaf_table_new_update//( (max_inner_nodes+63)/64, 1, 1 )
        args(state.build_globals, bvh_base);
    dispatch_indirect build_innernode_table//( (max_inner_nodes+63)/64, 1, 1 )
        args(bvh_base);
}

metakernel fixup_quad_table( qword bvh_base )
{
    dispatch  fixup_quad_table(2,1,1)
        args(bvh_base);
}

// this metakernel only initializes registers for use in a batching loop by "traverse_aabbs_quad" and "write_inner_nodes"
metakernel init_traverse_aabbs_quad_and_write_inner_nodes()
{
    REG0.hi = 0;
    REG1 = 1;
    REG2 = 31;
    REG3 = 4;
    REG4 = 2;
    REG5 = 7;
    REG6 = 255;
    DISPATCHDIM_Y = 1;
    DISPATCHDIM_Z = 1;
}

metakernel traverse_aabbs_quad( qword bvh_base, qword scratch, qword geos)//, dword max_inner_nodes )
{

    REG0.lo = load_dword( bvh_base + 64 ); // TODO: DOn't hardcode!
    define C_1  REG1;
    define C_31 REG2;
    define C_4  REG3;

    REG0 = REG0 + C_31; // + 31
    REG0 = REG0 >> C_4; // >> 4
    REG0 = REG0 >> C_1; // >> 1 == >> 5 == /32

    DISPATCHDIM_X = REG0.lo;

    dispatch_indirect traverse_aabbs_quad//( (max_inner_nodes+32)/32, 1, 1 )
        args(bvh_base,scratch,geos);
}

metakernel write_inner_nodes( qword bvh_base, qword scratch )//, dword max_inner_nodes )
{
    REG0.lo = load_dword( bvh_base + 68 ); // TODO: DOn't hardcode!
    define C_1 REG1;
    define C_2 REG4;
    define C_7 REG5;

    REG0 = REG0 + C_7;  // + 7
    REG0 = REG0 >> C_2; // >> 2 
    REG0 = REG0 >> C_1; // >> 1 ==>  >> 3  (/8)
    DISPATCHDIM_X = REG0.lo;

    dispatch_indirect  write_inner_nodes//( (max_inner_nodes+7)/8, 1, 1 )
        args(bvh_base,scratch);
}

metakernel update_single_group_quads( qword bvh_base, qword geos, qword aabbs  )
{
    dispatch  update_single_group_quads(1,1,1) //( (max_inner_nodes+1)/2, 1, 1 )
        args(bvh_base,geos,aabbs);
}

metakernel traverse_aabbs_new_update( qword bvh_base, qword geos, qword scratch )
{
    REG0.lo = load_dword( bvh_base + 84 ); // TODO: DOn't hardcode!
    define C_255 REG6;
    define C_4   REG3;
    
    REG0 = REG0 + C_255; // + 255
    REG0 = REG0 >> C_4; // >> 4
    REG0 = REG0 >> C_4; // >> 4 == >> 8 == /32
    
    DISPATCHDIM_X = REG0.lo;

    dispatch_indirect traverse_aabbs_new_update//( (max_inner_nodes+255)/256, 1, 1 )
        args(bvh_base, geos, scratch);
}

metakernel traverse_aabbs_new_update_single_geo( qword bvh_base, qword vertices, qword geos, qword scratch, dword vertex_format )
{
    REG0.lo = load_dword( bvh_base + 84 ); // TODO: DOn't hardcode!
    define C_255 REG6;
    define C_4   REG3;
    
    REG0 = REG0 + C_255; // + 255
    REG0 = REG0 >> C_4; // >> 4
    REG0 = REG0 >> C_4; // >> 4 == >> 8 == /32
    
    DISPATCHDIM_X = REG0.lo;

    dispatch_indirect traverse_aabbs_new_update_single_geo//( (max_inner_nodes+255)/256, 1, 1 )
        args(bvh_base, vertices, geos, scratch, vertex_format);
}