//
// Copyright (C) 2009-2021 Intel Corporation
//
// SPDX-License-Identifier: MIT
//
//

module new_sah_builder;

kernel_module bfs_kernels ("bvh_build_BFS.cl")
{
    links lsc_intrinsics;

    kernel opencl_build_kernel_BinnedSAH_BFS_pass1_initial      <  kernelFunction="BFS_pass1_initial"  >   ;
    kernel opencl_build_kernel_BinnedSAH_BFS_pass1_indexed      <  kernelFunction="BFS_pass1_indexed"  >   ;
    kernel opencl_build_kernel_BinnedSAH_BFS_pass2_initial      <  kernelFunction="BFS_pass2_initial"  >   ;
    kernel opencl_build_kernel_BinnedSAH_BFS_pass2_indexed      <  kernelFunction="BFS_pass2_indexed"  >   ;

    kernel opencl_build_kernel_BinnedSAH_DFS                    <  kernelFunction="DFS"        >;
    // kernel opencl_build_kernel_BinnedSAH_BuildQNodes            <  kernelFunction="build_qnodes" >;
    kernel opencl_build_kernel_BinnedSAH_BuildQNodes_Kickoff    <  kernelFunction="build_qnodes_pc_kickoff" >;
    kernel opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify    <  kernelFunction="build_qnodes_pc_amplify" >;
    kernel opencl_build_kernel_BinnedSAH_begin                  <  kernelFunction = "begin" >;
    kernel opencl_build_kernel_BinnedSAH_scheduler              <  kernelFunction = "scheduler" >;

    kernel opencl_build_kernel_BinnedSAH_BFS_pass1_initial_batch   < kernelFunction="BFS_pass1_initial_batchable"  >;
    kernel opencl_build_kernel_BinnedSAH_BFS_pass1_indexed_batch   < kernelFunction="BFS_pass1_indexed_batchable"  >;
    kernel opencl_build_kernel_BinnedSAH_BFS_pass2_initial_batch   < kernelFunction="BFS_pass2_initial_batchable"  >;
    kernel opencl_build_kernel_BinnedSAH_BFS_pass2_indexed_batch   < kernelFunction="BFS_pass2_indexed_batchable"  >;

    kernel opencl_build_kernel_BinnedSAH_categorize_builds_and_init_scheduler < kernelFunction="categorize_builds_and_init_scheduler" >;
    kernel opencl_build_kernel_BinnedSAH_begin_batched     < kernelFunction="begin_batchable"   >;

    kernel opencl_build_kernel_BinnedSAH_qnode_init_scheduler_batched      < kernelFunction="build_qnodes_init_scheduler_batched" >;
    kernel opencl_build_kernel_BinnedSAH_qnode_begin_batched               < kernelFunction="build_qnodes_begin_batchable" >;
    kernel opencl_build_kernel_BinnedSAH_qnode_scheduler                   < kernelFunction="build_qnodes_scheduler" >;
    kernel opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify_batch         < kernelFunction="build_qnodes_pc_amplify_batched" >;

    kernel opencl_build_kernel_BinnedSAH_BuildQnodes_TryToFillGRB_batched < kernelFunction="build_qnodes_try_to_fill_grb_batched" >;

}

kernel opencl_build_kernel_DFS_single_wg             < source="bvh_build_DFS.cl", kernelFunction="DFS_single_wg" >
kernel opencl_build_kernel_DFS_trivial               < source="bvh_build_DFS.cl", kernelFunction="DFS_trivial"  >
kernel opencl_build_kernel_DFS_single_wg_batch       < source="bvh_build_DFS.cl", kernelFunction="DFS_single_wg_batchable" >
kernel opencl_build_kernel_DFS_trivial_batch         < source="bvh_build_DFS.cl", kernelFunction="DFS_trivial_batchable"   >

kernel single_pass_binsah                            < source="bvh_build_DFS.cl", kernelFunction="DFS"                           >


const DFS_MIN_PRIMREFS  = 6;
const DFS_MAX_PRIMREFS  = 256;
const BFS_WG_SIZE_SHIFT = 9;


struct Scheduler
{
    dword num_bfs_wgs;
    dword num_dfs_wgs;

    dword scheduler_postsync;
    dword _pad1;

    dword num_trivial_builds;
    dword num_single_builds;

    dword batched_build_wg_count;
    dword batched_build_loop_mask;

};


struct SAHBuildArgs
{
    qword p_num_primitives;
    qword p_qnode_child_buffer;
    qword p_scheduler;
    qword p_sah_globals;
    qword p_globals;
    qword p_primref_buffer;
    qword p_primref_index_buffers;
    qword p_bvh_base;
    qword p_bvh2;
    qword p_root_buffer_counters;
    dword sah_build_flags;
    dword leaf_size;
    dword leaf_type;
    dword max_internal_nodes;
};


metakernel single_pass_binsah(
    qword build_globals,
    qword bvh_buffer,
    qword build_primref_buffer,
    qword build_primref_index_buffers,
    dword alloc_backpointers )
{

    dispatch single_pass_binsah(1, 1, 1) args(
        build_globals,
        bvh_buffer,
        build_primref_buffer,
        build_primref_index_buffers,
        alloc_backpointers
    );

}


metakernel new_sah_build( SAHBuildArgs build_args )
{
    define REG_num_prims    REG0;

    {
        define C_MIN_PRIMREFS           REG1;
        define C_MAX_PRIMREFS           REG2;
        define REG_dispatch_trivial     REG3;
        define REG_dispatch_single_wg   REG4;

        REG_num_prims  = load_dword( build_args.p_num_primitives );
        C_MIN_PRIMREFS = DFS_MIN_PRIMREFS;
        C_MAX_PRIMREFS = DFS_MAX_PRIMREFS;

        REG_dispatch_trivial   = REG_num_prims <= C_MIN_PRIMREFS;
        REG_dispatch_single_wg = REG_num_prims <= C_MAX_PRIMREFS;

        goto l_dispatch_trivial   if(REG_dispatch_trivial.lo);
        goto l_dispatch_single_wg if(REG_dispatch_single_wg.lo);
        goto l_full_build;
    }

l_dispatch_trivial:
    {
        dispatch opencl_build_kernel_DFS_trivial    (1,1,1)
            args( build_args.p_globals,
                  build_args.p_bvh_base,
                  build_args.p_primref_buffer,
                  build_args.p_primref_index_buffers,
                  build_args.sah_build_flags
                  );

        control( wait_idle );
        goto l_done;
    }

l_dispatch_single_wg:
    {
        dispatch opencl_build_kernel_DFS_single_wg    (1,1,1)
            args( build_args.p_globals,
                  build_args.p_bvh_base,
                  build_args.p_primref_buffer,
                  build_args.p_primref_index_buffers,
                  build_args.sah_build_flags
                  );

        control( wait_idle );
        goto l_done;
    }


l_full_build:


    {
        define p_scheduler                  build_args.p_scheduler;
        define p_num_dfs_wgs                build_args.p_scheduler + offsetof(Scheduler.num_dfs_wgs);
        define p_scheduler_postsync         (build_args.p_scheduler + offsetof(Scheduler.scheduler_postsync) );
        define C_0    REG1;
        define C_8    REG2;
        C_8 = 8;
        C_0 = 0;


        //
        //  Init pass
        //
        store_dword( p_scheduler_postsync, C_0.lo );

        // compute number of BFS WGs from prim-count
        // NOTE:  This code uses a hardcoded WG size of 512 for BFS
        //    If the BFS wg size ever changes, it needs to be touched
        //    This is necessary because DG2 shifter only supports POW2 shifts
        {
            define REG_scheduler_postsync    REG3;
            define C_511    REG4;
            define C_1      REG5;

            REG_scheduler_postsync = p_scheduler_postsync;
            C_511 = 511;
            C_1   = 1;

            store_qword( REG_scheduler_postsync, C_0 ); // initialize scheduler semaphore

            REG_num_prims = REG_num_prims + C_511;
            REG_num_prims = REG_num_prims >> C_8;
            REG_num_prims = REG_num_prims >> C_1;

            DISPATCHDIM_X = REG_num_prims.lo;
            DISPATCHDIM_Y = 1;
            DISPATCHDIM_Z = 1;

            control( cs_store_fence ); // commit the semaphore write

            // launch scheduler init kernel
            dispatch opencl_build_kernel_BinnedSAH_begin (1,1,1)
                args(
                    build_args.p_scheduler,
                    build_args.leaf_size,
                    build_args.leaf_type,
                    build_args.p_primref_index_buffers,
                    build_args.p_primref_buffer,
                    build_args.p_bvh2,
                    build_args.p_bvh_base,
                    build_args.p_globals,
                    build_args.p_sah_globals,
                    build_args.p_qnode_child_buffer,
                    build_args.sah_build_flags
                )
                postsync store_dword( p_scheduler_postsync, 1 );

            // wait on init kernel
            semaphore_wait while( *p_scheduler_postsync != 1 );

            // launch BFS1 pass1
            dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_initial
                args( build_args.p_scheduler,
                      build_args.p_sah_globals)
                postsync store_dword( p_scheduler_postsync, 0 );

            // wait on BFS pass1
            semaphore_wait while( *p_scheduler_postsync != 0 );

            // launch BFS pass2
            dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_initial
                args( build_args.p_scheduler,
                      build_args.p_sah_globals )
                postsync store_dword( p_scheduler_postsync, 1 );
        }

        // after BFS pass 2 we drop into a scheduling loop

        l_build_loop:
        {
            semaphore_wait while( *p_scheduler_postsync != 1 );

            {
                dispatch opencl_build_kernel_BinnedSAH_scheduler(1,1,1)
                    args( build_args.p_scheduler, build_args.p_sah_globals )
                    postsync store_dword( p_scheduler_postsync, 0 );

                // wait on the scheduler
                semaphore_wait while( *p_scheduler_postsync != 0 );
            }

            // load and process the scheduler results
            define REG_wg_counts    REG0;
            define REG_num_bfs_wgs  REG0.lo;
            define REG_num_dfs_wgs  REG0.hi;
            define REG_loop_break   REG1;
            define REG_p_scheduler  REG2;
            {
                REG_p_scheduler = p_scheduler;
                REG_wg_counts    = load_qword( REG_p_scheduler );

                define C_MASK_LO REG3 ;
                C_MASK_LO = 0xffffffff;

                REG_loop_break = REG_wg_counts  & C_MASK_LO;
                REG_loop_break = REG_loop_break == 0;
            }

            // dispatch new DFS WGs
            DISPATCHDIM_X = REG_num_dfs_wgs;
            dispatch_indirect opencl_build_kernel_BinnedSAH_DFS
                args( p_scheduler,
                      build_args.p_sah_globals );

            // jump out if there are no bfs WGs
            goto l_build_qnodes if (REG_loop_break);

            // dispatch new BFS1 WGs
            DISPATCHDIM_X = REG_num_bfs_wgs;
            dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_indexed
                args( p_scheduler,
                      build_args.p_sah_globals )
                postsync store_dword( p_scheduler_postsync, 2 );

           semaphore_wait while( *p_scheduler_postsync != 2 );

           // dispatch new BFS2 WGs
           dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_indexed
               args( p_scheduler,
                     build_args.p_sah_globals )
               postsync store_dword( p_scheduler_postsync, 1 );

            //goto l_build_loop if not(REG_num_dfs_wgs); //TODO: maybe add some logic to do "bool have_dfs_work" which will add some cycles but may be faster than checking semaphore

            // wait until all upcoming DFS WGs have finished launching
            //   so that the scheduler can refill the launch array
                // TODO_OPT:  Look at replacing this with a ring buffer so that scheduler stalls instead (and only rarely)
            semaphore_wait while( *p_num_dfs_wgs != 0 );


            goto l_build_loop;
        }
    }

l_build_qnodes:

    control( wait_idle );

    // P/C qnode build

    dispatch opencl_build_kernel_BinnedSAH_BuildQNodes_Kickoff (1,1,1)
        args( build_args.p_sah_globals,
              build_args.p_qnode_child_buffer,
              build_args.sah_build_flags );

    {
        define p_pc_counters ( build_args.p_root_buffer_counters );

        define REG_addr      REG0;
        define REG_produced  REG1;
        define REG_consumed  REG2;
        define REG_have_work REG3;
        define REG_wg_count  REG4;
        define C_8 REG5;
        define C_16 REG6;
        define C_1 REG7;
        C_1 = 1;
        C_8 =  8;
        C_16 = 16;
        REG_addr =  build_args.p_root_buffer_counters; // HINT: should we use REG_addr or just pass separate arguments to metakernel to avoid add/sub from address

        REG_consumed = 0;

        l_qnode_loop:

            control( wait_idle ); // wait for previous pass

            // load counters and compute number of wgs to respawn
            REG_produced  = load_qword( REG_addr ); REG_addr = REG_addr + C_8;
            REG_wg_count  = REG_produced - REG_consumed;
            REG_have_work = REG_wg_count > 0;

            goto l_done if not(REG_have_work.lo);

            // save REG_consumed as a starting position in p_qnode_child_buffer
            store_qword(REG_addr, REG_consumed); REG_addr = REG_addr + C_8;

            // save REG_produced as ending position in p_qnode_child_buffer
            store_qword(REG_addr, REG_produced); REG_addr = REG_addr - C_16;

            REG_consumed = REG_consumed + REG_wg_count; // update consumed for next iteration

            // calculate amount of workgroups to schedule
            REG_wg_count = REG_wg_count + C_1;
            REG_wg_count = REG_wg_count >> C_1;

            DISPATCHDIM_X = REG_wg_count.lo;

            control( cs_store_fence ); // commit the stores

            dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify
                    args( build_args.p_sah_globals,
                          build_args.p_qnode_child_buffer,
                          build_args.sah_build_flags);

            goto l_qnode_loop;
    }

l_done:
}


struct SAHBuildArgsBatchable
{
    qword p_globals_ptrs;
    qword p_scheduler;
    qword p_buffers_info;
    qword p_sah_globals;

    dword num_max_qnode_global_root_buffer_entries;
    dword num_builds;

};


metakernel new_sah_build_batchable( SAHBuildArgsBatchable build_args )
{
    define p_scheduler                  build_args.p_scheduler;
    define p_scheduler_postsync         (build_args.p_scheduler + offsetof(Scheduler.scheduler_postsync) );
    define p_num_dfs_wgs                (build_args.p_scheduler + offsetof(Scheduler.num_dfs_wgs));

    // initialize scheduler semaphore
    REG0.lo = 0;
    store_dword( p_scheduler_postsync, REG0.lo );


    // dispatch categorization pass
    dispatch opencl_build_kernel_BinnedSAH_categorize_builds_and_init_scheduler(2,1,1)
        args(
              build_args.p_scheduler,
              build_args.p_globals_ptrs,
              build_args.p_buffers_info,
              build_args.p_sah_globals,
              build_args.num_builds
          )
          postsync store_dword( p_scheduler_postsync, 1 );

    // wait on the categorization pass
    semaphore_wait while( *p_scheduler_postsync != 1 );


    //  dispatch the trivial and single-WG passes
    {
        REG0 = load_qword( build_args.p_scheduler + offsetof(Scheduler.num_trivial_builds) );
        DISPATCHDIM_X = REG0.lo;
        DISPATCHDIM_Y = 1;
        DISPATCHDIM_Z = 1;

        // dispatch trivial builds

        dispatch_indirect opencl_build_kernel_DFS_trivial_batch
            args( build_args.p_sah_globals );

        control( wait_idle );

        // dispatch single-wg builds

        DISPATCHDIM_X = REG0.hi;
        dispatch_indirect opencl_build_kernel_DFS_single_wg_batch
            args( build_args.p_sah_globals, build_args.p_scheduler );
    }

    // compute the number of builds not covered by the trivial passes
    // skip the builder loop if all builds are satisfied by trivial passes
    {
        REG1 = REG0.lo;
        REG2 = REG0.hi;
        REG3 = build_args.num_builds;
        REG5 = REG2 + REG1;
        REG5 = REG3 - REG5;
        REG4 = REG5 == 0 ;

        goto l_done if (REG4.lo);
    }

    // REG5 (number of non-trivial builds) will be used to launch build_qnodes kernel after the build loop
    define REG_num_nontrivial REG5;

l_build_outer_loop:
    {

        // configure the scheduler to initiate a new block of builds

        dispatch opencl_build_kernel_BinnedSAH_begin_batched (1,1,1)
            args( build_args.p_scheduler, build_args.p_sah_globals )
            postsync store_dword( p_scheduler_postsync, 0 );

        // wait on init kernel
        semaphore_wait while( *p_scheduler_postsync != 0 );


        // read results produced by scheduler init kernel
        //   lo == BFS wg count.  hi == all ones if we need to loop again
        //
        REG0 = build_args.p_scheduler + offsetof(Scheduler.batched_build_wg_count);
        REG4 = load_qword( REG0 );

        // launch BFS1 pass1
        DISPATCHDIM_X = REG4.lo;
        dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_initial_batch
            args( build_args.p_scheduler,
                    build_args.p_sah_globals)
            postsync store_dword( p_scheduler_postsync, 1 );

        // wait on BFS pass1
        semaphore_wait while( *p_scheduler_postsync != 1 );

        // launch BFS pass2
        dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_initial_batch
            args( build_args.p_scheduler,
                    build_args.p_sah_globals )
            postsync store_dword( p_scheduler_postsync, 0 );

        l_build_loop:
            {
                semaphore_wait while( *p_scheduler_postsync != 0 );

                {
                    dispatch opencl_build_kernel_BinnedSAH_scheduler(1,1,1)
                        args( build_args.p_scheduler, build_args.p_sah_globals )
                        postsync store_dword( p_scheduler_postsync, 1 );

                    // wait on the scheduler
                    semaphore_wait while( *p_scheduler_postsync != 1 );
                }

                // load and process the scheduler results
                define REG_wg_counts    REG0;
                define REG_num_bfs_wgs  REG0.lo;
                define REG_num_dfs_wgs  REG0.hi;
                define REG_loop_break   REG1;
                define REG_p_scheduler  REG2;
                {
                    REG_p_scheduler = p_scheduler;
                    REG_wg_counts    = load_qword( REG_p_scheduler );

                    define C_MASK_LO REG3 ;
                    C_MASK_LO = 0xffffffff;

                    REG_loop_break = REG_wg_counts  & C_MASK_LO;
                    REG_loop_break = REG_loop_break == 0;
                }

                // dispatch new DFS WGs
                DISPATCHDIM_X = REG_num_dfs_wgs;
                dispatch_indirect opencl_build_kernel_BinnedSAH_DFS
                    args( p_scheduler,
                          build_args.p_sah_globals );

                // jump out if there are no bfs WGs
                goto l_continue_outer_loop if (REG_loop_break);

                // dispatch new BFS1 WGs
                DISPATCHDIM_X = REG_num_bfs_wgs;
                dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_indexed_batch
                    args( p_scheduler,
                          build_args.p_sah_globals )
                    postsync store_dword( p_scheduler_postsync, 2 );

               semaphore_wait while( *p_scheduler_postsync != 2 );

                // dispatch new BFS2 WGs
                dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_indexed_batch
                    args( p_scheduler,
                          build_args.p_sah_globals )
                    postsync store_dword( p_scheduler_postsync, 0 );

                //goto l_build_loop if not(REG_num_dfs_wgs); //TODO: maybe add some logic to do "bool have_dfs_work" which will add some cycles but may be faster than checking semaphore

                // wait until all upcoming DFS WGs have finished launching
                //   so that the scheduler can refill the launch array
                // TODO_OPT:  Look at replacing this with a ring buffer so that scheduler stalls instead (and only rarely)
                semaphore_wait while( *p_num_dfs_wgs != 0 );

                goto l_build_loop;
            }


        l_continue_outer_loop:


            goto l_build_outer_loop if(REG4.hi);

    }

////////
//
// Qnode build phase
//
////////

    //  Wait for all outstanding DFS dispatches to complete, then build the QNodes
    control( wait_idle );

    define REG_wg_counts   REG1;
    define REG_p_scheduler REG2;
    define REG_have_work   REG3;
    define REG_GRB_NUM_MAX_ENTRIES    REG4;

    // init scheduler for qnode phase
    dispatch opencl_build_kernel_BinnedSAH_qnode_init_scheduler_batched(1,1,1)
        args( build_args.p_scheduler,
              build_args.num_builds,
              build_args.num_max_qnode_global_root_buffer_entries);

    REG_p_scheduler = p_scheduler;

    control( wait_idle );

    REG_wg_counts   = load_qword( REG_p_scheduler );

    DISPATCHDIM_X = REG_wg_counts.lo;

    // configure the scheduler to initiate a new block of builds
    dispatch_indirect opencl_build_kernel_BinnedSAH_qnode_begin_batched
        args( build_args.p_scheduler,
              build_args.p_sah_globals);

    // read results produced by init scheduler kernel
    //   lo == num of builds processed.  hi == num of maximum global root buffer entries
    //
    REG0 = build_args.p_scheduler + offsetof(Scheduler.batched_build_wg_count);
    REG5 = load_qword( REG0 );

    REG_GRB_NUM_MAX_ENTRIES.lo = REG5.hi;
    REG_GRB_NUM_MAX_ENTRIES.hi = 0;

l_qnode_loop:
    {
        control( wait_idle ); // wait for previous pass

        dispatch opencl_build_kernel_BinnedSAH_qnode_scheduler(1,1,1) args( build_args.p_scheduler );

        control( wait_idle );

        REG_wg_counts   = load_qword( REG_p_scheduler );
        REG_have_work = REG_wg_counts > 0;

        goto l_done if not(REG_have_work.lo);

        DISPATCHDIM_X = REG_wg_counts.lo;

        dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify_batch
                args( build_args.p_sah_globals,
                      build_args.p_scheduler );

        control( wait_idle );

        REG_wg_counts   = load_qword( REG_p_scheduler ); // reload values
        REG_wg_counts.lo = REG_wg_counts.hi;
        REG_wg_counts.hi = 0;

        REG_have_work = REG_wg_counts < REG_GRB_NUM_MAX_ENTRIES;

        goto l_qnode_loop if not(REG_have_work.lo);

        DISPATCHDIM_X = REG5.lo; // dispatch single workgroup for each build scheduled

        dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQnodes_TryToFillGRB_batched
                args( build_args.p_sah_globals,
                      build_args.p_scheduler );

        goto l_qnode_loop;
    }

////////
//
// Old implementation - TODO: maybe add switch between two implementations?
//
////////
    //  Wait for all outstanding DFS dispatches to complete, then build the QNodes
    //DISPATCHDIM_X = REG5.lo;

    //dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQNodes
    //    args( build_args.p_sah_globals, build_args.p_scheduler );


l_done:

    control( wait_idle );

}