/*
  Copyright (c) 2020  Hayati Ayguen ( h_ayguen@web.de )

  bench for mixer algorithm/implementations

 */

#include <pf_mixer.h>

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <assert.h>
#include <string.h>

#define HAVE_SYS_TIMES

#ifdef HAVE_SYS_TIMES
#  include <sys/times.h>
#  include <unistd.h>
#endif

#define BENCH_REF_TRIG_FUNC       1
#define BENCH_OUT_OF_PLACE_ALGOS  0
#define BENCH_INPLACE_ALGOS       1

#define SAVE_BY_DEFAULT  0
#define SAVE_LIMIT_MSPS           16

#if 0
  #define BENCH_FILE_SHIFT_MATH_CC           "/home/ayguen/WindowsDesktop/mixer_test/A_shift_math_cc.bin"
  #define BENCH_FILE_ADD_FAST_CC             "/home/ayguen/WindowsDesktop/mixer_test/C_shift_addfast_cc.bin"
  #define BENCH_FILE_ADD_FAST_INP_C          "/home/ayguen/WindowsDesktop/mixer_test/C_shift_addfast_inp_c.bin"
  #define BENCH_FILE_UNROLL_INP_C            "/home/ayguen/WindowsDesktop/mixer_test/D_shift_unroll_inp_c.bin"
  #define BENCH_FILE_LTD_UNROLL_INP_C        "/home/ayguen/WindowsDesktop/mixer_test/E_shift_limited_unroll_inp_c.bin"
  #define BENCH_FILE_LTD_UNROLL_A_SSE_INP_C  "/home/ayguen/WindowsDesktop/mixer_test/F_shift_limited_unroll_A_sse_inp_c.bin"
  #define BENCH_FILE_LTD_UNROLL_B_SSE_INP_C  "/home/ayguen/WindowsDesktop/mixer_test/G_shift_limited_unroll_B_sse_inp_c.bin"
  #define BENCH_FILE_LTD_UNROLL_C_SSE_INP_C  "/home/ayguen/WindowsDesktop/mixer_test/H_shift_limited_unroll_C_sse_inp_c.bin"
  #define BENCH_FILE_REC_OSC_CC              ""
  #define BENCH_FILE_REC_OSC_INP_C           "/home/ayguen/WindowsDesktop/mixer_test/I_shift_recursive_osc_inp_c.bin"
  #define BENCH_FILE_REC_OSC_SSE_INP_C       "/home/ayguen/WindowsDesktop/mixer_test/J_shift_recursive_osc_sse_inp_c.bin"
#else
  #define BENCH_FILE_SHIFT_MATH_CC           ""
  #define BENCH_FILE_ADD_FAST_CC             ""
  #define BENCH_FILE_ADD_FAST_INP_C          ""
  #define BENCH_FILE_UNROLL_INP_C            ""
  #define BENCH_FILE_LTD_UNROLL_INP_C        ""
  #define BENCH_FILE_LTD_UNROLL_A_SSE_INP_C  ""
  #define BENCH_FILE_LTD_UNROLL_B_SSE_INP_C  ""
  #define BENCH_FILE_LTD_UNROLL_C_SSE_INP_C  ""
  #define BENCH_FILE_REC_OSC_CC              ""
  #define BENCH_FILE_REC_OSC_INP_C           ""
  #define BENCH_FILE_REC_OSC_SSE_INP_C       ""
#endif



#if defined(HAVE_SYS_TIMES)
    static double ttclk = 0.;

    static double uclock_sec(int find_start)
    {
        struct tms t0, t;
        if (ttclk == 0.)
        {
            ttclk = sysconf(_SC_CLK_TCK);
            fprintf(stderr, "sysconf(_SC_CLK_TCK) => %f\n", ttclk);
        }
        times(&t);
        if (find_start)
        {
            t0 = t;
            while (t0.tms_utime == t.tms_utime)
                times(&t);
        }
        /* use only the user time of this process - not realtime, which depends on OS-scheduler .. */
        return ((double)t.tms_utime) / ttclk;
    }

#elif 0
    // https://docs.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-getprocesstimes
    double uclock_sec(int find_start)
    {
        FILETIME a, b, c, d;
        if (GetProcessTimes(GetCurrentProcess(), &a, &b, &c, &d) != 0)
        {
            //  Returns total user time.
            //  Can be tweaked to include kernel times as well.
            return
                (double)(d.dwLowDateTime |
                    ((unsigned long long)d.dwHighDateTime << 32)) * 0.0000001;
        }
        else {
            //  Handle error
            return 0;
        }
    }

#else
    double uclock_sec(int find_start)
    { return (double)clock()/(double)CLOCKS_PER_SEC; }
#endif


void save(complexf * d, int B, int N, const char * fn)
{
    if (!fn || !fn[0])
    {
        if (! SAVE_BY_DEFAULT)
            return;
        fn = "/dev/shm/bench.bin";
    }
    FILE * f = fopen(fn, "wb");
    if (!f) {
        fprintf(stderr, "error writing result to %s\n", fn);
        return;
    }
    if ( N >= SAVE_LIMIT_MSPS * 1024 * 1024 )
        N = SAVE_LIMIT_MSPS * 1024 * 1024;
    for (int off = 0; off + B <= N; off += B)
    {
        fwrite(d+off, sizeof(complexf), B, f);
    }
    fclose(f);
}


double bench_shift_math_cc(int B, int N) {
    double t0, t1, tstop, T, nI;
    int iter, off;
    float phase = 0.0F;
    complexf *input = (complexf *)malloc(N * sizeof(complexf));
    complexf *output = (complexf *)malloc(N * sizeof(complexf));
    shift_recursive_osc_t gen_state;
    shift_recursive_osc_conf_t gen_conf;

    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);

    iter = 0;
    off = 0;
    t0 = uclock_sec(1);
    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
    do {
        // work
        phase = shift_math_cc(input+off, output+off, B, -0.0009F, phase);
        off += B;
        ++iter;
        t1 = uclock_sec(0);
    } while ( t1 < tstop && off + B < N );

    save(output, B, off, BENCH_FILE_SHIFT_MATH_CC);

    free(input);
    free(output);
    T = ( t1 - t0 );  /* duration per fft() */
    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
    return (nI / T);    /* normalized iterations per second */
}


double bench_shift_table_cc(int B, int N) {
    double t0, t1, tstop, T, nI;
    int iter, off;
    int table_size=65536;
    float phase = 0.0F;
    complexf *input = (complexf *)malloc(N * sizeof(complexf));
    complexf *output = (complexf *)malloc(N * sizeof(complexf));
    shift_recursive_osc_t gen_state;
    shift_recursive_osc_conf_t gen_conf;

    shift_table_data_t table_data = shift_table_init(table_size);

    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);

    iter = 0;
    off = 0;
    t0 = uclock_sec(1);
    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
    do {
        // work
        phase = shift_table_cc(input+off, output+off, B, -0.0009F, table_data, phase);

        off += B;
        ++iter;
        t1 = uclock_sec(0);
    } while ( t1 < tstop && off + B < N );

    save(output, B, off, NULL);
    free(input);
    free(output);
    T = ( t1 - t0 );  /* duration per fft() */
    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
    return (nI / T);    /* normalized iterations per second */
}


double bench_shift_addfast(int B, int N) {
    double t0, t1, tstop, T, nI;
    int iter, off;
    float phase = 0.0F;
    complexf *input = (complexf *)malloc(N * sizeof(complexf));
    complexf *output = (complexf *)malloc(N * sizeof(complexf));
    shift_recursive_osc_t gen_state;
    shift_recursive_osc_conf_t gen_conf;
    shift_addfast_data_t state = shift_addfast_init(-0.0009F);

    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);

    iter = 0;
    off = 0;
    t0 = uclock_sec(1);
    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
    do {
        // work
        phase = shift_addfast_cc(input+off, output+off, B, &state, phase);

        off += B;
        ++iter;
        t1 = uclock_sec(0);
    } while ( t1 < tstop && off + B < N );

    save(output, B, off, BENCH_FILE_ADD_FAST_CC);

    free(input);
    free(output);
    T = ( t1 - t0 );  /* duration per fft() */
    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
    return (nI / T);    /* normalized iterations per second */
}

double bench_shift_addfast_inp(int B, int N) {
    double t0, t1, tstop, T, nI;
    int iter, off;
    float phase = 0.0F;
    complexf *input = (complexf *)malloc(N * sizeof(complexf));
    shift_recursive_osc_t gen_state;
    shift_recursive_osc_conf_t gen_conf;
    shift_addfast_data_t state = shift_addfast_init(-0.0009F);

    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);

    iter = 0;
    off = 0;
    t0 = uclock_sec(1);
    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
    do {
        // work
        phase = shift_addfast_inp_c(input+off, B, &state, phase);

        off += B;
        ++iter;
        t1 = uclock_sec(0);
    } while ( t1 < tstop && off + B < N );

    save(input, B, off, BENCH_FILE_ADD_FAST_INP_C);

    free(input);
    T = ( t1 - t0 );  /* duration per fft() */
    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
    return (nI / T);    /* normalized iterations per second */
}


double bench_shift_unroll_oop(int B, int N) {
    double t0, t1, tstop, T, nI;
    int iter, off;
    float phase = 0.0F;
    complexf *input = (complexf *)malloc(N * sizeof(complexf));
    complexf *output = (complexf *)malloc(N * sizeof(complexf));
    shift_recursive_osc_t gen_state;
    shift_recursive_osc_conf_t gen_conf;
    shift_unroll_data_t state = shift_unroll_init(-0.0009F, B);

    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);

    iter = 0;
    off = 0;
    t0 = uclock_sec(1);
    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
    do {
        // work
        phase = shift_unroll_cc(input+off, output+off, B, &state, phase);

        off += B;
        ++iter;
        t1 = uclock_sec(0);
    } while ( t1 < tstop && off + B < N );

    save(output, B, off, NULL);
    free(input);
    free(output);
    T = ( t1 - t0 );  /* duration per fft() */
    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
    return (nI / T);    /* normalized iterations per second */
}

double bench_shift_unroll_inp(int B, int N) {
    double t0, t1, tstop, T, nI;
    int iter, off;
    float phase = 0.0F;
    complexf *input = (complexf *)malloc(N * sizeof(complexf));
    shift_recursive_osc_t gen_state;
    shift_recursive_osc_conf_t gen_conf;
    shift_unroll_data_t state = shift_unroll_init(-0.0009F, B);

    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);

    iter = 0;
    off = 0;
    t0 = uclock_sec(1);
    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
    do {
        // work
        phase = shift_unroll_inp_c(input+off, B, &state, phase);

        off += B;
        ++iter;
        t1 = uclock_sec(0);
    } while ( t1 < tstop && off + B < N );

    save(input, B, off, BENCH_FILE_UNROLL_INP_C);

    free(input);
    T = ( t1 - t0 );  /* duration per fft() */
    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
    return (nI / T);    /* normalized iterations per second */
}



double bench_shift_limited_unroll_oop(int B, int N) {
    double t0, t1, tstop, T, nI;
    int iter, off;
    complexf *input = (complexf *)malloc(N * sizeof(complexf));
    complexf *output = (complexf *)malloc(N * sizeof(complexf));
    shift_recursive_osc_t gen_state;
    shift_recursive_osc_conf_t gen_conf;
    shift_limited_unroll_data_t state = shift_limited_unroll_init(-0.0009F);

    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);

    iter = 0;
    off = 0;
    t0 = uclock_sec(1);
    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
    do {
        // work
        shift_limited_unroll_cc(input+off, output+off, B, &state);

        off += B;
        ++iter;
        t1 = uclock_sec(0);
    } while ( t1 < tstop && off + B < N );

    save(output, B, off, NULL);
    free(input);
    free(output);
    T = ( t1 - t0 );  /* duration per fft() */
    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
    return (nI / T);    /* normalized iterations per second */
}


double bench_shift_limited_unroll_inp(int B, int N) {
    double t0, t1, tstop, T, nI;
    int iter, off;
    complexf *input = (complexf *)malloc(N * sizeof(complexf));
    shift_recursive_osc_t gen_state;
    shift_recursive_osc_conf_t gen_conf;
    shift_limited_unroll_data_t state = shift_limited_unroll_init(-0.0009F);

    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);

    iter = 0;
    off = 0;
    t0 = uclock_sec(1);
    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
    do {
        // work
        shift_limited_unroll_inp_c(input+off, B, &state);

        off += B;
        ++iter;
        t1 = uclock_sec(0);
    } while ( t1 < tstop && off + B < N );

    save(input, B, off, BENCH_FILE_LTD_UNROLL_INP_C);

    free(input);
    T = ( t1 - t0 );  /* duration per fft() */
    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
    return (nI / T);    /* normalized iterations per second */
}


double bench_shift_limited_unroll_A_sse_inp(int B, int N) {
    double t0, t1, tstop, T, nI;
    int iter, off;
    complexf *input = (complexf *)malloc(N * sizeof(complexf));
    shift_recursive_osc_t gen_state;
    shift_recursive_osc_conf_t gen_conf;
    shift_limited_unroll_A_sse_data_t *state = malloc(sizeof(shift_limited_unroll_A_sse_data_t));

    *state = shift_limited_unroll_A_sse_init(-0.0009F, 0.0F);

    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);

    iter = 0;
    off = 0;
    t0 = uclock_sec(1);
    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
    do {
        // work
        shift_limited_unroll_A_sse_inp_c(input+off, B, state);

        off += B;
        ++iter;
        t1 = uclock_sec(0);
    } while ( t1 < tstop && off + B < N );

    save(input, B, off, BENCH_FILE_LTD_UNROLL_A_SSE_INP_C);
    
    free(input);
    T = ( t1 - t0 );  /* duration per fft() */
    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
    return (nI / T);    /* normalized iterations per second */
}

double bench_shift_limited_unroll_B_sse_inp(int B, int N) {
    double t0, t1, tstop, T, nI;
    int iter, off;
    complexf *input = (complexf *)malloc(N * sizeof(complexf));
    shift_recursive_osc_t gen_state;
    shift_recursive_osc_conf_t gen_conf;
    shift_limited_unroll_B_sse_data_t *state = malloc(sizeof(shift_limited_unroll_B_sse_data_t));

    *state = shift_limited_unroll_B_sse_init(-0.0009F, 0.0F);

    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
    //shift_recursive_osc_init(0.0F, 0.0F, &gen_conf, &gen_state);
    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);

    iter = 0;
    off = 0;
    t0 = uclock_sec(1);
    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
    do {
        // work
        shift_limited_unroll_B_sse_inp_c(input+off, B, state);

        off += B;
        ++iter;
        t1 = uclock_sec(0);
    } while ( t1 < tstop && off + B < N );

    save(input, B, off, BENCH_FILE_LTD_UNROLL_B_SSE_INP_C);
    
    free(input);
    T = ( t1 - t0 );  /* duration per fft() */
    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
    return (nI / T);    /* normalized iterations per second */
}

double bench_shift_limited_unroll_C_sse_inp(int B, int N) {
    double t0, t1, tstop, T, nI;
    int iter, off;
    complexf *input = (complexf *)malloc(N * sizeof(complexf));
    shift_recursive_osc_t gen_state;
    shift_recursive_osc_conf_t gen_conf;
    shift_limited_unroll_C_sse_data_t *state = malloc(sizeof(shift_limited_unroll_C_sse_data_t));

    *state = shift_limited_unroll_C_sse_init(-0.0009F, 0.0F);

    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);

    iter = 0;
    off = 0;
    t0 = uclock_sec(1);
    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
    do {
        // work
        shift_limited_unroll_C_sse_inp_c(input+off, B, state);

        off += B;
        ++iter;
        t1 = uclock_sec(0);
    } while ( t1 < tstop && off + B < N );

    save(input, B, off, BENCH_FILE_LTD_UNROLL_C_SSE_INP_C);
    
    free(input);
    T = ( t1 - t0 );  /* duration per fft() */
    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
    return (nI / T);    /* normalized iterations per second */
}


double bench_shift_rec_osc_cc_oop(int B, int N) {
    double t0, t1, tstop, T, nI;
    int iter, off;
    float phase = 0.0F;
    complexf *input = (complexf *)malloc(N * sizeof(complexf));
    complexf *output = (complexf *)malloc(N * sizeof(complexf));
    shift_recursive_osc_t gen_state, shift_state;
    shift_recursive_osc_conf_t gen_conf, shift_conf;

    shift_recursive_osc_init(-0.0009F, 0.0F, &shift_conf, &shift_state);
    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);

    iter = 0;
    off = 0;
    t0 = uclock_sec(1);
    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
    do {
        // work
        shift_recursive_osc_cc(input+off, output+off, B, &shift_conf, &shift_state);

        off += B;
        ++iter;
        t1 = uclock_sec(0);
    } while ( t1 < tstop && off + B < N );

    save(input, B, off, BENCH_FILE_REC_OSC_CC);

    save(output, B, off, NULL);
    free(input);
    free(output);
    T = ( t1 - t0 );  /* duration per fft() */
    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
    return (nI / T);    /* normalized iterations per second */
}


double bench_shift_rec_osc_cc_inp(int B, int N) {
    double t0, t1, tstop, T, nI;
    int iter, off;
    float phase = 0.0F;
    complexf *input = (complexf *)malloc(N * sizeof(complexf));
    shift_recursive_osc_t gen_state, shift_state;
    shift_recursive_osc_conf_t gen_conf, shift_conf;

    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);
    shift_recursive_osc_init(-0.0009F, 0.0F, &shift_conf, &shift_state);

    iter = 0;
    off = 0;
    t0 = uclock_sec(1);
    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
    do {
        // work
        shift_recursive_osc_inp_c(input+off, B, &shift_conf, &shift_state);

        off += B;
        ++iter;
        t1 = uclock_sec(0);
    } while ( t1 < tstop && off + B < N );

    save(input, B, off, BENCH_FILE_REC_OSC_INP_C);
    free(input);
    T = ( t1 - t0 );  /* duration per fft() */
    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
    return (nI / T);    /* normalized iterations per second */
}


double bench_shift_rec_osc_sse_c_inp(int B, int N) {
    double t0, t1, tstop, T, nI;
    int iter, off;
    float phase = 0.0F;
    complexf *input = (complexf *)malloc(N * sizeof(complexf));
    shift_recursive_osc_t gen_state;
    shift_recursive_osc_conf_t gen_conf;

    shift_recursive_osc_sse_t *shift_state = malloc(sizeof(shift_recursive_osc_sse_t));
    shift_recursive_osc_sse_conf_t shift_conf;

    shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state);
    gen_recursive_osc_c(input, N, &gen_conf, &gen_state);

    shift_recursive_osc_sse_init(-0.0009F, 0.0F, &shift_conf, shift_state);

    iter = 0;
    off = 0;
    t0 = uclock_sec(1);
    tstop = t0 + 0.5;  /* benchmark duration: 500 ms */
    do {
        // work
        shift_recursive_osc_sse_inp_c(input+off, B, &shift_conf, shift_state);

        off += B;
        ++iter;
        t1 = uclock_sec(0);
    } while ( t1 < tstop && off + B < N );

    save(input, B, off, BENCH_FILE_REC_OSC_SSE_INP_C);
    free(input);
    T = ( t1 - t0 );  /* duration per fft() */
    printf("processed %f Msamples in %f ms\n", off * 1E-6, T*1E3);
    nI = ((double)iter) * B;  /* number of iterations "normalized" to O(N) = N */
    return (nI / T);    /* normalized iterations per second */
}



int main(int argc, char **argv)
{
    double rt;

    // process up to 64 MSample (512 MByte) in blocks of 8 kSamples (=64 kByte)
    int B = 8 * 1024;
    int N = 64 * 1024 * 1024;
    int showUsage = 0;

    if (argc == 1)
        showUsage = 1;

    if (1 < argc)
        B = atoi(argv[1]);
    if (2 < argc)
        N = atoi(argv[2]) * 1024 * 1024;

    if ( !B || !N || showUsage )
    {
        fprintf(stderr, "%s [<blockLength in samples> [<total # of MSamples>] ]\n", argv[0]);
        if ( !B || !N )
            return 0;
    }

    fprintf(stderr, "processing up to N = %d MSamples with blocke length of %d samples\n",
        N / (1024 * 1024), B );


#if BENCH_REF_TRIG_FUNC
    printf("\nstarting bench of shift_math_cc (out-of-place) with trig functions ..\n");
    rt = bench_shift_math_cc(B, N);
    printf("  %f MSamples/sec\n\n", rt * 1E-6);
#endif

#if BENCH_OUT_OF_PLACE_ALGOS
    printf("starting bench of shift_table_cc (out-of-place) ..\n");
    rt = bench_shift_table_cc(B, N);
    printf("  %f MSamples/sec\n\n", rt * 1E-6);

    printf("starting bench of shift_addfast_cc (out-of-place) ..\n");
    rt = bench_shift_addfast(B, N);
    printf("  %f MSamples/sec\n\n", rt * 1E-6);

    printf("\nstarting bench of shift_unroll_cc (out-of-place) ..\n");
    rt = bench_shift_unroll_oop(B, N);
    printf("  %f MSamples/sec\n\n", rt * 1E-6);

    printf("\nstarting bench of shift_limited_unroll_cc (out-of-place) ..\n");
    rt = bench_shift_limited_unroll_oop(B, N);
    printf("  %f MSamples/sec\n\n", rt * 1E-6);

    printf("\nstarting bench of shift_recursive_osc_cc (out-of-place) ..\n");
    rt = bench_shift_rec_osc_cc_oop(B, N);
    printf("  %f MSamples/sec\n\n", rt * 1E-6);
#endif

#if BENCH_INPLACE_ALGOS

    printf("starting bench of shift_addfast_inp_c in-place ..\n");
    rt = bench_shift_addfast_inp(B, N);
    printf("  %f MSamples/sec\n\n", rt * 1E-6);

    printf("starting bench of shift_unroll_inp_c in-place ..\n");
    rt = bench_shift_unroll_inp(B, N);
    printf("  %f MSamples/sec\n\n", rt * 1E-6);

    printf("starting bench of shift_limited_unroll_inp_c in-place ..\n");
    rt = bench_shift_limited_unroll_inp(B, N);
    printf("  %f MSamples/sec\n\n", rt * 1E-6);

    if ( have_sse_shift_mixer_impl() )
    {
        printf("starting bench of shift_limited_unroll_A_sse_inp_c in-place ..\n");
        rt = bench_shift_limited_unroll_A_sse_inp(B, N);
        printf("  %f MSamples/sec\n\n", rt * 1E-6);

        printf("starting bench of shift_limited_unroll_B_sse_inp_c in-place ..\n");
        rt = bench_shift_limited_unroll_B_sse_inp(B, N);
        printf("  %f MSamples/sec\n\n", rt * 1E-6);

        printf("starting bench of shift_limited_unroll_C_sse_inp_c in-place ..\n");
        rt = bench_shift_limited_unroll_C_sse_inp(B, N);
        printf("  %f MSamples/sec\n\n", rt * 1E-6);
    }

    printf("starting bench of shift_recursive_osc_cc in-place ..\n");
    rt = bench_shift_rec_osc_cc_inp(B, N);
    printf("  %f MSamples/sec\n\n", rt * 1E-6);

    if ( have_sse_shift_mixer_impl() )
    {
        printf("starting bench of shift_recursive_osc_sse_c in-place ..\n");
        rt = bench_shift_rec_osc_sse_c_inp(B, N);
        printf("  %f MSamples/sec\n\n", rt * 1E-6);
    }
#endif

    return 0;
}

