import torch
from torch._inductor import ir
from torch._inductor.runtime.benchmarking import benchmarker


def to_channels_last(x):
    assert x.dim() == 4

    # NCHW -> NHWC
    stride_order = [3, 0, 2, 1]
    y = x.clone().as_strided(
        x.shape,
        ir.FlexibleLayout.stride_ordered(x.shape, stride_order),
    )
    y.copy_(x)
    assert torch.allclose(x, y)
    return y


def bench_conv(with_stack=True):
    x = torch.rand(256, 3, 224, 224).cuda()
    weight = torch.rand(64, 3, 7, 7).cuda()

    x_chan = to_channels_last(x)
    weight_chan = to_channels_last(weight)
    kwargs = {
        "stride": [2, 2],
        "padding": [3, 3],
        "dilation": [1, 1],
        "transposed": False,
        "output_padding": [0, 0],
        "groups": 1,
    }

    def baseline_fn():
        return torch.convolution(x, weight, bias=None, **kwargs)

    def test_fn():
        return torch.convolution(x_chan, weight_chan, bias=None, **kwargs)

    # warmup
    baseline_fn()
    test_fn()

    torch.cuda.synchronize()
    with torch.profiler.profile(with_stack=with_stack) as p:
        baseline_out = baseline_fn()
        test_out = test_fn()
        torch.cuda.synchronize()

    p.export_chrome_trace("/tmp/chrome.json")
    assert torch.allclose(baseline_out, test_out, atol=1e-3, rtol=1e-3), (
        baseline_out[0][0][0][:32],
        test_out[0][0][0][:32],
    )

    baseline_ms = benchmarker.benchmark_gpu(baseline_fn, rep=40)
    test_ms = benchmarker.benchmark_gpu(test_fn, rep=40)
    print(f"baseline {baseline_ms} test {test_ms} speedup {baseline_ms / test_ms:.3f}x")


def main():
    bench_conv()


if __name__ == "__main__":
    main()