import itertools
import os
import random
import sys


sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from typing import Any, Tuple

from benchmark_runner import BenchmarkRunner  # type: ignore[import-not-found]
from benchmark_utils import (  # type: ignore[import-not-found]
    fits_in_memory,
    get_mm_tensors,
    get_random_between_pow2,
    set_precision,
)

import torch
from torch._inductor.utils import fresh_inductor_cache


class BenchmarkRunnerMM(BenchmarkRunner):  # type: ignore[misc, no-any-unimported]
    """
    BenchmarkRunner for mm.
    """

    def __init__(self) -> None:
        super().__init__("mm")

    def create_input(self) -> Tuple[Any, ...]:
        dtype = random.choices([torch.float32, torch.float16, torch.bfloat16])[0]
        set_precision(dtype)
        m, k, n = self.get_m_k_n(dtype)
        return (m, k, n, dtype)

    def run_benchmark(
        self,
        m: int,
        k: int,
        n: int,
        dtype: Any,
    ) -> Any:
        # for a given shape, test all possible combinations of transpose_left and transpose_right
        for transpose_left, transpose_right in itertools.product(
            [False, True], repeat=2
        ):
            print(
                f"m: {m}, k: {k}, n: {n}, transpose_left: {transpose_left}, transpose_right: {transpose_right}, dtype: {dtype}"
            )
            a, b = get_mm_tensors(
                m,
                k,
                n,
                transpose_left,
                transpose_right,
                dtype_left=dtype,
                dtype_right=dtype,
            )

            with fresh_inductor_cache():

                def mixed_mm(A: Any, B: Any) -> Any:
                    return torch.mm(A, B)

                cf = torch.compile(mixed_mm, mode="max-autotune-no-cudagraphs")
                cf(a, b)
                torch.compiler.reset()

    def random_multiple_of_128(self, min_num: int = 7, max_num: int = 17) -> int:
        # generates a random number ran_pow2 between min_num and max_num -1
        # and returns a random multiple of 128 between 2^ran_pow2 and 2^(ran_pow2+1)
        ran_pow2 = random.randint(min_num, max_num - 1)
        start = (2**ran_pow2) // 128
        end = (2 ** (ran_pow2 + 1)) // 128
        random_multiple = random.randint(start, end)
        return random_multiple * 128

    def get_distr_type(self) -> str:
        # 85%: choose a random multiple of 128 between 2^10 and 2^17
        # 10%: choose a random power of 2 between 2^0 and 2^17
        #  4%: choose a random number between 1 and 131072
        #  1%: choose a random number between 2^i and 2^(i+1) with i in [1, 16]
        return random.choices(
            ["mult_128", "pow2", "uniform", "uniform-between-pow2"],
            [0.85, 0.1, 0.04, 0.01],
        )[0]

    def get_random_dim(self) -> int:
        distr_type = self.get_distr_type()
        if distr_type == "mult_128":
            return self.random_multiple_of_128(min_num=10, max_num=17)
        if distr_type == "pow2":
            return int(2 ** random.randint(0, 17))
        elif distr_type == "uniform-between-pow2":
            # TODO(AlnisM): make mypy work for torchgen/_autoheuristic/
            return int(get_random_between_pow2(min_power2=1, max_power2=17))
        elif distr_type == "uniform":
            return random.randint(1, 131072)
        print(f"random_type {distr_type} not supported")
        sys.exit(1)

    def get_m_k_n(self, dtype: Any) -> Tuple[int, int, int]:
        numel_max = 2**31

        # repeat until tensors fit in memory
        while True:
            m = self.get_random_dim()
            k = self.get_random_dim()
            n = self.get_random_dim()

            if m * k >= numel_max or m * n >= numel_max or k * n >= numel_max:
                # autotuning will not happen for tensors that are this large
                continue

            if fits_in_memory(dtype, m, k, n):
                return (m, k, n)


if __name__ == "__main__":
    runner = BenchmarkRunnerMM()
    runner.run()