// Copyright 2021, VIXL authors
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
//   * Redistributions of source code must retain the above copyright notice,
//     this list of conditions and the following disclaimer.
//   * Redistributions in binary form must reproduce the above copyright notice,
//     this list of conditions and the following disclaimer in the documentation
//     and/or other materials provided with the distribution.
//   * Neither the name of ARM Limited nor the names of its contributors may be
//     used to endorse or promote products derived from this software without
//     specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "test-runner.h"
#include "test-utils.h"
#include "aarch64/test-utils-aarch64.h"

#include "aarch64/cpu-aarch64.h"
#include "aarch64/disasm-aarch64.h"
#include "aarch64/macro-assembler-aarch64.h"
#include "aarch64/simulator-aarch64.h"
#include "test-assembler-aarch64.h"

#define TEST_SVE(name) TEST_SVE_INNER("SIM", name)

namespace vixl {
namespace aarch64 {

TEST_SVE(sve_matmul) {
  SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
                          CPUFeatures::kSVEI8MM,
                          CPUFeatures::kNEON,
                          CPUFeatures::kCRC32);
  START();

  SetInitialMachineState(&masm);
  // state = 0xe2bd2480

  {
    ExactAssemblyScope scope(&masm, 50 * kInstructionSize);
    __ dci(0x45179979);  // smmla z25.s, z11.b, z23.b
    // vl128 state = 0xf1ca8a4d
    __ dci(0x45179b51);  // smmla z17.s, z26.b, z23.b
    // vl128 state = 0x4458ad10
    __ dci(0x45d79b53);  // ummla z19.s, z26.b, z23.b
    // vl128 state = 0x43d4d064
    __ dci(0x45d69b17);  // ummla z23.s, z24.b, z22.b
    // vl128 state = 0x601e77c8
    __ dci(0x45c69b33);  // ummla z19.s, z25.b, z6.b
    // vl128 state = 0x561b4e22
    __ dci(0x45c49b1b);  // ummla z27.s, z24.b, z4.b
    // vl128 state = 0x89b65d78
    __ dci(0x45dc9b1a);  // ummla z26.s, z24.b, z28.b
    // vl128 state = 0x85c9e62d
    __ dci(0x45d99b1b);  // ummla z27.s, z24.b, z25.b
    // vl128 state = 0x3fc74134
    __ dci(0x45d99b19);  // ummla z25.s, z24.b, z25.b
    // vl128 state = 0xa2fa347b
    __ dci(0x45d99b1b);  // ummla z27.s, z24.b, z25.b
    // vl128 state = 0xb9854782
    __ dci(0x45899b1a);  // usmmla z26.s, z24.b, z9.b
    // vl128 state = 0x7fd376d8
    __ dci(0x45099b8a);  // smmla z10.s, z28.b, z9.b
    // vl128 state = 0xb41d8433
    __ dci(0x45019bcb);  // smmla z11.s, z30.b, z1.b
    // vl128 state = 0xc9c0e80d
    __ dci(0x45019bdb);  // smmla z27.s, z30.b, z1.b
    // vl128 state = 0xf1130e02
    __ dci(0x45019b6b);  // smmla z11.s, z27.b, z1.b
    // vl128 state = 0x282d3dc7
    __ dci(0x45019b6f);  // smmla z15.s, z27.b, z1.b
    // vl128 state = 0x34570238
    __ dci(0x45859b6b);  // usmmla z11.s, z27.b, z5.b
    // vl128 state = 0xc451206a
    __ dci(0x45919b6a);  // usmmla z10.s, z27.b, z17.b
    // vl128 state = 0xa58e2ea8
    __ dci(0x45909a62);  // usmmla z2.s, z19.b, z16.b
    // vl128 state = 0x7b5f948d
    __ dci(0x45809a52);  // usmmla z18.s, z18.b, z0.b
    // vl128 state = 0xf746260d
    __ dci(0x45889b53);  // usmmla z19.s, z26.b, z8.b
    // vl128 state = 0xc31cc539
    __ dci(0x45809a57);  // usmmla z23.s, z18.b, z0.b
    // vl128 state = 0x736bb3ee
    __ dci(0x45809a96);  // usmmla z22.s, z20.b, z0.b
    // vl128 state = 0xbb05fef6
    __ dci(0x45809a92);  // usmmla z18.s, z20.b, z0.b
    // vl128 state = 0xbc594372
    __ dci(0x45809a82);  // usmmla z2.s, z20.b, z0.b
    // vl128 state = 0x87c5a584
    __ dci(0x45829ad2);  // usmmla z18.s, z22.b, z2.b
    // vl128 state = 0xa413f733
    __ dci(0x45889ad6);  // usmmla z22.s, z22.b, z8.b
    // vl128 state = 0x87ec445d
    __ dci(0x45c898d2);  // ummla z18.s, z6.b, z8.b
    // vl128 state = 0x3ca8a6e5
    __ dci(0x450898d0);  // smmla z16.s, z6.b, z8.b
    // vl128 state = 0x4300d87b
    __ dci(0x45189ad8);  // smmla z24.s, z22.b, z24.b
    // vl128 state = 0x38be2e8a
    __ dci(0x451c9bd9);  // smmla z25.s, z30.b, z28.b
    // vl128 state = 0x8a3e6103
    __ dci(0x45989bc9);  // usmmla z9.s, z30.b, z24.b
    // vl128 state = 0xc728e586
    __ dci(0x451c9bd9);  // smmla z25.s, z30.b, z28.b
    // vl128 state = 0x4cb44c0e
    __ dci(0x459c99d1);  // usmmla z17.s, z14.b, z28.b
    // vl128 state = 0x84ebcb36
    __ dci(0x459c99d5);  // usmmla z21.s, z14.b, z28.b
    // vl128 state = 0x8813d2e2
    __ dci(0x451c999d);  // smmla z29.s, z12.b, z28.b
    // vl128 state = 0x8f26ee51
    __ dci(0x451c999f);  // smmla z31.s, z12.b, z28.b
    // vl128 state = 0x5d626fd0
    __ dci(0x459e998f);  // usmmla z15.s, z12.b, z30.b
    // vl128 state = 0x6b64cc8f
    __ dci(0x459f991f);  // usmmla z31.s, z8.b, z31.b
    // vl128 state = 0x41648186
    __ dci(0x4587991e);  // usmmla z30.s, z8.b, z7.b
    // vl128 state = 0x701525ec
    __ dci(0x45079816);  // smmla z22.s, z0.b, z7.b
    // vl128 state = 0x61a2d024
    __ dci(0x450f9897);  // smmla z23.s, z4.b, z15.b
    // vl128 state = 0x82ba6bd5
    __ dci(0x450b98d3);  // smmla z19.s, z6.b, z11.b
    // vl128 state = 0xa842bbde
    __ dci(0x450b98db);  // smmla z27.s, z6.b, z11.b
    // vl128 state = 0x9977677a
    __ dci(0x451f98d3);  // smmla z19.s, z6.b, z31.b
    // vl128 state = 0xe6d6c2ef
    __ dci(0x451b9adb);  // smmla z27.s, z22.b, z27.b
    // vl128 state = 0xa535453f
    __ dci(0x450b98d9);  // smmla z25.s, z6.b, z11.b
    // vl128 state = 0xeda3f381
    __ dci(0x458b9adb);  // usmmla z27.s, z22.b, z11.b
    // vl128 state = 0xd72dbdef
    __ dci(0x45cb98da);  // ummla z26.s, z6.b, z11.b
    // vl128 state = 0xfae4975b
    __ dci(0x45c999d2);  // ummla z18.s, z14.b, z9.b
    // vl128 state = 0x0aa6e1f6
  }

  uint32_t state;
  ComputeMachineStateHash(&masm, &state);
  __ Mov(x0, reinterpret_cast<uint64_t>(&state));
  __ Ldr(w0, MemOperand(x0));

  END();
  if (CAN_RUN()) {
    RUN();
    uint32_t expected_hashes[] = {
        0x0aa6e1f6,
        0xba2d4547,
        0x0e72a647,
        0x15b8fc1b,
        0x92eddc98,
        0xe0c72bcf,
        0x36b4e3ba,
        0x1041114e,
        0x4d44ebd4,
        0xfe0e3cbf,
        0x81c43455,
        0x678617c5,
        0xf72fac1f,
        0xabdcd4e4,
        0x108864bd,
        0x035f6eca,
    };
    ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0);
  }
}

TEST_SVE(sve_fmatmul_s) {
  SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
                          CPUFeatures::kSVEF32MM,
                          CPUFeatures::kNEON,
                          CPUFeatures::kCRC32);
  START();

  SetInitialMachineState(&masm);
  // state = 0xe2bd2480

  {
    ExactAssemblyScope scope(&masm, 20 * kInstructionSize);
    __ dci(0x64a1e6ee);  // fmmla z14.s, z23.s, z1.s
    // vl128 state = 0x9db41bef
    __ dci(0x64b1e7fe);  // fmmla z30.s, z31.s, z17.s
    // vl128 state = 0xc1535e55
    __ dci(0x64b9e7d6);  // fmmla z22.s, z30.s, z25.s
    // vl128 state = 0xc65aad35
    __ dci(0x64bde6c6);  // fmmla z6.s, z22.s, z29.s
    // vl128 state = 0x68387c22
    __ dci(0x64b9e4c2);  // fmmla z2.s, z6.s, z25.s
    // vl128 state = 0xcf08b3a4
    __ dci(0x64b9e543);  // fmmla z3.s, z10.s, z25.s
    // vl128 state = 0x969bbe77
    __ dci(0x64b9e553);  // fmmla z19.s, z10.s, z25.s
    // vl128 state = 0xc3f514e1
    __ dci(0x64b9e557);  // fmmla z23.s, z10.s, z25.s
    // vl128 state = 0x4b351c29
    __ dci(0x64b9e773);  // fmmla z19.s, z27.s, z25.s
    // vl128 state = 0x5e026315
    __ dci(0x64bbe757);  // fmmla z23.s, z26.s, z27.s
    // vl128 state = 0x61684fe6
    __ dci(0x64bbe755);  // fmmla z21.s, z26.s, z27.s
    // vl128 state = 0x719b4ce0
    __ dci(0x64bfe554);  // fmmla z20.s, z10.s, z31.s
    // vl128 state = 0xdf3d2a1c
    __ dci(0x64bfe550);  // fmmla z16.s, z10.s, z31.s
    // vl128 state = 0x3279aab8
    __ dci(0x64bfe714);  // fmmla z20.s, z24.s, z31.s
    // vl128 state = 0x0b985869
    __ dci(0x64b7e756);  // fmmla z22.s, z26.s, z23.s
    // vl128 state = 0x14230587
    __ dci(0x64b7e737);  // fmmla z23.s, z25.s, z23.s
    // vl128 state = 0x2cb88e7f
    __ dci(0x64bfe767);  // fmmla z7.s, z27.s, z31.s
    // vl128 state = 0xb5ec0c65
    __ dci(0x64bfe777);  // fmmla z23.s, z27.s, z31.s
    // vl128 state = 0xb5e5eab0
    __ dci(0x64bfe715);  // fmmla z21.s, z24.s, z31.s
    // vl128 state = 0xd0491fb5
    __ dci(0x64b7e797);  // fmmla z23.s, z28.s, z23.s
    // vl128 state = 0x98a55a30
  }

  uint32_t state;
  ComputeMachineStateHash(&masm, &state);
  __ Mov(x0, reinterpret_cast<uint64_t>(&state));
  __ Ldr(w0, MemOperand(x0));

  END();
  if (CAN_RUN()) {
    RUN();
    uint32_t expected_hashes[] = {
        0x98a55a30,
        0x590b7715,
        0x4562ccf3,
        0x1f8653a6,
        0x5fe174d5,
        0xb300dcb8,
        0x3cefa79e,
        0xa22484c7,
        0x380697ec,
        0xde9e699b,
        0x99d21870,
        0x456cb46b,
        0x207d2615,
        0xecaf9678,
        0x0949e2d2,
        0xa764c43f,
    };
    ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0);
  }
}

}  // namespace aarch64
}  // namespace vixl
