/*
 * Copyright (c) Qualcomm Innovation Center, Inc.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */
#include <dlfcn.h>
#include <executorch/backends/qualcomm/runtime/Logging.h>
#include <executorch/backends/qualcomm/runtime/SharedBuffer.h>

// Refer to the QNN HTP Shared Buffer Tutorial
// in Qualcomm® AI Engine Direct document
constexpr uint8_t RPCMEM_HEAP_ID_SYSTEM = 25;
constexpr uint8_t RPCMEM_DEFAULT_FLAGS = 1;

std::size_t std::hash<CustomMemTensorInfo>::operator()(
    const CustomMemTensorInfo& info) const noexcept {
  size_t hash_val = 0;
  hash_val ^= std::hash<void*>()(info.tensor_addr);
  hash_val ^= std::hash<void*>()(info.custom_mem);
  hash_val ^= std::hash<size_t>()(info.pos);
  hash_val ^= std::hash<size_t>()(info.tensor_bytes);
  for (int i = 0; i < info.rank; ++i) {
    hash_val ^= info.shape[i];
  }
  hash_val ^= std::hash<uint32_t>()(info.rank);
  hash_val ^= std::hash<executorch::aten::ScalarType>()(info.dtype);
  return hash_val;
}

bool operator==(
    const CustomMemTensorInfo& lhs,
    const CustomMemTensorInfo& rhs) {
  bool is_same =
      (lhs.tensor_addr == rhs.tensor_addr && lhs.custom_mem == rhs.custom_mem &&
       lhs.pos == rhs.pos && lhs.tensor_bytes == rhs.tensor_bytes &&
       lhs.rank == rhs.rank && lhs.dtype == rhs.dtype);
  for (int i = 0; i < lhs.rank; ++i) {
    is_same &= lhs.shape[i] == rhs.shape[i];
  }
  return is_same;
}

namespace executorch {
namespace backends {
namespace qnn {

using executorch::runtime::Error;

namespace {

intptr_t alignTo(size_t alignment, intptr_t offset) {
  return offset % alignment == 0 ? offset
                                 : offset +
          (static_cast<intptr_t>(alignment) -
           offset % static_cast<intptr_t>(alignment));
}

} // namespace

std::mutex SharedBuffer::init_mutex_;

void* SharedBuffer::GetCustomMemBase(void* buf) {
  auto it = tensor_addr_to_custom_mem_.find(buf);
  if (it == tensor_addr_to_custom_mem_.end()) {
    return nullptr;
  }
  return it->second;
}

void* SharedBuffer::GetUnAlignedAddr(void* buf) {
  auto it = restore_map_.find(buf);
  if (it == restore_map_.end()) {
    return nullptr;
  }
  return it->second;
}

size_t SharedBuffer::GetAllocatedSize(void* buf) {
  auto it = allocated_size_map_.find(buf);
  if (it == allocated_size_map_.end()) {
    return 0;
  }
  return it->second;
}

SharedBuffer& SharedBuffer::GetSharedBufferManager() {
  std::lock_guard<std::mutex> lk(init_mutex_);
  static SharedBuffer shared_buffer_manager;
  if (!shared_buffer_manager.GetInitialize()) {
#if defined(__aarch64__)
    Error status = shared_buffer_manager.Load();
#else
    // For x86_64 platform
    Error status = Error::Ok;
#endif
    if (status == Error::Ok) {
      shared_buffer_manager.SetInitialize(true);
    }
  }
  return shared_buffer_manager;
}

SharedBuffer::~SharedBuffer() {
#if defined(__aarch64__)
  if (initialize_) {
    SharedBuffer::GetSharedBufferManager().UnLoad();
  }
#endif
};

void* SharedBuffer::AllocMem(size_t bytes, size_t alignment) {
  if (!initialize_) {
    QNN_EXECUTORCH_LOG_ERROR("Shared memory not initialized.");
    return nullptr;
  }
  // do alignment:
  auto allocate_bytes = static_cast<int32_t>(bytes + alignment);
  void* buf = rpc_mem_alloc_(
      RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes);
  if (buf == nullptr) {
    QNN_EXECUTORCH_LOG_WARN("Failed to allocate the tensor by RPC memory.");
    return nullptr;
  }
  allocated_size_map_.insert({buf, allocate_bytes});
  auto aligned_buf = reinterpret_cast<void*>(
      alignTo(alignment, reinterpret_cast<intptr_t>(buf)));
  bool status = restore_map_.insert({aligned_buf, buf}).second;
  if (!status) {
    QNN_EXECUTORCH_LOG_ERROR("Failed to allocate the tensor by RPC memory.");
    rpc_mem_free_(buf);
  }
  return aligned_buf;
}

int32_t SharedBuffer::MemToFd(void* buf) {
  int32_t memFd = -1;
  if (!initialize_) {
    QNN_EXECUTORCH_LOG_ERROR("Shared memory not initialized.");
  } else {
    memFd = rpc_mem_to_fd_(buf);
  }
  return memFd;
}

void SharedBuffer::FreeMem(void* buf) {
  if (!initialize_) {
    QNN_EXECUTORCH_LOG_ERROR("Shared memory not initialized.");
  } else if (restore_map_.count(buf) == 0) {
    QNN_EXECUTORCH_LOG_WARN("Don't free an unallocated tensor.");
  } else {
    rpc_mem_free_(restore_map_[buf]);
    restore_map_.erase(buf);
  }
}

bool SharedBuffer::IsAllocated(void* buf) {
  return restore_map_.count(buf) != 0U;
}

Error SharedBuffer::Load() {
  // On Android, 32-bit and 64-bit libcdsprpc.so can be found at /vendor/lib/
  // and /vendor/lib64/ respectively.
  lib_cdsp_rpc_ = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL);
  if (lib_cdsp_rpc_ == nullptr) {
    QNN_EXECUTORCH_LOG_ERROR(
        "Unable to load shared buffer. dlerror(): %s", dlerror());
    return Error::Internal;
  }
  rpc_mem_alloc_ = reinterpret_cast<RpcMemAllocFn_t>( // NOLINT
      dlsym(lib_cdsp_rpc_, "rpcmem_alloc"));
  rpc_mem_free_ = reinterpret_cast<RpcMemFreeFn_t>( // NOLINT
      dlsym(lib_cdsp_rpc_, "rpcmem_free"));
  rpc_mem_to_fd_ = reinterpret_cast<RpcMemToFdFn_t>( // NOLINT
      dlsym(lib_cdsp_rpc_, "rpcmem_to_fd"));
  if (nullptr == rpc_mem_alloc_ || nullptr == rpc_mem_free_ ||
      nullptr == rpc_mem_to_fd_) {
    QNN_EXECUTORCH_LOG_ERROR(
        "Unable to access symbols in shared buffer. dlerror(): %s", dlerror());
    dlclose(lib_cdsp_rpc_);
    return Error::Internal;
  }
  return Error::Ok;
}

void SharedBuffer::AddCusomMemTensorAddr(void* tensor_addr, void* custom_mem) {
  tensor_addr_to_custom_mem_.insert({tensor_addr, custom_mem});
};

void SharedBuffer::AddCusomMemTensorInfo(const CustomMemTensorInfo& info) {
  custom_mem_tensor_info_set_.insert(info);
  tensor_addr_to_custom_mem_.insert({info.tensor_addr, info.custom_mem});
}

Error SharedBuffer::UnLoad() {
  if (dlclose(lib_cdsp_rpc_) != 0) {
    QNN_EXECUTORCH_LOG_ERROR(
        "Unable to close shared buffer. dlerror(): %s", dlerror());
    return Error::Internal;
  };
  return Error::Ok;
}
} // namespace qnn
} // namespace backends
} // namespace executorch