/*
 * Copyright (C) 2018 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#define ATRACE_TAG ATRACE_TAG_PACKAGE_MANAGER

#include "apexd_loop.h"

#include <ApexProperties.sysprop.h>
#include <android-base/file.h>
#include <android-base/logging.h>
#include <android-base/parseint.h>
#include <android-base/properties.h>
#include <android-base/stringprintf.h>
#include <android-base/strings.h>
#include <dirent.h>
#include <fcntl.h>
#include <libdm/dm.h>
#include <linux/fs.h>
#include <linux/loop.h>
#include <string>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <sys/statfs.h>
#include <sys/sysmacros.h>
#include <sys/types.h>
#include <unistd.h>
#include <utils/Trace.h>

#include <array>
#include <filesystem>
#include <mutex>
#include <string_view>

#include "apexd_utils.h"

using android::base::Basename;
using android::base::ErrnoError;
using android::base::Error;
using android::base::GetBoolProperty;
using android::base::ParseUint;
using android::base::ReadFileToString;
using android::base::Result;
using android::base::StartsWith;
using android::base::StringPrintf;
using android::base::unique_fd;
using android::dm::DeviceMapper;

namespace android {
namespace apex {
namespace loop {

static constexpr const char* kApexLoopIdPrefix = "apex:";

// 128 kB read-ahead, which we currently use for /system as well
static constexpr const unsigned int kReadAheadKb = 128;

void LoopbackDeviceUniqueFd::MaybeCloseBad() {
  if (device_fd.get() != -1) {
    // Disassociate any files.
    if (ioctl(device_fd.get(), LOOP_CLR_FD) == -1) {
      PLOG(ERROR) << "Unable to clear fd for loopback device";
    }
  }
}

Result<void> ConfigureScheduler(const std::string& device_path) {
  ATRACE_NAME("ConfigureScheduler");
  if (!StartsWith(device_path, "/dev/")) {
    return Error() << "Invalid argument " << device_path;
  }

  const std::string device_name = Basename(device_path);

  const std::string sysfs_path =
      StringPrintf("/sys/block/%s/queue/scheduler", device_name.c_str());
  unique_fd sysfs_fd(open(sysfs_path.c_str(), O_RDWR | O_CLOEXEC));
  if (sysfs_fd.get() == -1) {
    return ErrnoError() << "Failed to open " << sysfs_path;
  }

  // Kernels before v4.1 only support 'noop'. Kernels [v4.1, v5.0) support
  // 'noop' and 'none'. Kernels v5.0 and later only support 'none'.
  static constexpr const std::array<std::string_view, 2> kNoScheduler = {
      "none", "noop"};

  int ret = 0;
  std::string cur_sched_str;
  if (!ReadFileToString(sysfs_path, &cur_sched_str)) {
    return ErrnoError() << "Failed to read " << sysfs_path;
  }
  cur_sched_str = android::base::Trim(cur_sched_str);
  if (std::count(kNoScheduler.begin(), kNoScheduler.end(), cur_sched_str)) {
    return {};
  }

  for (const std::string_view& scheduler : kNoScheduler) {
    ret = write(sysfs_fd.get(), scheduler.data(), scheduler.size());
    if (ret > 0) {
      break;
    }
  }

  if (ret <= 0) {
    return ErrnoError() << "Failed to write to " << sysfs_path;
  }

  return {};
}

// Return the parent device of a partition. Converts e.g. "sda26" into "sda".
static Result<std::string> PartitionParent(const std::string& blockdev) {
  if (blockdev.find('/') != std::string::npos) {
    return Error() << "Invalid argument " << blockdev;
  }
  std::error_code ec;
  for (const auto& entry :
       std::filesystem::directory_iterator("/sys/class/block", ec)) {
    const std::string path = entry.path().string();
    if (std::filesystem::exists(
            StringPrintf("%s/%s", path.c_str(), blockdev.c_str()))) {
      return Basename(path);
    }
  }
  return blockdev;
}

// Convert a major:minor pair into a block device name.
static std::string BlockdevName(dev_t dev) {
  std::error_code ec;
  for (const auto& entry :
       std::filesystem::directory_iterator("/dev/block", ec)) {
    struct stat statbuf;
    if (stat(entry.path().string().c_str(), &statbuf) < 0) {
      continue;
    }
    if (dev == statbuf.st_rdev) {
      return Basename(entry.path().string());
    }
  }
  return {};
}

// For file `file_path`, retrieve the block device backing the filesystem on
// which the file exists and return the queue depth of the block device. The
// loop in this function may e.g. traverse the following hierarchy:
// /dev/block/dm-9 (system-verity; dm-verity)
// -> /dev/block/dm-1 (system_b; dm-linear)
// -> /dev/sda26
static Result<uint32_t> BlockDeviceQueueDepth(const std::string& file_path) {
  struct stat statbuf;
  int res = stat(file_path.c_str(), &statbuf);
  if (res < 0) {
    return ErrnoErrorf("stat({})", file_path.c_str());
  }
  std::string blockdev = "/dev/block/" + BlockdevName(statbuf.st_dev);
  LOG(VERBOSE) << file_path << " -> " << blockdev;
  if (blockdev.empty()) {
    return Errorf("Failed to convert {}:{} (path {})", major(statbuf.st_dev),
                  minor(statbuf.st_dev), file_path.c_str());
  }
  auto& dm = DeviceMapper::Instance();
  for (;;) {
    std::optional<std::string> child = dm.GetParentBlockDeviceByPath(blockdev);
    if (!child) {
      break;
    }
    LOG(VERBOSE) << blockdev << " -> " << *child;
    blockdev = *child;
  }
  std::optional<std::string> maybe_blockdev =
      android::dm::ExtractBlockDeviceName(blockdev);
  if (!maybe_blockdev) {
    return Error() << "Failed to remove /dev/block/ prefix from " << blockdev;
  }
  Result<std::string> maybe_parent = PartitionParent(*maybe_blockdev);
  if (!maybe_parent.ok()) {
    return Error() << "Failed to determine parent of " << *maybe_blockdev;
  }
  blockdev = *maybe_parent;
  LOG(VERBOSE) << "Partition parent: " << blockdev;
  const std::string nr_tags_path =
      StringPrintf("/sys/class/block/%s/mq/0/nr_tags", blockdev.c_str());
  std::string nr_tags;
  if (!ReadFileToString(nr_tags_path, &nr_tags)) {
    return ErrnoError() << "Failed to read " << nr_tags_path;
  }
  nr_tags = android::base::Trim(nr_tags);
  LOG(VERBOSE) << file_path << " is backed by /dev/" << blockdev
               << " and that block device supports queue depth " << nr_tags;
  return strtol(nr_tags.c_str(), NULL, 0);
}

// Set 'nr_requests' of `loop_device_path` equal to the queue depth of
// the block device backing `file_path`.
Result<void> ConfigureQueueDepth(const std::string& loop_device_path,
                                 const std::string& file_path) {
  ATRACE_NAME("ConfigureQueueDepth");
  if (!StartsWith(loop_device_path, "/dev/")) {
    return Error() << "Invalid argument " << loop_device_path;
  }

  const std::string loop_device_name = Basename(loop_device_path);

  const std::string sysfs_path =
      StringPrintf("/sys/block/%s/queue/nr_requests", loop_device_name.c_str());
  std::string cur_nr_requests_str;
  if (!ReadFileToString(sysfs_path, &cur_nr_requests_str)) {
    return ErrnoError() << "Failed to read " << sysfs_path;
  }
  cur_nr_requests_str = android::base::Trim(cur_nr_requests_str);
  uint32_t cur_nr_requests = 0;
  if (!ParseUint(cur_nr_requests_str.c_str(), &cur_nr_requests)) {
    return Error() << "Failed to parse " << cur_nr_requests_str;
  }

  unique_fd sysfs_fd(open(sysfs_path.c_str(), O_RDWR | O_CLOEXEC));
  if (sysfs_fd.get() == -1) {
    return ErrnoErrorf("Failed to open {}", sysfs_path);
  }

  const auto qd = BlockDeviceQueueDepth(file_path);
  if (!qd.ok()) {
    return qd.error();
  }
  if (*qd == cur_nr_requests) {
    return {};
  }
  // Only report write failures if reducing the queue depth. Attempts to
  // increase the queue depth are rejected by the kernel if no I/O scheduler
  // is associated with the request queue.
  if (!WriteStringToFd(StringPrintf("%u", *qd), sysfs_fd) &&
      *qd < cur_nr_requests) {
    return ErrnoErrorf("Failed to write {} to {}", *qd, sysfs_path);
  }
  return {};
}

Result<void> ConfigureReadAhead(const std::string& device_path) {
  ATRACE_NAME("ConfigureReadAhead");
  CHECK(StartsWith(device_path, "/dev/"));
  std::string device_name = Basename(device_path);

  std::string sysfs_device =
      StringPrintf("/sys/block/%s/queue/read_ahead_kb", device_name.c_str());
  unique_fd sysfs_fd(open(sysfs_device.c_str(), O_RDWR | O_CLOEXEC));
  if (sysfs_fd.get() == -1) {
    return ErrnoError() << "Failed to open " << sysfs_device;
  }

  std::string readAheadKb = std::to_string(
      android::sysprop::ApexProperties::loopback_readahead().value_or(kReadAheadKb));

  int ret = TEMP_FAILURE_RETRY(
      write(sysfs_fd.get(), readAheadKb.c_str(), readAheadKb.length()));
  if (ret < 0) {
    return ErrnoError() << "Failed to write to " << sysfs_device;
  }

  return {};
}

Result<void> PreAllocateLoopDevices(size_t num) {
  Result<void> loop_ready = WaitForFile("/dev/loop-control", 20s);
  if (!loop_ready.ok()) {
    return loop_ready;
  }
  unique_fd ctl_fd(
      TEMP_FAILURE_RETRY(open("/dev/loop-control", O_RDWR | O_CLOEXEC)));
  if (ctl_fd.get() == -1) {
    return ErrnoError() << "Failed to open loop-control";
  }

  int new_allocations = 0;  // for logging purpose

  // Assumption: loop device ID [0..num) is valid.
  // This is because pre-allocation happens during bootstrap.
  // Anyway Kernel pre-allocated loop devices
  // as many as CONFIG_BLK_DEV_LOOP_MIN_COUNT,
  // Within the amount of kernel-pre-allocation,
  // LOOP_CTL_ADD will fail with EEXIST
  for (size_t id = 0ul, cnt = 0; cnt < num; ++id) {
    int ret = ioctl(ctl_fd.get(), LOOP_CTL_ADD, id);
    if (ret > 0) {
      new_allocations++;
      cnt++;
    } else if (errno == EEXIST) {
      // When LOOP_CTL_ADD failed with EEXIST, it can check
      // whether it is already in use.
      // Otherwise, the loop devices pre-allocated by the kernel can be used.
      std::string loop_device = StringPrintf("/sys/block/loop%zu/loop", id);
      if (access(loop_device.c_str(), F_OK) == 0) {
        LOG(WARNING) << "Loop device " << id << " already in use";
      } else {
        cnt++;
      }
    } else {
      return ErrnoError() << "Failed LOOP_CTL_ADD id = " << id;
    }
  }

  // Don't wait until the dev nodes are actually created, which
  // will delay the boot. By simply returing here, the creation of the dev
  // nodes will be done in parallel with other boot processes, and we
  // just optimistally hope that they are all created when we actually
  // access them for activating APEXes. If the dev nodes are not ready
  // even then, we wait 50ms and warning message will be printed (see below
  // CreateLoopDevice()).
  LOG(INFO) << "Found " << (num - new_allocations)
            << " idle loopback devices that were "
            << "pre-allocated by kernel. Allocated " << new_allocations
            << " more.";
  return {};
}

// This is a temporary/empty object for a loop device before the backing file is
// set.
struct EmptyLoopDevice {
  unique_fd fd;
  std::string name;
  LoopbackDeviceUniqueFd ToOwned() { return {std::move(fd), std::move(name)}; }
};

static Result<LoopbackDeviceUniqueFd> ConfigureLoopDevice(
    EmptyLoopDevice&& inner, const std::string& target,
    const uint32_t image_offset, const size_t image_size) {
  static bool use_loop_configure;
  static std::once_flag once_flag;
  auto device_fd = inner.fd.get();
  std::call_once(once_flag, [&]() {
    // LOOP_CONFIGURE is a new ioctl in Linux 5.8 (and backported in Android
    // common) that allows atomically configuring a loop device. It is a lot
    // faster than the traditional LOOP_SET_FD/LOOP_SET_STATUS64 combo, but
    // it may not be available on updating devices, so try once before
    // deciding.
    struct loop_config config;
    memset(&config, 0, sizeof(config));
    config.fd = -1;
    if (ioctl(device_fd, LOOP_CONFIGURE, &config) == -1 && errno == EBADF) {
      // If the IOCTL exists, it will fail with EBADF for the -1 fd
      use_loop_configure = true;
    }
  });

  /*
   * Using O_DIRECT will tell the kernel that we want to use Direct I/O
   * on the underlying file, which we want to do to avoid double caching.
   * Note that Direct I/O won't be enabled immediately, because the block
   * size of the underlying block device may not match the default loop
   * device block size (512); when we call LOOP_SET_BLOCK_SIZE below, the
   * kernel driver will automatically enable Direct I/O when it sees that
   * condition is now met.
   */
  bool use_buffered_io = false;
  unique_fd target_fd(open(target.c_str(), O_RDONLY | O_CLOEXEC | O_DIRECT));
  if (target_fd.get() == -1) {
    struct statfs stbuf;
    int saved_errno = errno;
    // let's give another try with buffered I/O for EROFS and squashfs
    if (statfs(target.c_str(), &stbuf) != 0 ||
        (stbuf.f_type != EROFS_SUPER_MAGIC_V1 &&
         stbuf.f_type != SQUASHFS_MAGIC &&
         stbuf.f_type != OVERLAYFS_SUPER_MAGIC)) {
      return Error(saved_errno) << "Failed to open " << target;
    }
    LOG(WARNING) << "Fallback to buffered I/O for " << target;
    use_buffered_io = true;
    target_fd.reset(open(target.c_str(), O_RDONLY | O_CLOEXEC));
    if (target_fd.get() == -1) {
      return ErrnoError() << "Failed to open " << target;
    }
  }

  struct loop_info64 li;
  memset(&li, 0, sizeof(li));
  strlcpy((char*)li.lo_crypt_name, kApexLoopIdPrefix, LO_NAME_SIZE);
  li.lo_offset = image_offset;
  li.lo_sizelimit = image_size;
  // Automatically free loop device on last close.
  li.lo_flags |= LO_FLAGS_AUTOCLEAR;

  if (use_loop_configure) {
    struct loop_config config;
    memset(&config, 0, sizeof(config));
    config.fd = target_fd.get();
    config.info = li;
    config.block_size = 4096;
    if (!use_buffered_io) {
        li.lo_flags |= LO_FLAGS_DIRECT_IO;
    }

    if (ioctl(device_fd, LOOP_CONFIGURE, &config) == -1) {
      return ErrnoError() << "Failed to LOOP_CONFIGURE";
    }

    return inner.ToOwned();
  } else {
    if (ioctl(device_fd, LOOP_SET_FD, target_fd.get()) == -1) {
      return ErrnoError() << "Failed to LOOP_SET_FD";
    }
    // Now, we have a fully-owned loop device.
    LoopbackDeviceUniqueFd loop_device = inner.ToOwned();

    if (ioctl(device_fd, LOOP_SET_STATUS64, &li) == -1) {
      return ErrnoError() << "Failed to LOOP_SET_STATUS64";
    }

    if (ioctl(device_fd, BLKFLSBUF, 0) == -1) {
      // This works around a kernel bug where the following happens.
      // 1) The device runs with a value of loop.max_part > 0
      // 2) As part of LOOP_SET_FD above, we do a partition scan, which loads
      //    the first 2 pages of the underlying file into the buffer cache
      // 3) When we then change the offset with LOOP_SET_STATUS64, those pages
      //    are not invalidated from the cache.
      // 4) When we try to mount an ext4 filesystem on the loop device, the ext4
      //    code will try to find a superblock by reading 4k at offset 0; but,
      //    because we still have the old pages at offset 0 lying in the cache,
      //    those pages will be returned directly. However, those pages contain
      //    the data at offset 0 in the underlying file, not at the offset that
      //    we configured
      // 5) the ext4 driver fails to find a superblock in the (wrong) data, and
      //    fails to mount the filesystem.
      //
      // To work around this, explicitly flush the block device, which will
      // flush the buffer cache and make sure we actually read the data at the
      // correct offset.
      return ErrnoError() << "Failed to flush buffers on the loop device";
    }

    // Direct-IO requires the loop device to have the same block size as the
    // underlying filesystem.
    if (ioctl(device_fd, LOOP_SET_BLOCK_SIZE, 4096) == -1) {
      PLOG(WARNING) << "Failed to LOOP_SET_BLOCK_SIZE";
    }
    return loop_device;
  }
}

static Result<EmptyLoopDevice> WaitForLoopDevice(int num) {
  std::vector<std::string> candidate_devices = {
      StringPrintf("/dev/block/loop%d", num),
      StringPrintf("/dev/loop%d", num),
  };

  // apexd-bootstrap runs in parallel with ueventd to optimize boot time. In
  // rare cases apexd would try attempt to mount an apex before ueventd created
  // a loop device for it. To work around this we keep polling for loop device
  // to be created until ueventd's cold boot sequence is done.
  bool cold_boot_done = GetBoolProperty("ro.cold_boot_done", false);

  // Even though the kernel has created the loop device, we still depend on
  // ueventd to run to actually create the device node in userspace. To solve
  // this properly we should listen on the netlink socket for uevents, or use
  // inotify. For now, this will have to do.
  size_t attempts =
      android::sysprop::ApexProperties::loop_wait_attempts().value_or(3u);
  for (size_t i = 0; i != attempts; ++i) {
    if (!cold_boot_done) {
      cold_boot_done = GetBoolProperty("ro.cold_boot_done", false);
    }
    for (const auto& device : candidate_devices) {
      unique_fd sysfs_fd(open(device.c_str(), O_RDWR | O_CLOEXEC));
      if (sysfs_fd.get() != -1) {
        return EmptyLoopDevice{std::move(sysfs_fd), std::move(device)};
      }
    }
    PLOG(WARNING) << "Loopback device " << num << " not ready. Waiting 50ms...";
    usleep(50000);
    if (!cold_boot_done) {
      // ueventd hasn't finished cold boot yet, keep trying.
      i = 0;
    }
  }

  return Error() << "Failed to open loopback device " << num;
}

static Result<LoopbackDeviceUniqueFd> CreateLoopDevice(
    const std::string& target, uint32_t image_offset, size_t image_size) {
  ATRACE_NAME("CreateLoopDevice");

  unique_fd ctl_fd(open("/dev/loop-control", O_RDWR | O_CLOEXEC));
  if (ctl_fd.get() == -1) {
    return ErrnoError() << "Failed to open loop-control";
  }

  static std::mutex mtx;
  std::lock_guard lock(mtx);
  int num = ioctl(ctl_fd.get(), LOOP_CTL_GET_FREE);
  if (num == -1) {
    return ErrnoError() << "Failed LOOP_CTL_GET_FREE";
  }

  auto loop_device = OR_RETURN(WaitForLoopDevice(num));
  CHECK_NE(loop_device.fd.get(), -1);

  return ConfigureLoopDevice(std::move(loop_device), target, image_offset,
                             image_size);
}

Result<LoopbackDeviceUniqueFd> CreateAndConfigureLoopDevice(
    const std::string& target, uint32_t image_offset, size_t image_size) {
  ATRACE_NAME("CreateAndConfigureLoopDevice");
  // Do minimal amount of work while holding a mutex. We need it because
  // acquiring + configuring a loop device is not atomic. Ideally we should
  // pre-acquire all the loop devices in advance, so that when we run APEX
  // activation in-parallel, we can do it without holding any lock.
  // Unfortunately, this will require some refactoring of how we manage loop
  // devices, and probably some new loop-control ioctls, so for the time being
  // we just limit the scope that requires locking.
  android::base::Timer timer;
  Result<LoopbackDeviceUniqueFd> loop_device;
  while (timer.duration() < 1s) {
    loop_device = CreateLoopDevice(target, image_offset, image_size);
    if (loop_device.ok()) {
      break;
    }
    std::this_thread::sleep_for(5ms);
  }

  if (!loop_device.ok()) {
    return loop_device.error();
  }

  Result<void> sched_status = ConfigureScheduler(loop_device->name);
  if (!sched_status.ok()) {
    LOG(WARNING) << "Configuring I/O scheduler failed: "
                 << sched_status.error();
  }

  Result<void> qd_status = ConfigureQueueDepth(loop_device->name, target);
  if (!qd_status.ok()) {
    LOG(WARNING) << qd_status.error();
  }

  Result<void> read_ahead_status = ConfigureReadAhead(loop_device->name);
  if (!read_ahead_status.ok()) {
    return read_ahead_status.error();
  }

  return loop_device;
}

void DestroyLoopDevice(const std::string& path, const DestroyLoopFn& extra) {
  unique_fd fd(open(path.c_str(), O_RDWR | O_CLOEXEC));
  if (fd.get() == -1) {
    if (errno != ENOENT) {
      PLOG(WARNING) << "Failed to open " << path;
    }
    return;
  }

  struct loop_info64 li;
  if (ioctl(fd.get(), LOOP_GET_STATUS64, &li) < 0) {
    if (errno != ENXIO) {
      PLOG(WARNING) << "Failed to LOOP_GET_STATUS64 " << path;
    }
    return;
  }

  auto id = std::string((char*)li.lo_crypt_name);
  if (StartsWith(id, kApexLoopIdPrefix)) {
    extra(path, id);

    if (ioctl(fd.get(), LOOP_CLR_FD, 0) < 0) {
      PLOG(WARNING) << "Failed to LOOP_CLR_FD " << path;
    }
  }
}

}  // namespace loop
}  // namespace apex
}  // namespace android
