// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Implementation file for the sandbox2::Client class.

#include "sandboxed_api/sandbox2/client.h"

#include <fcntl.h>
#include <linux/bpf_common.h>
#include <linux/filter.h>
#include <linux/seccomp.h>
#include <sys/prctl.h>
#include <syscall.h>
#include <unistd.h>

#include <atomic>
#include <cerrno>
#include <cinttypes>
#include <cstdint>
#include <cstdlib>
#include <limits>
#include <memory>
#include <string>
#include <thread>  // NOLINT(build/c++11)
#include <utility>
#include <vector>

#include "absl/base/attributes.h"
#include "absl/base/macros.h"
#include "absl/container/flat_hash_map.h"
#include "absl/status/status.h"
#include "absl/strings/numbers.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/str_join.h"
#include "absl/strings/str_split.h"
#include "absl/strings/string_view.h"
#include "sandboxed_api/sandbox2/comms.h"
#include "sandboxed_api/sandbox2/policy.h"
#include "sandboxed_api/sandbox2/sanitizer.h"
#include "sandboxed_api/sandbox2/syscall.h"
#include "sandboxed_api/sandbox2/util/bpf_helper.h"
#include "sandboxed_api/util/raw_logging.h"

#ifndef SECCOMP_FILTER_FLAG_NEW_LISTENER
#define SECCOMP_FILTER_FLAG_NEW_LISTENER (1UL << 3)
#endif

namespace sandbox2 {
namespace {

void InitSeccompUnotify(sock_fprog prog, Comms* comms) {
  // The policy might not allow sending the notify FD.
  // Create a separate thread that won't get the seccomp policy to send the FD.
  // Synchronize with it using plain atomics + seccomp TSYNC, so we don't need
  // any additional syscalls.
  std::atomic<int> fd(-1);
  std::atomic<int> tid(-1);

  std::thread th([comms, &fd, &tid]() {
    int notify_fd = -1;
    while (notify_fd == -1) {
      notify_fd = fd.load(std::memory_order_seq_cst);
    }
    SAPI_RAW_CHECK(comms->SendFD(notify_fd), "sending unotify fd");
    SAPI_RAW_CHECK(close(notify_fd) == 0, "closing unotify fd");
    sock_filter filter = ALLOW;
    struct sock_fprog allow_prog = {
        .len = 1,
        .filter = &filter,
    };
    int result = syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, 0,
                         reinterpret_cast<uintptr_t>(&allow_prog));
    SAPI_RAW_PCHECK(result != -1, "setting seccomp filter");
    tid.store(syscall(__NR_gettid), std::memory_order_seq_cst);
  });
  th.detach();
  int result = syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER,
                       SECCOMP_FILTER_FLAG_NEW_LISTENER,
                       reinterpret_cast<uintptr_t>(&prog));
  SAPI_RAW_PCHECK(result != -1, "setting seccomp filter");
  fd.store(result, std::memory_order_seq_cst);
  pid_t child = -1;
  while (child == -1) {
    child = tid.load(std::memory_order_seq_cst);
  }
  // Apply seccomp.
  struct sock_filter code[] = {
      LOAD_ARCH,
      JNE32(sandbox2::Syscall::GetHostAuditArch(), ALLOW),
      LOAD_SYSCALL_NR,
      BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, __NR_seccomp, 0, 3),
      ARG_32(3),
      BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, internal::kExecveMagic, 0, 1),
      DENY,
      ALLOW,
  };
  prog.len = ABSL_ARRAYSIZE(code);
  prog.filter = code;
  do {
    result = syscall(
        __NR_seccomp, SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
        reinterpret_cast<uintptr_t>(&prog), internal::kExecveMagic);
  } while (result == child);
  SAPI_RAW_CHECK(result == 0, "Enabling seccomp filter");
}

void InitSeccompRegular(sock_fprog prog) {
  int result =
      syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
              reinterpret_cast<uintptr_t>(&prog));
  SAPI_RAW_PCHECK(result != -1, "setting seccomp filter");
  SAPI_RAW_PCHECK(result == 0,
                  "synchronizing threads using SECCOMP_FILTER_FLAG_TSYNC flag "
                  "for thread=%d",
                  result);
}

}  // namespace

Client::Client(Comms* comms) : comms_(comms) {
  char* fdmap_envvar = getenv(kFDMapEnvVar);
  if (!fdmap_envvar) {
    return;
  }
  absl::flat_hash_map<absl::string_view, absl::string_view> vars =
      absl::StrSplit(fdmap_envvar, ',', absl::SkipEmpty());
  for (const auto& [name, mapped_fd] : vars) {
    int fd;
    SAPI_RAW_CHECK(absl::SimpleAtoi(mapped_fd, &fd), "failed to parse fd map");
    SAPI_RAW_CHECK(fd_map_.emplace(std::string(name), fd).second,
                   "could not insert mapping into fd map (duplicate)");
  }
  unsetenv(kFDMapEnvVar);
}

std::string Client::GetFdMapEnvVar() const {
  return absl::StrCat(kFDMapEnvVar, "=",
                      absl::StrJoin(fd_map_, ",", absl::PairFormatter(",")));
}

void Client::PrepareEnvironment(int* preserved_fd) {
  SetUpIPC(preserved_fd);
  SetUpCwd();
}

void Client::EnableSandbox() {
  ReceivePolicy();
  ApplyPolicyAndBecomeTracee();
}

void Client::SandboxMeHere() {
  PrepareEnvironment();
  EnableSandbox();
}

void Client::SetUpCwd() {
  {
    // Get the current working directory to check if we are in a mount
    // namespace.
    // Note: glibc 2.27 no longer returns a relative path in that case, but
    //       fails with ENOENT and returns a nullptr instead. The code still
    //       needs to run on lower version for the time being.
    char cwd_buf[PATH_MAX + 1] = {0};
    char* cwd = getcwd(cwd_buf, ABSL_ARRAYSIZE(cwd_buf));
    SAPI_RAW_PCHECK(cwd != nullptr || errno == ENOENT,
                    "no current working directory");

    // Outside of the mount namespace, the path is of the form
    // '(unreachable)/...'. Only check for the slash, since Linux might make up
    // other prefixes in the future.
    if (errno == ENOENT || cwd_buf[0] != '/') {
      SAPI_RAW_VLOG(1, "chdir into mount namespace, cwd was '%s'", cwd_buf);
      // If we are in a mount namespace but fail to chdir, then it can lead to a
      // sandbox escape -- we need to fail with FATAL if the chdir fails.
      SAPI_RAW_PCHECK(chdir("/") != -1, "corrective chdir");
    }
  }

  // Receive the user-supplied current working directory and change into it.
  std::string cwd;
  SAPI_RAW_CHECK(comms_->RecvString(&cwd), "receiving working directory");
  if (!cwd.empty()) {
    // On the other hand this chdir can fail without a sandbox escape. It will
    // probably not have the intended behavior though.
    if (chdir(cwd.c_str()) == -1 && SAPI_RAW_VLOG_IS_ON(1)) {
      SAPI_RAW_PLOG(
          INFO,
          "chdir(%s) failed, falling back to previous cwd or / (with "
          "namespaces). Use Executor::SetCwd() to set a working directory",
          cwd.c_str());
    }
  }
}

void Client::SetUpIPC(int* preserved_fd) {
  uint32_t num_of_fd_pairs;
  SAPI_RAW_CHECK(comms_->RecvUint32(&num_of_fd_pairs),
                 "receiving number of fd pairs");
  SAPI_RAW_CHECK(fd_map_.empty(), "fd map not empty");

  SAPI_RAW_VLOG(1, "Will receive %d file descriptor pairs", num_of_fd_pairs);

  absl::flat_hash_map<int, int*> preserve_fds_map;
  if (preserved_fd) {
    preserve_fds_map.emplace(*preserved_fd, preserved_fd);
  }

  for (uint32_t i = 0; i < num_of_fd_pairs; ++i) {
    int32_t requested_fd;
    int32_t fd;
    std::string name;

    SAPI_RAW_CHECK(comms_->RecvInt32(&requested_fd), "receiving requested fd");
    SAPI_RAW_CHECK(comms_->RecvFD(&fd), "receiving current fd");
    SAPI_RAW_CHECK(comms_->RecvString(&name), "receiving name string");

    if (auto it = preserve_fds_map.find(requested_fd);
        it != preserve_fds_map.end()) {
      int old_fd = it->first;
      int new_fd = dup(old_fd);
      SAPI_RAW_PCHECK(new_fd != -1, "Failed to duplicate preserved fd=%d",
                      old_fd);
      SAPI_RAW_LOG(INFO, "Moved preserved fd=%d to %d", old_fd, new_fd);
      close(old_fd);
      int* pfd = it->second;
      *pfd = new_fd;
      preserve_fds_map.erase(it);
      preserve_fds_map.emplace(new_fd, pfd);
    }

    if (requested_fd == comms_->GetConnectionFD()) {
      comms_->MoveToAnotherFd();
      SAPI_RAW_LOG(INFO,
                   "Trying to map over comms fd (%d). Remapped comms to %d",
                   requested_fd, comms_->GetConnectionFD());
    }

    if (requested_fd != -1 && fd != requested_fd) {
      if (requested_fd > STDERR_FILENO && fcntl(requested_fd, F_GETFD) != -1) {
        // Dup2 will silently close the FD if one is already at requested_fd.
        // If someone is using the deferred sandbox entry, ie. SandboxMeHere,
        // the application might have something actually using that fd.
        // Therefore let's log a big warning if that FD is already in use.
        // Note: this check doesn't happen for STDIN,STDOUT,STDERR.
        SAPI_RAW_LOG(
            WARNING,
            "Cloning received fd %d over %d which is already open and will "
            "be silently closed. This may lead to unexpected behavior!",
            fd, requested_fd);
      }

      SAPI_RAW_VLOG(1, "Cloning received fd=%d onto fd=%d", fd, requested_fd);
      SAPI_RAW_PCHECK(dup2(fd, requested_fd) != -1, "");

      // Close the newly received FD if it differs from the new one.
      close(fd);
      fd = requested_fd;
    }

    if (!name.empty()) {
      SAPI_RAW_CHECK(fd_map_.emplace(name, fd).second, "duplicate fd mapping");
    }
  }
}

void Client::ReceivePolicy() {
  std::vector<uint8_t> bytes;
  SAPI_RAW_CHECK(comms_->RecvBytes(&bytes), "receive bytes");
  policy_ = std::move(bytes);
}

void Client::ApplyPolicyAndBecomeTracee() {
  // When running under *SAN, we need to notify *SANs background thread that we
  // want it to exit and wait for it to be done. When not running under *SAN,
  // this function does nothing.
  sanitizer::WaitForSanitizer();

  // Creds can be received w/o synchronization, once the connection is
  // established.
  pid_t cred_pid;
  uid_t cred_uid ABSL_ATTRIBUTE_UNUSED;
  gid_t cred_gid ABSL_ATTRIBUTE_UNUSED;
  SAPI_RAW_CHECK(comms_->RecvCreds(&cred_pid, &cred_uid, &cred_gid),
                 "receiving credentials");

  SAPI_RAW_CHECK(prctl(PR_SET_DUMPABLE, 1) == 0,
                 "setting PR_SET_DUMPABLE flag");
  if (prctl(PR_SET_PTRACER, cred_pid) == -1) {
    SAPI_RAW_VLOG(1, "No YAMA on this system. Continuing");
  }

  SAPI_RAW_CHECK(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == 0,
                 "setting PR_SET_NO_NEW_PRIVS flag");
  SAPI_RAW_CHECK(prctl(PR_SET_KEEPCAPS, 0) == 0,
                 "setting PR_SET_KEEPCAPS flag");

  sock_fprog prog;
  SAPI_RAW_CHECK(policy_.size() / sizeof(sock_filter) <=
                     std::numeric_limits<uint16_t>::max(),
                 "seccomp policy too long");
  prog.len = static_cast<uint16_t>(policy_.size() / sizeof(sock_filter));
  prog.filter = reinterpret_cast<sock_filter*>(&policy_.front());

  SAPI_RAW_VLOG(1,
                "Applying policy in PID %zd, sock_fprog.len: %" PRId16
                " entries (%" PRIuPTR " bytes)",
                syscall(__NR_gettid), prog.len, policy_.size());

  // Signal executor we are ready to have limits applied on us and be ptraced.
  // We want limits at the last moment to avoid triggering them too early and we
  // want ptrace at the last moment to avoid synchronization deadlocks.
  SAPI_RAW_CHECK(comms_->SendUint32(kClient2SandboxReady),
                 "receiving ready signal from executor");
  uint32_t ret;  // wait for confirmation
  SAPI_RAW_CHECK(comms_->RecvUint32(&ret),
                 "receving confirmation from executor");
  if (ret == kSandbox2ClientUnotify) {
    InitSeccompUnotify(prog, comms_);
  } else {
    SAPI_RAW_CHECK(ret == kSandbox2ClientDone,
                   "invalid confirmation from executor");
    InitSeccompRegular(prog);
  }
}

int Client::GetMappedFD(const std::string& name) {
  auto it = fd_map_.find(name);
  SAPI_RAW_CHECK(it != fd_map_.end(),
                 "mapped fd not found (function called twice?)");
  int fd = it->second;
  fd_map_.erase(it);
  return fd;
}

bool Client::HasMappedFD(const std::string& name) {
  return fd_map_.find(name) != fd_map_.end();
}

void Client::SendLogsToSupervisor() {
  // This LogSink will register itself and send all logs to the executor until
  // the object is destroyed.
  logsink_ = std::make_unique<LogSink>(GetMappedFD(LogSink::kLogFDName));
}

NetworkProxyClient* Client::GetNetworkProxyClient() {
  if (proxy_client_ == nullptr) {
    proxy_client_ = std::make_unique<NetworkProxyClient>(
        GetMappedFD(NetworkProxyClient::kFDName));
  }
  return proxy_client_.get();
}

absl::Status Client::InstallNetworkProxyHandler() {
  if (fd_map_.find(NetworkProxyClient::kFDName) == fd_map_.end()) {
    return absl::FailedPreconditionError(
        "InstallNetworkProxyHandler() must be called at most once after the "
        "sandbox is installed. Also, the NetworkProxyServer needs to be "
        "enabled.");
  }
  return NetworkProxyHandler::InstallNetworkProxyHandler(
      GetNetworkProxyClient());
}

}  // namespace sandbox2
