// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Implementation of the sandbox2::ForkServer class.

#include "sandboxed_api/sandbox2/forkserver.h"

#include <fcntl.h>
#include <linux/filter.h>
#include <linux/seccomp.h>
#include <sched.h>
#include <sys/eventfd.h>
#include <sys/prctl.h>
#include <sys/resource.h>
#include <sys/socket.h>
#include <sys/uio.h>
#include <sys/wait.h>
#include <syscall.h>
#include <unistd.h>

#include <cerrno>
#include <csignal>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <initializer_list>
#include <string>
#include <utility>
#include <vector>

#include "absl/base/attributes.h"
#include "absl/container/flat_hash_map.h"
#include "absl/container/flat_hash_set.h"
#include "absl/status/status.h"
#include "absl/status/statusor.h"
#include "absl/strings/match.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/str_join.h"
#include "absl/strings/str_split.h"
#include "absl/strings/string_view.h"
#include "sys/capability.h" // AOSP: match libcap exported includes
#include "sandboxed_api/sandbox2/client.h"
#include "sandboxed_api/sandbox2/comms.h"
#include "sandboxed_api/sandbox2/fork_client.h"
#include "sandboxed_api/sandbox2/forkserver.pb.h"
#include "sandboxed_api/sandbox2/namespace.h"
#include "sandboxed_api/sandbox2/policy.h"
#include "sandboxed_api/sandbox2/sanitizer.h"
#include "sandboxed_api/sandbox2/syscall.h"
#include "sandboxed_api/sandbox2/util.h"
#include "sandboxed_api/sandbox2/util/bpf_helper.h"
#include "sandboxed_api/util/fileops.h"
#include "sandboxed_api/util/raw_logging.h"
#include "sandboxed_api/util/strerror.h"

namespace sandbox2 {
namespace {

using ::sapi::StrError;
using ::sapi::file_util::fileops::FDCloser;

// "Moves" FDs in move_fds from current to target FD number while keeping FDs
// in keep_fds open - potentially moving them to another FD number as well in
// case of colisions.
// Ignores invalid (-1) fds.
void MoveFDs(std::initializer_list<std::pair<int*, int>> move_fds,
             std::initializer_list<int*> keep_fds) {
  absl::flat_hash_map<int, int*> fd_map;
  for (int* fd : keep_fds) {
    if (*fd != -1) {
      fd_map.emplace(*fd, fd);
    }
  }

  for (auto [old_fd, new_fd] : move_fds) {
    if (*old_fd != -1) {
      fd_map.emplace(*old_fd, old_fd);
    }
  }

  for (auto [old_fd, new_fd] : move_fds) {
    if (*old_fd == -1 || *old_fd == new_fd) {
      continue;
    }

    // Make sure we won't override another fd
    if (auto it = fd_map.find(new_fd); it != fd_map.end()) {
      int fd = dup(new_fd);
      SAPI_RAW_CHECK(fd != -1, "Duplicating an FD failed.");
      *it->second = fd;
      fd_map.emplace(fd, it->second);
      fd_map.erase(it);
    }

    if (dup2(*old_fd, new_fd) == -1) {
      SAPI_RAW_PLOG(FATAL, "Moving temporary to proper FD failed.");
    }

    close(*old_fd);
    fd_map.erase(*old_fd);
    *old_fd = new_fd;
  }
}

ABSL_ATTRIBUTE_NORETURN void RunInitProcess(pid_t main_pid, FDCloser pipe_fd) {
  if (prctl(PR_SET_NAME, "S2-INIT-PROC", 0, 0, 0) != 0) {
    SAPI_RAW_PLOG(WARNING, "prctl(PR_SET_NAME, 'S2-INIT-PROC')");
  }

  // Clear SA_NOCLDWAIT.
  struct sigaction sa;
  sa.sa_handler = SIG_DFL;
  sa.sa_flags = 0;
  sigemptyset(&sa.sa_mask);
  SAPI_RAW_CHECK(sigaction(SIGCHLD, &sa, nullptr) == 0,
                 "clearing SA_NOCLDWAIT");

  // Apply seccomp.
  std::vector<sock_filter> code = {
      LOAD_ARCH,
      JNE32(sandbox2::Syscall::GetHostAuditArch(), DENY),

      LOAD_SYSCALL_NR,
      SYSCALL(__NR_waitid, ALLOW),
      SYSCALL(__NR_exit, ALLOW),
  };
  if (pipe_fd.get() >= 0) {
    code.insert(code.end(),
                {SYSCALL(__NR_getrusage, ALLOW), SYSCALL(__NR_write, ALLOW)});
  }
  code.push_back(DENY);

  struct sock_fprog prog {
    .len = static_cast<uint16_t>(code.size()), .filter = code.data(),
  };

  SAPI_RAW_CHECK(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == 0,
                 "Denying new privs");
  SAPI_RAW_CHECK(prctl(PR_SET_KEEPCAPS, 0) == 0, "Dropping caps");
  SAPI_RAW_CHECK(
      syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
              reinterpret_cast<uintptr_t>(&prog)) == 0,
      "Enabling seccomp filter");

  siginfo_t info;
  // Reap children.
  for (;;) {
    int rv = TEMP_FAILURE_RETRY(waitid(P_ALL, -1, &info, WEXITED | __WALL));
    if (rv != 0) {
      _exit(1);
    }

    if (info.si_pid == main_pid) {
      if (pipe_fd.get() >= 0) {
        write(pipe_fd.get(), &info.si_code, sizeof(info.si_code));
        write(pipe_fd.get(), &info.si_status, sizeof(info.si_status));

        rusage usage{};
        getrusage(RUSAGE_CHILDREN, &usage);
        write(pipe_fd.get(), &usage, sizeof(usage));
      }
      _exit(0);
    }
  }
}

absl::Status SendPid(int signaling_fd) {
  // Send our PID (the actual sandboxee process) via SCM_CREDENTIALS.
  // The ancillary message will be attached to the message as SO_PASSCRED is set
  // on the socket.
  char dummy = ' ';
  if (TEMP_FAILURE_RETRY(send(signaling_fd, &dummy, 1, 0)) != 1) {
    return absl::ErrnoToStatus(errno, "Sending PID: send()");
  }
  return absl::OkStatus();
}

absl::StatusOr<pid_t> ReceivePid(int signaling_fd) {
  union {
    struct cmsghdr cmh;
    char ctrl[CMSG_SPACE(sizeof(struct ucred))];
  } ucred_msg{};

  struct msghdr msgh {};
  struct iovec iov {};

  msgh.msg_iov = &iov;
  msgh.msg_iovlen = 1;
  msgh.msg_control = ucred_msg.ctrl;
  msgh.msg_controllen = sizeof(ucred_msg);

  char dummy;
  iov.iov_base = &dummy;
  iov.iov_len = sizeof(char);

  if (TEMP_FAILURE_RETRY(recvmsg(signaling_fd, &msgh, MSG_WAITALL)) != 1) {
    return absl::ErrnoToStatus(errno, "Receiving pid failed: recvmsg");
  }
  struct cmsghdr* cmsgp = CMSG_FIRSTHDR(&msgh);
  if (cmsgp->cmsg_len != CMSG_LEN(sizeof(struct ucred)) ||
      cmsgp->cmsg_level != SOL_SOCKET || cmsgp->cmsg_type != SCM_CREDENTIALS) {
    return absl::InternalError("Receiving pid failed");
  }
  auto* ucredp = reinterpret_cast<struct ucred*>(CMSG_DATA(cmsgp));
  return ucredp->pid;
}

absl::StatusOr<std::string> GetRootMountId(const std::string& proc_id) {
  std::ifstream mounts(absl::StrCat("/proc/", proc_id, "/mountinfo"));
  if (!mounts.good()) {
    return absl::InternalError("Failed to open mountinfo");
  }
  std::string line;
  while (std::getline(mounts, line)) {
    std::vector<absl::string_view> parts =
        absl::StrSplit(line, absl::MaxSplits(' ', 4));
    if (parts.size() >= 4 && parts[3] == "/") {
      return std::string(parts[0]);
    }
  }
  return absl::NotFoundError("Root entry not found in mountinfo");
}

bool IsLikelyChrooted() {
  absl::StatusOr<std::string> self_root_id = GetRootMountId("self");
  if (!self_root_id.ok()) {
    return absl::IsNotFound(self_root_id.status());
  }
  absl::StatusOr<std::string> init_root_id = GetRootMountId("1");
  if (!init_root_id.ok()) {
    return false;
  }
  return *self_root_id != *init_root_id;
}

}  // namespace

void ForkServer::PrepareExecveArgs(const ForkRequest& request,
                                   std::vector<std::string>* args,
                                   std::vector<std::string>* envp) {
  // Prepare arguments for execve.
  for (const auto& arg : request.args()) {
    args->push_back(arg);
  }

  // Prepare environment variables for execve.
  for (const auto& env : request.envs()) {
    envp->push_back(env);
  }

  // The child process should not start any fork-servers.
  envp->push_back(absl::StrCat(kForkServerDisableEnv, "=1"));

  constexpr char kSapiVlogLevel[] = "SAPI_VLOG_LEVEL";
  char* sapi_vlog = getenv(kSapiVlogLevel);
  if (sapi_vlog && strlen(sapi_vlog) > 0) {
    envp->push_back(absl::StrCat(kSapiVlogLevel, "=", sapi_vlog));
  }

  SAPI_RAW_VLOG(1, "Will execute args:['%s'], environment:['%s']",
                absl::StrJoin(*args, "', '").c_str(),
                absl::StrJoin(*envp, "', '").c_str());
}

void ForkServer::LaunchChild(const ForkRequest& request, int execve_fd,
                             uid_t uid, gid_t gid, FDCloser signaling_fd,
                             FDCloser status_fd, bool avoid_pivot_root) const {
  SAPI_RAW_CHECK(request.mode() != FORKSERVER_FORK_UNSPECIFIED,
                 "Forkserver mode is unspecified");

  const bool will_execve = execve_fd != -1;
  const bool should_sandbox = request.mode() == FORKSERVER_FORK_EXECVE_SANDBOX;

  absl::StatusOr<absl::flat_hash_set<int>> open_fds = sanitizer::GetListOfFDs();
  if (!open_fds.ok()) {
    SAPI_RAW_LOG(WARNING, "Could not get list of current open FDs: %s",
                 std::string(open_fds.status().message()).c_str());
    open_fds = absl::flat_hash_set<int>();
  }
  SanitizeEnvironment();

  InitializeNamespaces(request, uid, gid, avoid_pivot_root);

  auto caps = cap_init();
  SAPI_RAW_CHECK(cap_set_proc(caps) == 0, "while dropping capabilities");
  cap_free(caps);

  // A custom init process is only needed if a new PID NS is created.
  if (request.clone_flags() & CLONE_NEWPID) {
    // Spawn a child process
    pid_t child = util::ForkWithFlags(SIGCHLD);
    if (child < 0) {
      SAPI_RAW_PLOG(FATAL, "Could not spawn init process");
    }
    if (child != 0) {
      if (status_fd.get() >= 0) {
        open_fds->erase(status_fd.get());
      }
      // Close all open fds (equals to CloseAllFDsExcept but does not require
      // /proc to be available).
      for (const auto& fd : *open_fds) {
        close(fd);
      }
      RunInitProcess(child, std::move(status_fd));
    }
    // Send sandboxee pid
    auto status = SendPid(signaling_fd.get());
    SAPI_RAW_CHECK(status.ok(),
                   absl::StrCat("sending pid: ", status.message()).c_str());
  }
  signaling_fd.Close();
  status_fd.Close();

  Client c(comms_);

  // Prepare the arguments before sandboxing (if needed), as doing it after
  // sandoxing can cause syscall violations (e.g. related to memory management).
  std::vector<std::string> args;
  std::vector<std::string> envs;
  if (will_execve) {
    PrepareExecveArgs(request, &args, &envs);
  }

  // Sandboxing can be enabled either here - just before execve, or somewhere
  // inside the executed binary (e.g. after basic structures have been
  // initialized, and resources acquired). In the latter case, it's up to the
  // sandboxed binary to establish proper Comms channel (using
  // Comms::kSandbox2ClientCommsFD) and call sandbox2::Client::SandboxMeHere()
  if (should_sandbox) {
    // The following client calls are basically SandboxMeHere. We split it so
    // that we can set up the envp after we received the file descriptors but
    // before we enable the syscall filter.
    c.PrepareEnvironment(&execve_fd);
    if (comms_->GetConnectionFD() != Comms::kSandbox2ClientCommsFD) {
      envs.push_back(absl::StrCat(Comms::kSandbox2CommsFDEnvVar, "=",
                                  comms_->GetConnectionFD()));
    }
    envs.push_back(c.GetFdMapEnvVar());
  }

  // Convert args and envs before enabling sandbox (it'll allocate which might
  // be blocked).
  util::CharPtrArray argv = util::CharPtrArray::FromStringVector(args);
  util::CharPtrArray envp = util::CharPtrArray::FromStringVector(envs);

  if (should_sandbox) {
    c.EnableSandbox();
  }

  if (will_execve) {
    ExecuteProcess(execve_fd, argv.data(), envp.data());
  }
}

pid_t ForkServer::ServeRequest() {
  ForkRequest fork_request;
  if (!comms_->RecvProtoBuf(&fork_request)) {
    if (comms_->IsTerminated()) {
      return -1;
    }
    SAPI_RAW_LOG(FATAL, "Failed to receive ForkServer request");
  }
  int comms_fd;
  SAPI_RAW_CHECK(comms_->RecvFD(&comms_fd), "Failed to receive Comms FD");

  SAPI_RAW_CHECK(fork_request.mode() != FORKSERVER_FORK_UNSPECIFIED,
                 "Forkserver mode is unspecified");

  int exec_fd = -1;
  if (fork_request.mode() == FORKSERVER_FORK_EXECVE ||
      fork_request.mode() == FORKSERVER_FORK_EXECVE_SANDBOX) {
    SAPI_RAW_CHECK(comms_->RecvFD(&exec_fd), "Failed to receive Exec FD");
  }

  // Make the kernel notify us with SIGCHLD when the process terminates.
  // We use sigaction(SIGCHLD, flags=SA_NOCLDWAIT) in combination with
  // this to make sure the zombie process is reaped immediately.
  int clone_flags = fork_request.clone_flags() | SIGCHLD;

  // Store uid and gid since they will change if CLONE_NEWUSER is set.
  uid_t uid = getuid();
  uid_t gid = getgid();

  FDCloser pipe_fds[2];
  {
    int pfds[2] = {-1, -1};
    if (fork_request.monitor_type() == FORKSERVER_MONITOR_UNOTIFY) {
      SAPI_RAW_PCHECK(pipe(pfds) == 0, "creating status pipe");
    }
    pipe_fds[0] = FDCloser(pfds[0]);
    pipe_fds[1] = FDCloser(pfds[1]);
  }

  int socketpair_fds[2];
  SAPI_RAW_PCHECK(
      socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, socketpair_fds) == 0,
      "creating signaling socketpair");
  for (int i = 0; i < 2; i++) {
    int val = 1;
    SAPI_RAW_PCHECK(setsockopt(socketpair_fds[i], SOL_SOCKET, SO_PASSCRED, &val,
                               sizeof(val)) == 0,
                    "setsockopt failed");
  }

  FDCloser signaling_fds[] = {FDCloser(socketpair_fds[0]),
                              FDCloser(socketpair_fds[1])};

  // Note: init_pid will be overwritten with the actual init pid if the init
  //       process was started or stays at 0 if that is not needed - no pidns.
  pid_t init_pid = 0;
  pid_t sandboxee_pid = -1;
  bool avoid_pivot_root = clone_flags & (CLONE_NEWUSER | CLONE_NEWNS);
  if (avoid_pivot_root) {
    // Create initial namespaces only when they're first needed.
    // This allows sandbox2 to be still used without any namespaces support
    if (initial_mntns_fd_ == -1) {
      CreateInitialNamespaces();
    }
    // We first just fork a child, which will join the initial namespaces
    // Note: Not a regular fork() as one really needs to be single-threaded to
    //       setns and this is not the case with TSAN.
    pid_t pid = util::ForkWithFlags(SIGCHLD);
    SAPI_RAW_PCHECK(pid != -1, "fork failed");
    if (pid == 0) {
      SAPI_RAW_PCHECK(setns(initial_userns_fd_, CLONE_NEWUSER) != -1,
                      "joining initial user namespace");
      SAPI_RAW_PCHECK(setns(initial_mntns_fd_, CLONE_NEWNS) != -1,
                      "joining initial mnt namespace");
      close(initial_userns_fd_);
      close(initial_mntns_fd_);
      // Do not create new userns it will be unshared later
      sandboxee_pid =
          util::ForkWithFlags((clone_flags & ~CLONE_NEWUSER) | CLONE_PARENT);
      if (sandboxee_pid == -1) {
        SAPI_RAW_LOG(ERROR, "util::ForkWithFlags(%x)", clone_flags);
      }
      if (sandboxee_pid != 0) {
        _exit(0);
      }
      // Send sandboxee pid
      absl::Status status = SendPid(signaling_fds[1].get());
      SAPI_RAW_CHECK(status.ok(),
                     absl::StrCat("sending pid: ", status.message()).c_str());
    }
  } else {
    sandboxee_pid = util::ForkWithFlags(clone_flags);
    if (sandboxee_pid == -1) {
      SAPI_RAW_LOG(ERROR, "util::ForkWithFlags(%x)", clone_flags);
    }
    if (sandboxee_pid == 0) {
      close(initial_userns_fd_);
      close(initial_mntns_fd_);
    }
  }

  // Child.
  if (sandboxee_pid == 0) {
    signaling_fds[0].Close();
    pipe_fds[0].Close();
    // Make sure we override the forkserver's comms fd
    comms_->Terminate();
    if (exec_fd != -1) {
      int signaling_fd = signaling_fds[1].Release();
      int pipe_fd = pipe_fds[1].Release();
      MoveFDs({{&exec_fd, Comms::kSandbox2TargetExecFD},
               {&comms_fd, Comms::kSandbox2ClientCommsFD}},
              {&signaling_fd, &pipe_fd});
      signaling_fds[1] = FDCloser(signaling_fd);
      pipe_fds[1] = FDCloser(pipe_fd);
    }
    *comms_ = Comms(comms_fd);
    LaunchChild(fork_request, exec_fd, uid, gid, std::move(signaling_fds[1]),
                std::move(pipe_fds[1]), avoid_pivot_root);
    return sandboxee_pid;
  }

  signaling_fds[1].Close();

  if (avoid_pivot_root) {
    if (auto pid = ReceivePid(signaling_fds[0].get()); !pid.ok()) {
      SAPI_RAW_LOG(ERROR, "%s", std::string(pid.status().message()).c_str());
    } else {
      sandboxee_pid = pid.value();
    }
  }

  if (fork_request.clone_flags() & CLONE_NEWPID) {
    // The pid of the init process is equal to the child process that we've
    // previously forked.
    init_pid = sandboxee_pid;
    sandboxee_pid = -1;
    // And the actual sandboxee is forked from the init process, so we need to
    // receive the actual PID.
    if (auto pid_or = ReceivePid(signaling_fds[0].get()); !pid_or.ok()) {
      SAPI_RAW_LOG(ERROR, "%s", std::string(pid_or.status().message()).c_str());
      if (init_pid != -1) {
        kill(init_pid, SIGKILL);
      }
      init_pid = -1;
    } else {
      sandboxee_pid = pid_or.value();
    }
  }

  // Parent.
  pipe_fds[1].Close();
  close(comms_fd);
  if (exec_fd >= 0) {
    close(exec_fd);
  }
  SAPI_RAW_CHECK(comms_->SendInt32(init_pid),
                 absl::StrCat("Failed to send init PID: ", init_pid).c_str());
  SAPI_RAW_CHECK(
      comms_->SendInt32(sandboxee_pid),
      absl::StrCat("Failed to send sandboxee PID: ", sandboxee_pid).c_str());

  if (pipe_fds[0].get() >= 0) {
    SAPI_RAW_CHECK(comms_->SendFD(pipe_fds[0].get()),
                   "Failed to send status pipe");
  }
  return sandboxee_pid;
}

bool ForkServer::IsTerminated() const { return comms_->IsTerminated(); }

bool ForkServer::Initialize() {
  // For safety drop as many capabilities as possible.
  // Note that cap_t is actually a pointer.
  cap_t have_caps = cap_get_proc();  // caps we currently have
  SAPI_RAW_CHECK(have_caps, "failed to cap_get_proc()");
  cap_t wanted_caps = cap_init();  // starts as empty set, ie. no caps
  SAPI_RAW_CHECK(wanted_caps, "failed to cap_init()");

  // CAP_SYS_PTRACE appears to be needed for apparmor (or possibly yama)
  // CAP_SETFCAP is needed on newer kernels (5.10 needs it, 4.15 does not)
  for (cap_value_t cap : {CAP_SYS_PTRACE, CAP_SETFCAP}) {
    for (cap_flag_t flag : {CAP_EFFECTIVE, CAP_PERMITTED}) {
      cap_flag_value_t value;
      int rc = cap_get_flag(have_caps, cap, flag, &value);
      SAPI_RAW_CHECK(!rc, "cap_get_flag");
      if (value == CAP_SET) {
        cap_value_t caps_to_set[1] = {
            cap,
        };
        rc = cap_set_flag(wanted_caps, flag, 1, caps_to_set, CAP_SET);
        SAPI_RAW_CHECK(!rc, "cap_set_flag");
      }
    }
  }

  SAPI_RAW_CHECK(!cap_set_proc(wanted_caps), "while dropping capabilities");
  SAPI_RAW_CHECK(!cap_free(wanted_caps), "while freeing wanted_caps");
  SAPI_RAW_CHECK(!cap_free(have_caps), "while freeing have_caps");

  // All processes spawned by the fork'd/execute'd process will see this process
  // as /sbin/init. Therefore it will receive (and ignore) their final status
  // (see the next comment as well). PR_SET_CHILD_SUBREAPER is available since
  // kernel version 3.4, so don't panic if it fails.
  if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) == -1) {
    SAPI_RAW_VLOG(3, "prctl(PR_SET_CHILD_SUBREAPER, 1): %s [%d]",
                  StrError(errno).c_str(), errno);
  }

  // Don't convert terminated child processes into zombies. It's up to the
  // sandbox (Monitor) to track them and receive/report their final status.
  struct sigaction sa;
  sa.sa_handler = SIG_DFL;
  sa.sa_flags = SA_NOCLDWAIT;
  sigemptyset(&sa.sa_mask);
  if (sigaction(SIGCHLD, &sa, nullptr) == -1) {
    SAPI_RAW_PLOG(ERROR, "sigaction(SIGCHLD, flags=SA_NOCLDWAIT)");
    return false;
  }
  return true;
}

void ForkServer::CreateInitialNamespaces() {
  // Spawn a new process to create initial user and mount namespaces to be used
  // as a base for each namespaced sandboxee.

  // Store uid and gid to create mappings after CLONE_NEWUSER
  uid_t uid = getuid();
  gid_t gid = getgid();

  // Socket to synchronize so that we open ns fds before process dies
  FDCloser create_efd(eventfd(0, EFD_CLOEXEC));
  SAPI_RAW_PCHECK(create_efd.get() != -1, "creating eventfd");
  FDCloser open_efd(eventfd(0, EFD_CLOEXEC));
  SAPI_RAW_PCHECK(open_efd.get() != -1, "creating eventfd");
  pid_t pid = util::ForkWithFlags(CLONE_NEWUSER | CLONE_NEWNS | SIGCHLD);
  if (pid == -1 && errno == EPERM && IsLikelyChrooted()) {
    SAPI_RAW_LOG(FATAL,
                 "failed to fork initial namespaces process: parent process is "
                 "likely chrooted");
  }
  SAPI_RAW_PCHECK(pid != -1, "failed to fork initial namespaces process");
  uint64_t value = 1;
  if (pid == 0) {
    Namespace::InitializeInitialNamespaces(uid, gid);
    SAPI_RAW_PCHECK(TEMP_FAILURE_RETRY(write(create_efd.get(), &value,
                                             sizeof(value))) == sizeof(value),
                    "synchronizing initial namespaces creation");
    SAPI_RAW_PCHECK(TEMP_FAILURE_RETRY(read(open_efd.get(), &value,
                                            sizeof(value))) == sizeof(value),
                    "synchronizing initial namespaces creation");
    SAPI_RAW_PCHECK(chroot("/realroot") == 0,
                    "chrooting prior to dumping coverage");
    util::DumpCoverageData();
    _exit(0);
  }
  SAPI_RAW_PCHECK(TEMP_FAILURE_RETRY(read(create_efd.get(), &value,
                                          sizeof(value))) == sizeof(value),
                  "synchronizing initial namespaces creation");
  initial_userns_fd_ = open(absl::StrCat("/proc/", pid, "/ns/user").c_str(),
                            O_RDONLY | O_CLOEXEC);
  SAPI_RAW_PCHECK(initial_userns_fd_ != -1, "getting initial userns fd");
  initial_mntns_fd_ = open(absl::StrCat("/proc/", pid, "/ns/mnt").c_str(),
                           O_RDONLY | O_CLOEXEC);
  SAPI_RAW_PCHECK(initial_mntns_fd_ != -1, "getting initial mntns fd");
  SAPI_RAW_PCHECK(TEMP_FAILURE_RETRY(write(open_efd.get(), &value,
                                           sizeof(value))) == sizeof(value),
                  "synchronizing initial namespaces creation");
}

void ForkServer::SanitizeEnvironment() const {
  // Mark all file descriptors, except the standard ones (needed
  // for proper sandboxed process operations), as close-on-exec.
  absl::Status status = sanitizer::SanitizeCurrentProcess(
      {STDIN_FILENO, STDOUT_FILENO, STDERR_FILENO, comms_->GetConnectionFD()},
      /* close_fds = */ false);
  SAPI_RAW_CHECK(
      status.ok(),
      absl::StrCat("while sanitizing process: ", status.message()).c_str());
}

void ForkServer::ExecuteProcess(int execve_fd, const char* const* argv,
                                const char* const* envp) {
  // Do not add any code before execve(), as it's subject to seccomp policies.
  // Indicate that it's a special execve(), by setting 4th, 5th and 6th syscall
  // argument to magic values.
  util::Execveat(execve_fd, "", argv, envp, AT_EMPTY_PATH,
                 internal::kExecveMagic);

  int saved_errno = errno;
  SAPI_RAW_PLOG(ERROR, "execveat failed");
  if (argv[0]) {
    SAPI_RAW_LOG(ERROR, "argv[0]=%s", argv[0]);
  }

  if (saved_errno == ENOSYS) {
    SAPI_RAW_LOG(ERROR,
                 "This is likely caused by running on a kernel that is too old."
    );
  } else if (saved_errno == ENOENT && execve_fd >= 0) {
    // Since we know the file exists, it must be that the file is dynamically
    // linked and the ELF interpreter is what's actually missing.
    SAPI_RAW_LOG(
        ERROR,
        "This is likely caused by running dynamically-linked sandboxee without "
        "calling .AddLibrariesForBinary() on the policy builder.");
  }

  util::Syscall(__NR_exit_group, EXIT_FAILURE);
  abort();
}

void ForkServer::InitializeNamespaces(const ForkRequest& request, uid_t uid,
                                      gid_t gid, bool avoid_pivot_root) {
  if (!request.has_mount_tree()) {
    return;
  }
  Namespace::InitializeNamespaces(
      uid, gid, request.clone_flags(), Mounts(request.mount_tree()),
      request.hostname(), avoid_pivot_root, request.allow_mount_propagation());
}

}  // namespace sandbox2
