// Copyright 2012 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "net/dns/dns_hosts.h"

#include <string>
#include <utility>

#include "base/check.h"
#include "base/files/file_path.h"
#include "base/files/file_util.h"
#include "base/metrics/histogram_functions.h"
#include "base/strings/string_piece.h"
#include "base/strings/string_util.h"
#include "base/trace_event/memory_usage_estimator.h"
#include "build/build_config.h"
#include "net/base/cronet_buildflags.h"
#include "net/base/url_util.h"
#include "net/dns/dns_util.h"
#include "url/url_canon.h"

using base::StringPiece;

namespace net {

namespace {

// Parses the contents of a hosts file.  Returns one token (IP or hostname) at
// a time.  Doesn't copy anything; accepts the file as a StringPiece and
// returns tokens as StringPieces.
class HostsParser {
 public:
  explicit HostsParser(const StringPiece& text, ParseHostsCommaMode comma_mode)
      : text_(text),
        data_(text.data()),
        end_(text.size()),
        comma_mode_(comma_mode) {}

  HostsParser(const HostsParser&) = delete;
  HostsParser& operator=(const HostsParser&) = delete;

  // Advances to the next token (IP or hostname).  Returns whether another
  // token was available.  |token_is_ip| and |token| can be used to find out
  // the type and text of the token.
  bool Advance() {
    bool next_is_ip = (pos_ == 0);
    while (pos_ < end_ && pos_ != std::string::npos) {
      switch (text_[pos_]) {
        case ' ':
        case '\t':
          SkipWhitespace();
          break;

        case '\r':
        case '\n':
          next_is_ip = true;
          pos_++;
          break;

        case '#':
          SkipRestOfLine();
          break;

        case ',':
          if (comma_mode_ == PARSE_HOSTS_COMMA_IS_WHITESPACE) {
            SkipWhitespace();
            break;
          }

          // If comma_mode_ is COMMA_IS_TOKEN, fall through:
          [[fallthrough]];

        default: {
          size_t token_start = pos_;
          SkipToken();
          size_t token_end = (pos_ == std::string::npos) ? end_ : pos_;

          token_ = StringPiece(data_ + token_start, token_end - token_start);
          token_is_ip_ = next_is_ip;

          return true;
        }
      }
    }

    return false;
  }

  // Fast-forwards the parser to the next line.  Should be called if an IP
  // address doesn't parse, to avoid wasting time tokenizing hostnames that
  // will be ignored.
  void SkipRestOfLine() { pos_ = text_.find("\n", pos_); }

  // Returns whether the last-parsed token is an IP address (true) or a
  // hostname (false).
  bool token_is_ip() { return token_is_ip_; }

  // Returns the text of the last-parsed token as a StringPiece referencing
  // the same underlying memory as the StringPiece passed to the constructor.
  // Returns an empty StringPiece if no token has been parsed or the end of
  // the input string has been reached.
  const StringPiece& token() { return token_; }

 private:
  void SkipToken() {
    switch (comma_mode_) {
      case PARSE_HOSTS_COMMA_IS_TOKEN:
        pos_ = text_.find_first_of(" \t\n\r#", pos_);
        break;
      case PARSE_HOSTS_COMMA_IS_WHITESPACE:
        pos_ = text_.find_first_of(" ,\t\n\r#", pos_);
        break;
    }
  }

  void SkipWhitespace() {
    switch (comma_mode_) {
      case PARSE_HOSTS_COMMA_IS_TOKEN:
        pos_ = text_.find_first_not_of(" \t", pos_);
        break;
      case PARSE_HOSTS_COMMA_IS_WHITESPACE:
        pos_ = text_.find_first_not_of(" ,\t", pos_);
        break;
    }
  }

  const StringPiece text_;
  const char* data_;
  const size_t end_;

  size_t pos_ = 0;
  StringPiece token_;
  bool token_is_ip_ = false;

  const ParseHostsCommaMode comma_mode_;
};

void ParseHostsWithCommaMode(const std::string& contents,
                             DnsHosts* dns_hosts,
                             ParseHostsCommaMode comma_mode) {
  CHECK(dns_hosts);

  StringPiece ip_text;
  IPAddress ip;
  AddressFamily family = ADDRESS_FAMILY_IPV4;
  HostsParser parser(contents, comma_mode);
  while (parser.Advance()) {
    if (parser.token_is_ip()) {
      StringPiece new_ip_text = parser.token();
      // Some ad-blocking hosts files contain thousands of entries pointing to
      // the same IP address (usually 127.0.0.1).  Don't bother parsing the IP
      // again if it's the same as the one above it.
      if (new_ip_text != ip_text) {
        IPAddress new_ip;
        if (new_ip.AssignFromIPLiteral(parser.token())) {
          ip_text = new_ip_text;
          ip = new_ip;
          family = (ip.IsIPv4()) ? ADDRESS_FAMILY_IPV4 : ADDRESS_FAMILY_IPV6;
        } else {
          parser.SkipRestOfLine();
        }
      }
    } else {
      url::CanonHostInfo canonicalization_info;
      std::string canonicalized_host =
          CanonicalizeHost(parser.token(), &canonicalization_info);

      // Skip if token is invalid for host canonicalization, or if it
      // canonicalizes as an IP address.
      if (canonicalization_info.family != url::CanonHostInfo::NEUTRAL)
        continue;

      DnsHostsKey key(std::move(canonicalized_host), family);
      if (!IsCanonicalizedHostCompliant(key.first))
        continue;
      IPAddress* mapped_ip = &(*dns_hosts)[key];
      if (mapped_ip->empty())
        *mapped_ip = ip;
      // else ignore this entry (first hit counts)
    }
  }
}

}  // namespace

void ParseHostsWithCommaModeForTesting(const std::string& contents,
                                       DnsHosts* dns_hosts,
                                       ParseHostsCommaMode comma_mode) {
  ParseHostsWithCommaMode(contents, dns_hosts, comma_mode);
}

void ParseHosts(const std::string& contents, DnsHosts* dns_hosts) {
  ParseHostsCommaMode comma_mode;
#if BUILDFLAG(IS_APPLE)
  // Mac OS X allows commas to separate hostnames.
  comma_mode = PARSE_HOSTS_COMMA_IS_WHITESPACE;
#else
  // Linux allows commas in hostnames.
  comma_mode = PARSE_HOSTS_COMMA_IS_TOKEN;
#endif

  ParseHostsWithCommaMode(contents, dns_hosts, comma_mode);

  // TODO(crbug.com/1377305): Remove this when we have enough data.
  base::UmaHistogramCounts100000("Net.DNS.DnsHosts.Count", dns_hosts->size());

#if !BUILDFLAG(CRONET_BUILD)
  // Cronet disables tracing and doesn't provide an implementation of
  // base::trace_event::EstimateMemoryUsage for DnsHosts. Having this
  // conditional is preferred over a fake implementation to avoid reporting fake
  // metrics.
  base::UmaHistogramMemoryKB(
      "Net.DNS.DnsHosts.EstimateMemoryUsage",
      base::trace_event::EstimateMemoryUsage(*dns_hosts));
#endif  // !BUILDFLAG(CRONET_BUILD)
}

DnsHostsParser::~DnsHostsParser() = default;

DnsHostsFileParser::DnsHostsFileParser(base::FilePath hosts_file_path)
    : hosts_file_path_(std::move(hosts_file_path)) {}

DnsHostsFileParser::~DnsHostsFileParser() = default;

bool DnsHostsFileParser::ParseHosts(DnsHosts* dns_hosts) const {
  dns_hosts->clear();
  // Missing file indicates empty HOSTS.
  if (!base::PathExists(hosts_file_path_))
    return true;

  int64_t size;
  if (!base::GetFileSize(hosts_file_path_, &size))
    return false;

  // Reject HOSTS files larger than |kMaxHostsSize| bytes.
  const int64_t kMaxHostsSize = 1 << 25;  // 32MB

  // TODO(crbug.com/1377305): Remove this when we have enough data.
  base::UmaHistogramCustomCounts("Net.DNS.DnsHosts.FileSize", size, 1,
                                 kMaxHostsSize * 2, 50);
  if (size > kMaxHostsSize)
    return false;

  std::string contents;
  if (!base::ReadFileToString(hosts_file_path_, &contents))
    return false;

  net::ParseHosts(contents, dns_hosts);
  return true;
}

}  // namespace net
