/*
 * Copyright (C) 2020 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

// See /docs/design-docs/protozero.md for rationale and results.

#include <memory>
#include <vector>

#include <unistd.h>

#include <benchmark/benchmark.h>

#include "perfetto/base/compiler.h"
#include "perfetto/protozero/static_buffer.h"

// Autogenerated headers in out/*/gen/
#include "src/protozero/test/example_proto/library.pbzero.h"
#include "src/protozero/test/example_proto/test_messages.pb.h"
#include "src/protozero/test/example_proto/test_messages.pbzero.h"

// Generated by the protozero plugin.
namespace pbzero = protozero::test::protos::pbzero;

// Generated by the official protobuf compiler.
namespace pblite = protozero::test::protos;

namespace {

// This needs to be > the max size written by each iteration.
constexpr size_t kBufPerIteration = 512;

// Write cyclically on a 64 MB buffer set to simulate a realistic tracing
// scenario.
constexpr size_t kTotalWorkingSetSize = 64 * 1024 * 1024;
alignas(uint64_t) char g_out_buffer[kTotalWorkingSetSize];

char* g_cur = g_out_buffer;

uint64_t g_fake_input_simple[] = {0x12345678,
                                  0x90ABCDEF,
                                  0x11111111,
                                  0xFFFFFFFF,
                                  0x6666666666666666ULL,
                                  0x6666666666666666ULL,
                                  0x6666666666666666ULL,
                                  0x0066666666666666ULL};

// Speed-of-light serializer. Aa very simple C++ class that just appends data
// into a linear buffer making all sorts of favourable assumptions. It does not
// use any binary-stable encoding, it does not perform bound checking,
// all writes are 64-bit aligned, it doesn't deal with any thread-safety.
// The speed-of-light serializer serves as a reference for how fast a serializer
// could be if argument marshalling and bound checking were zero cost.
struct SOLMsg {
  template <typename T>
  void Append(T x) {
    // The reinterpret_cast is to give favorable alignment guarantees.
    // The memcpy will be elided by the compiler, which will emit just a
    // 64-bit aligned mov instruction.
    memcpy(reinterpret_cast<void*>(ptr_), &x, sizeof(x));
    ptr_ += sizeof(uint64_t);
  }

  void set_field_int32(int32_t x) { Append(x); }
  void set_field_uint32(uint32_t x) { Append(x); }
  void set_field_int64(int64_t x) { Append(x); }
  void set_field_uint64(uint64_t x) { Append(x); }
  void set_field_string(const char* str) { ptr_ = strcpy(ptr_, str); }

  SOLMsg* add_field_nested() { return new (this + 1) SOLMsg(); }

  alignas(uint64_t) char storage_[sizeof(g_fake_input_simple) + 8];
  char* ptr_ = &storage_[0];
};

template <typename T>
PERFETTO_ALWAYS_INLINE void FillMessage_Simple(T* msg) {
  benchmark::DoNotOptimize(g_fake_input_simple);
  msg->set_field_int32(static_cast<int32_t>(g_fake_input_simple[0]));
  msg->set_field_uint32(static_cast<uint32_t>(g_fake_input_simple[1]));
  msg->set_field_int64(static_cast<int64_t>(g_fake_input_simple[2]));
  msg->set_field_uint64(static_cast<uint64_t>(g_fake_input_simple[3]));
  msg->set_field_string(reinterpret_cast<const char*>(&g_fake_input_simple[4]));
}

template <typename T>
PERFETTO_ALWAYS_INLINE void FillMessage_Nested(T* msg, int depth = 0) {
  benchmark::DoNotOptimize(g_fake_input_simple);
  FillMessage_Simple(msg);
  if (depth < 3) {
    auto* child = msg->add_field_nested();
    FillMessage_Nested(child, depth + 1);
  }
}

PERFETTO_ALWAYS_INLINE void Clobber(benchmark::State& state) {
  uint64_t* buf = reinterpret_cast<uint64_t*>(g_cur);

  // Read-back the data written to have a realistic evaluation of the
  // speed-of-light scenario. This is to deal with architecture of modern CPUs.
  // If we write a bunch of memory bytes, never read-back from them, and then
  // just over-write them, the CPU can just throw away the whole stream of
  // instructions that produced them, if that's still in flight and tracked in
  // the out-of-order units.
  // The buf[i-1] ^= buf forces the CPU to consume the result of the writes.
  buf[0] = reinterpret_cast<uint64_t>(&state);
  for (size_t i = 1; i < kBufPerIteration / sizeof(uint64_t); i++)
    buf[i] ^= buf[i - 1];
  if (buf[(kBufPerIteration / sizeof(uint64_t)) - 1] == 42)
    PERFETTO_LOG(".");
  benchmark::DoNotOptimize(buf);

  constexpr size_t kWrap = kTotalWorkingSetSize / kBufPerIteration;
  g_cur = &g_out_buffer[(state.iterations() % kWrap) * kBufPerIteration];
  benchmark::ClobberMemory();
}

}  // namespace

static void BM_Protozero_Simple_Libprotobuf(benchmark::State& state) {
  while (state.KeepRunning()) {
    {
      // The nested block is to account for RAII finalizers.
      pblite::EveryField msg;
      FillMessage_Simple(&msg);
      msg.SerializeToArray(g_cur, kBufPerIteration);
    }
    Clobber(state);
  }
}

static void BM_Protozero_Simple_Protozero(benchmark::State& state) {
  while (state.KeepRunning()) {
    {
      protozero::StaticBuffered<pbzero::EveryField> msg(g_cur,
                                                        kBufPerIteration);
      FillMessage_Simple(msg.get());
    }
    Clobber(state);
  }
}

static void BM_Protozero_Simple_SpeedOfLight(benchmark::State& state) {
  while (state.KeepRunning()) {
    SOLMsg* msg = new (g_cur) SOLMsg();
    FillMessage_Simple(msg);
    Clobber(state);
  }
}

static void BM_Protozero_Nested_Libprotobuf(benchmark::State& state) {
  while (state.KeepRunning()) {
    {
      pblite::EveryField msg;
      FillMessage_Nested(&msg);
      msg.SerializeToArray(g_cur, kBufPerIteration);
    }
    Clobber(state);
  }
}

static void BM_Protozero_Nested_Protozero(benchmark::State& state) {
  while (state.KeepRunning()) {
    {
      protozero::StaticBuffered<pbzero::EveryField> msg(g_cur,
                                                        kBufPerIteration);
      FillMessage_Nested(msg.get());
    }
    Clobber(state);
  }
}

static void BM_Protozero_Nested_SpeedOfLight(benchmark::State& state) {
  while (state.KeepRunning()) {
    SOLMsg* msg = new (g_cur) SOLMsg();
    FillMessage_Nested(msg);
    Clobber(state);
  }
}

BENCHMARK(BM_Protozero_Simple_Libprotobuf);
BENCHMARK(BM_Protozero_Simple_Protozero);
BENCHMARK(BM_Protozero_Simple_SpeedOfLight);

BENCHMARK(BM_Protozero_Nested_Libprotobuf);
BENCHMARK(BM_Protozero_Nested_Protozero);
BENCHMARK(BM_Protozero_Nested_SpeedOfLight);
