/*
 * Copyright (C) 2018 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef SRC_PROFILING_MEMORY_BOOKKEEPING_H_
#define SRC_PROFILING_MEMORY_BOOKKEEPING_H_

#include <map>
#include <vector>

#include "perfetto/base/time.h"
#include "src/profiling/common/callstack_trie.h"
#include "src/profiling/common/interner.h"
#include "src/profiling/memory/unwound_messages.h"

// Below is an illustration of the bookkeeping system state where
// PID 1 does the following allocations:
// 0x123: 128 bytes at [bar main]
// 0x234: 128 bytes at [bar main]
// 0xf00: 512 bytes at [foo main]
// PID 1 allocated but previously freed 1024 bytes at [bar main]
//
// PID 2 does the following allocations:
// 0x345: 512 bytes at [foo main]
// 0x456:  32 bytes at [foo main]
// PID 2 allocated but already freed 1235 bytes at [foo main]
// PID 2 allocated and freed 2048 bytes in main.
//
// +---------------------------------+   +-------------------+
// | +---------+    HeapTracker PID 1|   | GlobalCallstackTri|
// | |0x123 128+---+    +----------+ |   |           +---+   |
// | |         |   +---->alloc:1280+----------------->bar|   |
// | |0x234 128+---+    |free: 1024| |   |           +-^-+   |
// | |         |        +----------+ |   |   +---+     ^     |
// | |0xf00 512+---+                 | +----->foo|     |     |
// | +--------+|   |    +----------+ | | |   +-^-+     |     |
// |               +---->alloc: 512+---+ |     |       |     |
// |                    |free:    0| | | |     +--+----+     |
// |                    +----------+ | | |        |          |
// |                                 | | |      +-+--+       |
// +---------------------------------+ | |      |main|       |
//                                     | |      +--+-+       |
// +---------------------------------+ | |         ^         |
// | +---------+    HeapTracker PID 2| | +-------------------+
// | |0x345 512+---+    +----------+ | |           |
// | |         |   +---->alloc:1779+---+           |
// | |0x456  32+---+    |free: 1235| |             |
// | +---------+        +----------+ |             |
// |                                 |             |
// |                    +----------+ |             |
// |                    |alloc:2048+---------------+
// |                    |free: 2048| |
// |                    +----------+ |
// |                                 |
// +---------------------------------+
//   Allocation    CallstackAllocations        Node
//
// The active allocations are on the leftmost side, modeled as the class
// HeapTracker::Allocation.
//
// The total allocated and freed bytes per callsite are in the middle, modeled
// as the HeapTracker::CallstackAllocations class.
// Note that (1280 - 1024) = 256, so alloc - free is equal to the total of the
// currently active allocations.
// Note in PID 2 there is a CallstackAllocations with 2048 allocated and 2048
// freed bytes. This is not currently referenced by any Allocations (as it
// should, as 2048 - 2048 = 0, which would mean that the total size of the
// allocations referencing it should be 0). This is because we haven't dumped
// this state yet, so the CallstackAllocations will be kept around until the
// next dump, written to the trace, and then destroyed.
//
// On the right hand side is the GlobalCallstackTrie, with nodes representing
// distinct callstacks. They have no information about the currently allocated
// or freed bytes, they only contain a reference count to destroy them as
// soon as they are no longer referenced by a HeapTracker.

namespace perfetto {
namespace profiling {

// Snapshot for memory allocations of a particular process. Shares callsites
// with other processes.
class HeapTracker {
 public:
  struct CallstackMaxAllocations {
    uint64_t max;
    uint64_t cur;
    uint64_t max_count;
    uint64_t cur_count;
  };
  struct CallstackTotalAllocations {
    uint64_t allocated;
    uint64_t freed;
    uint64_t allocation_count;
    uint64_t free_count;
  };

  // Sum of all the allocations for a given callstack.
  struct CallstackAllocations {
    explicit CallstackAllocations(GlobalCallstackTrie::Node* n) : node(n) {}

    uint64_t allocs = 0;

    union {
      CallstackMaxAllocations retain_max;
      CallstackTotalAllocations totals;
    } value = {};

    GlobalCallstackTrie::Node* const node;

    ~CallstackAllocations() { GlobalCallstackTrie::DecrementNode(node); }

    bool operator<(const CallstackAllocations& other) const {
      return node < other.node;
    }
  };

  // Caller needs to ensure that callsites outlives the HeapTracker.
  explicit HeapTracker(GlobalCallstackTrie* callsites, bool dump_at_max_mode)
      : callsites_(callsites), dump_at_max_mode_(dump_at_max_mode) {}

  void RecordMalloc(const std::vector<unwindstack::FrameData>& callstack,
                    const std::vector<std::string>& build_ids,
                    uint64_t address,
                    uint64_t sample_size,
                    uint64_t alloc_size,
                    uint64_t sequence_number,
                    uint64_t timestamp);

  template <typename F>
  void GetCallstackAllocations(F fn) {
    // There are two reasons we remove the unused callstack allocations on the
    // next iteration of Dump:
    // * We need to remove them after the callstacks were dumped, which
    //   currently happens after the allocations are dumped.
    // * This way, we do not destroy and recreate callstacks as frequently.
    for (auto it_and_alloc : dead_callstack_allocations_) {
      auto& it = it_and_alloc.first;
      uint64_t allocated = it_and_alloc.second;
      const CallstackAllocations& alloc = it->second;
      // For non-dump-at-max, we need to check, even if there are still no
      // allocations referencing this callstack, whether there were any
      // allocations that happened but were freed again. If that was the case,
      // we need to keep the callsite, because the next dump will indicate a
      // different self_alloc and self_freed.
      if (alloc.allocs == 0 &&
          (dump_at_max_mode_ ||
           alloc.value.totals.allocation_count == allocated)) {
        // TODO(fmayer): We could probably be smarter than throw away
        // our whole frames cache.
        ClearFrameCache();
        callstack_allocations_.erase(it);
      }
    }
    dead_callstack_allocations_.clear();

    for (auto it = callstack_allocations_.begin();
         it != callstack_allocations_.end(); ++it) {
      const CallstackAllocations& alloc = it->second;
      fn(alloc);

      if (alloc.allocs == 0)
        dead_callstack_allocations_.emplace_back(
            it, !dump_at_max_mode_ ? alloc.value.totals.allocation_count : 0);
    }
  }

  template <typename F>
  void GetAllocations(F fn) {
    for (const auto& addr_and_allocation : allocations_) {
      const Allocation& alloc = addr_and_allocation.second;
      fn(addr_and_allocation.first, alloc.sample_size, alloc.alloc_size,
         alloc.callstack_allocations()->node->id());
    }
  }

  void RecordFree(uint64_t address,
                  uint64_t sequence_number,
                  uint64_t timestamp) {
    RecordOperation(sequence_number, {address, timestamp});
  }

  void ClearFrameCache() { frame_cache_.clear(); }

  uint64_t dump_timestamp() {
    return dump_at_max_mode_ ? max_timestamp_ : committed_timestamp_;
  }

  uint64_t GetSizeForTesting(const std::vector<unwindstack::FrameData>& stack,
                             std::vector<std::string> build_ids);
  uint64_t GetMaxForTesting(const std::vector<unwindstack::FrameData>& stack,
                            std::vector<std::string> build_ids);
  uint64_t GetMaxCountForTesting(
      const std::vector<unwindstack::FrameData>& stack,
      std::vector<std::string> build_ids);

  uint64_t GetTimestampForTesting() { return committed_timestamp_; }

 private:
  struct Allocation {
    Allocation(uint64_t size,
               uint64_t asize,
               uint64_t seq,
               CallstackAllocations* csa)
        : sample_size(size), alloc_size(asize), sequence_number(seq) {
      SetCallstackAllocations(csa);
    }

    Allocation() = default;
    Allocation(const Allocation&) = delete;
    Allocation(Allocation&& other) noexcept {
      sample_size = other.sample_size;
      alloc_size = other.alloc_size;
      sequence_number = other.sequence_number;
      callstack_allocations_ = other.callstack_allocations_;
      other.callstack_allocations_ = nullptr;
    }

    ~Allocation() { SetCallstackAllocations(nullptr); }

    void SetCallstackAllocations(CallstackAllocations* callstack_allocations) {
      if (callstack_allocations_)
        callstack_allocations_->allocs--;
      callstack_allocations_ = callstack_allocations;
      if (callstack_allocations_)
        callstack_allocations_->allocs++;
    }

    CallstackAllocations* callstack_allocations() const {
      return callstack_allocations_;
    }

    uint64_t sample_size;
    uint64_t alloc_size;
    uint64_t sequence_number;

   private:
    CallstackAllocations* callstack_allocations_ = nullptr;
  };

  struct PendingOperation {
    uint64_t allocation_address;
    uint64_t timestamp;
  };

  CallstackAllocations* MaybeCreateCallstackAllocations(
      GlobalCallstackTrie::Node* node) {
    auto callstack_allocations_it = callstack_allocations_.find(node);
    if (callstack_allocations_it == callstack_allocations_.end()) {
      GlobalCallstackTrie::IncrementNode(node);
      bool inserted;
      std::tie(callstack_allocations_it, inserted) =
          callstack_allocations_.emplace(node, node);
      PERFETTO_DCHECK(inserted);
    }
    return &callstack_allocations_it->second;
  }

  void RecordOperation(uint64_t sequence_number,
                       const PendingOperation& operation);

  // Commits a malloc or free operation.
  // See comment of pending_operations_ for encoding of malloc and free
  // operations.
  //
  // Committing a malloc operation: Add the allocations size to
  // CallstackAllocation::allocated.
  // Committing a free operation: Add the allocation's size to
  // CallstackAllocation::freed and delete the allocation.
  void CommitOperation(uint64_t sequence_number,
                       const PendingOperation& operation);

  void AddToCallstackAllocations(uint64_t ts, const Allocation& alloc) {
    if (dump_at_max_mode_) {
      current_unfreed_ += alloc.sample_size;
      alloc.callstack_allocations()->value.retain_max.cur += alloc.sample_size;
      alloc.callstack_allocations()->value.retain_max.cur_count++;

      if (current_unfreed_ <= max_unfreed_)
        return;

      if (max_sequence_number_ == alloc.sequence_number - 1) {
        // We know the only CallstackAllocation that has max != cur is the
        // one we just updated.
        alloc.callstack_allocations()->value.retain_max.max =
            alloc.callstack_allocations()->value.retain_max.cur;
        alloc.callstack_allocations()->value.retain_max.max_count =
            alloc.callstack_allocations()->value.retain_max.cur_count;
      } else {
        for (auto& p : callstack_allocations_) {
          // We need to reset max = cur for every CallstackAllocation, as we
          // do not know which ones have changed since the last max.
          // TODO(fmayer): Add an index to speed this up
          CallstackAllocations& csa = p.second;
          csa.value.retain_max.max = csa.value.retain_max.cur;
          csa.value.retain_max.max_count = csa.value.retain_max.cur_count;
        }
      }
      max_sequence_number_ = alloc.sequence_number;
      max_unfreed_ = current_unfreed_;
      max_timestamp_ = ts;
    } else {
      alloc.callstack_allocations()->value.totals.allocated +=
          alloc.sample_size;
      alloc.callstack_allocations()->value.totals.allocation_count++;
    }
  }

  void SubtractFromCallstackAllocations(const Allocation& alloc) {
    if (dump_at_max_mode_) {
      current_unfreed_ -= alloc.sample_size;
      alloc.callstack_allocations()->value.retain_max.cur -= alloc.sample_size;
      alloc.callstack_allocations()->value.retain_max.cur_count--;
    } else {
      alloc.callstack_allocations()->value.totals.freed += alloc.sample_size;
      alloc.callstack_allocations()->value.totals.free_count++;
    }
  }

  // We cannot use an interner here, because after the last allocation goes
  // away, we still need to keep the CallstackAllocations around until the next
  // dump.
  std::map<GlobalCallstackTrie::Node*, CallstackAllocations>
      callstack_allocations_;

  std::vector<std::pair<decltype(callstack_allocations_)::iterator, uint64_t>>
      dead_callstack_allocations_;

  std::map<uint64_t /* allocation address */, Allocation> allocations_;

  // An operation is either a commit of an allocation or freeing of an
  // allocation. An operation is a free if its seq_id is larger than
  // the sequence_number of the corresponding allocation. It is a commit if its
  // seq_id is equal to the sequence_number of the corresponding allocation.
  //
  // If its seq_id is less than the sequence_number of the corresponding
  // allocation it could be either, but is ignored either way.
  std::map<uint64_t /* seq_id */, PendingOperation /* allocation address */>
      pending_operations_;

  uint64_t committed_timestamp_ = 0;
  // The sequence number all mallocs and frees have been handled up to.
  uint64_t committed_sequence_number_ = 0;
  GlobalCallstackTrie* callsites_;

  const bool dump_at_max_mode_ = false;
  // The following members are only used if dump_at_max_mode_ == true.
  uint64_t max_sequence_number_ = 0;
  uint64_t current_unfreed_ = 0;
  uint64_t max_unfreed_ = 0;
  uint64_t max_timestamp_ = 0;

  // We index by abspc, which is unique as long as the maps do not change.
  // This is why we ClearFrameCache after we reparsed maps.
  std::unordered_map<uint64_t /* abs pc */, Interned<Frame>> frame_cache_;
};

}  // namespace profiling
}  // namespace perfetto

#endif  // SRC_PROFILING_MEMORY_BOOKKEEPING_H_
