/* * Copyright (C) 2023 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "src/trace_processor/sorter/trace_token_buffer.h" #include #include #include #include #include #include #include #include "perfetto/base/compiler.h" #include "perfetto/base/logging.h" #include "perfetto/trace_processor/ref_counted.h" #include "perfetto/trace_processor/trace_blob.h" #include "perfetto/trace_processor/trace_blob_view.h" #include "src/trace_processor/importers/common/parser_types.h" #include "src/trace_processor/importers/proto/packet_sequence_state_generation.h" #include "src/trace_processor/util/bump_allocator.h" namespace perfetto::trace_processor { namespace { struct alignas(8) TrackEventDataDescriptor { static constexpr uint8_t kMaxOffsetFromInternedBlobBits = 25; static constexpr uint32_t kMaxOffsetFromInternedBlob = (1Ul << kMaxOffsetFromInternedBlobBits) - 1; static constexpr uint8_t kMaxExtraCountersBits = 4; static constexpr uint8_t kMaxExtraCounters = (1 << kMaxExtraCountersBits) - 1; static_assert(TrackEventData::kMaxNumExtraCounters <= kMaxExtraCounters, "Counter bits must be able to fit TrackEventData counters"); uint16_t intern_blob_index; uint16_t intern_seq_index; uint32_t intern_blob_offset : kMaxOffsetFromInternedBlobBits; uint32_t has_thread_timestamp : 1; uint32_t has_thread_instruction_count : 1; uint32_t has_counter_value : 1; uint32_t extra_counter_count : kMaxExtraCountersBits; }; static_assert(sizeof(TrackEventDataDescriptor) == 8, "CompressedTracePacketData must be small"); static_assert(alignof(TrackEventDataDescriptor) == 8, "CompressedTracePacketData must be 8-aligned"); template T ExtractFromPtr(uint8_t** ptr) { T* typed_ptr = reinterpret_cast(*ptr); T value(std::move(*typed_ptr)); typed_ptr->~T(); *ptr += sizeof(T); return value; } template uint8_t* AppendToPtr(uint8_t* ptr, T value) { new (ptr) T(std::move(value)); return ptr + sizeof(T); } uint32_t GetAllocSize(const TrackEventDataDescriptor& desc) { uint32_t alloc_size = sizeof(TrackEventDataDescriptor); alloc_size += sizeof(uint64_t); alloc_size += desc.has_thread_instruction_count * sizeof(int64_t); alloc_size += desc.has_thread_timestamp * sizeof(int64_t); alloc_size += desc.has_counter_value * sizeof(double); alloc_size += desc.extra_counter_count * sizeof(double); return alloc_size; } } // namespace TraceTokenBuffer::Id TraceTokenBuffer::Append(TrackEventData ted) { // TrackEventData (and TracePacketData) are two big contributors to the size // of the peak memory usage by sorted. The main reasons for this are a) object // padding and b) using more bits than necessary to store their contents. // // The purpose of this function is to "compress" the contents of // TrackEventData by utilising techniques like bitpacking, interning and // variable length encoding to ensure only the amount of data which really // needs to be stored is done so. // Compress all the booleans indicating the presence of a value into 4 bits // instead of 4 bytes as they would take inside base::Optional. TrackEventDataDescriptor desc; desc.has_thread_instruction_count = ted.thread_instruction_count.has_value(); desc.has_thread_timestamp = ted.thread_timestamp.has_value(); desc.has_counter_value = std::not_equal_to()(ted.counter_value, 0); desc.extra_counter_count = ted.CountExtraCounterValues(); // Allocate enough memory using the BumpAllocator to store the data in |ted|. // Also figure out the interned index. BumpAllocator::AllocId alloc_id = AllocAndResizeInternedVectors(GetAllocSize(desc)); InternedIndex interned_index = GetInternedIndex(alloc_id); // Compute the interning information for the TrackBlob and the SequenceState. TracePacketData& tpd = ted.trace_packet_data; desc.intern_blob_offset = InternTraceBlob(interned_index, tpd.packet); desc.intern_blob_index = static_cast(interned_blobs_.at(interned_index).size() - 1); desc.intern_seq_index = InternSeqState(interned_index, std::move(tpd.sequence_state)); // Store the descriptor uint8_t* ptr = static_cast(allocator_.GetPointer(alloc_id)); ptr = AppendToPtr(ptr, desc); // Store the packet sizes. uint64_t packet_size = static_cast(tpd.packet.size()); ptr = AppendToPtr(ptr, packet_size); // Add the "optional" fields of TrackEventData based on whether or not they // are non-null. if (desc.has_thread_instruction_count) { ptr = AppendToPtr(ptr, ted.thread_instruction_count.value()); } if (desc.has_thread_timestamp) { ptr = AppendToPtr(ptr, ted.thread_timestamp.value()); } if (desc.has_counter_value) { ptr = AppendToPtr(ptr, ted.counter_value); } for (uint32_t i = 0; i < desc.extra_counter_count; ++i) { ptr = AppendToPtr(ptr, ted.extra_counter_values[i]); } return Id{alloc_id}; } template <> TrackEventData TraceTokenBuffer::Extract(Id id) { uint8_t* ptr = static_cast(allocator_.GetPointer(id.alloc_id)); TrackEventDataDescriptor desc = ExtractFromPtr(&ptr); uint64_t packet_size = ExtractFromPtr(&ptr); InternedIndex interned_index = GetInternedIndex(id.alloc_id); BlobWithOffset& bwo = interned_blobs_.at(interned_index)[desc.intern_blob_index]; TraceBlobView tbv(RefPtr::FromReleasedUnsafe(bwo.blob), bwo.offset_in_blob + desc.intern_blob_offset, static_cast(packet_size)); auto seq = RefPtr::FromReleasedUnsafe( interned_seqs_.at(interned_index)[desc.intern_seq_index]); TrackEventData ted{std::move(tbv), std::move(seq)}; if (desc.has_thread_instruction_count) { ted.thread_instruction_count = ExtractFromPtr(&ptr); } if (desc.has_thread_timestamp) { ted.thread_timestamp = ExtractFromPtr(&ptr); } if (desc.has_counter_value) { ted.counter_value = ExtractFromPtr(&ptr); } for (uint32_t i = 0; i < desc.extra_counter_count; ++i) { ted.extra_counter_values[i] = ExtractFromPtr(&ptr); } allocator_.Free(id.alloc_id); return ted; } uint32_t TraceTokenBuffer::InternTraceBlob(InternedIndex interned_index, const TraceBlobView& tbv) { BlobWithOffsets& blobs = interned_blobs_.at(interned_index); if (blobs.empty()) { return AddTraceBlob(interned_index, tbv); } BlobWithOffset& last_blob = blobs.back(); if (last_blob.blob != tbv.blob().get()) { return AddTraceBlob(interned_index, tbv); } PERFETTO_CHECK(last_blob.offset_in_blob <= tbv.offset()); // To allow our offsets in the store to be 16 bits, we intern not only the // TraceBlob pointer but also the offset. By having this double indirection, // we can store offset always as uint16 at the cost of storing blobs here more // often: this more than pays for itself as in the majority of cases the // offsets are small anyway. size_t rel_offset = tbv.offset() - last_blob.offset_in_blob; if (rel_offset > TrackEventDataDescriptor::kMaxOffsetFromInternedBlob) { return AddTraceBlob(interned_index, tbv); } // Intentionally "leak" this pointer. This essentially keeps the refcount // of this TraceBlob one higher than the number of RefPtrs pointing to it. // This allows avoid storing the same RefPtr n times. // // Calls to this function are paired to Extract which picks // up this "leaked" pointer. TraceBlob* leaked = tbv.blob().ReleaseUnsafe(); base::ignore_result(leaked); return static_cast(rel_offset); } uint16_t TraceTokenBuffer::InternSeqState( InternedIndex interned_index, RefPtr ptr) { // Look back at most 32 elements. This should be far enough in most cases // unless either: a) we are essentially round-robining between >32 sequences // b) we are churning through generations. Either case seems pathalogical. SequenceStates& states = interned_seqs_.at(interned_index); size_t lookback = std::min(32u, states.size()); for (uint32_t i = 0; i < lookback; ++i) { uint16_t idx = static_cast(states.size() - 1 - i); if (states[idx] == ptr.get()) { // Intentionally "leak" this pointer. See |InternTraceBlob| for an // explanation. PacketSequenceStateGeneration* leaked = ptr.ReleaseUnsafe(); base::ignore_result(leaked); return idx; } } states.emplace_back(ptr.ReleaseUnsafe()); PERFETTO_CHECK(states.size() <= std::numeric_limits::max()); return static_cast(states.size() - 1); } uint32_t TraceTokenBuffer::AddTraceBlob(InternedIndex interned_index, const TraceBlobView& tbv) { BlobWithOffsets& blobs = interned_blobs_.at(interned_index); blobs.emplace_back(BlobWithOffset{tbv.blob().ReleaseUnsafe(), tbv.offset()}); PERFETTO_CHECK(blobs.size() <= std::numeric_limits::max()); return 0u; } void TraceTokenBuffer::FreeMemory() { uint64_t erased = allocator_.EraseFrontFreeChunks(); PERFETTO_CHECK(erased <= std::numeric_limits::max()); interned_blobs_.erase_front(static_cast(erased)); interned_seqs_.erase_front(static_cast(erased)); PERFETTO_CHECK(interned_blobs_.size() == interned_seqs_.size()); } BumpAllocator::AllocId TraceTokenBuffer::AllocAndResizeInternedVectors( uint32_t size) { uint64_t erased = allocator_.erased_front_chunks_count(); BumpAllocator::AllocId alloc_id = allocator_.Alloc(size); uint64_t allocator_chunks_size = alloc_id.chunk_index - erased + 1; // The allocator should never "remove" chunks from being tracked. PERFETTO_DCHECK(allocator_chunks_size >= interned_blobs_.size()); // We should add at most one chunk in the allocator. uint64_t chunks_added = allocator_chunks_size - interned_blobs_.size(); PERFETTO_DCHECK(chunks_added <= 1); PERFETTO_DCHECK(interned_blobs_.size() == interned_seqs_.size()); for (uint64_t i = 0; i < chunks_added; ++i) { interned_blobs_.emplace_back(); interned_seqs_.emplace_back(); } return alloc_id; } TraceTokenBuffer::InternedIndex TraceTokenBuffer::GetInternedIndex( BumpAllocator::AllocId alloc_id) { uint64_t interned_index = alloc_id.chunk_index - allocator_.erased_front_chunks_count(); PERFETTO_DCHECK(interned_index <= std::numeric_limits::max()); PERFETTO_DCHECK(interned_index < interned_blobs_.size()); PERFETTO_DCHECK(interned_index < interned_seqs_.size()); PERFETTO_DCHECK(interned_blobs_.size() == interned_seqs_.size()); return static_cast(interned_index); } } // namespace perfetto::trace_processor