/* * Copyright 2022 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #undef LOG_TAG #define LOG_TAG "GpuWork" #define ATRACE_TAG ATRACE_TAG_GRAPHICS #include "gpuwork/GpuWork.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "gpuwork/gpuWork.h" #define ONE_MS_IN_NS (10000000) namespace android { namespace gpuwork { namespace { bool lessThanGpuIdUid(const android::gpuwork::GpuIdUid& l, const android::gpuwork::GpuIdUid& r) { return std::tie(l.gpu_id, l.uid) < std::tie(r.gpu_id, r.uid); } size_t hashGpuIdUid(const android::gpuwork::GpuIdUid& gpuIdUid) { return static_cast((gpuIdUid.gpu_id << 5U) + gpuIdUid.uid); } bool equalGpuIdUid(const android::gpuwork::GpuIdUid& l, const android::gpuwork::GpuIdUid& r) { return std::tie(l.gpu_id, l.uid) == std::tie(r.gpu_id, r.uid); } // Gets a BPF map from |mapPath|. template bool getBpfMap(const char* mapPath, bpf::BpfMap* out) { errno = 0; auto map = bpf::BpfMap(mapPath); if (!map.isValid()) { ALOGW("Failed to create bpf map from %s [%d(%s)]", mapPath, errno, strerror(errno)); return false; } *out = std::move(map); return true; } template inline int32_t cast_int32(SourceType) = delete; template inline int32_t bitcast_int32(SourceType) = delete; template <> inline int32_t bitcast_int32(uint32_t source) { int32_t result; memcpy(&result, &source, sizeof(result)); return result; } } // namespace using base::StringAppendF; GpuWork::~GpuWork() { // If we created our clearer thread, then we must stop it and join it. if (mMapClearerThread.joinable()) { // Tell the thread to terminate. { std::scoped_lock lock(mMutex); mIsTerminating = true; mIsTerminatingConditionVariable.notify_all(); } // Now, we can join it. mMapClearerThread.join(); } { std::scoped_lock lock(mMutex); if (mStatsdRegistered) { AStatsManager_clearPullAtomCallback(android::util::GPU_WORK_PER_UID); } } bpf_detach_tracepoint("power", "gpu_work_period"); } void GpuWork::initialize() { // Make sure BPF programs are loaded. bpf::waitForProgsLoaded(); waitForPermissions(); // Get the BPF maps before trying to attach the BPF program; if we can't get // the maps then there is no point in attaching the BPF program. { std::lock_guard lock(mMutex); if (!getBpfMap("/sys/fs/bpf/map_gpuWork_gpu_work_map", &mGpuWorkMap)) { return; } if (!getBpfMap("/sys/fs/bpf/map_gpuWork_gpu_work_global_data", &mGpuWorkGlobalDataMap)) { return; } mPreviousMapClearTimePoint = std::chrono::steady_clock::now(); } // Attach the tracepoint. if (!attachTracepoint("/sys/fs/bpf/prog_gpuWork_tracepoint_power_gpu_work_period", "power", "gpu_work_period")) { return; } // Create the map clearer thread, and store it to |mMapClearerThread|. std::thread thread([this]() { periodicallyClearMap(); }); mMapClearerThread.swap(thread); { std::lock_guard lock(mMutex); AStatsManager_setPullAtomCallback(int32_t{android::util::GPU_WORK_PER_UID}, nullptr, GpuWork::pullAtomCallback, this); mStatsdRegistered = true; } ALOGI("Initialized!"); mInitialized.store(true); } void GpuWork::dump(const Vector& /* args */, std::string* result) { if (!mInitialized.load()) { result->append("GPU work information is not available.\n"); return; } // Ordered map ensures output data is sorted. std::map dumpMap(&lessThanGpuIdUid); { std::lock_guard lock(mMutex); if (!mGpuWorkMap.isValid()) { result->append("GPU work map is not available.\n"); return; } // Iteration of BPF hash maps can be unreliable (no data races, but elements // may be repeated), as the map is typically being modified by other // threads. The buckets are all preallocated. Our eBPF program only updates // entries (in-place) or adds entries. |GpuWork| only iterates or clears the // map while holding |mMutex|. Given this, we should be able to iterate over // all elements reliably. Nevertheless, we copy into a map to avoid // duplicates. // Note that userspace reads of BPF maps make a copy of the value, and // thus the returned value is not being concurrently accessed by the BPF // program (no atomic reads needed below). mGpuWorkMap.iterateWithValue( [&dumpMap](const GpuIdUid& key, const UidTrackingInfo& value, const android::bpf::BpfMap&) -> base::Result { dumpMap[key] = value; return {}; }); } // Dump work information. // E.g. // GPU work information. // gpu_id uid total_active_duration_ns total_inactive_duration_ns // 0 1000 0 0 // 0 1003 1234 123 // [errors:3]0 1006 4567 456 // Header. result->append("GPU work information.\ngpu_id uid total_active_duration_ns " "total_inactive_duration_ns\n"); for (const auto& idToUidInfo : dumpMap) { if (idToUidInfo.second.error_count) { StringAppendF(result, "[errors:%" PRIu32 "]", idToUidInfo.second.error_count); } StringAppendF(result, "%" PRIu32 " %" PRIu32 " %" PRIu64 " %" PRIu64 "\n", idToUidInfo.first.gpu_id, idToUidInfo.first.uid, idToUidInfo.second.total_active_duration_ns, idToUidInfo.second.total_inactive_duration_ns); } } bool GpuWork::attachTracepoint(const char* programPath, const char* tracepointGroup, const char* tracepointName) { errno = 0; base::unique_fd fd(bpf::retrieveProgram(programPath)); if (fd < 0) { ALOGW("Failed to retrieve pinned program from %s [%d(%s)]", programPath, errno, strerror(errno)); return false; } // Attach the program to the tracepoint. The tracepoint is automatically enabled. errno = 0; int count = 0; while (bpf_attach_tracepoint(fd.get(), tracepointGroup, tracepointName) < 0) { if (++count > kGpuWaitTimeoutSeconds) { ALOGW("Failed to attach bpf program to %s/%s tracepoint [%d(%s)]", tracepointGroup, tracepointName, errno, strerror(errno)); return false; } // Retry until GPU driver loaded or timeout. if (mStop.load()) return false; sleep(1); errno = 0; } return true; } AStatsManager_PullAtomCallbackReturn GpuWork::pullAtomCallback(int32_t atomTag, AStatsEventList* data, void* cookie) { ATRACE_CALL(); GpuWork* gpuWork = reinterpret_cast(cookie); if (atomTag == android::util::GPU_WORK_PER_UID) { return gpuWork->pullWorkAtoms(data); } return AStatsManager_PULL_SKIP; } AStatsManager_PullAtomCallbackReturn GpuWork::pullWorkAtoms(AStatsEventList* data) { ATRACE_CALL(); if (!data || !mInitialized.load()) { return AStatsManager_PULL_SKIP; } std::lock_guard lock(mMutex); if (!mGpuWorkMap.isValid()) { return AStatsManager_PULL_SKIP; } std::unordered_map workMap(32, &hashGpuIdUid, &equalGpuIdUid); // Iteration of BPF hash maps can be unreliable (no data races, but elements // may be repeated), as the map is typically being modified by other // threads. The buckets are all preallocated. Our eBPF program only updates // entries (in-place) or adds entries. |GpuWork| only iterates or clears the // map while holding |mMutex|. Given this, we should be able to iterate over // all elements reliably. Nevertheless, we copy into a map to avoid // duplicates. // Note that userspace reads of BPF maps make a copy of the value, and thus // the returned value is not being concurrently accessed by the BPF program // (no atomic reads needed below). mGpuWorkMap.iterateWithValue([&workMap](const GpuIdUid& key, const UidTrackingInfo& value, const android::bpf::BpfMap&) -> base::Result { workMap[key] = value; return {}; }); // Get a list of just the UIDs; the order does not matter. std::vector uids; // Get a list of the GPU IDs, in order. std::set gpuIds; { // To avoid adding duplicate UIDs. std::unordered_set addedUids; for (const auto& workInfo : workMap) { if (addedUids.insert(workInfo.first.uid).second) { // Insertion was successful. uids.push_back(workInfo.first.uid); } gpuIds.insert(workInfo.first.gpu_id); } } ALOGI("pullWorkAtoms: uids.size() == %zu", uids.size()); ALOGI("pullWorkAtoms: gpuIds.size() == %zu", gpuIds.size()); if (gpuIds.size() > kNumGpusHardLimit) { // If we observe a very high number of GPUs then something has probably // gone wrong, so don't log any atoms. return AStatsManager_PULL_SKIP; } size_t numSampledUids = kNumSampledUids; if (gpuIds.size() > kNumGpusSoftLimit) { // If we observe a high number of GPUs then we just sample 1 UID. numSampledUids = 1; } // Remove all UIDs that do not have at least |kMinGpuTimeNanoseconds| on at // least one GPU. { auto uidIt = uids.begin(); while (uidIt != uids.end()) { bool hasEnoughGpuTime = false; for (uint32_t gpuId : gpuIds) { auto infoIt = workMap.find(GpuIdUid{gpuId, *uidIt}); if (infoIt == workMap.end()) { continue; } if (infoIt->second.total_active_duration_ns + infoIt->second.total_inactive_duration_ns >= kMinGpuTimeNanoseconds) { hasEnoughGpuTime = true; break; } } if (hasEnoughGpuTime) { ++uidIt; } else { uidIt = uids.erase(uidIt); } } } ALOGI("pullWorkAtoms: after removing uids with very low GPU time: uids.size() == %zu", uids.size()); std::random_device device; std::default_random_engine random_engine(device()); // If we have more than |numSampledUids| UIDs, choose |numSampledUids| // random UIDs. We swap them to the front of the list. Given the list // indices 0..i..n-1, we have the following inclusive-inclusive ranges: // - [0, i-1] == the randomly chosen elements. // - [i, n-1] == the remaining unchosen elements. if (uids.size() > numSampledUids) { for (size_t i = 0; i < numSampledUids; ++i) { std::uniform_int_distribution uniform_dist(i, uids.size() - 1); size_t random_index = uniform_dist(random_engine); std::swap(uids[i], uids[random_index]); } // Only keep the front |numSampledUids| elements. uids.resize(numSampledUids); } ALOGI("pullWorkAtoms: after random selection: uids.size() == %zu", uids.size()); auto now = std::chrono::steady_clock::now(); long long duration = std::chrono::duration_cast(now - mPreviousMapClearTimePoint) .count(); if (duration > std::numeric_limits::max() || duration < 0) { // This is essentially impossible. If it does somehow happen, give up, // but still clear the map. clearMap(); return AStatsManager_PULL_SKIP; } // Log an atom for each (gpu id, uid) pair for which we have data. for (uint32_t gpuId : gpuIds) { for (Uid uid : uids) { auto it = workMap.find(GpuIdUid{gpuId, uid}); if (it == workMap.end()) { continue; } const UidTrackingInfo& info = it->second; uint64_t total_active_duration_ms = info.total_active_duration_ns / ONE_MS_IN_NS; uint64_t total_inactive_duration_ms = info.total_inactive_duration_ns / ONE_MS_IN_NS; // Skip this atom if any numbers are out of range. |duration| is // already checked above. if (total_active_duration_ms > std::numeric_limits::max() || total_inactive_duration_ms > std::numeric_limits::max()) { continue; } ALOGI("pullWorkAtoms: adding stats for GPU ID %" PRIu32 "; UID %" PRIu32, gpuId, uid); android::util::addAStatsEvent(data, int32_t{android::util::GPU_WORK_PER_UID}, // uid bitcast_int32(uid), // gpu_id bitcast_int32(gpuId), // time_duration_seconds static_cast(duration), // total_active_duration_millis static_cast(total_active_duration_ms), // total_inactive_duration_millis static_cast(total_inactive_duration_ms)); } } clearMap(); return AStatsManager_PULL_SUCCESS; } void GpuWork::periodicallyClearMap() { std::unique_lock lock(mMutex); auto previousTime = std::chrono::steady_clock::now(); while (true) { if (mIsTerminating) { break; } auto nextTime = std::chrono::steady_clock::now(); auto differenceSeconds = std::chrono::duration_cast(nextTime - previousTime); if (differenceSeconds.count() > kMapClearerWaitDurationSeconds) { // It has been >1 hour, so clear the map, if needed. clearMapIfNeeded(); // We only update |previousTime| if we actually checked the map. previousTime = nextTime; } // Sleep for ~1 hour. It does not matter if we don't check the map for 2 // hours. mIsTerminatingConditionVariable.wait_for(lock, std::chrono::seconds{ kMapClearerWaitDurationSeconds}); } } void GpuWork::clearMapIfNeeded() { if (!mInitialized.load() || !mGpuWorkMap.isValid() || !mGpuWorkGlobalDataMap.isValid()) { ALOGW("Map clearing could not occur because we are not initialized properly"); return; } base::Result globalData = mGpuWorkGlobalDataMap.readValue(0); if (!globalData.ok()) { ALOGW("Could not read BPF global data map entry"); return; } // Note that userspace reads of BPF maps make a copy of the value, and thus // the return value is not being concurrently accessed by the BPF program // (no atomic reads needed below). uint64_t numEntries = globalData.value().num_map_entries; // If the map is <=75% full, we do nothing. if (numEntries <= (kMaxTrackedGpuIdUids / 4) * 3) { return; } clearMap(); } void GpuWork::clearMap() { if (!mInitialized.load() || !mGpuWorkMap.isValid() || !mGpuWorkGlobalDataMap.isValid()) { ALOGW("Map clearing could not occur because we are not initialized properly"); return; } base::Result globalData = mGpuWorkGlobalDataMap.readValue(0); if (!globalData.ok()) { ALOGW("Could not read BPF global data map entry"); return; } // Iterating BPF maps to delete keys is tricky. If we just repeatedly call // |getFirstKey()| and delete that, we may loop forever (or for a long time) // because our BPF program might be repeatedly re-adding keys. Also, even if // we limit the number of elements we try to delete, we might only delete // new entries, leaving old entries in the map. If we delete a key A and // then call |getNextKey(A)|, the first key in the map is returned, so we // have the same issue. // // Thus, we instead get the next key and then delete the previous key. We // also limit the number of deletions we try, just in case. base::Result key = mGpuWorkMap.getFirstKey(); for (size_t i = 0; i < kMaxTrackedGpuIdUids; ++i) { if (!key.ok()) { break; } base::Result previousKey = key; key = mGpuWorkMap.getNextKey(previousKey.value()); mGpuWorkMap.deleteValue(previousKey.value()); } // Reset our counter; |globalData| is a copy of the data, so we have to use // |writeValue|. globalData.value().num_map_entries = 0; mGpuWorkGlobalDataMap.writeValue(0, globalData.value(), BPF_ANY); // Update |mPreviousMapClearTimePoint| so we know when we started collecting // the stats. mPreviousMapClearTimePoint = std::chrono::steady_clock::now(); } void GpuWork::waitForPermissions() { const String16 permissionRegisterStatsPullAtom(kPermissionRegisterStatsPullAtom); int count = 0; while (!PermissionCache::checkPermission(permissionRegisterStatsPullAtom, getpid(), getuid())) { if (++count > kPermissionsWaitTimeoutSeconds) { ALOGW("Timed out waiting for android.permission.REGISTER_STATS_PULL_ATOM"); return; } // Retry. sleep(1); } } } // namespace gpuwork } // namespace android