/*
 * Copyright 2022 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#undef LOG_TAG
#define LOG_TAG "GpuWork"
#define ATRACE_TAG ATRACE_TAG_GRAPHICS

#include "gpuwork/GpuWork.h"

#include <android-base/stringprintf.h>
#include <binder/PermissionCache.h>
#include <bpf/WaitForProgsLoaded.h>
#include <libbpf.h>
#include <log/log.h>
#include <random>
#include <stats_event.h>
#include <statslog.h>
#include <unistd.h>
#include <utils/Timers.h>
#include <utils/Trace.h>

#include <bit>
#include <chrono>
#include <cstdint>
#include <limits>
#include <map>
#include <mutex>
#include <unordered_map>
#include <unordered_set>
#include <vector>

#include "gpuwork/gpuWork.h"

#define ONE_MS_IN_NS (10000000)

namespace android {
namespace gpuwork {

namespace {

bool lessThanGpuIdUid(const android::gpuwork::GpuIdUid& l, const android::gpuwork::GpuIdUid& r) {
    return std::tie(l.gpu_id, l.uid) < std::tie(r.gpu_id, r.uid);
}

size_t hashGpuIdUid(const android::gpuwork::GpuIdUid& gpuIdUid) {
    return static_cast<size_t>((gpuIdUid.gpu_id << 5U) + gpuIdUid.uid);
}

bool equalGpuIdUid(const android::gpuwork::GpuIdUid& l, const android::gpuwork::GpuIdUid& r) {
    return std::tie(l.gpu_id, l.uid) == std::tie(r.gpu_id, r.uid);
}

// Gets a BPF map from |mapPath|.
template <class Key, class Value>
bool getBpfMap(const char* mapPath, bpf::BpfMap<Key, Value>* out) {
    errno = 0;
    auto map = bpf::BpfMap<Key, Value>(mapPath);
    if (!map.isValid()) {
        ALOGW("Failed to create bpf map from %s [%d(%s)]", mapPath, errno, strerror(errno));
        return false;
    }
    *out = std::move(map);
    return true;
}

template <typename SourceType>
inline int32_t cast_int32(SourceType) = delete;

template <typename SourceType>
inline int32_t bitcast_int32(SourceType) = delete;

template <>
inline int32_t bitcast_int32<uint32_t>(uint32_t source) {
    int32_t result;
    memcpy(&result, &source, sizeof(result));
    return result;
}

} // namespace

using base::StringAppendF;

GpuWork::~GpuWork() {
    // If we created our clearer thread, then we must stop it and join it.
    if (mMapClearerThread.joinable()) {
        // Tell the thread to terminate.
        {
            std::scoped_lock<std::mutex> lock(mMutex);
            mIsTerminating = true;
            mIsTerminatingConditionVariable.notify_all();
        }

        // Now, we can join it.
        mMapClearerThread.join();
    }

    {
        std::scoped_lock<std::mutex> lock(mMutex);
        if (mStatsdRegistered) {
            AStatsManager_clearPullAtomCallback(android::util::GPU_WORK_PER_UID);
        }
    }

    bpf_detach_tracepoint("power", "gpu_work_period");
}

void GpuWork::initialize() {
    // Make sure BPF programs are loaded.
    bpf::waitForProgsLoaded();

    waitForPermissions();

    // Get the BPF maps before trying to attach the BPF program; if we can't get
    // the maps then there is no point in attaching the BPF program.
    {
        std::lock_guard<std::mutex> lock(mMutex);

        if (!getBpfMap("/sys/fs/bpf/map_gpuWork_gpu_work_map", &mGpuWorkMap)) {
            return;
        }

        if (!getBpfMap("/sys/fs/bpf/map_gpuWork_gpu_work_global_data", &mGpuWorkGlobalDataMap)) {
            return;
        }

        mPreviousMapClearTimePoint = std::chrono::steady_clock::now();
    }

    // Attach the tracepoint.
    if (!attachTracepoint("/sys/fs/bpf/prog_gpuWork_tracepoint_power_gpu_work_period", "power",
                          "gpu_work_period")) {
        return;
    }

    // Create the map clearer thread, and store it to |mMapClearerThread|.
    std::thread thread([this]() { periodicallyClearMap(); });

    mMapClearerThread.swap(thread);

    {
        std::lock_guard<std::mutex> lock(mMutex);
        AStatsManager_setPullAtomCallback(int32_t{android::util::GPU_WORK_PER_UID}, nullptr,
                                          GpuWork::pullAtomCallback, this);
        mStatsdRegistered = true;
    }

    ALOGI("Initialized!");

    mInitialized.store(true);
}

void GpuWork::dump(const Vector<String16>& /* args */, std::string* result) {
    if (!mInitialized.load()) {
        result->append("GPU work information is not available.\n");
        return;
    }

    // Ordered map ensures output data is sorted.
    std::map<GpuIdUid, UidTrackingInfo, decltype(lessThanGpuIdUid)*> dumpMap(&lessThanGpuIdUid);

    {
        std::lock_guard<std::mutex> lock(mMutex);

        if (!mGpuWorkMap.isValid()) {
            result->append("GPU work map is not available.\n");
            return;
        }

        // Iteration of BPF hash maps can be unreliable (no data races, but elements
        // may be repeated), as the map is typically being modified by other
        // threads. The buckets are all preallocated. Our eBPF program only updates
        // entries (in-place) or adds entries. |GpuWork| only iterates or clears the
        // map while holding |mMutex|. Given this, we should be able to iterate over
        // all elements reliably. Nevertheless, we copy into a map to avoid
        // duplicates.

        // Note that userspace reads of BPF maps make a copy of the value, and
        // thus the returned value is not being concurrently accessed by the BPF
        // program (no atomic reads needed below).

        mGpuWorkMap.iterateWithValue(
                [&dumpMap](const GpuIdUid& key, const UidTrackingInfo& value,
                           const android::bpf::BpfMap<GpuIdUid, UidTrackingInfo>&)
                        -> base::Result<void> {
                    dumpMap[key] = value;
                    return {};
                });
    }

    // Dump work information.
    // E.g.
    // GPU work information.
    // gpu_id uid total_active_duration_ns total_inactive_duration_ns
    // 0 1000 0 0
    // 0 1003 1234 123
    // [errors:3]0 1006 4567 456

    // Header.
    result->append("GPU work information.\ngpu_id uid total_active_duration_ns "
                   "total_inactive_duration_ns\n");

    for (const auto& idToUidInfo : dumpMap) {
        if (idToUidInfo.second.error_count) {
            StringAppendF(result, "[errors:%" PRIu32 "]", idToUidInfo.second.error_count);
        }
        StringAppendF(result, "%" PRIu32 " %" PRIu32 " %" PRIu64 " %" PRIu64 "\n",
                      idToUidInfo.first.gpu_id, idToUidInfo.first.uid,
                      idToUidInfo.second.total_active_duration_ns,
                      idToUidInfo.second.total_inactive_duration_ns);
    }
}

bool GpuWork::attachTracepoint(const char* programPath, const char* tracepointGroup,
                               const char* tracepointName) {
    errno = 0;
    base::unique_fd fd(bpf::retrieveProgram(programPath));
    if (fd < 0) {
        ALOGW("Failed to retrieve pinned program from %s [%d(%s)]", programPath, errno,
              strerror(errno));
        return false;
    }

    // Attach the program to the tracepoint. The tracepoint is automatically enabled.
    errno = 0;
    int count = 0;
    while (bpf_attach_tracepoint(fd.get(), tracepointGroup, tracepointName) < 0) {
        if (++count > kGpuWaitTimeoutSeconds) {
            ALOGW("Failed to attach bpf program to %s/%s tracepoint [%d(%s)]", tracepointGroup,
                  tracepointName, errno, strerror(errno));
            return false;
        }
        // Retry until GPU driver loaded or timeout.
        if (mStop.load()) return false;
        sleep(1);
        errno = 0;
    }

    return true;
}

AStatsManager_PullAtomCallbackReturn GpuWork::pullAtomCallback(int32_t atomTag,
                                                               AStatsEventList* data,
                                                               void* cookie) {
    ATRACE_CALL();

    GpuWork* gpuWork = reinterpret_cast<GpuWork*>(cookie);
    if (atomTag == android::util::GPU_WORK_PER_UID) {
        return gpuWork->pullWorkAtoms(data);
    }

    return AStatsManager_PULL_SKIP;
}

AStatsManager_PullAtomCallbackReturn GpuWork::pullWorkAtoms(AStatsEventList* data) {
    ATRACE_CALL();

    if (!data || !mInitialized.load()) {
        return AStatsManager_PULL_SKIP;
    }

    std::lock_guard<std::mutex> lock(mMutex);

    if (!mGpuWorkMap.isValid()) {
        return AStatsManager_PULL_SKIP;
    }

    std::unordered_map<GpuIdUid, UidTrackingInfo, decltype(hashGpuIdUid)*, decltype(equalGpuIdUid)*>
            workMap(32, &hashGpuIdUid, &equalGpuIdUid);

    // Iteration of BPF hash maps can be unreliable (no data races, but elements
    // may be repeated), as the map is typically being modified by other
    // threads. The buckets are all preallocated. Our eBPF program only updates
    // entries (in-place) or adds entries. |GpuWork| only iterates or clears the
    // map while holding |mMutex|. Given this, we should be able to iterate over
    // all elements reliably. Nevertheless, we copy into a map to avoid
    // duplicates.

    // Note that userspace reads of BPF maps make a copy of the value, and thus
    // the returned value is not being concurrently accessed by the BPF program
    // (no atomic reads needed below).

    mGpuWorkMap.iterateWithValue([&workMap](const GpuIdUid& key, const UidTrackingInfo& value,
                                            const android::bpf::BpfMap<GpuIdUid, UidTrackingInfo>&)
                                         -> base::Result<void> {
        workMap[key] = value;
        return {};
    });

    // Get a list of just the UIDs; the order does not matter.
    std::vector<Uid> uids;
    // Get a list of the GPU IDs, in order.
    std::set<uint32_t> gpuIds;
    {
        // To avoid adding duplicate UIDs.
        std::unordered_set<Uid> addedUids;

        for (const auto& workInfo : workMap) {
            if (addedUids.insert(workInfo.first.uid).second) {
                // Insertion was successful.
                uids.push_back(workInfo.first.uid);
            }
            gpuIds.insert(workInfo.first.gpu_id);
        }
    }

    ALOGI("pullWorkAtoms: uids.size() == %zu", uids.size());
    ALOGI("pullWorkAtoms: gpuIds.size() == %zu", gpuIds.size());

    if (gpuIds.size() > kNumGpusHardLimit) {
        // If we observe a very high number of GPUs then something has probably
        // gone wrong, so don't log any atoms.
        return AStatsManager_PULL_SKIP;
    }

    size_t numSampledUids = kNumSampledUids;

    if (gpuIds.size() > kNumGpusSoftLimit) {
        // If we observe a high number of GPUs then we just sample 1 UID.
        numSampledUids = 1;
    }

    // Remove all UIDs that do not have at least |kMinGpuTimeNanoseconds| on at
    // least one GPU.
    {
        auto uidIt = uids.begin();
        while (uidIt != uids.end()) {
            bool hasEnoughGpuTime = false;
            for (uint32_t gpuId : gpuIds) {
                auto infoIt = workMap.find(GpuIdUid{gpuId, *uidIt});
                if (infoIt == workMap.end()) {
                    continue;
                }
                if (infoIt->second.total_active_duration_ns +
                            infoIt->second.total_inactive_duration_ns >=
                    kMinGpuTimeNanoseconds) {
                    hasEnoughGpuTime = true;
                    break;
                }
            }
            if (hasEnoughGpuTime) {
                ++uidIt;
            } else {
                uidIt = uids.erase(uidIt);
            }
        }
    }

    ALOGI("pullWorkAtoms: after removing uids with very low GPU time: uids.size() == %zu",
          uids.size());

    std::random_device device;
    std::default_random_engine random_engine(device());

    // If we have more than |numSampledUids| UIDs, choose |numSampledUids|
    // random UIDs. We swap them to the front of the list. Given the list
    // indices 0..i..n-1, we have the following inclusive-inclusive ranges:
    // - [0, i-1] == the randomly chosen elements.
    // - [i, n-1] == the remaining unchosen elements.
    if (uids.size() > numSampledUids) {
        for (size_t i = 0; i < numSampledUids; ++i) {
            std::uniform_int_distribution<size_t> uniform_dist(i, uids.size() - 1);
            size_t random_index = uniform_dist(random_engine);
            std::swap(uids[i], uids[random_index]);
        }
        // Only keep the front |numSampledUids| elements.
        uids.resize(numSampledUids);
    }

    ALOGI("pullWorkAtoms: after random selection: uids.size() == %zu", uids.size());

    auto now = std::chrono::steady_clock::now();
    long long duration =
            std::chrono::duration_cast<std::chrono::seconds>(now - mPreviousMapClearTimePoint)
                    .count();
    if (duration > std::numeric_limits<int32_t>::max() || duration < 0) {
        // This is essentially impossible. If it does somehow happen, give up,
        // but still clear the map.
        clearMap();
        return AStatsManager_PULL_SKIP;
    }

    // Log an atom for each (gpu id, uid) pair for which we have data.
    for (uint32_t gpuId : gpuIds) {
        for (Uid uid : uids) {
            auto it = workMap.find(GpuIdUid{gpuId, uid});
            if (it == workMap.end()) {
                continue;
            }
            const UidTrackingInfo& info = it->second;

            uint64_t total_active_duration_ms = info.total_active_duration_ns / ONE_MS_IN_NS;
            uint64_t total_inactive_duration_ms = info.total_inactive_duration_ns / ONE_MS_IN_NS;

            // Skip this atom if any numbers are out of range. |duration| is
            // already checked above.
            if (total_active_duration_ms > std::numeric_limits<int32_t>::max() ||
                total_inactive_duration_ms > std::numeric_limits<int32_t>::max()) {
                continue;
            }

            ALOGI("pullWorkAtoms: adding stats for GPU ID %" PRIu32 "; UID %" PRIu32, gpuId, uid);
            android::util::addAStatsEvent(data, int32_t{android::util::GPU_WORK_PER_UID},
                                          // uid
                                          bitcast_int32(uid),
                                          // gpu_id
                                          bitcast_int32(gpuId),
                                          // time_duration_seconds
                                          static_cast<int32_t>(duration),
                                          // total_active_duration_millis
                                          static_cast<int32_t>(total_active_duration_ms),
                                          // total_inactive_duration_millis
                                          static_cast<int32_t>(total_inactive_duration_ms));
        }
    }
    clearMap();
    return AStatsManager_PULL_SUCCESS;
}

void GpuWork::periodicallyClearMap() {
    std::unique_lock<std::mutex> lock(mMutex);

    auto previousTime = std::chrono::steady_clock::now();

    while (true) {
        if (mIsTerminating) {
            break;
        }
        auto nextTime = std::chrono::steady_clock::now();
        auto differenceSeconds =
                std::chrono::duration_cast<std::chrono::seconds>(nextTime - previousTime);
        if (differenceSeconds.count() > kMapClearerWaitDurationSeconds) {
            // It has been >1 hour, so clear the map, if needed.
            clearMapIfNeeded();
            // We only update |previousTime| if we actually checked the map.
            previousTime = nextTime;
        }
        // Sleep for ~1 hour. It does not matter if we don't check the map for 2
        // hours.
        mIsTerminatingConditionVariable.wait_for(lock,
                                                 std::chrono::seconds{
                                                         kMapClearerWaitDurationSeconds});
    }
}

void GpuWork::clearMapIfNeeded() {
    if (!mInitialized.load() || !mGpuWorkMap.isValid() || !mGpuWorkGlobalDataMap.isValid()) {
        ALOGW("Map clearing could not occur because we are not initialized properly");
        return;
    }

    base::Result<GlobalData> globalData = mGpuWorkGlobalDataMap.readValue(0);
    if (!globalData.ok()) {
        ALOGW("Could not read BPF global data map entry");
        return;
    }

    // Note that userspace reads of BPF maps make a copy of the value, and thus
    // the return value is not being concurrently accessed by the BPF program
    // (no atomic reads needed below).

    uint64_t numEntries = globalData.value().num_map_entries;

    // If the map is <=75% full, we do nothing.
    if (numEntries <= (kMaxTrackedGpuIdUids / 4) * 3) {
        return;
    }

    clearMap();
}

void GpuWork::clearMap() {
    if (!mInitialized.load() || !mGpuWorkMap.isValid() || !mGpuWorkGlobalDataMap.isValid()) {
        ALOGW("Map clearing could not occur because we are not initialized properly");
        return;
    }

    base::Result<GlobalData> globalData = mGpuWorkGlobalDataMap.readValue(0);
    if (!globalData.ok()) {
        ALOGW("Could not read BPF global data map entry");
        return;
    }

    // Iterating BPF maps to delete keys is tricky. If we just repeatedly call
    // |getFirstKey()| and delete that, we may loop forever (or for a long time)
    // because our BPF program might be repeatedly re-adding keys. Also, even if
    // we limit the number of elements we try to delete, we might only delete
    // new entries, leaving old entries in the map. If we delete a key A and
    // then call |getNextKey(A)|, the first key in the map is returned, so we
    // have the same issue.
    //
    // Thus, we instead get the next key and then delete the previous key. We
    // also limit the number of deletions we try, just in case.

    base::Result<GpuIdUid> key = mGpuWorkMap.getFirstKey();

    for (size_t i = 0; i < kMaxTrackedGpuIdUids; ++i) {
        if (!key.ok()) {
            break;
        }
        base::Result<GpuIdUid> previousKey = key;
        key = mGpuWorkMap.getNextKey(previousKey.value());
        mGpuWorkMap.deleteValue(previousKey.value());
    }

    // Reset our counter; |globalData| is a copy of the data, so we have to use
    // |writeValue|.
    globalData.value().num_map_entries = 0;
    mGpuWorkGlobalDataMap.writeValue(0, globalData.value(), BPF_ANY);

    // Update |mPreviousMapClearTimePoint| so we know when we started collecting
    // the stats.
    mPreviousMapClearTimePoint = std::chrono::steady_clock::now();
}

void GpuWork::waitForPermissions() {
    const String16 permissionRegisterStatsPullAtom(kPermissionRegisterStatsPullAtom);
    int count = 0;
    while (!PermissionCache::checkPermission(permissionRegisterStatsPullAtom, getpid(), getuid())) {
        if (++count > kPermissionsWaitTimeoutSeconds) {
            ALOGW("Timed out waiting for android.permission.REGISTER_STATS_PULL_ATOM");
            return;
        }
        // Retry.
        sleep(1);
    }
}

} // namespace gpuwork
} // namespace android
