/*
 * timeInState eBPF program
 *
 * Copyright (C) 2018 Google
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License version
 * 2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 */

#include <android_bpf_defs.h>
#include <bpf_timeinstate.h>
#include <errno.h>

#ifdef ENABLE_LIBBPF
#include <linux/bpf.h>
#include <private/android_filesystem_config.h>
#include <stdbool.h>
#endif  // ENABLE_LIBBPF

DEFINE_BPF_MAP_GRW(total_time_in_state_map, PERCPU_ARRAY, uint32_t, uint64_t, MAX_FREQS_FOR_TOTAL,
                   AID_SYSTEM)

DEFINE_BPF_MAP_GRW(uid_time_in_state_map, PERCPU_HASH, time_key_t, tis_val_t, 1024, AID_SYSTEM)

DEFINE_BPF_MAP_GRW(uid_concurrent_times_map, PERCPU_HASH, time_key_t, concurrent_val_t, 1024,
                   AID_SYSTEM)
DEFINE_BPF_MAP_GRW(uid_last_update_map, HASH, uint32_t, uint64_t, 1024, AID_SYSTEM)

DEFINE_BPF_MAP_GWO(cpu_last_update_map, PERCPU_ARRAY, uint32_t, uint64_t, 1, AID_SYSTEM)
DEFINE_BPF_MAP_GWO(cpu_last_pid_map, PERCPU_ARRAY, uint32_t, pid_t, 1, AID_SYSTEM)

DEFINE_BPF_MAP_GWO(cpu_policy_map, ARRAY, uint32_t, uint32_t, 1024, AID_SYSTEM)
DEFINE_BPF_MAP_GWO(policy_freq_idx_map, ARRAY, uint32_t, uint8_t, 1024, AID_SYSTEM)

DEFINE_BPF_MAP_GWO(freq_to_idx_map, HASH, freq_idx_key_t, uint8_t, 2048, AID_SYSTEM)

DEFINE_BPF_MAP_GWO(nr_active_map, ARRAY, uint32_t, uint32_t, 1, AID_SYSTEM)
DEFINE_BPF_MAP_GWO(policy_nr_active_map, ARRAY, uint32_t, uint32_t, 1024, AID_SYSTEM)

DEFINE_BPF_MAP_GWO(pid_tracked_hash_map, HASH, uint32_t, pid_t, MAX_TRACKED_PIDS, AID_SYSTEM)
DEFINE_BPF_MAP_GWO(pid_tracked_map, ARRAY, uint32_t, tracked_pid_t, MAX_TRACKED_PIDS, AID_SYSTEM)
DEFINE_BPF_MAP_GWO(pid_task_aggregation_map, HASH, pid_t, uint16_t, 1024, AID_SYSTEM)
DEFINE_BPF_MAP_GRO(pid_time_in_state_map, PERCPU_HASH, aggregated_task_tis_key_t, tis_val_t, 1024,
                   AID_SYSTEM)

struct switch_args {
    unsigned long long ignore;
    char prev_comm[16];
    int prev_pid;
    int prev_prio;
    long long prev_state;
    char next_comm[16];
    int next_pid;
    int next_prio;
};

static inline __always_inline void update_uid(uint32_t uid, uint64_t delta, uint64_t time,
                                              uint8_t freq_idx, uint32_t active,
                                              uint32_t policy_active) {
    time_key_t key = {.uid = uid, .bucket = freq_idx / FREQS_PER_ENTRY};
    tis_val_t* val = bpf_uid_time_in_state_map_lookup_elem(&key);
    if (!val) {
        tis_val_t zero_val = {.ar = {0}};
        bpf_uid_time_in_state_map_update_elem(&key, &zero_val, BPF_NOEXIST);
        val = bpf_uid_time_in_state_map_lookup_elem(&key);
    }
    if (val) val->ar[freq_idx % FREQS_PER_ENTRY] += delta;

    key.bucket = active / CPUS_PER_ENTRY;
    concurrent_val_t* ct = bpf_uid_concurrent_times_map_lookup_elem(&key);
    if (!ct) {
        concurrent_val_t zero_val = {.active = {0}, .policy = {0}};
        bpf_uid_concurrent_times_map_update_elem(&key, &zero_val, BPF_NOEXIST);
        ct = bpf_uid_concurrent_times_map_lookup_elem(&key);
    }
    if (ct) ct->active[active % CPUS_PER_ENTRY] += delta;

    if (policy_active / CPUS_PER_ENTRY != key.bucket) {
        key.bucket = policy_active / CPUS_PER_ENTRY;
        ct = bpf_uid_concurrent_times_map_lookup_elem(&key);
        if (!ct) {
            concurrent_val_t zero_val = {.active = {0}, .policy = {0}};
            bpf_uid_concurrent_times_map_update_elem(&key, &zero_val, BPF_NOEXIST);
            ct = bpf_uid_concurrent_times_map_lookup_elem(&key);
        }
    }
    if (ct) ct->policy[policy_active % CPUS_PER_ENTRY] += delta;
    uint64_t* uid_last_update = bpf_uid_last_update_map_lookup_elem(&uid);
    if (uid_last_update) {
        *uid_last_update = time;
    } else {
        bpf_uid_last_update_map_update_elem(&uid, &time, BPF_NOEXIST);
    }
    return;
}

DEFINE_BPF_PROG("tracepoint/sched/sched_switch", AID_ROOT, AID_SYSTEM,
                tracepoint_sched_sched_switch)
(struct switch_args* args) {
    const int ALLOW = 1;  // return 1 to avoid blocking simpleperf from receiving events.
    uint32_t zero = 0;
    uint64_t* last = bpf_cpu_last_update_map_lookup_elem(&zero);
    if (!last) return ALLOW;
    uint64_t old_last = *last;
    uint64_t time = bpf_ktime_get_ns();
    *last = time;

    // With suspend-to-ram, it's possible to see prev_pid==0 twice in a row on the same CPU. Add a
    // check to ensure prev_pid matches the previous next_pid to avoid incorrectly incrementing our
    // active CPU counts a second time in this scenario.
    pid_t *cpu_pidp = bpf_cpu_last_pid_map_lookup_elem(&zero);
    if (!cpu_pidp) return ALLOW;
    pid_t cpu_pid = *cpu_pidp;
    *cpu_pidp = args->next_pid;
    if (old_last && args->prev_pid != cpu_pid) return ALLOW;

    uint32_t* active = bpf_nr_active_map_lookup_elem(&zero);
    if (!active) return ALLOW;

    uint32_t cpu = bpf_get_smp_processor_id();
    uint32_t* policyp = bpf_cpu_policy_map_lookup_elem(&cpu);
    if (!policyp) return ALLOW;
    uint32_t policy = *policyp;

    uint32_t* policy_active = bpf_policy_nr_active_map_lookup_elem(&policy);
    if (!policy_active) return ALLOW;

    uint32_t nactive = *active - 1;
    uint32_t policy_nactive = *policy_active - 1;

    if (!args->prev_pid || (!old_last && args->next_pid)) {
        __sync_fetch_and_add(active, 1);
        __sync_fetch_and_add(policy_active, 1);
    }

    // Return here in 2 scenarios:
    // 1) prev_pid == 0, so we're exiting idle. No UID stats need updating, and active CPUs can't be
    //    decreasing.
    // 2) old_last == 0, so this is the first time we've seen this CPU. Any delta will be invalid,
    //    and our active CPU counts don't include this CPU yet so we shouldn't decrement them even
    //    if we're going idle.
    if (!args->prev_pid || !old_last) return ALLOW;

    if (!args->next_pid) {
        __sync_fetch_and_add(active, -1);
        __sync_fetch_and_add(policy_active, -1);
    }

    uint8_t* freq_idxp = bpf_policy_freq_idx_map_lookup_elem(&policy);
    if (!freq_idxp || !*freq_idxp) return ALLOW;
    // freq_to_idx_map uses 1 as its minimum index so that *freq_idxp == 0 only when uninitialized
    uint8_t freq_idx = *freq_idxp - 1;

    // The bpf_get_current_uid_gid() helper function returns a u64 value, with the lower 32 bits
    // containing the UID and the upper 32 bits containing the GID. Additionally, in rare cases,
    // (usually something is very wrong with the kernel) the helper can return -EINVAL, in which
    // case we should just return early.
    unsigned long long uid_gid = bpf_get_current_uid_gid();
    if (uid_gid == (unsigned long long)(-EINVAL)) return ALLOW;

    // Mask out the uid part of the uid_gid value returned from the kernel.
    uint32_t uid = uid_gid & 0xFFFFFFFF;

    uint64_t delta = time - old_last;

    // For UIDs in the SDK sandbox range, we account per-UID times twice, both to the corresponding
    // app uid and to the "virtual" UID AID_SDK_SANDBOX which is reserved for collecting total times
    // across all SDK sandbox UIDs. Special handling for this reserved UID in framework code
    // prevents double counting in systemwide totals.
    if (((uid % AID_USER_OFFSET) >= AID_SDK_SANDBOX_PROCESS_START) &&
        ((uid % AID_USER_OFFSET) <= AID_SDK_SANDBOX_PROCESS_END)) {
        uid -= AID_SDK_SANDBOX_PROCESS_START - AID_APP_START;
        update_uid(uid, delta, time, freq_idx, nactive, policy_nactive);
        update_uid(AID_SDK_SANDBOX, delta, time, freq_idx, nactive, policy_nactive);
    } else {
        update_uid(uid, delta, time, freq_idx, nactive, policy_nactive);
    }

    // Add delta to total.
    const uint32_t total_freq_idx = freq_idx < MAX_FREQS_FOR_TOTAL ? freq_idx :
                                    MAX_FREQS_FOR_TOTAL - 1;
    uint64_t* total = bpf_total_time_in_state_map_lookup_elem(&total_freq_idx);
    if (total) *total += delta;

    const int pid = args->prev_pid;
    const pid_t tgid = bpf_get_current_pid_tgid() >> 32;
    bool is_tgid_tracked = false;

    // eBPF verifier does not currently allow loops.
    // Instruct the C compiler to unroll the loop into a series of steps.
    #pragma unroll
    for (uint32_t index = 0; index < MAX_TRACKED_PIDS; index++) {
        const uint32_t key = index;
        tracked_pid_t* tracked_pid = bpf_pid_tracked_map_lookup_elem(&key);
        if (!tracked_pid) continue;
        if (tracked_pid->state == TRACKED_PID_STATE_UNUSED) {
            // Reached the end of the list
            break;
        }

        if (tracked_pid->state == TRACKED_PID_STATE_ACTIVE && tracked_pid->pid == tgid) {
            is_tgid_tracked = true;
            break;
        }
    }

    if (is_tgid_tracked) {
        // If this process is marked for time-in-state tracking, aggregate the CPU time-in-state
        // with other threads sharing the same TGID and aggregation key.
        uint16_t* aggregation_key = bpf_pid_task_aggregation_map_lookup_elem(&pid);
        aggregated_task_tis_key_t task_key = {
                .tgid = tgid,
                .aggregation_key = aggregation_key ? *aggregation_key : 0,
                .bucket = freq_idx / FREQS_PER_ENTRY};
        tis_val_t* task_val = bpf_pid_time_in_state_map_lookup_elem(&task_key);
        if (!task_val) {
            tis_val_t zero_val = {.ar = {0}};
            bpf_pid_time_in_state_map_update_elem(&task_key, &zero_val, BPF_NOEXIST);
            task_val = bpf_pid_time_in_state_map_lookup_elem(&task_key);
        }
        if (task_val) task_val->ar[freq_idx % FREQS_PER_ENTRY] += delta;
    }
    return ALLOW;
}

struct cpufreq_args {
    unsigned long long ignore;
    unsigned int state;
    unsigned int cpu_id;
};

DEFINE_BPF_PROG("tracepoint/power/cpu_frequency", AID_ROOT, AID_SYSTEM,
                tracepoint_power_cpu_frequency)
(struct cpufreq_args* args) {
    const int ALLOW = 1;  // return 1 to avoid blocking simpleperf from receiving events.
    uint32_t cpu = args->cpu_id;
    unsigned int new = args->state;
    uint32_t* policyp = bpf_cpu_policy_map_lookup_elem(&cpu);
    if (!policyp) return ALLOW;
    uint32_t policy = *policyp;
    freq_idx_key_t key = {.policy = policy, .freq = new};
    uint8_t* idxp = bpf_freq_to_idx_map_lookup_elem(&key);
    if (!idxp) return ALLOW;
    uint8_t idx = *idxp;
    bpf_policy_freq_idx_map_update_elem(&policy, &idx, BPF_ANY);
    return ALLOW;
}

// The format of the sched/sched_process_free event is described in
// adb shell cat /d/tracing/events/sched/sched_process_free/format
struct sched_process_free_args {
    unsigned long long ignore;
    char comm[16];
    pid_t pid;
    int prio;
};

DEFINE_BPF_PROG("tracepoint/sched/sched_process_free", AID_ROOT, AID_SYSTEM,
                tracepoint_sched_sched_process_free)
(struct sched_process_free_args* args) {
    const int ALLOW = 1;

    int pid = args->pid;
    bool is_last = true;

    // eBPF verifier does not currently allow loops.
    // Instruct the C compiler to unroll the loop into a series of steps.
    #pragma unroll
    for (uint32_t index = 0; index < MAX_TRACKED_PIDS; index++) {
        const uint32_t key = MAX_TRACKED_PIDS - index - 1;
        tracked_pid_t* tracked_pid = bpf_pid_tracked_map_lookup_elem(&key);
        if (!tracked_pid) continue;
        if (tracked_pid->pid == pid) {
            tracked_pid->pid = 0;
            tracked_pid->state = is_last ? TRACKED_PID_STATE_UNUSED : TRACKED_PID_STATE_EXITED;
            bpf_pid_tracked_hash_map_delete_elem(&key);
            break;
        }
        if (tracked_pid->state == TRACKED_PID_STATE_ACTIVE) {
            is_last = false;
        }
    }

    bpf_pid_task_aggregation_map_delete_elem(&pid);
    return ALLOW;
}

LICENSE("GPL");
