/* * Copyright (C) 2013 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #define LOG_TAG "lowmemorykiller" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "reaper.h" #include "statslog.h" #include "watchdog.h" /* * Define LMKD_TRACE_KILLS to record lmkd kills in kernel traces * to profile and correlate with OOM kills */ #ifdef LMKD_TRACE_KILLS #define ATRACE_TAG ATRACE_TAG_ALWAYS #include static inline void trace_kill_start(const char *desc) { ATRACE_BEGIN(desc); } static inline void trace_kill_end() { ATRACE_END(); } #else /* LMKD_TRACE_KILLS */ static inline void trace_kill_start(const char *) {} static inline void trace_kill_end() {} #endif /* LMKD_TRACE_KILLS */ #ifndef __unused #define __unused __attribute__((__unused__)) #endif #define ZONEINFO_PATH "/proc/zoneinfo" #define MEMINFO_PATH "/proc/meminfo" #define VMSTAT_PATH "/proc/vmstat" #define PROC_STATUS_TGID_FIELD "Tgid:" #define PROC_STATUS_RSS_FIELD "VmRSS:" #define PROC_STATUS_SWAP_FIELD "VmSwap:" #define NODE_STATS_MARKER " per-node stats" #define PERCEPTIBLE_APP_ADJ 200 #define PREVIOUS_APP_ADJ 700 /* Android Logger event logtags (see event.logtags) */ #define KILLINFO_LOG_TAG 10195355 /* gid containing AID_SYSTEM required */ #define INKERNEL_MINFREE_PATH "/sys/module/lowmemorykiller/parameters/minfree" #define INKERNEL_ADJ_PATH "/sys/module/lowmemorykiller/parameters/adj" #define EIGHT_MEGA (1 << 23) #define TARGET_UPDATE_MIN_INTERVAL_MS 1000 #define THRASHING_RESET_INTERVAL_MS 1000 #define NS_PER_MS (NS_PER_SEC / MS_PER_SEC) #define US_PER_MS (US_PER_SEC / MS_PER_SEC) /* Defined as ProcessList.SYSTEM_ADJ in ProcessList.java */ #define SYSTEM_ADJ (-900) #define STRINGIFY(x) STRINGIFY_INTERNAL(x) #define STRINGIFY_INTERNAL(x) #x #define PROCFS_PATH_MAX 64 /* * Read lmk property with persist.device_config.lmkd_native. overriding ro.lmk. * persist.device_config.lmkd_native.* properties are being set by experiments. If a new property * can be controlled by an experiment then use GET_LMK_PROPERTY instead of property_get_xxx and * add "on property" triggers in lmkd.rc to react to the experiment flag changes. */ #define GET_LMK_PROPERTY(type, name, def) \ property_get_##type("persist.device_config.lmkd_native." name, \ property_get_##type("ro.lmk." name, def)) /* * PSI monitor tracking window size. * PSI monitor generates events at most once per window, * therefore we poll memory state for the duration of * PSI_WINDOW_SIZE_MS after the event happens. */ #define PSI_WINDOW_SIZE_MS 1000 /* Polling period after PSI signal when pressure is high */ #define PSI_POLL_PERIOD_SHORT_MS 10 /* Polling period after PSI signal when pressure is low */ #define PSI_POLL_PERIOD_LONG_MS 100 #define FAIL_REPORT_RLIMIT_MS 1000 /* * System property defaults */ /* ro.lmk.swap_free_low_percentage property defaults */ #define DEF_LOW_SWAP 10 /* ro.lmk.thrashing_limit property defaults */ #define DEF_THRASHING_LOWRAM 30 #define DEF_THRASHING 100 /* ro.lmk.thrashing_limit_decay property defaults */ #define DEF_THRASHING_DECAY_LOWRAM 50 #define DEF_THRASHING_DECAY 10 /* ro.lmk.psi_partial_stall_ms property defaults */ #define DEF_PARTIAL_STALL_LOWRAM 200 #define DEF_PARTIAL_STALL 70 /* ro.lmk.psi_complete_stall_ms property defaults */ #define DEF_COMPLETE_STALL 700 /* ro.lmk.direct_reclaim_threshold_ms property defaults */ #define DEF_DIRECT_RECL_THRESH_MS 0 /* ro.lmk.swap_compression_ratio property defaults */ #define DEF_SWAP_COMP_RATIO 1 /* ro.lmk.lowmem_min_oom_score defaults */ #define DEF_LOWMEM_MIN_SCORE (PREVIOUS_APP_ADJ + 1) #define LMKD_REINIT_PROP "lmkd.reinit" #define WATCHDOG_TIMEOUT_SEC 2 /* default to old in-kernel interface if no memory pressure events */ static bool use_inkernel_interface = true; static bool has_inkernel_module; /* memory pressure levels */ enum vmpressure_level { VMPRESS_LEVEL_LOW = 0, VMPRESS_LEVEL_MEDIUM, VMPRESS_LEVEL_CRITICAL, VMPRESS_LEVEL_COUNT }; static const char *level_name[] = { "low", "medium", "critical" }; struct { int64_t min_nr_free_pages; /* recorded but not used yet */ int64_t max_nr_free_pages; } low_pressure_mem = { -1, -1 }; struct psi_threshold { enum psi_stall_type stall_type; int threshold_ms; }; /* Listener for direct reclaim and kswapd state changes */ static std::unique_ptr memevent_listener(nullptr); static struct timespec direct_reclaim_start_tm; static struct timespec kswapd_start_tm; static int level_oomadj[VMPRESS_LEVEL_COUNT]; static int mpevfd[VMPRESS_LEVEL_COUNT] = { -1, -1, -1 }; static bool pidfd_supported; static int last_kill_pid_or_fd = -1; static struct timespec last_kill_tm; enum vmpressure_level prev_level = VMPRESS_LEVEL_LOW; static bool monitors_initialized; static bool boot_completed_handled = false; /* lmkd configurable parameters */ static bool debug_process_killing; static bool enable_pressure_upgrade; static int64_t upgrade_pressure; static int64_t downgrade_pressure; static bool low_ram_device; static bool kill_heaviest_task; static unsigned long kill_timeout_ms; static int pressure_after_kill_min_score; static bool use_minfree_levels; static bool per_app_memcg; static int swap_free_low_percentage; static int psi_partial_stall_ms; static int psi_complete_stall_ms; static int thrashing_limit_pct; static int thrashing_limit_decay_pct; static int thrashing_critical_pct; static int swap_util_max; static int64_t filecache_min_kb; static int64_t stall_limit_critical; static bool use_psi_monitors = false; static int kpoll_fd; static bool delay_monitors_until_boot; static int direct_reclaim_threshold_ms; static int swap_compression_ratio; static int lowmem_min_oom_score; static struct psi_threshold psi_thresholds[VMPRESS_LEVEL_COUNT] = { { PSI_SOME, 70 }, /* 70ms out of 1sec for partial stall */ { PSI_SOME, 100 }, /* 100ms out of 1sec for partial stall */ { PSI_FULL, 70 }, /* 70ms out of 1sec for complete stall */ }; static uint64_t mp_event_count; static android_log_context ctx; static Reaper reaper; static int reaper_comm_fd[2]; enum polling_update { POLLING_DO_NOT_CHANGE, POLLING_START, POLLING_PAUSE, POLLING_RESUME, }; /* * Data used for periodic polling for the memory state of the device. * Note that when system is not polling poll_handler is set to NULL, * when polling starts poll_handler gets set and is reset back to * NULL when polling stops. */ struct polling_params { struct event_handler_info* poll_handler; struct event_handler_info* paused_handler; struct timespec poll_start_tm; struct timespec last_poll_tm; int polling_interval_ms; enum polling_update update; }; /* data required to handle events */ struct event_handler_info { int data; void (*handler)(int data, uint32_t events, struct polling_params *poll_params); }; /* data required to handle socket events */ struct sock_event_handler_info { int sock; pid_t pid; uint32_t async_event_mask; struct event_handler_info handler_info; }; /* max supported number of data connections (AMS, init, tests) */ #define MAX_DATA_CONN 3 /* socket event handler data */ static struct sock_event_handler_info ctrl_sock; static struct sock_event_handler_info data_sock[MAX_DATA_CONN]; /* vmpressure event handler data */ static struct event_handler_info vmpressure_hinfo[VMPRESS_LEVEL_COUNT]; /* * 1 ctrl listen socket, 3 ctrl data socket, 3 memory pressure levels, * 1 lmk events + 1 fd to wait for process death + 1 fd to receive kill failure notifications * + 1 fd to receive memevent_listener notifications */ #define MAX_EPOLL_EVENTS (1 + MAX_DATA_CONN + VMPRESS_LEVEL_COUNT + 1 + 1 + 1 + 1) static int epollfd; static int maxevents; /* OOM score values used by both kernel and framework */ #define OOM_SCORE_ADJ_MIN (-1000) #define OOM_SCORE_ADJ_MAX 1000 static std::array lowmem_adj; static std::array lowmem_minfree; static int lowmem_targets_size; /* Fields to parse in /proc/zoneinfo */ /* zoneinfo per-zone fields */ enum zoneinfo_zone_field { ZI_ZONE_NR_FREE_PAGES = 0, ZI_ZONE_MIN, ZI_ZONE_LOW, ZI_ZONE_HIGH, ZI_ZONE_PRESENT, ZI_ZONE_NR_FREE_CMA, ZI_ZONE_FIELD_COUNT }; static const char* const zoneinfo_zone_field_names[ZI_ZONE_FIELD_COUNT] = { "nr_free_pages", "min", "low", "high", "present", "nr_free_cma", }; /* zoneinfo per-zone special fields */ enum zoneinfo_zone_spec_field { ZI_ZONE_SPEC_PROTECTION = 0, ZI_ZONE_SPEC_PAGESETS, ZI_ZONE_SPEC_FIELD_COUNT, }; static const char* const zoneinfo_zone_spec_field_names[ZI_ZONE_SPEC_FIELD_COUNT] = { "protection:", "pagesets", }; /* see __MAX_NR_ZONES definition in kernel mmzone.h */ #define MAX_NR_ZONES 6 union zoneinfo_zone_fields { struct { int64_t nr_free_pages; int64_t min; int64_t low; int64_t high; int64_t present; int64_t nr_free_cma; } field; int64_t arr[ZI_ZONE_FIELD_COUNT]; }; struct zoneinfo_zone { union zoneinfo_zone_fields fields; int64_t protection[MAX_NR_ZONES]; int64_t max_protection; }; /* zoneinfo per-node fields */ enum zoneinfo_node_field { ZI_NODE_NR_INACTIVE_FILE = 0, ZI_NODE_NR_ACTIVE_FILE, ZI_NODE_FIELD_COUNT }; static const char* const zoneinfo_node_field_names[ZI_NODE_FIELD_COUNT] = { "nr_inactive_file", "nr_active_file", }; union zoneinfo_node_fields { struct { int64_t nr_inactive_file; int64_t nr_active_file; } field; int64_t arr[ZI_NODE_FIELD_COUNT]; }; struct zoneinfo_node { int id; int zone_count; struct zoneinfo_zone zones[MAX_NR_ZONES]; union zoneinfo_node_fields fields; }; /* for now two memory nodes is more than enough */ #define MAX_NR_NODES 2 struct zoneinfo { int node_count; struct zoneinfo_node nodes[MAX_NR_NODES]; int64_t totalreserve_pages; int64_t total_inactive_file; int64_t total_active_file; }; /* Fields to parse in /proc/meminfo */ enum meminfo_field { MI_NR_FREE_PAGES = 0, MI_CACHED, MI_SWAP_CACHED, MI_BUFFERS, MI_SHMEM, MI_UNEVICTABLE, MI_TOTAL_SWAP, MI_FREE_SWAP, MI_ACTIVE_ANON, MI_INACTIVE_ANON, MI_ACTIVE_FILE, MI_INACTIVE_FILE, MI_SRECLAIMABLE, MI_SUNRECLAIM, MI_KERNEL_STACK, MI_PAGE_TABLES, MI_ION_HELP, MI_ION_HELP_POOL, MI_CMA_FREE, MI_FIELD_COUNT }; static const char* const meminfo_field_names[MI_FIELD_COUNT] = { "MemFree:", "Cached:", "SwapCached:", "Buffers:", "Shmem:", "Unevictable:", "SwapTotal:", "SwapFree:", "Active(anon):", "Inactive(anon):", "Active(file):", "Inactive(file):", "SReclaimable:", "SUnreclaim:", "KernelStack:", "PageTables:", "ION_heap:", "ION_heap_pool:", "CmaFree:", }; union meminfo { struct { int64_t nr_free_pages; int64_t cached; int64_t swap_cached; int64_t buffers; int64_t shmem; int64_t unevictable; int64_t total_swap; int64_t free_swap; int64_t active_anon; int64_t inactive_anon; int64_t active_file; int64_t inactive_file; int64_t sreclaimable; int64_t sunreclaimable; int64_t kernel_stack; int64_t page_tables; int64_t ion_heap; int64_t ion_heap_pool; int64_t cma_free; /* fields below are calculated rather than read from the file */ int64_t nr_file_pages; int64_t total_gpu_kb; int64_t easy_available; } field; int64_t arr[MI_FIELD_COUNT]; }; /* Fields to parse in /proc/vmstat */ enum vmstat_field { VS_FREE_PAGES, VS_INACTIVE_FILE, VS_ACTIVE_FILE, VS_WORKINGSET_REFAULT, VS_WORKINGSET_REFAULT_FILE, VS_PGSCAN_KSWAPD, VS_PGSCAN_DIRECT, VS_PGSCAN_DIRECT_THROTTLE, VS_PGREFILL, VS_FIELD_COUNT }; static const char* const vmstat_field_names[VS_FIELD_COUNT] = { "nr_free_pages", "nr_inactive_file", "nr_active_file", "workingset_refault", "workingset_refault_file", "pgscan_kswapd", "pgscan_direct", "pgscan_direct_throttle", "pgrefill", }; union vmstat { struct { int64_t nr_free_pages; int64_t nr_inactive_file; int64_t nr_active_file; int64_t workingset_refault; int64_t workingset_refault_file; int64_t pgscan_kswapd; int64_t pgscan_direct; int64_t pgscan_direct_throttle; int64_t pgrefill; } field; int64_t arr[VS_FIELD_COUNT]; }; enum field_match_result { NO_MATCH, PARSE_FAIL, PARSE_SUCCESS }; struct adjslot_list { struct adjslot_list *next; struct adjslot_list *prev; }; struct proc { struct adjslot_list asl; int pid; int pidfd; uid_t uid; int oomadj; pid_t reg_pid; /* PID of the process that registered this record */ bool valid; struct proc *pidhash_next; }; struct reread_data { const char* const filename; int fd; }; #define PIDHASH_SZ 1024 static struct proc *pidhash[PIDHASH_SZ]; #define pid_hashfn(x) ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1)) #define ADJTOSLOT(adj) ((adj) + -OOM_SCORE_ADJ_MIN) #define ADJTOSLOT_COUNT (ADJTOSLOT(OOM_SCORE_ADJ_MAX) + 1) // protects procadjslot_list from concurrent access static std::shared_mutex adjslot_list_lock; // procadjslot_list should be modified only from the main thread while exclusively holding // adjslot_list_lock. Readers from non-main threads should hold adjslot_list_lock shared lock. static struct adjslot_list procadjslot_list[ADJTOSLOT_COUNT]; #define MAX_DISTINCT_OOM_ADJ 32 #define KILLCNT_INVALID_IDX 0xFF /* * Because killcnt array is sparse a two-level indirection is used * to keep the size small. killcnt_idx stores index of the element in * killcnt array. Index KILLCNT_INVALID_IDX indicates an unused slot. */ static uint8_t killcnt_idx[ADJTOSLOT_COUNT]; static uint16_t killcnt[MAX_DISTINCT_OOM_ADJ]; static int killcnt_free_idx = 0; static uint32_t killcnt_total = 0; static int pagesize; static long page_k; /* page size in kB */ static bool update_props(); static bool init_monitors(); static void destroy_monitors(); static bool init_memevent_listener_monitoring(); static int clamp(int low, int high, int value) { return std::max(std::min(value, high), low); } static bool parse_int64(const char* str, int64_t* ret) { char* endptr; long long val = strtoll(str, &endptr, 10); if (str == endptr || val > INT64_MAX) { return false; } *ret = (int64_t)val; return true; } static int find_field(const char* name, const char* const field_names[], int field_count) { for (int i = 0; i < field_count; i++) { if (!strcmp(name, field_names[i])) { return i; } } return -1; } static enum field_match_result match_field(const char* cp, const char* ap, const char* const field_names[], int field_count, int64_t* field, int *field_idx) { int i = find_field(cp, field_names, field_count); if (i < 0) { return NO_MATCH; } *field_idx = i; return parse_int64(ap, field) ? PARSE_SUCCESS : PARSE_FAIL; } /* * Read file content from the beginning up to max_len bytes or EOF * whichever happens first. */ static ssize_t read_all(int fd, char *buf, size_t max_len) { ssize_t ret = 0; off_t offset = 0; while (max_len > 0) { ssize_t r = TEMP_FAILURE_RETRY(pread(fd, buf, max_len, offset)); if (r == 0) { break; } if (r == -1) { return -1; } ret += r; buf += r; offset += r; max_len -= r; } return ret; } /* * Read a new or already opened file from the beginning. * If the file has not been opened yet data->fd should be set to -1. * To be used with files which are read often and possibly during high * memory pressure to minimize file opening which by itself requires kernel * memory allocation and might result in a stall on memory stressed system. */ static char *reread_file(struct reread_data *data) { /* start with page-size buffer and increase if needed */ static ssize_t buf_size = pagesize; static char *new_buf, *buf = NULL; ssize_t size; if (data->fd == -1) { /* First-time buffer initialization */ if (!buf && (buf = static_cast(malloc(buf_size))) == nullptr) { return NULL; } data->fd = TEMP_FAILURE_RETRY(open(data->filename, O_RDONLY | O_CLOEXEC)); if (data->fd < 0) { ALOGE("%s open: %s", data->filename, strerror(errno)); return NULL; } } while (true) { size = read_all(data->fd, buf, buf_size - 1); if (size < 0) { ALOGE("%s read: %s", data->filename, strerror(errno)); close(data->fd); data->fd = -1; return NULL; } if (size < buf_size - 1) { break; } /* * Since we are reading /proc files we can't use fstat to find out * the real size of the file. Double the buffer size and keep retrying. */ if ((new_buf = static_cast(realloc(buf, buf_size * 2))) == nullptr) { errno = ENOMEM; return NULL; } buf = new_buf; buf_size *= 2; } buf[size] = 0; return buf; } static bool claim_record(struct proc* procp, pid_t pid) { if (procp->reg_pid == pid) { /* Record already belongs to the registrant */ return true; } if (procp->reg_pid == 0) { /* Old registrant is gone, claim the record */ procp->reg_pid = pid; return true; } /* The record is owned by another registrant */ return false; } static void remove_claims(pid_t pid) { int i; for (i = 0; i < PIDHASH_SZ; i++) { struct proc* procp = pidhash[i]; while (procp) { if (procp->reg_pid == pid) { procp->reg_pid = 0; } procp = procp->pidhash_next; } } } static void ctrl_data_close(int dsock_idx) { struct epoll_event epev; ALOGI("closing lmkd data connection"); if (epoll_ctl(epollfd, EPOLL_CTL_DEL, data_sock[dsock_idx].sock, &epev) == -1) { // Log a warning and keep going ALOGW("epoll_ctl for data connection socket failed; errno=%d", errno); } maxevents--; close(data_sock[dsock_idx].sock); data_sock[dsock_idx].sock = -1; /* Mark all records of the old registrant as unclaimed */ remove_claims(data_sock[dsock_idx].pid); } static ssize_t ctrl_data_read(int dsock_idx, char* buf, size_t bufsz, struct ucred* sender_cred) { struct iovec iov = {buf, bufsz}; char control[CMSG_SPACE(sizeof(struct ucred))]; struct msghdr hdr = { NULL, 0, &iov, 1, control, sizeof(control), 0, }; ssize_t ret; ret = TEMP_FAILURE_RETRY(recvmsg(data_sock[dsock_idx].sock, &hdr, 0)); if (ret == -1) { ALOGE("control data socket read failed; %s", strerror(errno)); return -1; } if (ret == 0) { ALOGE("Got EOF on control data socket"); return -1; } struct ucred* cred = NULL; struct cmsghdr* cmsg = CMSG_FIRSTHDR(&hdr); while (cmsg != NULL) { if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_CREDENTIALS) { cred = (struct ucred*)CMSG_DATA(cmsg); break; } cmsg = CMSG_NXTHDR(&hdr, cmsg); } if (cred == NULL) { ALOGE("Failed to retrieve sender credentials"); /* Close the connection */ ctrl_data_close(dsock_idx); return -1; } memcpy(sender_cred, cred, sizeof(struct ucred)); /* Store PID of the peer */ data_sock[dsock_idx].pid = cred->pid; return ret; } static int ctrl_data_write(int dsock_idx, char* buf, size_t bufsz) { int ret = 0; ret = TEMP_FAILURE_RETRY(write(data_sock[dsock_idx].sock, buf, bufsz)); if (ret == -1) { ALOGE("control data socket write failed; errno=%d", errno); } else if (ret == 0) { ALOGE("Got EOF on control data socket"); ret = -1; } return ret; } /* * Write the pid/uid pair over the data socket, note: all active clients * will receive this unsolicited notification. */ static void ctrl_data_write_lmk_kill_occurred(pid_t pid, uid_t uid, int64_t rss_kb) { LMKD_CTRL_PACKET packet; size_t len = lmkd_pack_set_prockills(packet, pid, uid, static_cast(rss_kb)); for (int i = 0; i < MAX_DATA_CONN; i++) { if (data_sock[i].sock >= 0 && data_sock[i].async_event_mask & 1 << LMK_ASYNC_EVENT_KILL) { ctrl_data_write(i, (char*)packet, len); } } } /* * Write the kill_stat/memory_stat over the data socket to be propagated via AMS to statsd */ static void stats_write_lmk_kill_occurred(struct kill_stat *kill_st, struct memory_stat *mem_st) { LMK_KILL_OCCURRED_PACKET packet; const size_t len = lmkd_pack_set_kill_occurred(packet, kill_st, mem_st); if (len == 0) { return; } for (int i = 0; i < MAX_DATA_CONN; i++) { if (data_sock[i].sock >= 0 && data_sock[i].async_event_mask & 1 << LMK_ASYNC_EVENT_STAT) { ctrl_data_write(i, packet, len); } } } static void stats_write_lmk_kill_occurred_pid(int pid, struct kill_stat *kill_st, struct memory_stat *mem_st) { kill_st->taskname = stats_get_task_name(pid); if (kill_st->taskname != NULL) { stats_write_lmk_kill_occurred(kill_st, mem_st); } } static void poll_kernel(int poll_fd) { if (poll_fd == -1) { // not waiting return; } while (1) { char rd_buf[256]; int bytes_read = TEMP_FAILURE_RETRY(pread(poll_fd, (void*)rd_buf, sizeof(rd_buf) - 1, 0)); if (bytes_read <= 0) break; rd_buf[bytes_read] = '\0'; int64_t pid; int64_t uid; int64_t group_leader_pid; int64_t rss_in_pages; struct memory_stat mem_st = {}; int16_t oom_score_adj; int16_t min_score_adj; int64_t starttime; char* taskname = 0; int64_t rss_kb; int fields_read = sscanf(rd_buf, "%" SCNd64 " %" SCNd64 " %" SCNd64 " %" SCNd64 " %" SCNd64 " %" SCNd64 " %" SCNd16 " %" SCNd16 " %" SCNd64 "\n%m[^\n]", &pid, &uid, &group_leader_pid, &mem_st.pgfault, &mem_st.pgmajfault, &rss_in_pages, &oom_score_adj, &min_score_adj, &starttime, &taskname); /* only the death of the group leader process is logged */ if (fields_read == 10 && group_leader_pid == pid) { mem_st.rss_in_bytes = rss_in_pages * pagesize; rss_kb = mem_st.rss_in_bytes >> 10; ctrl_data_write_lmk_kill_occurred((pid_t)pid, (uid_t)uid, rss_kb); mem_st.process_start_time_ns = starttime * (NS_PER_SEC / sysconf(_SC_CLK_TCK)); struct kill_stat kill_st = { .uid = static_cast(uid), .kill_reason = NONE, .oom_score = oom_score_adj, .min_oom_score = min_score_adj, .free_mem_kb = 0, .free_swap_kb = 0, }; stats_write_lmk_kill_occurred_pid(pid, &kill_st, &mem_st); } free(taskname); } } static bool init_poll_kernel() { kpoll_fd = TEMP_FAILURE_RETRY(open("/proc/lowmemorykiller", O_RDONLY | O_NONBLOCK | O_CLOEXEC)); if (kpoll_fd < 0) { ALOGE("kernel lmk event file could not be opened; errno=%d", errno); return false; } return true; } static struct proc *pid_lookup(int pid) { struct proc *procp; for (procp = pidhash[pid_hashfn(pid)]; procp && procp->pid != pid; procp = procp->pidhash_next) ; return procp; } static void adjslot_insert(struct adjslot_list *head, struct adjslot_list *new_element) { struct adjslot_list *next = head->next; new_element->prev = head; new_element->next = next; next->prev = new_element; head->next = new_element; } static void adjslot_remove(struct adjslot_list *old) { struct adjslot_list *prev = old->prev; struct adjslot_list *next = old->next; next->prev = prev; prev->next = next; } static struct adjslot_list *adjslot_tail(struct adjslot_list *head) { struct adjslot_list *asl = head->prev; return asl == head ? NULL : asl; } // Should be modified only from the main thread. static void proc_slot(struct proc *procp) { int adjslot = ADJTOSLOT(procp->oomadj); std::scoped_lock lock(adjslot_list_lock); adjslot_insert(&procadjslot_list[adjslot], &procp->asl); } // Should be modified only from the main thread. static void proc_unslot(struct proc *procp) { std::scoped_lock lock(adjslot_list_lock); adjslot_remove(&procp->asl); } static void proc_insert(struct proc *procp) { int hval = pid_hashfn(procp->pid); procp->pidhash_next = pidhash[hval]; pidhash[hval] = procp; proc_slot(procp); } // Can be called only from the main thread. static int pid_remove(int pid) { int hval = pid_hashfn(pid); struct proc *procp; struct proc *prevp; for (procp = pidhash[hval], prevp = NULL; procp && procp->pid != pid; procp = procp->pidhash_next) prevp = procp; if (!procp) return -1; if (!prevp) pidhash[hval] = procp->pidhash_next; else prevp->pidhash_next = procp->pidhash_next; proc_unslot(procp); /* * Close pidfd here if we are not waiting for corresponding process to die, * in which case stop_wait_for_proc_kill() will close the pidfd later */ if (procp->pidfd >= 0 && procp->pidfd != last_kill_pid_or_fd) { close(procp->pidfd); } free(procp); return 0; } static void pid_invalidate(int pid) { std::shared_lock lock(adjslot_list_lock); struct proc *procp = pid_lookup(pid); if (procp) { procp->valid = false; } } /* * Write a string to a file. * Returns false if the file does not exist. */ static bool writefilestring(const char *path, const char *s, bool err_if_missing) { int fd = open(path, O_WRONLY | O_CLOEXEC); ssize_t len = strlen(s); ssize_t ret; if (fd < 0) { if (err_if_missing) { ALOGE("Error opening %s; errno=%d", path, errno); } return false; } ret = TEMP_FAILURE_RETRY(write(fd, s, len)); if (ret < 0) { ALOGE("Error writing %s; errno=%d", path, errno); } else if (ret < len) { ALOGE("Short write on %s; length=%zd", path, ret); } close(fd); return true; } static inline long get_time_diff_ms(struct timespec *from, struct timespec *to) { return (to->tv_sec - from->tv_sec) * (long)MS_PER_SEC + (to->tv_nsec - from->tv_nsec) / (long)NS_PER_MS; } /* Reads /proc/pid/status into buf. */ static bool read_proc_status(int pid, char *buf, size_t buf_sz) { char path[PROCFS_PATH_MAX]; int fd; ssize_t size; snprintf(path, PROCFS_PATH_MAX, "/proc/%d/status", pid); fd = open(path, O_RDONLY | O_CLOEXEC); if (fd < 0) { return false; } size = read_all(fd, buf, buf_sz - 1); close(fd); if (size <= 0) { return false; } buf[size] = 0; return true; } /* Looks for tag in buf and parses the first integer */ static bool parse_status_tag(char *buf, const char *tag, int64_t *out) { char *pos = buf; while (true) { pos = strstr(pos, tag); /* Stop if tag not found or found at the line beginning */ if (pos == NULL || pos == buf || pos[-1] == '\n') { break; } pos++; } if (pos == NULL) { return false; } pos += strlen(tag); while (*pos == ' ') ++pos; return parse_int64(pos, out); } static int proc_get_size(int pid) { char path[PROCFS_PATH_MAX]; char line[LINE_MAX]; int fd; int rss = 0; int total; ssize_t ret; /* gid containing AID_READPROC required */ snprintf(path, PROCFS_PATH_MAX, "/proc/%d/statm", pid); fd = open(path, O_RDONLY | O_CLOEXEC); if (fd == -1) return -1; ret = read_all(fd, line, sizeof(line) - 1); if (ret < 0) { close(fd); return -1; } line[ret] = '\0'; sscanf(line, "%d %d ", &total, &rss); close(fd); return rss; } static char *proc_get_name(int pid, char *buf, size_t buf_size) { char path[PROCFS_PATH_MAX]; int fd; char *cp; ssize_t ret; /* gid containing AID_READPROC required */ snprintf(path, PROCFS_PATH_MAX, "/proc/%d/cmdline", pid); fd = open(path, O_RDONLY | O_CLOEXEC); if (fd == -1) { return NULL; } ret = read_all(fd, buf, buf_size - 1); close(fd); if (ret <= 0) { return NULL; } buf[ret] = '\0'; cp = strchr(buf, ' '); if (cp) { *cp = '\0'; } return buf; } static void register_oom_adj_proc(const struct lmk_procprio& proc, struct ucred* cred) { char val[20]; int soft_limit_mult; bool is_system_server; struct passwd *pwdrec; struct proc* procp; int oom_adj_score = proc.oomadj; /* lmkd should not change soft limits for services */ if (proc.ptype == PROC_TYPE_APP && per_app_memcg) { if (proc.oomadj >= 900) { soft_limit_mult = 0; } else if (proc.oomadj >= 800) { soft_limit_mult = 0; } else if (proc.oomadj >= 700) { soft_limit_mult = 0; } else if (proc.oomadj >= 600) { // Launcher should be perceptible, don't kill it. oom_adj_score = 200; soft_limit_mult = 1; } else if (proc.oomadj >= 500) { soft_limit_mult = 0; } else if (proc.oomadj >= 400) { soft_limit_mult = 0; } else if (proc.oomadj >= 300) { soft_limit_mult = 1; } else if (proc.oomadj >= 200) { soft_limit_mult = 8; } else if (proc.oomadj >= 100) { soft_limit_mult = 10; } else if (proc.oomadj >= 0) { soft_limit_mult = 20; } else { // Persistent processes will have a large // soft limit 512MB. soft_limit_mult = 64; } std::string soft_limit_path; if (!CgroupGetAttributePathForTask("MemSoftLimit", proc.pid, &soft_limit_path)) { ALOGE("Querying MemSoftLimit path failed"); return; } snprintf(val, sizeof(val), "%d", soft_limit_mult * EIGHT_MEGA); /* * system_server process has no memcg under /dev/memcg/apps but should be * registered with lmkd. This is the best way so far to identify it. */ is_system_server = (oom_adj_score == SYSTEM_ADJ && (pwdrec = getpwnam("system")) != NULL && proc.uid == pwdrec->pw_uid); writefilestring(soft_limit_path.c_str(), val, !is_system_server); } procp = pid_lookup(proc.pid); if (!procp) { int pidfd = -1; if (pidfd_supported) { pidfd = TEMP_FAILURE_RETRY(pidfd_open(proc.pid, 0)); if (pidfd < 0) { ALOGE("pidfd_open for pid %d failed; errno=%d", proc.pid, errno); return; } } procp = static_cast(calloc(1, sizeof(struct proc))); if (!procp) { // Oh, the irony. May need to rebuild our state. return; } procp->pid = proc.pid; procp->pidfd = pidfd; procp->uid = proc.uid; procp->reg_pid = cred->pid; procp->oomadj = oom_adj_score; procp->valid = true; proc_insert(procp); } else { if (!claim_record(procp, cred->pid)) { char buf[LINE_MAX]; char *taskname = proc_get_name(cred->pid, buf, sizeof(buf)); /* Only registrant of the record can remove it */ ALOGE("%s (%d, %d) attempts to modify a process registered by another client", taskname ? taskname : "A process ", cred->uid, cred->pid); return; } proc_unslot(procp); procp->oomadj = oom_adj_score; proc_slot(procp); } } static void apply_proc_prio(const struct lmk_procprio& params, struct ucred* cred) { char path[PROCFS_PATH_MAX]; char val[20]; int64_t tgid; char buf[pagesize]; if (params.oomadj < OOM_SCORE_ADJ_MIN || params.oomadj > OOM_SCORE_ADJ_MAX) { ALOGE("Invalid PROCPRIO oomadj argument %d", params.oomadj); return; } if (params.ptype < PROC_TYPE_FIRST || params.ptype >= PROC_TYPE_COUNT) { ALOGE("Invalid PROCPRIO process type argument %d", params.ptype); return; } /* Check if registered process is a thread group leader */ if (read_proc_status(params.pid, buf, sizeof(buf))) { if (parse_status_tag(buf, PROC_STATUS_TGID_FIELD, &tgid) && tgid != params.pid) { ALOGE("Attempt to register a task that is not a thread group leader " "(tid %d, tgid %" PRId64 ")", params.pid, tgid); return; } } /* gid containing AID_READPROC required */ /* CAP_SYS_RESOURCE required */ /* CAP_DAC_OVERRIDE required */ snprintf(path, sizeof(path), "/proc/%d/oom_score_adj", params.pid); snprintf(val, sizeof(val), "%d", params.oomadj); if (!writefilestring(path, val, false)) { ALOGW("Failed to open %s; errno=%d: process %d might have been killed", path, errno, params.pid); /* If this file does not exist the process is dead. */ return; } if (use_inkernel_interface) { stats_store_taskname(params.pid, proc_get_name(params.pid, path, sizeof(path))); return; } register_oom_adj_proc(params, cred); } static void cmd_procprio(LMKD_CTRL_PACKET packet, int field_count, struct ucred* cred) { struct lmk_procprio proc_prio; lmkd_pack_get_procprio(packet, field_count, &proc_prio); apply_proc_prio(proc_prio, cred); } static void cmd_procremove(LMKD_CTRL_PACKET packet, struct ucred *cred) { struct lmk_procremove params; struct proc *procp; lmkd_pack_get_procremove(packet, ¶ms); if (use_inkernel_interface) { /* * Perform an extra check before the pid is removed, after which it * will be impossible for poll_kernel to get the taskname. poll_kernel() * is potentially a long-running blocking function; however this method * handles AMS requests but does not block AMS. */ poll_kernel(kpoll_fd); stats_remove_taskname(params.pid); return; } procp = pid_lookup(params.pid); if (!procp) { return; } if (!claim_record(procp, cred->pid)) { char buf[LINE_MAX]; char *taskname = proc_get_name(cred->pid, buf, sizeof(buf)); /* Only registrant of the record can remove it */ ALOGE("%s (%d, %d) attempts to unregister a process registered by another client", taskname ? taskname : "A process ", cred->uid, cred->pid); return; } /* * WARNING: After pid_remove() procp is freed and can't be used! * Therefore placed at the end of the function. */ pid_remove(params.pid); } static void cmd_procpurge(struct ucred *cred) { int i; struct proc *procp; struct proc *next; if (use_inkernel_interface) { stats_purge_tasknames(); return; } for (i = 0; i < PIDHASH_SZ; i++) { procp = pidhash[i]; while (procp) { next = procp->pidhash_next; /* Purge only records created by the requestor */ if (claim_record(procp, cred->pid)) { pid_remove(procp->pid); } procp = next; } } } static void cmd_subscribe(int dsock_idx, LMKD_CTRL_PACKET packet) { struct lmk_subscribe params; lmkd_pack_get_subscribe(packet, ¶ms); data_sock[dsock_idx].async_event_mask |= 1 << params.evt_type; } static void inc_killcnt(int oomadj) { int slot = ADJTOSLOT(oomadj); uint8_t idx = killcnt_idx[slot]; if (idx == KILLCNT_INVALID_IDX) { /* index is not assigned for this oomadj */ if (killcnt_free_idx < MAX_DISTINCT_OOM_ADJ) { killcnt_idx[slot] = killcnt_free_idx; killcnt[killcnt_free_idx] = 1; killcnt_free_idx++; } else { ALOGW("Number of distinct oomadj levels exceeds %d", MAX_DISTINCT_OOM_ADJ); } } else { /* * wraparound is highly unlikely and is detectable using total * counter because it has to be equal to the sum of all counters */ killcnt[idx]++; } /* increment total kill counter */ killcnt_total++; } static int get_killcnt(int min_oomadj, int max_oomadj) { int slot; int count = 0; if (min_oomadj > max_oomadj) return 0; /* special case to get total kill count */ if (min_oomadj > OOM_SCORE_ADJ_MAX) return killcnt_total; while (min_oomadj <= max_oomadj && (slot = ADJTOSLOT(min_oomadj)) < ADJTOSLOT_COUNT) { uint8_t idx = killcnt_idx[slot]; if (idx != KILLCNT_INVALID_IDX) { count += killcnt[idx]; } min_oomadj++; } return count; } static int cmd_getkillcnt(LMKD_CTRL_PACKET packet) { struct lmk_getkillcnt params; if (use_inkernel_interface) { /* kernel driver does not expose this information */ return 0; } lmkd_pack_get_getkillcnt(packet, ¶ms); return get_killcnt(params.min_oomadj, params.max_oomadj); } static void cmd_target(int ntargets, LMKD_CTRL_PACKET packet) { int i; struct lmk_target target; char minfree_str[PROPERTY_VALUE_MAX]; char *pstr = minfree_str; char *pend = minfree_str + sizeof(minfree_str); static struct timespec last_req_tm; struct timespec curr_tm; if (ntargets < 1 || ntargets > (int)lowmem_adj.size()) { return; } /* * Ratelimit minfree updates to once per TARGET_UPDATE_MIN_INTERVAL_MS * to prevent DoS attacks */ if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) { ALOGE("Failed to get current time"); return; } if (get_time_diff_ms(&last_req_tm, &curr_tm) < TARGET_UPDATE_MIN_INTERVAL_MS) { ALOGE("Ignoring frequent updated to lmkd limits"); return; } last_req_tm = curr_tm; for (i = 0; i < ntargets; i++) { lmkd_pack_get_target(packet, i, &target); lowmem_minfree[i] = target.minfree; lowmem_adj[i] = target.oom_adj_score; pstr += snprintf(pstr, pend - pstr, "%d:%d,", target.minfree, target.oom_adj_score); if (pstr >= pend) { /* if no more space in the buffer then terminate the loop */ pstr = pend; break; } } lowmem_targets_size = ntargets; /* Override the last extra comma */ pstr[-1] = '\0'; property_set("sys.lmk.minfree_levels", minfree_str); if (has_inkernel_module) { char minfreestr[128]; char killpriostr[128]; minfreestr[0] = '\0'; killpriostr[0] = '\0'; for (i = 0; i < lowmem_targets_size; i++) { char val[40]; if (i) { strlcat(minfreestr, ",", sizeof(minfreestr)); strlcat(killpriostr, ",", sizeof(killpriostr)); } snprintf(val, sizeof(val), "%d", use_inkernel_interface ? lowmem_minfree[i] : 0); strlcat(minfreestr, val, sizeof(minfreestr)); snprintf(val, sizeof(val), "%d", use_inkernel_interface ? lowmem_adj[i] : 0); strlcat(killpriostr, val, sizeof(killpriostr)); } writefilestring(INKERNEL_MINFREE_PATH, minfreestr, true); writefilestring(INKERNEL_ADJ_PATH, killpriostr, true); } } static void cmd_procs_prio(LMKD_CTRL_PACKET packet, const int field_count, struct ucred* cred) { struct lmk_procs_prio params; const int procs_count = lmkd_pack_get_procs_prio(packet, ¶ms, field_count); if (procs_count < 0) { ALOGE("LMK_PROCS_PRIO received invalid packet format"); return; } for (int i = 0; i < procs_count; i++) { apply_proc_prio(params.procs[i], cred); } } static void ctrl_command_handler(int dsock_idx) { LMKD_CTRL_PACKET packet; struct ucred cred; int len; enum lmk_cmd cmd; int nargs; int targets; int kill_cnt; int result; len = ctrl_data_read(dsock_idx, (char *)packet, CTRL_PACKET_MAX_SIZE, &cred); if (len <= 0) return; if (len < (int)sizeof(int)) { ALOGE("Wrong control socket read length len=%d", len); return; } cmd = lmkd_pack_get_cmd(packet); nargs = len / sizeof(int) - 1; if (nargs < 0) goto wronglen; switch(cmd) { case LMK_TARGET: targets = nargs / 2; if (nargs & 0x1 || targets > (int)lowmem_adj.size()) { goto wronglen; } cmd_target(targets, packet); break; case LMK_PROCPRIO: /* process type field is optional for backward compatibility */ if (nargs < 3 || nargs > 4) goto wronglen; cmd_procprio(packet, nargs, &cred); break; case LMK_PROCREMOVE: if (nargs != 1) goto wronglen; cmd_procremove(packet, &cred); break; case LMK_PROCPURGE: if (nargs != 0) goto wronglen; cmd_procpurge(&cred); break; case LMK_GETKILLCNT: if (nargs != 2) goto wronglen; kill_cnt = cmd_getkillcnt(packet); len = lmkd_pack_set_getkillcnt_repl(packet, kill_cnt); if (ctrl_data_write(dsock_idx, (char *)packet, len) != len) return; break; case LMK_SUBSCRIBE: if (nargs != 1) goto wronglen; cmd_subscribe(dsock_idx, packet); break; case LMK_PROCKILL: /* This command code is NOT expected at all */ ALOGE("Received unexpected command code %d", cmd); break; case LMK_UPDATE_PROPS: if (nargs != 0) goto wronglen; result = -1; if (update_props()) { if (!use_inkernel_interface && monitors_initialized) { /* Reinitialize monitors to apply new settings */ destroy_monitors(); if (init_monitors()) { result = 0; } } else { result = 0; } if (direct_reclaim_threshold_ms > 0 && !memevent_listener) { ALOGW("Kernel support for direct_reclaim_threshold_ms is not found"); direct_reclaim_threshold_ms = 0; } } len = lmkd_pack_set_update_props_repl(packet, result); if (ctrl_data_write(dsock_idx, (char *)packet, len) != len) { ALOGE("Failed to report operation results"); } if (!result) { ALOGI("Properties reinitilized"); } else { /* New settings can't be supported, crash to be restarted */ ALOGE("New configuration is not supported. Exiting..."); exit(1); } break; case LMK_START_MONITORING: if (nargs != 0) goto wronglen; // Registration is needed only if it was skipped earlier. if (monitors_initialized) return; if (!property_get_bool("sys.boot_completed", false)) { ALOGE("LMK_START_MONITORING cannot be handled before boot completed"); return; } if (!init_monitors()) { /* Failure to start psi monitoring, crash to be restarted */ ALOGE("Failure to initialize monitoring. Exiting..."); exit(1); } ALOGI("Initialized monitors after boot completed."); break; case LMK_BOOT_COMPLETED: if (nargs != 0) goto wronglen; if (boot_completed_handled) { /* Notify we have already handled post boot-up operations */ result = 1; } else if (!property_get_bool("sys.boot_completed", false)) { ALOGE("LMK_BOOT_COMPLETED cannot be handled before boot completed"); result = -1; } else { /* * Initialize the memevent listener after boot is completed to prevent * waiting, during boot-up, for BPF programs to be loaded. */ if (init_memevent_listener_monitoring()) { ALOGI("Using memevents for direct reclaim and kswapd detection"); } else { ALOGI("Using vmstats for direct reclaim and kswapd detection"); if (direct_reclaim_threshold_ms > 0) { ALOGW("Kernel support for direct_reclaim_threshold_ms is not found"); direct_reclaim_threshold_ms = 0; } } result = 0; boot_completed_handled = true; } len = lmkd_pack_set_boot_completed_notif_repl(packet, result); if (ctrl_data_write(dsock_idx, (char*)packet, len) != len) { ALOGE("Failed to report boot-completed operation results"); } break; case LMK_PROCS_PRIO: cmd_procs_prio(packet, nargs, &cred); break; default: ALOGE("Received unknown command code %d", cmd); return; } return; wronglen: ALOGE("Wrong control socket read length cmd=%d len=%d", cmd, len); } static void ctrl_data_handler(int data, uint32_t events, struct polling_params *poll_params __unused) { if (events & EPOLLIN) { ctrl_command_handler(data); } } static int get_free_dsock() { for (int i = 0; i < MAX_DATA_CONN; i++) { if (data_sock[i].sock < 0) { return i; } } return -1; } static void ctrl_connect_handler(int data __unused, uint32_t events __unused, struct polling_params *poll_params __unused) { struct epoll_event epev; int free_dscock_idx = get_free_dsock(); if (free_dscock_idx < 0) { /* * Number of data connections exceeded max supported. This should not * happen but if it does we drop all existing connections and accept * the new one. This prevents inactive connections from monopolizing * data socket and if we drop ActivityManager connection it will * immediately reconnect. */ for (int i = 0; i < MAX_DATA_CONN; i++) { ctrl_data_close(i); } free_dscock_idx = 0; } data_sock[free_dscock_idx].sock = accept(ctrl_sock.sock, NULL, NULL); if (data_sock[free_dscock_idx].sock < 0) { ALOGE("lmkd control socket accept failed; errno=%d", errno); return; } ALOGI("lmkd data connection established"); /* use data to store data connection idx */ data_sock[free_dscock_idx].handler_info.data = free_dscock_idx; data_sock[free_dscock_idx].handler_info.handler = ctrl_data_handler; data_sock[free_dscock_idx].async_event_mask = 0; epev.events = EPOLLIN; epev.data.ptr = (void *)&(data_sock[free_dscock_idx].handler_info); if (epoll_ctl(epollfd, EPOLL_CTL_ADD, data_sock[free_dscock_idx].sock, &epev) == -1) { ALOGE("epoll_ctl for data connection socket failed; errno=%d", errno); ctrl_data_close(free_dscock_idx); return; } maxevents++; } /* * /proc/zoneinfo parsing routines * Expected file format is: * * Node , zone * ( * per-node stats * ( )+ * )? * (pages free * ( )+ * pagesets * ()* * )+ * ... */ static void zoneinfo_parse_protection(char *buf, struct zoneinfo_zone *zone) { int zone_idx; int64_t max = 0; char *save_ptr; for (buf = strtok_r(buf, "(), ", &save_ptr), zone_idx = 0; buf && zone_idx < MAX_NR_ZONES; buf = strtok_r(NULL, "), ", &save_ptr), zone_idx++) { long long zoneval = strtoll(buf, &buf, 0); if (zoneval > max) { max = (zoneval > INT64_MAX) ? INT64_MAX : zoneval; } zone->protection[zone_idx] = zoneval; } zone->max_protection = max; } static int zoneinfo_parse_zone(char **buf, struct zoneinfo_zone *zone) { for (char *line = strtok_r(NULL, "\n", buf); line; line = strtok_r(NULL, "\n", buf)) { char *cp; char *ap; char *save_ptr; int64_t val; int field_idx; enum field_match_result match_res; cp = strtok_r(line, " ", &save_ptr); if (!cp) { return false; } field_idx = find_field(cp, zoneinfo_zone_spec_field_names, ZI_ZONE_SPEC_FIELD_COUNT); if (field_idx >= 0) { /* special field */ if (field_idx == ZI_ZONE_SPEC_PAGESETS) { /* no mode fields we are interested in */ return true; } /* protection field */ ap = strtok_r(NULL, ")", &save_ptr); if (ap) { zoneinfo_parse_protection(ap, zone); } continue; } ap = strtok_r(NULL, " ", &save_ptr); if (!ap) { continue; } match_res = match_field(cp, ap, zoneinfo_zone_field_names, ZI_ZONE_FIELD_COUNT, &val, &field_idx); if (match_res == PARSE_FAIL) { return false; } if (match_res == PARSE_SUCCESS) { zone->fields.arr[field_idx] = val; } if (field_idx == ZI_ZONE_PRESENT && val == 0) { /* zone is not populated, stop parsing it */ return true; } } return false; } static int zoneinfo_parse_node(char **buf, struct zoneinfo_node *node) { int fields_to_match = ZI_NODE_FIELD_COUNT; for (char *line = strtok_r(NULL, "\n", buf); line; line = strtok_r(NULL, "\n", buf)) { char *cp; char *ap; char *save_ptr; int64_t val; int field_idx; enum field_match_result match_res; cp = strtok_r(line, " ", &save_ptr); if (!cp) { return false; } ap = strtok_r(NULL, " ", &save_ptr); if (!ap) { return false; } match_res = match_field(cp, ap, zoneinfo_node_field_names, ZI_NODE_FIELD_COUNT, &val, &field_idx); if (match_res == PARSE_FAIL) { return false; } if (match_res == PARSE_SUCCESS) { node->fields.arr[field_idx] = val; fields_to_match--; if (!fields_to_match) { return true; } } } return false; } static int zoneinfo_parse(struct zoneinfo *zi) { static struct reread_data file_data = { .filename = ZONEINFO_PATH, .fd = -1, }; char *buf; char *save_ptr; char *line; char zone_name[LINE_MAX + 1]; struct zoneinfo_node *node = NULL; int node_idx = 0; int zone_idx = 0; memset(zi, 0, sizeof(struct zoneinfo)); if ((buf = reread_file(&file_data)) == NULL) { return -1; } for (line = strtok_r(buf, "\n", &save_ptr); line; line = strtok_r(NULL, "\n", &save_ptr)) { int node_id; if (sscanf(line, "Node %d, zone %" STRINGIFY(LINE_MAX) "s", &node_id, zone_name) == 2) { if (!node || node->id != node_id) { line = strtok_r(NULL, "\n", &save_ptr); if (strncmp(line, NODE_STATS_MARKER, strlen(NODE_STATS_MARKER)) != 0) { /* * per-node stats are only present in the first non-empty zone of * the node. */ continue; } /* new node is found */ if (node) { node->zone_count = zone_idx + 1; node_idx++; if (node_idx == MAX_NR_NODES) { /* max node count exceeded */ ALOGE("%s parse error", file_data.filename); return -1; } } node = &zi->nodes[node_idx]; node->id = node_id; zone_idx = 0; if (!zoneinfo_parse_node(&save_ptr, node)) { ALOGE("%s parse error", file_data.filename); return -1; } } else { /* new zone is found */ zone_idx++; } if (!zoneinfo_parse_zone(&save_ptr, &node->zones[zone_idx])) { ALOGE("%s parse error", file_data.filename); return -1; } } } if (!node) { ALOGE("%s parse error", file_data.filename); return -1; } node->zone_count = zone_idx + 1; zi->node_count = node_idx + 1; /* calculate totals fields */ for (node_idx = 0; node_idx < zi->node_count; node_idx++) { node = &zi->nodes[node_idx]; for (zone_idx = 0; zone_idx < node->zone_count; zone_idx++) { struct zoneinfo_zone *zone = &zi->nodes[node_idx].zones[zone_idx]; zi->totalreserve_pages += zone->max_protection + zone->fields.field.high; } zi->total_inactive_file += node->fields.field.nr_inactive_file; zi->total_active_file += node->fields.field.nr_active_file; } return 0; } /* /proc/meminfo parsing routines */ static bool meminfo_parse_line(char *line, union meminfo *mi) { char *cp = line; char *ap; char *save_ptr; int64_t val; int field_idx; enum field_match_result match_res; cp = strtok_r(line, " ", &save_ptr); if (!cp) { return false; } ap = strtok_r(NULL, " ", &save_ptr); if (!ap) { return false; } match_res = match_field(cp, ap, meminfo_field_names, MI_FIELD_COUNT, &val, &field_idx); if (match_res == PARSE_SUCCESS) { mi->arr[field_idx] = val / page_k; } return (match_res != PARSE_FAIL); } static int64_t read_gpu_total_kb() { static android::base::unique_fd fd( android::bpf::mapRetrieveRO("/sys/fs/bpf/map_gpuMem_gpu_mem_total_map")); static constexpr uint64_t kBpfKeyGpuTotalUsage = 0; uint64_t value; if (!fd.ok()) { return 0; } return android::bpf::findMapEntry(fd, &kBpfKeyGpuTotalUsage, &value) ? 0 : (int32_t)(value / 1024); } static int meminfo_parse(union meminfo *mi) { static struct reread_data file_data = { .filename = MEMINFO_PATH, .fd = -1, }; char *buf; char *save_ptr; char *line; memset(mi, 0, sizeof(union meminfo)); if ((buf = reread_file(&file_data)) == NULL) { return -1; } for (line = strtok_r(buf, "\n", &save_ptr); line; line = strtok_r(NULL, "\n", &save_ptr)) { if (!meminfo_parse_line(line, mi)) { ALOGE("%s parse error", file_data.filename); return -1; } } mi->field.nr_file_pages = mi->field.cached + mi->field.swap_cached + mi->field.buffers; mi->field.total_gpu_kb = read_gpu_total_kb(); mi->field.easy_available = mi->field.nr_free_pages + mi->field.inactive_file; return 0; } // In the case of ZRAM, mi->field.free_swap can't be used directly because swap space is taken // from the free memory or reclaimed. Use the lowest of free_swap and easily available memory to // measure free swap because they represent how much swap space the system will consider to use // and how much it can actually use. // Swap compression ratio in the calculation can be adjusted using swap_compression_ratio tunable. // By setting swap_compression_ratio to 0, available memory can be ignored. static inline int64_t get_free_swap(union meminfo *mi) { if (swap_compression_ratio) return std::min(mi->field.free_swap, mi->field.easy_available * swap_compression_ratio); return mi->field.free_swap; } /* /proc/vmstat parsing routines */ static bool vmstat_parse_line(char *line, union vmstat *vs) { char *cp; char *ap; char *save_ptr; int64_t val; int field_idx; enum field_match_result match_res; cp = strtok_r(line, " ", &save_ptr); if (!cp) { return false; } ap = strtok_r(NULL, " ", &save_ptr); if (!ap) { return false; } match_res = match_field(cp, ap, vmstat_field_names, VS_FIELD_COUNT, &val, &field_idx); if (match_res == PARSE_SUCCESS) { vs->arr[field_idx] = val; } return (match_res != PARSE_FAIL); } static int vmstat_parse(union vmstat *vs) { static struct reread_data file_data = { .filename = VMSTAT_PATH, .fd = -1, }; char *buf; char *save_ptr; char *line; memset(vs, 0, sizeof(union vmstat)); if ((buf = reread_file(&file_data)) == NULL) { return -1; } for (line = strtok_r(buf, "\n", &save_ptr); line; line = strtok_r(NULL, "\n", &save_ptr)) { if (!vmstat_parse_line(line, vs)) { ALOGE("%s parse error", file_data.filename); return -1; } } return 0; } static int psi_parse(struct reread_data *file_data, struct psi_stats stats[], bool full) { char *buf; char *save_ptr; char *line; if ((buf = reread_file(file_data)) == NULL) { return -1; } line = strtok_r(buf, "\n", &save_ptr); if (parse_psi_line(line, PSI_SOME, stats)) { return -1; } if (full) { line = strtok_r(NULL, "\n", &save_ptr); if (parse_psi_line(line, PSI_FULL, stats)) { return -1; } } return 0; } static int psi_parse_mem(struct psi_data *psi_data) { static struct reread_data file_data = { .filename = psi_resource_file[PSI_MEMORY], .fd = -1, }; return psi_parse(&file_data, psi_data->mem_stats, true); } static int psi_parse_io(struct psi_data *psi_data) { static struct reread_data file_data = { .filename = psi_resource_file[PSI_IO], .fd = -1, }; return psi_parse(&file_data, psi_data->io_stats, true); } static int psi_parse_cpu(struct psi_data *psi_data) { static struct reread_data file_data = { .filename = psi_resource_file[PSI_CPU], .fd = -1, }; return psi_parse(&file_data, psi_data->cpu_stats, false); } enum wakeup_reason { Event, Polling }; struct wakeup_info { struct timespec wakeup_tm; struct timespec prev_wakeup_tm; struct timespec last_event_tm; int wakeups_since_event; int skipped_wakeups; }; /* * After the initial memory pressure event is received lmkd schedules periodic wakeups to check * the memory conditions and kill if needed (polling). This is done because pressure events are * rate-limited and memory conditions can change in between events. Therefore after the initial * event there might be multiple wakeups. This function records the wakeup information such as the * timestamps of the last event and the last wakeup, the number of wakeups since the last event * and how many of those wakeups were skipped (some wakeups are skipped if previously killed * process is still freeing its memory). */ static void record_wakeup_time(struct timespec *tm, enum wakeup_reason reason, struct wakeup_info *wi) { wi->prev_wakeup_tm = wi->wakeup_tm; wi->wakeup_tm = *tm; if (reason == Event) { wi->last_event_tm = *tm; wi->wakeups_since_event = 0; wi->skipped_wakeups = 0; } else { wi->wakeups_since_event++; } } struct kill_info { enum kill_reasons kill_reason; const char *kill_desc; int thrashing; int max_thrashing; }; static void killinfo_log(struct proc* procp, int min_oom_score, int rss_kb, int swap_kb, struct kill_info *ki, union meminfo *mi, struct wakeup_info *wi, struct timespec *tm, struct psi_data *pd) { /* log process information */ android_log_write_int32(ctx, procp->pid); android_log_write_int32(ctx, procp->uid); android_log_write_int32(ctx, procp->oomadj); android_log_write_int32(ctx, min_oom_score); android_log_write_int32(ctx, std::min(rss_kb, (int)INT32_MAX)); android_log_write_int32(ctx, ki ? ki->kill_reason : NONE); /* log meminfo fields */ for (int field_idx = 0; field_idx < MI_FIELD_COUNT; field_idx++) { android_log_write_int32(ctx, mi ? std::min(mi->arr[field_idx] * page_k, (int64_t)INT32_MAX) : 0); } /* log lmkd wakeup information */ if (wi) { android_log_write_int32(ctx, (int32_t)get_time_diff_ms(&wi->last_event_tm, tm)); android_log_write_int32(ctx, (int32_t)get_time_diff_ms(&wi->prev_wakeup_tm, tm)); android_log_write_int32(ctx, wi->wakeups_since_event); android_log_write_int32(ctx, wi->skipped_wakeups); } else { android_log_write_int32(ctx, 0); android_log_write_int32(ctx, 0); android_log_write_int32(ctx, 0); android_log_write_int32(ctx, 0); } android_log_write_int32(ctx, std::min(swap_kb, (int)INT32_MAX)); android_log_write_int32(ctx, mi ? (int32_t)mi->field.total_gpu_kb : 0); if (ki) { android_log_write_int32(ctx, ki->thrashing); android_log_write_int32(ctx, ki->max_thrashing); } else { android_log_write_int32(ctx, 0); android_log_write_int32(ctx, 0); } if (pd) { android_log_write_float32(ctx, pd->mem_stats[PSI_SOME].avg10); android_log_write_float32(ctx, pd->mem_stats[PSI_FULL].avg10); android_log_write_float32(ctx, pd->io_stats[PSI_SOME].avg10); android_log_write_float32(ctx, pd->io_stats[PSI_FULL].avg10); android_log_write_float32(ctx, pd->cpu_stats[PSI_SOME].avg10); } else { for (int i = 0; i < 5; i++) { android_log_write_float32(ctx, 0); } } android_log_write_list(ctx, LOG_ID_EVENTS); android_log_reset(ctx); } // Note: returned entry is only an anchor and does not hold a valid process info. // When called from a non-main thread, adjslot_list_lock read lock should be taken. static struct proc *proc_adj_head(int oomadj) { return (struct proc *)&procadjslot_list[ADJTOSLOT(oomadj)]; } // When called from a non-main thread, adjslot_list_lock read lock should be taken. static struct proc *proc_adj_tail(int oomadj) { return (struct proc *)adjslot_tail(&procadjslot_list[ADJTOSLOT(oomadj)]); } // When called from a non-main thread, adjslot_list_lock read lock should be taken. static struct proc *proc_adj_prev(int oomadj, int pid) { struct adjslot_list *head = &procadjslot_list[ADJTOSLOT(oomadj)]; struct adjslot_list *curr = adjslot_tail(&procadjslot_list[ADJTOSLOT(oomadj)]); while (curr != head) { if (((struct proc *)curr)->pid == pid) { return (struct proc *)curr->prev; } curr = curr->prev; } return NULL; } // Can be called only from the main thread. static struct proc *proc_get_heaviest(int oomadj) { struct adjslot_list *head = &procadjslot_list[ADJTOSLOT(oomadj)]; struct adjslot_list *curr = head->next; struct proc *maxprocp = NULL; int maxsize = 0; while (curr != head) { int pid = ((struct proc *)curr)->pid; int tasksize = proc_get_size(pid); if (tasksize < 0) { struct adjslot_list *next = curr->next; pid_remove(pid); curr = next; } else { if (tasksize > maxsize) { maxsize = tasksize; maxprocp = (struct proc *)curr; } curr = curr->next; } } return maxprocp; } static bool find_victim(int oom_score, int prev_pid, struct proc &target_proc) { struct proc *procp; std::shared_lock lock(adjslot_list_lock); if (!prev_pid) { procp = proc_adj_tail(oom_score); } else { procp = proc_adj_prev(oom_score, prev_pid); if (!procp) { // pid was removed, restart at the tail procp = proc_adj_tail(oom_score); } } // the list is empty at this oom_score or we looped through it if (!procp || procp == proc_adj_head(oom_score)) { return false; } // make a copy because original might be destroyed after adjslot_list_lock is released target_proc = *procp; return true; } static void watchdog_callback() { int prev_pid = 0; ALOGW("lmkd watchdog timed out!"); for (int oom_score = OOM_SCORE_ADJ_MAX; oom_score >= 0;) { struct proc target; if (!find_victim(oom_score, prev_pid, target)) { oom_score--; prev_pid = 0; continue; } if (target.valid && reaper.kill({ target.pidfd, target.pid, target.uid }, true) == 0) { ALOGW("lmkd watchdog killed process %d, oom_score_adj %d", target.pid, oom_score); killinfo_log(&target, 0, 0, 0, NULL, NULL, NULL, NULL, NULL); // Can't call pid_remove() from non-main thread, therefore just invalidate the record pid_invalidate(target.pid); break; } prev_pid = target.pid; } } static Watchdog watchdog(WATCHDOG_TIMEOUT_SEC, watchdog_callback); static bool is_kill_pending(void) { char buf[24]; if (last_kill_pid_or_fd < 0) { return false; } if (pidfd_supported) { return true; } /* when pidfd is not supported base the decision on /proc/ existence */ snprintf(buf, sizeof(buf), "/proc/%d/", last_kill_pid_or_fd); if (access(buf, F_OK) == 0) { return true; } return false; } static bool is_waiting_for_kill(void) { return pidfd_supported && last_kill_pid_or_fd >= 0; } static void stop_wait_for_proc_kill(bool finished) { struct epoll_event epev; if (last_kill_pid_or_fd < 0) { return; } if (debug_process_killing) { struct timespec curr_tm; if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) { /* * curr_tm is used here merely to report kill duration, so this failure is not fatal. * Log an error and continue. */ ALOGE("Failed to get current time"); } if (finished) { ALOGI("Process got killed in %ldms", get_time_diff_ms(&last_kill_tm, &curr_tm)); } else { ALOGI("Stop waiting for process kill after %ldms", get_time_diff_ms(&last_kill_tm, &curr_tm)); } } if (pidfd_supported) { /* unregister fd */ if (epoll_ctl(epollfd, EPOLL_CTL_DEL, last_kill_pid_or_fd, &epev)) { // Log an error and keep going ALOGE("epoll_ctl for last killed process failed; errno=%d", errno); } maxevents--; close(last_kill_pid_or_fd); } last_kill_pid_or_fd = -1; } static void kill_done_handler(int data __unused, uint32_t events __unused, struct polling_params *poll_params) { stop_wait_for_proc_kill(true); poll_params->update = POLLING_RESUME; } static void kill_fail_handler(int data __unused, uint32_t events __unused, struct polling_params *poll_params) { int pid; // Extract pid from the communication pipe. Clearing the pipe this way allows further // epoll_wait calls to sleep until the next event. if (TEMP_FAILURE_RETRY(read(reaper_comm_fd[0], &pid, sizeof(pid))) != sizeof(pid)) { ALOGE("thread communication read failed: %s", strerror(errno)); } stop_wait_for_proc_kill(false); poll_params->update = POLLING_RESUME; } static void start_wait_for_proc_kill(int pid_or_fd) { static struct event_handler_info kill_done_hinfo = { 0, kill_done_handler }; struct epoll_event epev; if (last_kill_pid_or_fd >= 0) { /* Should not happen but if it does we should stop previous wait */ ALOGE("Attempt to wait for a kill while another wait is in progress"); stop_wait_for_proc_kill(false); } last_kill_pid_or_fd = pid_or_fd; if (!pidfd_supported) { /* If pidfd is not supported just store PID and exit */ return; } epev.events = EPOLLIN; epev.data.ptr = (void *)&kill_done_hinfo; if (epoll_ctl(epollfd, EPOLL_CTL_ADD, last_kill_pid_or_fd, &epev) != 0) { ALOGE("epoll_ctl for last kill failed; errno=%d", errno); close(last_kill_pid_or_fd); last_kill_pid_or_fd = -1; return; } maxevents++; } /* Kill one process specified by procp. Returns the size (in pages) of the process killed */ static int kill_one_process(struct proc* procp, int min_oom_score, struct kill_info *ki, union meminfo *mi, struct wakeup_info *wi, struct timespec *tm, struct psi_data *pd) { int pid = procp->pid; int pidfd = procp->pidfd; uid_t uid = procp->uid; char *taskname; int kill_result; int result = -1; struct memory_stat *mem_st; struct kill_stat kill_st; int64_t tgid; int64_t rss_kb; int64_t swap_kb; char buf[pagesize]; char desc[LINE_MAX]; if (!procp->valid || !read_proc_status(pid, buf, sizeof(buf))) { goto out; } if (!parse_status_tag(buf, PROC_STATUS_TGID_FIELD, &tgid)) { ALOGE("Unable to parse tgid from /proc/%d/status", pid); goto out; } if (tgid != pid) { ALOGE("Possible pid reuse detected (pid %d, tgid %" PRId64 ")!", pid, tgid); goto out; } // Zombie processes will not have RSS / Swap fields. if (!parse_status_tag(buf, PROC_STATUS_RSS_FIELD, &rss_kb)) { goto out; } if (!parse_status_tag(buf, PROC_STATUS_SWAP_FIELD, &swap_kb)) { goto out; } taskname = proc_get_name(pid, buf, sizeof(buf)); // taskname will point inside buf, do not reuse buf onwards. if (!taskname) { goto out; } mem_st = stats_read_memory_stat(per_app_memcg, pid, uid, rss_kb * 1024, swap_kb * 1024); snprintf(desc, sizeof(desc), "lmk,%d,%d,%d,%d,%d", pid, ki ? (int)ki->kill_reason : -1, procp->oomadj, min_oom_score, ki ? ki->max_thrashing : -1); result = lmkd_free_memory_before_kill_hook(procp, rss_kb / page_k, procp->oomadj, ki ? (int)ki->kill_reason : -1); if (result > 0) { /* * Memory was freed elsewhere; no need to kill. Note: intentionally do not * pid_remove(pid) since it was not killed. */ ALOGI("Skipping kill; %ld kB freed elsewhere.", result * page_k); return result; } trace_kill_start(desc); start_wait_for_proc_kill(pidfd < 0 ? pid : pidfd); kill_result = reaper.kill({ pidfd, pid, uid }, false); trace_kill_end(); if (kill_result) { stop_wait_for_proc_kill(false); ALOGE("kill(%d): errno=%d", pid, errno); /* Delete process record even when we fail to kill so that we don't get stuck on it */ goto out; } last_kill_tm = *tm; inc_killcnt(procp->oomadj); if (ki) { kill_st.kill_reason = ki->kill_reason; kill_st.thrashing = ki->thrashing; kill_st.max_thrashing = ki->max_thrashing; ALOGI("Kill '%s' (%d), uid %d, oom_score_adj %d to free %" PRId64 "kB rss, %" PRId64 "kB swap; reason: %s", taskname, pid, uid, procp->oomadj, rss_kb, swap_kb, ki->kill_desc); } else { kill_st.kill_reason = NONE; kill_st.thrashing = 0; kill_st.max_thrashing = 0; ALOGI("Kill '%s' (%d), uid %d, oom_score_adj %d to free %" PRId64 "kB rss, %" PRId64 "kb swap", taskname, pid, uid, procp->oomadj, rss_kb, swap_kb); } killinfo_log(procp, min_oom_score, rss_kb, swap_kb, ki, mi, wi, tm, pd); kill_st.uid = static_cast(uid); kill_st.taskname = taskname; kill_st.oom_score = procp->oomadj; kill_st.min_oom_score = min_oom_score; kill_st.free_mem_kb = mi->field.nr_free_pages * page_k; kill_st.free_swap_kb = get_free_swap(mi) * page_k; stats_write_lmk_kill_occurred(&kill_st, mem_st); ctrl_data_write_lmk_kill_occurred((pid_t)pid, uid, rss_kb); result = rss_kb / page_k; out: /* * WARNING: After pid_remove() procp is freed and can't be used! * Therefore placed at the end of the function. */ pid_remove(pid); return result; } /* * Find one process to kill at or above the given oom_score_adj level. * Returns size of the killed process. */ static int find_and_kill_process(int min_score_adj, struct kill_info *ki, union meminfo *mi, struct wakeup_info *wi, struct timespec *tm, struct psi_data *pd) { int i; int killed_size = 0; bool choose_heaviest_task = kill_heaviest_task; for (i = OOM_SCORE_ADJ_MAX; i >= min_score_adj; i--) { struct proc *procp; if (!choose_heaviest_task && i <= PERCEPTIBLE_APP_ADJ) { /* * If we have to choose a perceptible process, choose the heaviest one to * hopefully minimize the number of victims. */ choose_heaviest_task = true; } while (true) { procp = choose_heaviest_task ? proc_get_heaviest(i) : proc_adj_tail(i); if (!procp) break; killed_size = kill_one_process(procp, min_score_adj, ki, mi, wi, tm, pd); if (killed_size >= 0) { break; } } if (killed_size) { break; } } return killed_size; } static int64_t get_memory_usage(struct reread_data *file_data) { int64_t mem_usage; char *buf; if ((buf = reread_file(file_data)) == NULL) { return -1; } if (!parse_int64(buf, &mem_usage)) { ALOGE("%s parse error", file_data->filename); return -1; } if (mem_usage == 0) { ALOGE("No memory!"); return -1; } return mem_usage; } void record_low_pressure_levels(union meminfo *mi) { if (low_pressure_mem.min_nr_free_pages == -1 || low_pressure_mem.min_nr_free_pages > mi->field.nr_free_pages) { if (debug_process_killing) { ALOGI("Low pressure min memory update from %" PRId64 " to %" PRId64, low_pressure_mem.min_nr_free_pages, mi->field.nr_free_pages); } low_pressure_mem.min_nr_free_pages = mi->field.nr_free_pages; } /* * Free memory at low vmpressure events occasionally gets spikes, * possibly a stale low vmpressure event with memory already * freed up (no memory pressure should have been reported). * Ignore large jumps in max_nr_free_pages that would mess up our stats. */ if (low_pressure_mem.max_nr_free_pages == -1 || (low_pressure_mem.max_nr_free_pages < mi->field.nr_free_pages && mi->field.nr_free_pages - low_pressure_mem.max_nr_free_pages < low_pressure_mem.max_nr_free_pages * 0.1)) { if (debug_process_killing) { ALOGI("Low pressure max memory update from %" PRId64 " to %" PRId64, low_pressure_mem.max_nr_free_pages, mi->field.nr_free_pages); } low_pressure_mem.max_nr_free_pages = mi->field.nr_free_pages; } } enum vmpressure_level upgrade_level(enum vmpressure_level level) { return (enum vmpressure_level)((level < VMPRESS_LEVEL_CRITICAL) ? level + 1 : level); } enum vmpressure_level downgrade_level(enum vmpressure_level level) { return (enum vmpressure_level)((level > VMPRESS_LEVEL_LOW) ? level - 1 : level); } enum zone_watermark { WMARK_MIN = 0, WMARK_LOW, WMARK_HIGH, WMARK_NONE }; struct zone_watermarks { long high_wmark; long low_wmark; long min_wmark; }; /* * Returns lowest breached watermark or WMARK_NONE. */ static enum zone_watermark get_lowest_watermark(union meminfo *mi, struct zone_watermarks *watermarks) { int64_t nr_free_pages = mi->field.nr_free_pages - mi->field.cma_free; if (nr_free_pages < watermarks->min_wmark) { return WMARK_MIN; } if (nr_free_pages < watermarks->low_wmark) { return WMARK_LOW; } if (nr_free_pages < watermarks->high_wmark) { return WMARK_HIGH; } return WMARK_NONE; } void calc_zone_watermarks(struct zoneinfo *zi, struct zone_watermarks *watermarks) { memset(watermarks, 0, sizeof(struct zone_watermarks)); for (int node_idx = 0; node_idx < zi->node_count; node_idx++) { struct zoneinfo_node *node = &zi->nodes[node_idx]; for (int zone_idx = 0; zone_idx < node->zone_count; zone_idx++) { struct zoneinfo_zone *zone = &node->zones[zone_idx]; if (!zone->fields.field.present) { continue; } watermarks->high_wmark += zone->max_protection + zone->fields.field.high; watermarks->low_wmark += zone->max_protection + zone->fields.field.low; watermarks->min_wmark += zone->max_protection + zone->fields.field.min; } } } static int calc_swap_utilization(union meminfo *mi) { int64_t swap_used = mi->field.total_swap - get_free_swap(mi); int64_t total_swappable = mi->field.active_anon + mi->field.inactive_anon + mi->field.shmem + swap_used; return total_swappable > 0 ? (swap_used * 100) / total_swappable : 0; } static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_params) { enum reclaim_state { NO_RECLAIM = 0, KSWAPD_RECLAIM, DIRECT_RECLAIM, }; static int64_t init_ws_refault; static int64_t prev_workingset_refault; static int64_t base_file_lru; static int64_t init_pgscan_kswapd; static int64_t init_pgscan_direct; static int64_t init_pgrefill; static bool killing; static int thrashing_limit = thrashing_limit_pct; static struct zone_watermarks watermarks; static struct timespec wmark_update_tm; static struct wakeup_info wi; static struct timespec thrashing_reset_tm; static int64_t prev_thrash_growth = 0; static bool check_filecache = false; static int max_thrashing = 0; union meminfo mi; union vmstat vs; struct psi_data psi_data; struct timespec curr_tm; int64_t thrashing = 0; bool swap_is_low = false; enum vmpressure_level level = (enum vmpressure_level)data; enum kill_reasons kill_reason = NONE; bool cycle_after_kill = false; enum reclaim_state reclaim = NO_RECLAIM; enum zone_watermark wmark = WMARK_NONE; char kill_desc[LINE_MAX]; bool cut_thrashing_limit = false; int min_score_adj = 0; int swap_util = 0; int64_t swap_low_threshold; long since_thrashing_reset_ms; int64_t workingset_refault_file; bool critical_stall = false; bool in_direct_reclaim; long direct_reclaim_duration_ms; bool in_kswapd_reclaim; mp_event_count++; if (debug_process_killing) { ALOGI("%s memory pressure event #%" PRIu64 " is triggered", level_name[level], mp_event_count); } if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) { ALOGE("Failed to get current time"); return; } if (events > 0 ) { /* Ignore a lower event within the first polling window. */ if (level < prev_level) { if (debug_process_killing) ALOGI("Ignoring %s pressure event; occurred too soon.", level_name[level]); return; } prev_level = level; } else { /* Reset event level after the first polling window. */ prev_level = VMPRESS_LEVEL_LOW; } record_wakeup_time(&curr_tm, events ? Event : Polling, &wi); bool kill_pending = is_kill_pending(); if (kill_pending && (kill_timeout_ms == 0 || get_time_diff_ms(&last_kill_tm, &curr_tm) < static_cast(kill_timeout_ms))) { /* Skip while still killing a process */ wi.skipped_wakeups++; goto no_kill; } /* * Process is dead or kill timeout is over, stop waiting. This has no effect if pidfds are * supported and death notification already caused waiting to stop. */ stop_wait_for_proc_kill(!kill_pending); if (vmstat_parse(&vs) < 0) { ALOGE("Failed to parse vmstat!"); return; } /* Starting 5.9 kernel workingset_refault vmstat field was renamed workingset_refault_file */ workingset_refault_file = vs.field.workingset_refault ? : vs.field.workingset_refault_file; if (meminfo_parse(&mi) < 0) { ALOGE("Failed to parse meminfo!"); return; } /* Reset states after process got killed */ if (killing) { killing = false; cycle_after_kill = true; /* Reset file-backed pagecache size and refault amounts after a kill */ base_file_lru = vs.field.nr_inactive_file + vs.field.nr_active_file; init_ws_refault = workingset_refault_file; thrashing_reset_tm = curr_tm; prev_thrash_growth = 0; } /* Check free swap levels */ if (swap_free_low_percentage) { swap_low_threshold = mi.field.total_swap * swap_free_low_percentage / 100; swap_is_low = get_free_swap(&mi) < swap_low_threshold; } else { swap_low_threshold = 0; } if (memevent_listener) { in_direct_reclaim = direct_reclaim_start_tm.tv_sec != 0 || direct_reclaim_start_tm.tv_nsec != 0; in_kswapd_reclaim = kswapd_start_tm.tv_sec != 0 || kswapd_start_tm.tv_nsec != 0; } else { in_direct_reclaim = vs.field.pgscan_direct != init_pgscan_direct; in_kswapd_reclaim = (vs.field.pgscan_kswapd != init_pgscan_kswapd) || (vs.field.pgrefill != init_pgrefill); } /* Identify reclaim state */ if (in_direct_reclaim) { init_pgscan_direct = vs.field.pgscan_direct; init_pgscan_kswapd = vs.field.pgscan_kswapd; init_pgrefill = vs.field.pgrefill; direct_reclaim_duration_ms = get_time_diff_ms(&direct_reclaim_start_tm, &curr_tm); reclaim = DIRECT_RECLAIM; } else if (in_kswapd_reclaim) { init_pgscan_kswapd = vs.field.pgscan_kswapd; init_pgrefill = vs.field.pgrefill; reclaim = KSWAPD_RECLAIM; } else if (workingset_refault_file == prev_workingset_refault) { /* * Device is not thrashing and not reclaiming, bail out early until we see these stats * changing */ goto no_kill; } prev_workingset_refault = workingset_refault_file; /* * It's possible we fail to find an eligible process to kill (ex. no process is * above oom_adj_min). When this happens, we should retry to find a new process * for a kill whenever a new eligible process is available. This is especially * important for a slow growing refault case. While retrying, we should keep * monitoring new thrashing counter as someone could release the memory to mitigate * the thrashing. Thus, when thrashing reset window comes, we decay the prev thrashing * counter by window counts. If the counter is still greater than thrashing limit, * we preserve the current prev_thrash counter so we will retry kill again. Otherwise, * we reset the prev_thrash counter so we will stop retrying. */ since_thrashing_reset_ms = get_time_diff_ms(&thrashing_reset_tm, &curr_tm); if (since_thrashing_reset_ms > THRASHING_RESET_INTERVAL_MS) { long windows_passed; /* Calculate prev_thrash_growth if we crossed THRASHING_RESET_INTERVAL_MS */ prev_thrash_growth = (workingset_refault_file - init_ws_refault) * 100 / (base_file_lru + 1); windows_passed = (since_thrashing_reset_ms / THRASHING_RESET_INTERVAL_MS); /* * Decay prev_thrashing unless over-the-limit thrashing was registered in the window we * just crossed, which means there were no eligible processes to kill. We preserve the * counter in that case to ensure a kill if a new eligible process appears. */ if (windows_passed > 1 || prev_thrash_growth < thrashing_limit) { prev_thrash_growth >>= windows_passed; } /* Record file-backed pagecache size when crossing THRASHING_RESET_INTERVAL_MS */ base_file_lru = vs.field.nr_inactive_file + vs.field.nr_active_file; init_ws_refault = workingset_refault_file; thrashing_reset_tm = curr_tm; thrashing_limit = thrashing_limit_pct; } else { /* Calculate what % of the file-backed pagecache refaulted so far */ thrashing = (workingset_refault_file - init_ws_refault) * 100 / (base_file_lru + 1); } /* Add previous cycle's decayed thrashing amount */ thrashing += prev_thrash_growth; if (max_thrashing < thrashing) { max_thrashing = thrashing; } update_watermarks: /* * Refresh watermarks once per min in case user updated one of the margins. * TODO: b/140521024 replace this periodic update with an API for AMS to notify LMKD * that zone watermarks were changed by the system software. */ if (watermarks.high_wmark == 0 || get_time_diff_ms(&wmark_update_tm, &curr_tm) > 60000) { struct zoneinfo zi; if (zoneinfo_parse(&zi) < 0) { ALOGE("Failed to parse zoneinfo!"); return; } calc_zone_watermarks(&zi, &watermarks); wmark_update_tm = curr_tm; } /* Find out which watermark is breached if any */ wmark = get_lowest_watermark(&mi, &watermarks); if (!psi_parse_mem(&psi_data)) { critical_stall = psi_data.mem_stats[PSI_FULL].avg10 > (float)stall_limit_critical; } /* * TODO: move this logic into a separate function * Decide if killing a process is necessary and record the reason */ if (cycle_after_kill && wmark < WMARK_LOW) { /* * Prevent kills not freeing enough memory which might lead to OOM kill. * This might happen when a process is consuming memory faster than reclaim can * free even after a kill. Mostly happens when running memory stress tests. */ min_score_adj = pressure_after_kill_min_score; kill_reason = PRESSURE_AFTER_KILL; strncpy(kill_desc, "min watermark is breached even after kill", sizeof(kill_desc)); } else if (level == VMPRESS_LEVEL_CRITICAL && events != 0) { /* * Device is too busy reclaiming memory which might lead to ANR. * Critical level is triggered when PSI complete stall (all tasks are blocked because * of the memory congestion) breaches the configured threshold. */ kill_reason = NOT_RESPONDING; strncpy(kill_desc, "device is not responding", sizeof(kill_desc)); } else if (swap_is_low && thrashing > thrashing_limit_pct) { /* Page cache is thrashing while swap is low */ kill_reason = LOW_SWAP_AND_THRASHING; snprintf(kill_desc, sizeof(kill_desc), "device is low on swap (%" PRId64 "kB < %" PRId64 "kB) and thrashing (%" PRId64 "%%)", get_free_swap(&mi) * page_k, swap_low_threshold * page_k, thrashing); /* Do not kill perceptible apps unless below min watermark or heavily thrashing */ if (wmark > WMARK_MIN && thrashing < thrashing_critical_pct) { min_score_adj = PERCEPTIBLE_APP_ADJ + 1; } check_filecache = true; } else if (swap_is_low && wmark < WMARK_HIGH) { /* Both free memory and swap are low */ kill_reason = LOW_MEM_AND_SWAP; snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached and swap is low (%" PRId64 "kB < %" PRId64 "kB)", wmark < WMARK_LOW ? "min" : "low", get_free_swap(&mi) * page_k, swap_low_threshold * page_k); /* Do not kill perceptible apps unless below min watermark or heavily thrashing */ if (wmark > WMARK_MIN && thrashing < thrashing_critical_pct) { min_score_adj = PERCEPTIBLE_APP_ADJ + 1; } } else if (wmark < WMARK_HIGH && swap_util_max < 100 && (swap_util = calc_swap_utilization(&mi)) > swap_util_max) { /* * Too much anon memory is swapped out but swap is not low. * Non-swappable allocations created memory pressure. */ kill_reason = LOW_MEM_AND_SWAP_UTIL; snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached and swap utilization" " is high (%d%% > %d%%)", wmark < WMARK_LOW ? "min" : "low", swap_util, swap_util_max); } else if (wmark < WMARK_HIGH && thrashing > thrashing_limit) { /* Page cache is thrashing while memory is low */ kill_reason = LOW_MEM_AND_THRASHING; snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached and thrashing (%" PRId64 "%%)", wmark < WMARK_LOW ? "min" : "low", thrashing); cut_thrashing_limit = true; /* Do not kill perceptible apps unless thrashing at critical levels */ if (thrashing < thrashing_critical_pct) { min_score_adj = PERCEPTIBLE_APP_ADJ + 1; } check_filecache = true; } else if (reclaim == DIRECT_RECLAIM && thrashing > thrashing_limit) { /* Page cache is thrashing while in direct reclaim (mostly happens on lowram devices) */ kill_reason = DIRECT_RECL_AND_THRASHING; snprintf(kill_desc, sizeof(kill_desc), "device is in direct reclaim and thrashing (%" PRId64 "%%)", thrashing); cut_thrashing_limit = true; /* Do not kill perceptible apps unless thrashing at critical levels */ if (thrashing < thrashing_critical_pct) { min_score_adj = PERCEPTIBLE_APP_ADJ + 1; } check_filecache = true; } else if (reclaim == DIRECT_RECLAIM && direct_reclaim_threshold_ms > 0 && direct_reclaim_duration_ms > direct_reclaim_threshold_ms) { kill_reason = DIRECT_RECL_STUCK; snprintf(kill_desc, sizeof(kill_desc), "device is stuck in direct reclaim (%ldms > %dms)", direct_reclaim_duration_ms, direct_reclaim_threshold_ms); } else if (check_filecache) { int64_t file_lru_kb = (vs.field.nr_inactive_file + vs.field.nr_active_file) * page_k; if (file_lru_kb < filecache_min_kb) { /* File cache is too low after thrashing, keep killing background processes */ kill_reason = LOW_FILECACHE_AFTER_THRASHING; snprintf(kill_desc, sizeof(kill_desc), "filecache is low (%" PRId64 "kB < %" PRId64 "kB) after thrashing", file_lru_kb, filecache_min_kb); min_score_adj = PERCEPTIBLE_APP_ADJ + 1; } else { /* File cache is big enough, stop checking */ check_filecache = false; } } /* Check if a cached app should be killed */ if (kill_reason == NONE && wmark < WMARK_HIGH) { kill_reason = LOW_MEM; snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached", wmark < WMARK_LOW ? "min" : "low"); min_score_adj = lowmem_min_oom_score; } /* Kill a process if necessary */ if (kill_reason != NONE) { struct kill_info ki = { .kill_reason = kill_reason, .kill_desc = kill_desc, .thrashing = (int)thrashing, .max_thrashing = max_thrashing, }; static bool first_kill = true; /* Make sure watermarks are correct before the first kill */ if (first_kill) { first_kill = false; watermarks.high_wmark = 0; // force recomputation goto update_watermarks; } /* Allow killing perceptible apps if the system is stalled */ if (critical_stall) { min_score_adj = 0; } psi_parse_io(&psi_data); psi_parse_cpu(&psi_data); int pages_freed = find_and_kill_process(min_score_adj, &ki, &mi, &wi, &curr_tm, &psi_data); if (pages_freed > 0) { killing = true; max_thrashing = 0; if (cut_thrashing_limit) { /* * Cut thrasing limit by thrashing_limit_decay_pct percentage of the current * thrashing limit until the system stops thrashing. */ thrashing_limit = (thrashing_limit * (100 - thrashing_limit_decay_pct)) / 100; } } } no_kill: /* Do not poll if kernel supports pidfd waiting */ if (is_waiting_for_kill()) { /* Pause polling if we are waiting for process death notification */ poll_params->update = POLLING_PAUSE; return; } /* * Start polling after initial PSI event; * extend polling while device is in direct reclaim or process is being killed; * do not extend when kswapd reclaims because that might go on for a long time * without causing memory pressure */ if (events || killing || reclaim == DIRECT_RECLAIM) { poll_params->update = POLLING_START; } /* Decide the polling interval */ if (swap_is_low || killing) { /* Fast polling during and after a kill or when swap is low */ poll_params->polling_interval_ms = PSI_POLL_PERIOD_SHORT_MS; } else { /* By default use long intervals */ poll_params->polling_interval_ms = PSI_POLL_PERIOD_LONG_MS; } } static std::string GetCgroupAttributePath(const char* attr) { std::string path; if (!CgroupGetAttributePath(attr, &path)) { ALOGE("Unknown cgroup attribute %s", attr); } return path; } // The implementation of this function relies on memcg statistics that are only available in the // v1 cgroup hierarchy. static void mp_event_common(int data, uint32_t events, struct polling_params *poll_params) { unsigned long long evcount; int64_t mem_usage, memsw_usage; int64_t mem_pressure; union meminfo mi; struct zoneinfo zi; struct timespec curr_tm; static unsigned long kill_skip_count = 0; enum vmpressure_level level = (enum vmpressure_level)data; long other_free = 0, other_file = 0; int min_score_adj; int minfree = 0; static const std::string mem_usage_path = GetCgroupAttributePath("MemUsage"); static struct reread_data mem_usage_file_data = { .filename = mem_usage_path.c_str(), .fd = -1, }; static const std::string memsw_usage_path = GetCgroupAttributePath("MemAndSwapUsage"); static struct reread_data memsw_usage_file_data = { .filename = memsw_usage_path.c_str(), .fd = -1, }; static struct wakeup_info wi; mp_event_count++; if (debug_process_killing) { ALOGI("%s memory pressure event #%" PRIu64 " is triggered", level_name[level], mp_event_count); } if (!use_psi_monitors) { /* * Check all event counters from low to critical * and upgrade to the highest priority one. By reading * eventfd we also reset the event counters. */ for (int lvl = VMPRESS_LEVEL_LOW; lvl < VMPRESS_LEVEL_COUNT; lvl++) { if (mpevfd[lvl] != -1 && TEMP_FAILURE_RETRY(read(mpevfd[lvl], &evcount, sizeof(evcount))) > 0 && evcount > 0 && lvl > level) { level = static_cast(lvl); } } } /* Start polling after initial PSI event */ if (use_psi_monitors && events) { /* Override polling params only if current event is more critical */ if (!poll_params->poll_handler || data > poll_params->poll_handler->data) { poll_params->polling_interval_ms = PSI_POLL_PERIOD_SHORT_MS; poll_params->update = POLLING_START; } } if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) { ALOGE("Failed to get current time"); return; } record_wakeup_time(&curr_tm, events ? Event : Polling, &wi); if (kill_timeout_ms && get_time_diff_ms(&last_kill_tm, &curr_tm) < static_cast(kill_timeout_ms)) { /* * If we're within the no-kill timeout, see if there's pending reclaim work * from the last killed process. If so, skip killing for now. */ if (is_kill_pending()) { kill_skip_count++; wi.skipped_wakeups++; return; } /* * Process is dead, stop waiting. This has no effect if pidfds are supported and * death notification already caused waiting to stop. */ stop_wait_for_proc_kill(true); } else { /* * Killing took longer than no-kill timeout. Stop waiting for the last process * to die because we are ready to kill again. */ stop_wait_for_proc_kill(false); } if (kill_skip_count > 0) { ALOGI("%lu memory pressure events were skipped after a kill!", kill_skip_count); kill_skip_count = 0; } if (meminfo_parse(&mi) < 0 || zoneinfo_parse(&zi) < 0) { ALOGE("Failed to get free memory!"); return; } if (use_minfree_levels) { int i; other_free = mi.field.nr_free_pages - zi.totalreserve_pages; if (mi.field.nr_file_pages > (mi.field.shmem + mi.field.unevictable + mi.field.swap_cached)) { other_file = (mi.field.nr_file_pages - mi.field.shmem - mi.field.unevictable - mi.field.swap_cached); } else { other_file = 0; } min_score_adj = OOM_SCORE_ADJ_MAX + 1; for (i = 0; i < lowmem_targets_size; i++) { minfree = lowmem_minfree[i]; if (other_free < minfree && other_file < minfree) { min_score_adj = lowmem_adj[i]; break; } } if (min_score_adj == OOM_SCORE_ADJ_MAX + 1) { if (debug_process_killing && lowmem_targets_size) { ALOGI("Ignore %s memory pressure event " "(free memory=%ldkB, cache=%ldkB, limit=%ldkB)", level_name[level], other_free * page_k, other_file * page_k, (long)lowmem_minfree[lowmem_targets_size - 1] * page_k); } return; } goto do_kill; } if (level == VMPRESS_LEVEL_LOW) { record_low_pressure_levels(&mi); } if (level_oomadj[level] > OOM_SCORE_ADJ_MAX) { /* Do not monitor this pressure level */ return; } if ((mem_usage = get_memory_usage(&mem_usage_file_data)) < 0) { goto do_kill; } if ((memsw_usage = get_memory_usage(&memsw_usage_file_data)) < 0) { goto do_kill; } // Calculate percent for swappinness. mem_pressure = (mem_usage * 100) / memsw_usage; if (enable_pressure_upgrade && level != VMPRESS_LEVEL_CRITICAL) { // We are swapping too much. if (mem_pressure < upgrade_pressure) { level = upgrade_level(level); if (debug_process_killing) { ALOGI("Event upgraded to %s", level_name[level]); } } } // If we still have enough swap space available, check if we want to // ignore/downgrade pressure events. if (get_free_swap(&mi) >= mi.field.total_swap * swap_free_low_percentage / 100) { // If the pressure is larger than downgrade_pressure lmk will not // kill any process, since enough memory is available. if (mem_pressure > downgrade_pressure) { if (debug_process_killing) { ALOGI("Ignore %s memory pressure", level_name[level]); } return; } else if (level == VMPRESS_LEVEL_CRITICAL && mem_pressure > upgrade_pressure) { if (debug_process_killing) { ALOGI("Downgrade critical memory pressure"); } // Downgrade event, since enough memory available. level = downgrade_level(level); } } do_kill: if (low_ram_device) { /* For Go devices kill only one task */ if (find_and_kill_process(use_minfree_levels ? min_score_adj : level_oomadj[level], NULL, &mi, &wi, &curr_tm, NULL) == 0) { if (debug_process_killing) { ALOGI("Nothing to kill"); } } } else { int pages_freed; static struct timespec last_report_tm; static unsigned long report_skip_count = 0; if (!use_minfree_levels) { /* Free up enough memory to downgrate the memory pressure to low level */ if (mi.field.nr_free_pages >= low_pressure_mem.max_nr_free_pages) { if (debug_process_killing) { ALOGI("Ignoring pressure since more memory is " "available (%" PRId64 ") than watermark (%" PRId64 ")", mi.field.nr_free_pages, low_pressure_mem.max_nr_free_pages); } return; } min_score_adj = level_oomadj[level]; } pages_freed = find_and_kill_process(min_score_adj, NULL, &mi, &wi, &curr_tm, NULL); if (pages_freed == 0 && min_score_adj == 0) { lmkd_no_kill_candidates_hook(); } if (pages_freed == 0) { /* Rate limit kill reports when nothing was reclaimed */ if (get_time_diff_ms(&last_report_tm, &curr_tm) < FAIL_REPORT_RLIMIT_MS) { report_skip_count++; return; } } /* Log whenever we kill or when report rate limit allows */ if (use_minfree_levels) { ALOGI("Reclaimed %ldkB, cache(%ldkB) and free(%" PRId64 "kB)-reserved(%" PRId64 "kB) " "below min(%ldkB) for oom_score_adj %d", pages_freed * page_k, other_file * page_k, mi.field.nr_free_pages * page_k, zi.totalreserve_pages * page_k, minfree * page_k, min_score_adj); } else { ALOGI("Reclaimed %ldkB at oom_score_adj %d", pages_freed * page_k, min_score_adj); } if (report_skip_count > 0) { ALOGI("Suppressed %lu failed kill reports", report_skip_count); report_skip_count = 0; } last_report_tm = curr_tm; } if (is_waiting_for_kill()) { /* pause polling if we are waiting for process death notification */ poll_params->update = POLLING_PAUSE; } } static bool init_mp_psi(enum vmpressure_level level, bool use_new_strategy) { int fd; /* Do not register a handler if threshold_ms is not set */ if (!psi_thresholds[level].threshold_ms) { return true; } fd = init_psi_monitor(psi_thresholds[level].stall_type, psi_thresholds[level].threshold_ms * US_PER_MS, PSI_WINDOW_SIZE_MS * US_PER_MS); if (fd < 0) { return false; } vmpressure_hinfo[level].handler = use_new_strategy ? mp_event_psi : mp_event_common; vmpressure_hinfo[level].data = level; if (register_psi_monitor(epollfd, fd, &vmpressure_hinfo[level]) < 0) { destroy_psi_monitor(fd); return false; } maxevents++; mpevfd[level] = fd; return true; } static void destroy_mp_psi(enum vmpressure_level level) { int fd = mpevfd[level]; if (fd < 0) { return; } if (unregister_psi_monitor(epollfd, fd) < 0) { ALOGE("Failed to unregister psi monitor for %s memory pressure; errno=%d", level_name[level], errno); } maxevents--; destroy_psi_monitor(fd); mpevfd[level] = -1; } enum class MemcgVersion { kNotFound, kV1, kV2, }; static MemcgVersion __memcg_version() { std::string cgroupv2_path, memcg_path; if (!CgroupGetControllerPath("memory", &memcg_path)) { return MemcgVersion::kNotFound; } return CgroupGetControllerPath(CGROUPV2_HIERARCHY_NAME, &cgroupv2_path) && cgroupv2_path == memcg_path ? MemcgVersion::kV2 : MemcgVersion::kV1; } static MemcgVersion memcg_version() { static MemcgVersion version = __memcg_version(); return version; } static void memevent_listener_notification(int data __unused, uint32_t events __unused, struct polling_params* poll_params __unused) { struct timespec curr_tm; std::vector mem_events; if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) { direct_reclaim_start_tm.tv_sec = 0; direct_reclaim_start_tm.tv_nsec = 0; ALOGE("Failed to get current time for memevent listener notification."); return; } if (!memevent_listener->getMemEvents(mem_events)) { direct_reclaim_start_tm.tv_sec = 0; direct_reclaim_start_tm.tv_nsec = 0; ALOGE("Failed fetching memory listener events."); return; } for (const mem_event_t& mem_event : mem_events) { switch (mem_event.type) { /* Direct Reclaim */ case MEM_EVENT_DIRECT_RECLAIM_BEGIN: direct_reclaim_start_tm = curr_tm; break; case MEM_EVENT_DIRECT_RECLAIM_END: direct_reclaim_start_tm.tv_sec = 0; direct_reclaim_start_tm.tv_nsec = 0; break; /* kswapd */ case MEM_EVENT_KSWAPD_WAKE: kswapd_start_tm = curr_tm; break; case MEM_EVENT_KSWAPD_SLEEP: kswapd_start_tm.tv_sec = 0; kswapd_start_tm.tv_nsec = 0; break; } } } static bool init_memevent_listener_monitoring() { static struct event_handler_info direct_reclaim_poll_hinfo = {0, memevent_listener_notification}; if (memevent_listener) return true; // Make sure bpf programs are loaded, else we'll wait until they are loaded android::bpf::waitForProgsLoaded(); memevent_listener = std::make_unique( android::bpf::memevents::MemEventClient::LMKD); if (!memevent_listener->ok()) { ALOGE("Failed to initialize memevents listener"); memevent_listener.reset(); return false; } if (!memevent_listener->registerEvent(MEM_EVENT_DIRECT_RECLAIM_BEGIN) || !memevent_listener->registerEvent(MEM_EVENT_DIRECT_RECLAIM_END)) { ALOGE("Failed to register direct reclaim memevents"); memevent_listener.reset(); return false; } if (!memevent_listener->registerEvent(MEM_EVENT_KSWAPD_WAKE) || !memevent_listener->registerEvent(MEM_EVENT_KSWAPD_SLEEP)) { ALOGE("Failed to register kswapd memevents"); memevent_listener.reset(); return false; } int memevent_listener_fd = memevent_listener->getRingBufferFd(); if (memevent_listener_fd < 0) { memevent_listener.reset(); ALOGE("Invalid memevent_listener fd: %d", memevent_listener_fd); return false; } struct epoll_event epev; epev.events = EPOLLIN; epev.data.ptr = (void*)&direct_reclaim_poll_hinfo; if (epoll_ctl(epollfd, EPOLL_CTL_ADD, memevent_listener_fd, &epev) < 0) { ALOGE("Failed registering memevent_listener fd: %d; errno=%d", memevent_listener_fd, errno); memevent_listener.reset(); return false; } direct_reclaim_start_tm.tv_sec = 0; direct_reclaim_start_tm.tv_nsec = 0; maxevents++; return true; } static bool init_psi_monitors() { /* * When PSI is used on low-ram devices or on high-end devices without memfree levels * use new kill strategy based on zone watermarks, free swap and thrashing stats. * Also use the new strategy if memcg has not been mounted in the v1 cgroups hiearchy since * the old strategy relies on memcg attributes that are available only in the v1 cgroups * hiearchy. */ bool use_new_strategy = GET_LMK_PROPERTY(bool, "use_new_strategy", low_ram_device || !use_minfree_levels); if (!use_new_strategy && memcg_version() != MemcgVersion::kV1) { ALOGE("Old kill strategy can only be used with v1 cgroup hierarchy"); return false; } /* In default PSI mode override stall amounts using system properties */ if (use_new_strategy) { /* Do not use low pressure level */ psi_thresholds[VMPRESS_LEVEL_LOW].threshold_ms = 0; psi_thresholds[VMPRESS_LEVEL_MEDIUM].threshold_ms = psi_partial_stall_ms; psi_thresholds[VMPRESS_LEVEL_CRITICAL].threshold_ms = psi_complete_stall_ms; } if (!init_mp_psi(VMPRESS_LEVEL_LOW, use_new_strategy)) { return false; } if (!init_mp_psi(VMPRESS_LEVEL_MEDIUM, use_new_strategy)) { destroy_mp_psi(VMPRESS_LEVEL_LOW); return false; } if (!init_mp_psi(VMPRESS_LEVEL_CRITICAL, use_new_strategy)) { destroy_mp_psi(VMPRESS_LEVEL_MEDIUM); destroy_mp_psi(VMPRESS_LEVEL_LOW); return false; } return true; } static bool init_mp_common(enum vmpressure_level level) { // The implementation of this function relies on memcg statistics that are only available in the // v1 cgroup hierarchy. if (memcg_version() != MemcgVersion::kV1) { ALOGE("%s: global monitoring is only available for the v1 cgroup hierarchy", __func__); return false; } int mpfd; int evfd; int evctlfd; char buf[256]; struct epoll_event epev; int ret; int level_idx = (int)level; const char *levelstr = level_name[level_idx]; /* gid containing AID_SYSTEM required */ mpfd = open(GetCgroupAttributePath("MemPressureLevel").c_str(), O_RDONLY | O_CLOEXEC); if (mpfd < 0) { ALOGI("No kernel memory.pressure_level support (errno=%d)", errno); goto err_open_mpfd; } evctlfd = open(GetCgroupAttributePath("MemCgroupEventControl").c_str(), O_WRONLY | O_CLOEXEC); if (evctlfd < 0) { ALOGI("No kernel memory cgroup event control (errno=%d)", errno); goto err_open_evctlfd; } evfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); if (evfd < 0) { ALOGE("eventfd failed for level %s; errno=%d", levelstr, errno); goto err_eventfd; } ret = snprintf(buf, sizeof(buf), "%d %d %s", evfd, mpfd, levelstr); if (ret >= (ssize_t)sizeof(buf)) { ALOGE("cgroup.event_control line overflow for level %s", levelstr); goto err; } ret = TEMP_FAILURE_RETRY(write(evctlfd, buf, strlen(buf) + 1)); if (ret == -1) { ALOGE("cgroup.event_control write failed for level %s; errno=%d", levelstr, errno); goto err; } epev.events = EPOLLIN; /* use data to store event level */ vmpressure_hinfo[level_idx].data = level_idx; vmpressure_hinfo[level_idx].handler = mp_event_common; epev.data.ptr = (void *)&vmpressure_hinfo[level_idx]; ret = epoll_ctl(epollfd, EPOLL_CTL_ADD, evfd, &epev); if (ret == -1) { ALOGE("epoll_ctl for level %s failed; errno=%d", levelstr, errno); goto err; } maxevents++; mpevfd[level] = evfd; close(evctlfd); return true; err: close(evfd); err_eventfd: close(evctlfd); err_open_evctlfd: close(mpfd); err_open_mpfd: return false; } static void destroy_mp_common(enum vmpressure_level level) { struct epoll_event epev; int fd = mpevfd[level]; if (fd < 0) { return; } if (epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, &epev)) { // Log an error and keep going ALOGE("epoll_ctl for level %s failed; errno=%d", level_name[level], errno); } maxevents--; close(fd); mpevfd[level] = -1; } static void kernel_event_handler(int data __unused, uint32_t events __unused, struct polling_params *poll_params __unused) { poll_kernel(kpoll_fd); } static bool init_monitors() { ALOGI("Wakeup counter is reset from %" PRIu64 " to 0", mp_event_count); mp_event_count = 0; /* Try to use psi monitor first if kernel has it */ use_psi_monitors = GET_LMK_PROPERTY(bool, "use_psi", true) && init_psi_monitors(); /* Fall back to vmpressure */ if (!use_psi_monitors && (!init_mp_common(VMPRESS_LEVEL_LOW) || !init_mp_common(VMPRESS_LEVEL_MEDIUM) || !init_mp_common(VMPRESS_LEVEL_CRITICAL))) { ALOGE("Kernel does not support memory pressure events or in-kernel low memory killer"); return false; } if (use_psi_monitors) { ALOGI("Using psi monitors for memory pressure detection"); } else { ALOGI("Using vmpressure for memory pressure detection"); } monitors_initialized = true; return true; } static void destroy_monitors() { if (use_psi_monitors) { destroy_mp_psi(VMPRESS_LEVEL_CRITICAL); destroy_mp_psi(VMPRESS_LEVEL_MEDIUM); destroy_mp_psi(VMPRESS_LEVEL_LOW); } else { destroy_mp_common(VMPRESS_LEVEL_CRITICAL); destroy_mp_common(VMPRESS_LEVEL_MEDIUM); destroy_mp_common(VMPRESS_LEVEL_LOW); } } static void drop_reaper_comm() { close(reaper_comm_fd[0]); close(reaper_comm_fd[1]); } static bool setup_reaper_comm() { if (pipe(reaper_comm_fd)) { ALOGE("pipe failed: %s", strerror(errno)); return false; } // Ensure main thread never blocks on read int flags = fcntl(reaper_comm_fd[0], F_GETFL); if (fcntl(reaper_comm_fd[0], F_SETFL, flags | O_NONBLOCK)) { ALOGE("fcntl failed: %s", strerror(errno)); drop_reaper_comm(); return false; } return true; } static bool init_reaper() { if (!reaper.is_reaping_supported()) { ALOGI("Process reaping is not supported"); return false; } if (!setup_reaper_comm()) { ALOGE("Failed to create thread communication channel"); return false; } // Setup epoll handler struct epoll_event epev; static struct event_handler_info kill_failed_hinfo = { 0, kill_fail_handler }; epev.events = EPOLLIN; epev.data.ptr = (void *)&kill_failed_hinfo; if (epoll_ctl(epollfd, EPOLL_CTL_ADD, reaper_comm_fd[0], &epev)) { ALOGE("epoll_ctl failed: %s", strerror(errno)); drop_reaper_comm(); return false; } if (!reaper.init(reaper_comm_fd[1])) { ALOGE("Failed to initialize reaper object"); if (epoll_ctl(epollfd, EPOLL_CTL_DEL, reaper_comm_fd[0], &epev)) { ALOGE("epoll_ctl failed: %s", strerror(errno)); } drop_reaper_comm(); return false; } maxevents++; return true; } static int init(void) { static struct event_handler_info kernel_poll_hinfo = { 0, kernel_event_handler }; struct reread_data file_data = { .filename = ZONEINFO_PATH, .fd = -1, }; struct epoll_event epev; int pidfd; int i; int ret; // Initialize page size pagesize = getpagesize(); page_k = pagesize / 1024; epollfd = epoll_create(MAX_EPOLL_EVENTS); if (epollfd == -1) { ALOGE("epoll_create failed (errno=%d)", errno); return -1; } // mark data connections as not connected for (int i = 0; i < MAX_DATA_CONN; i++) { data_sock[i].sock = -1; } ctrl_sock.sock = android_get_control_socket("lmkd"); if (ctrl_sock.sock < 0) { ALOGE("get lmkd control socket failed"); return -1; } ret = listen(ctrl_sock.sock, MAX_DATA_CONN); if (ret < 0) { ALOGE("lmkd control socket listen failed (errno=%d)", errno); return -1; } epev.events = EPOLLIN; ctrl_sock.handler_info.handler = ctrl_connect_handler; epev.data.ptr = (void *)&(ctrl_sock.handler_info); if (epoll_ctl(epollfd, EPOLL_CTL_ADD, ctrl_sock.sock, &epev) == -1) { ALOGE("epoll_ctl for lmkd control socket failed (errno=%d)", errno); return -1; } maxevents++; has_inkernel_module = !access(INKERNEL_MINFREE_PATH, W_OK); use_inkernel_interface = has_inkernel_module; if (use_inkernel_interface) { ALOGI("Using in-kernel low memory killer interface"); if (init_poll_kernel()) { epev.events = EPOLLIN; epev.data.ptr = (void*)&kernel_poll_hinfo; if (epoll_ctl(epollfd, EPOLL_CTL_ADD, kpoll_fd, &epev) != 0) { ALOGE("epoll_ctl for lmk events failed (errno=%d)", errno); close(kpoll_fd); kpoll_fd = -1; } else { maxevents++; /* let the others know it does support reporting kills */ property_set("sys.lmk.reportkills", "1"); } } } else { // Do not register monitors until boot completed for devices configured // for delaying monitors. This is done to save CPU cycles for low // resource devices during boot up. if (!delay_monitors_until_boot || property_get_bool("sys.boot_completed", false)) { if (!init_monitors()) { return -1; } } /* let the others know it does support reporting kills */ property_set("sys.lmk.reportkills", "1"); } for (i = 0; i <= ADJTOSLOT(OOM_SCORE_ADJ_MAX); i++) { procadjslot_list[i].next = &procadjslot_list[i]; procadjslot_list[i].prev = &procadjslot_list[i]; } memset(killcnt_idx, KILLCNT_INVALID_IDX, sizeof(killcnt_idx)); /* * Read zoneinfo as the biggest file we read to create and size the initial * read buffer and avoid memory re-allocations during memory pressure */ if (reread_file(&file_data) == NULL) { ALOGE("Failed to read %s: %s", file_data.filename, strerror(errno)); } /* check if kernel supports pidfd_open syscall */ pidfd = TEMP_FAILURE_RETRY(pidfd_open(getpid(), 0)); if (pidfd < 0) { pidfd_supported = (errno != ENOSYS); } else { pidfd_supported = true; close(pidfd); } ALOGI("Process polling is %s", pidfd_supported ? "supported" : "not supported" ); if (!lmkd_init_hook()) { ALOGE("Failed to initialize LMKD hooks."); return -1; } return 0; } static bool polling_paused(struct polling_params *poll_params) { return poll_params->paused_handler != NULL; } static void resume_polling(struct polling_params *poll_params, struct timespec curr_tm) { poll_params->poll_start_tm = curr_tm; poll_params->poll_handler = poll_params->paused_handler; poll_params->polling_interval_ms = PSI_POLL_PERIOD_SHORT_MS; poll_params->paused_handler = NULL; } static void call_handler(struct event_handler_info* handler_info, struct polling_params *poll_params, uint32_t events) { struct timespec curr_tm; watchdog.start(); poll_params->update = POLLING_DO_NOT_CHANGE; handler_info->handler(handler_info->data, events, poll_params); clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm); if (poll_params->poll_handler == handler_info) { poll_params->last_poll_tm = curr_tm; } switch (poll_params->update) { case POLLING_START: /* * Poll for the duration of PSI_WINDOW_SIZE_MS after the * initial PSI event because psi events are rate-limited * at one per sec. */ poll_params->poll_start_tm = curr_tm; poll_params->poll_handler = handler_info; poll_params->last_poll_tm = curr_tm; break; case POLLING_PAUSE: poll_params->paused_handler = handler_info; poll_params->poll_handler = NULL; break; case POLLING_RESUME: resume_polling(poll_params, curr_tm); break; case POLLING_DO_NOT_CHANGE: if (poll_params->poll_handler && get_time_diff_ms(&poll_params->poll_start_tm, &curr_tm) > PSI_WINDOW_SIZE_MS) { /* Polled for the duration of PSI window, time to stop */ poll_params->poll_handler = NULL; } break; } watchdog.stop(); } static void mainloop(void) { struct event_handler_info* handler_info; struct polling_params poll_params; struct timespec curr_tm; struct epoll_event *evt; long delay = -1; poll_params.poll_handler = NULL; poll_params.paused_handler = NULL; while (1) { struct epoll_event events[MAX_EPOLL_EVENTS]; int nevents; int i; if (poll_params.poll_handler) { bool poll_now; clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm); if (poll_params.update == POLLING_RESUME) { /* Just transitioned into POLLING_RESUME, poll immediately. */ poll_now = true; nevents = 0; } else { /* Calculate next timeout */ delay = get_time_diff_ms(&poll_params.last_poll_tm, &curr_tm); delay = (delay < poll_params.polling_interval_ms) ? poll_params.polling_interval_ms - delay : poll_params.polling_interval_ms; /* Wait for events until the next polling timeout */ nevents = epoll_wait(epollfd, events, maxevents, delay); /* Update current time after wait */ clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm); poll_now = (get_time_diff_ms(&poll_params.last_poll_tm, &curr_tm) >= poll_params.polling_interval_ms); } if (poll_now) { call_handler(poll_params.poll_handler, &poll_params, 0); } } else { if (kill_timeout_ms && is_waiting_for_kill()) { clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm); delay = kill_timeout_ms - get_time_diff_ms(&last_kill_tm, &curr_tm); /* Wait for pidfds notification or kill timeout to expire */ nevents = (delay > 0) ? epoll_wait(epollfd, events, maxevents, delay) : 0; if (nevents == 0) { /* Kill notification timed out */ stop_wait_for_proc_kill(false); if (polling_paused(&poll_params)) { clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm); poll_params.update = POLLING_RESUME; resume_polling(&poll_params, curr_tm); } } } else { /* Wait for events with no timeout */ nevents = epoll_wait(epollfd, events, maxevents, -1); } } if (nevents == -1) { if (errno == EINTR) continue; ALOGE("epoll_wait failed (errno=%d)", errno); continue; } /* * First pass to see if any data socket connections were dropped. * Dropped connection should be handled before any other events * to deallocate data connection and correctly handle cases when * connection gets dropped and reestablished in the same epoll cycle. * In such cases it's essential to handle connection closures first. */ for (i = 0, evt = &events[0]; i < nevents; ++i, evt++) { if ((evt->events & EPOLLHUP) && evt->data.ptr) { handler_info = (struct event_handler_info*)evt->data.ptr; if (handler_info->handler == kill_done_handler) { call_handler(handler_info, &poll_params, evt->events); } else { ALOGI("lmkd data connection dropped"); watchdog.start(); ctrl_data_close(handler_info->data); watchdog.stop(); } } } /* Second pass to handle all other events */ for (i = 0, evt = &events[0]; i < nevents; ++i, evt++) { if (evt->events & EPOLLERR) { ALOGD("EPOLLERR on event #%d", i); } if (evt->events & EPOLLHUP) { /* This case was handled in the first pass */ continue; } if (evt->data.ptr) { handler_info = (struct event_handler_info*)evt->data.ptr; call_handler(handler_info, &poll_params, evt->events); } } } } int issue_reinit() { int sock; sock = lmkd_connect(); if (sock < 0) { ALOGE("failed to connect to lmkd: %s", strerror(errno)); return -1; } enum update_props_result res = lmkd_update_props(sock); switch (res) { case UPDATE_PROPS_SUCCESS: ALOGI("lmkd updated properties successfully"); break; case UPDATE_PROPS_SEND_ERR: ALOGE("failed to send lmkd request: %s", strerror(errno)); break; case UPDATE_PROPS_RECV_ERR: ALOGE("failed to receive lmkd reply: %s", strerror(errno)); break; case UPDATE_PROPS_FORMAT_ERR: ALOGE("lmkd reply is invalid"); break; case UPDATE_PROPS_FAIL: ALOGE("lmkd failed to update its properties"); break; } close(sock); return res == UPDATE_PROPS_SUCCESS ? 0 : -1; } static int on_boot_completed() { int sock; sock = lmkd_connect(); if (sock < 0) { ALOGE("failed to connect to lmkd: %s", strerror(errno)); return -1; } enum boot_completed_notification_result res = lmkd_notify_boot_completed(sock); switch (res) { case BOOT_COMPLETED_NOTIF_SUCCESS: break; case BOOT_COMPLETED_NOTIF_ALREADY_HANDLED: ALOGW("lmkd already handled boot-completed operations"); break; case BOOT_COMPLETED_NOTIF_SEND_ERR: ALOGE("failed to send lmkd request: %m"); break; case BOOT_COMPLETED_NOTIF_RECV_ERR: ALOGE("failed to receive request: %m"); break; case BOOT_COMPLETED_NOTIF_FORMAT_ERR: ALOGE("lmkd reply is invalid"); break; case BOOT_COMPLETED_NOTIF_FAILS: ALOGE("lmkd failed to receive boot-completed notification"); break; } close(sock); return res == BOOT_COMPLETED_NOTIF_SUCCESS ? 0 : -1; } static bool update_props() { /* By default disable low level vmpressure events */ level_oomadj[VMPRESS_LEVEL_LOW] = GET_LMK_PROPERTY(int32, "low", OOM_SCORE_ADJ_MAX + 1); level_oomadj[VMPRESS_LEVEL_MEDIUM] = GET_LMK_PROPERTY(int32, "medium", 800); level_oomadj[VMPRESS_LEVEL_CRITICAL] = GET_LMK_PROPERTY(int32, "critical", 0); debug_process_killing = GET_LMK_PROPERTY(bool, "debug", false); /* By default disable upgrade/downgrade logic */ enable_pressure_upgrade = GET_LMK_PROPERTY(bool, "critical_upgrade", false); upgrade_pressure = (int64_t)GET_LMK_PROPERTY(int32, "upgrade_pressure", 100); downgrade_pressure = (int64_t)GET_LMK_PROPERTY(int32, "downgrade_pressure", 100); kill_heaviest_task = GET_LMK_PROPERTY(bool, "kill_heaviest_task", false); low_ram_device = property_get_bool("ro.config.low_ram", false); kill_timeout_ms = (unsigned long)GET_LMK_PROPERTY(int32, "kill_timeout_ms", 100); pressure_after_kill_min_score = (unsigned long)GET_LMK_PROPERTY(int32, "pressure_after_kill_min_score", 0); use_minfree_levels = GET_LMK_PROPERTY(bool, "use_minfree_levels", false); per_app_memcg = property_get_bool("ro.config.per_app_memcg", low_ram_device); swap_free_low_percentage = clamp(0, 100, GET_LMK_PROPERTY(int32, "swap_free_low_percentage", DEF_LOW_SWAP)); psi_partial_stall_ms = GET_LMK_PROPERTY(int32, "psi_partial_stall_ms", low_ram_device ? DEF_PARTIAL_STALL_LOWRAM : DEF_PARTIAL_STALL); psi_complete_stall_ms = GET_LMK_PROPERTY(int32, "psi_complete_stall_ms", DEF_COMPLETE_STALL); thrashing_limit_pct = std::max(0, GET_LMK_PROPERTY(int32, "thrashing_limit", low_ram_device ? DEF_THRASHING_LOWRAM : DEF_THRASHING)); thrashing_limit_decay_pct = clamp(0, 100, GET_LMK_PROPERTY(int32, "thrashing_limit_decay", low_ram_device ? DEF_THRASHING_DECAY_LOWRAM : DEF_THRASHING_DECAY)); thrashing_critical_pct = std::max( 0, GET_LMK_PROPERTY(int32, "thrashing_limit_critical", thrashing_limit_pct * 3)); swap_util_max = clamp(0, 100, GET_LMK_PROPERTY(int32, "swap_util_max", 100)); filecache_min_kb = GET_LMK_PROPERTY(int64, "filecache_min_kb", 0); stall_limit_critical = GET_LMK_PROPERTY(int64, "stall_limit_critical", 100); delay_monitors_until_boot = GET_LMK_PROPERTY(bool, "delay_monitors_until_boot", false); direct_reclaim_threshold_ms = GET_LMK_PROPERTY(int64, "direct_reclaim_threshold_ms", DEF_DIRECT_RECL_THRESH_MS); swap_compression_ratio = GET_LMK_PROPERTY(int64, "swap_compression_ratio", DEF_SWAP_COMP_RATIO); lowmem_min_oom_score = std::max(PERCEPTIBLE_APP_ADJ + 1, GET_LMK_PROPERTY(int32, "lowmem_min_oom_score", DEF_LOWMEM_MIN_SCORE)); reaper.enable_debug(debug_process_killing); /* Call the update props hook */ if (!lmkd_update_props_hook()) { ALOGE("Failed to update LMKD hook props."); return false; } return true; } int main(int argc, char **argv) { if ((argc > 1) && argv[1]) { if (!strcmp(argv[1], "--reinit")) { if (property_set(LMKD_REINIT_PROP, "")) { ALOGE("Failed to reset " LMKD_REINIT_PROP " property"); } return issue_reinit(); } else if (!strcmp(argv[1], "--boot_completed")) { return on_boot_completed(); } } if (!update_props()) { ALOGE("Failed to initialize props, exiting."); return -1; } ctx = create_android_logger(KILLINFO_LOG_TAG); if (!init()) { if (!use_inkernel_interface) { /* * MCL_ONFAULT pins pages as they fault instead of loading * everything immediately all at once. (Which would be bad, * because as of this writing, we have a lot of mapped pages we * never use.) Old kernels will see MCL_ONFAULT and fail with * EINVAL; we ignore this failure. * * N.B. read the man page for mlockall. MCL_CURRENT | MCL_ONFAULT * pins ⊆ MCL_CURRENT, converging to just MCL_CURRENT as we fault * in pages. */ /* CAP_IPC_LOCK required */ if (mlockall(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT) && (errno != EINVAL)) { ALOGW("mlockall failed %s", strerror(errno)); } /* CAP_NICE required */ struct sched_param param = { .sched_priority = 1, }; if (sched_setscheduler(0, SCHED_FIFO, ¶m)) { ALOGW("set SCHED_FIFO failed %s", strerror(errno)); } } if (init_reaper()) { ALOGI("Process reaper initialized with %d threads in the pool", reaper.thread_cnt()); } if (!watchdog.init()) { ALOGE("Failed to initialize the watchdog"); } mainloop(); } android_log_destroy(&ctx); ALOGI("exiting"); return 0; }