Linux内核-内存管理: Out Of Memory Management 源码分析

Posted by LB on Sun, Mar 8, 2020

我们这篇文章中描述了Linux内核对于 Out Of Memory Management 场景下的相关策略,接下来我们将进行Linux 5.0内核的OOM内核源码分析。

一. 关键数据结构

针对源码部分,我们首先需要阐述oom_kill部分的核心数据结构。

文件路径:/linux/include/linux/oom.h

1.1 oom_control 结构体

首先,我们给出具体的内核定义:

 1/*
 2 * Details of the page allocation that triggered the oom killer that are used to
 3 * determine what should be killed.
 4 */
 5struct oom_control {
 6	/* Used to determine cpuset */
 7	struct zonelist *zonelist;
 8
 9	/* Used to determine mempolicy */
10	nodemask_t *nodemask;
11
12	/* Memory cgroup in which oom is invoked, or NULL for global oom */
13	struct mem_cgroup *memcg;
14
15	/* Used to determine cpuset and node locality requirement */
16	const gfp_t gfp_mask;
17
18	/*
19	 * order == -1 means the oom kill is required by sysrq, otherwise only
20	 * for display purposes.
21	 */
22	const int order;
23
24	/* Used by oom implementation, do not set */
25	unsigned long totalpages;
26	struct task_struct *chosen;
27	unsigned long chosen_points;
28
29	/* Used to print the constraint info. */
30	enum oom_constraint constraint;
31};

上面这个结构体中,着重强调一下几个参数:

  1. gfp_mask:全局文件指针mask,在oom阶段被用来判断IO设备属性等。
  2. totalpages : 总共的内存页。
  3. chosen :被选择待kill的进程task结构。
  4. chosen_points:被选择待kill的进程分数。
  5. constraint:oom分配约束的类型枚举类型。

二. out_of_memory 函数

当操作系统内存不够使用的时候,这个函数杀死“最佳进程”,这个函数并非最优操作,只是尽量让系统运行良好。

文件路径:/linux/mm/oom_kill.c

这部分内核源码如下:

 11033  bool out_of_memory(struct oom_control *oc)
 21034  {
 31035  	unsigned long freed = 0;
 41036  	enum oom_constraint constraint = CONSTRAINT_NONE;
 51037  
 61038  	if (oom_killer_disabled)
 71039  		return false;
 81040  
 91041  	if (!is_memcg_oom(oc)) {
101042  		blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
111043  		if (freed > 0)
121044  			/* Got some memory back in the last second. */
131045  			return true;
141046  	}
151047  
161048  	/*
171049  	 * If current has a pending SIGKILL or is exiting, then automatically
181050  	 * select it.  The goal is to allow it to allocate so that it may
191051  	 * quickly exit and free its memory.
201052  	 */
211053  	if (task_will_free_mem(current)) {
221054  		mark_oom_victim(current);
231055  		wake_oom_reaper(current);
241056  		return true;
251057  	}
261058  
271059  	/*
281060  	 * The OOM killer does not compensate for IO-less reclaim.
291061  	 * pagefault_out_of_memory lost its gfp context so we have to
301062  	 * make sure exclude 0 mask - all other users should have at least
311063  	 * ___GFP_DIRECT_RECLAIM to get here.
321064  	 */
331065  	if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS))
341066  		return true;
351067  
361068  	/*
371069  	 * Check if there were limitations on the allocation (only relevant for
381070  	 * NUMA and memcg) that may require different handling.
391071  	 */
401072  	constraint = constrained_alloc(oc);
411073  	if (constraint != CONSTRAINT_MEMORY_POLICY)
421074  		oc->nodemask = NULL;
431075  	check_panic_on_oom(oc, constraint);
441076  
451077  	if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
461078  	    current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
471079  	    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
481080  		get_task_struct(current);
491081  		oc->chosen = current;
501082  		oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
511083  		return true;
521084  	}
531085  
541086  	select_bad_process(oc);
551087  	/* Found nothing?!?! */
561088  	if (!oc->chosen) {
571089  		dump_header(oc, NULL);
581090  		pr_warn("Out of memory and no killable processes...\n");
591091  		/*
601092  		 * If we got here due to an actual allocation at the
611093  		 * system level, we cannot survive this and will enter
621094  		 * an endless loop in the allocator. Bail out now.
631095  		 */
641096  		if (!is_sysrq_oom(oc) && !is_memcg_oom(oc))
651097  			panic("System is deadlocked on memory\n");
661098  	}
671099  	if (oc->chosen && oc->chosen != (void *)-1UL)
681100  		oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
691101  				 "Memory cgroup out of memory");
701102  	return !!oc->chosen;
711103  }

针对上面的代码,进行相关阐述如下:

  • 1038-1039 : 如果内核没有开启oom_killer 则退出执行流程。
  • 1041-1046:如果oom_control内存上有可以追溯的函数栈,则尝试通知相关函数栈进行释放内存,如果内存释放量大于0,则代表释放了一些内存,则返回。
  • 1053-1057:如果当前有一个挂起的SIGKILL或正在退出,则自动执行选择它。
  • 1072-1072:确定分配约束的类型。
  • 1075:1075:确定内核是否必须由于oom sysctl上的panic而死机。
  • 1077-1084:针对当前正在需要运行的任务进行kill判断,如果当前任务可被kill则进行kill,如果当前任务不能被杀,则进一步判断处理。
  • 1086-1086:select_bad_process函数非常重要,这个函数是从操作系统进程中选择一个“最佳”待杀进程进行kill操作。
  • 1088-1098 : 如果select_bad_process没有找到“最佳”kill进程,则内核判断是否进入了内存僵局,如果进入则崩溃系统。
  • 1099-1101:如果找到了“最佳”kill进程,则进行kill操作,并上报日志。

三. select_bad_process、oom_evaluate_task、oom_badness函数

3.1 select_bad_process函数

这部分源代码如下:

 1366  /*
 2367   * Simple selection loop. We choose the process with the highest number of
 3368   * 'points'. In case scan was aborted, oc->chosen is set to -1.
 4369   */
 5370  static void select_bad_process(struct oom_control *oc)
 6371  {
 7372  	if (is_memcg_oom(oc))
 8373  		mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
 9374  	else {
10375  		struct task_struct *p;
11376  
12377  		rcu_read_lock();
13378  		for_each_process(p)
14379  			if (oom_evaluate_task(p, oc))
15380  				break;
16381  		rcu_read_unlock();
17382  	}
18383  
19384  	oc->chosen_points = oc->chosen_points * 1000 / oc->totalpages;
20385  }
  • 377-381:针对进程进行遍历操作,针对每个进程进行评估工作,并选出“最佳”进程。

3.2 oom_evaluate_task 函数

这部分源代码如下:

 1315  static int oom_evaluate_task(struct task_struct *task, void *arg)
 2316  {
 3317  	struct oom_control *oc = arg;
 4318  	unsigned long points;
 5319  
 6320  	if (oom_unkillable_task(task, NULL, oc->nodemask))
 7321  		goto next;
 8322  
 9323  	/*
10324  	 * This task already has access to memory reserves and is being killed.
11325  	 * Don't allow any other task to have access to the reserves unless
12326  	 * the task has MMF_OOM_SKIP because chances that it would release
13327  	 * any memory is quite low.
14328  	 */
15329  	if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
16330  		if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
17331  			goto next;
18332  		goto abort;
19333  	}
20334  
21335  	/*
22336  	 * If task is allocating a lot of memory and has been marked to be
23337  	 * killed first if it triggers an oom, then select it.
24338  	 */
25339  	if (oom_task_origin(task)) {
26340  		points = ULONG_MAX;
27341  		goto select;
28342  	}
29343  
30344  	points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
31345  	if (!points || points < oc->chosen_points)
32346  		goto next;
33347  
34348  	/* Prefer thread group leaders for display purposes */
35349  	if (points == oc->chosen_points && thread_group_leader(oc->chosen))
36350  		goto next;
37351  select:
38352  	if (oc->chosen)
39353  		put_task_struct(oc->chosen);
40354  	get_task_struct(task);
41355  	oc->chosen = task;
42356  	oc->chosen_points = points;
43357  next:
44358  	return 0;
45359  abort:
46360  	if (oc->chosen)
47361  		put_task_struct(oc->chosen);
48362  	oc->chosen = (void *)-1UL;
49363  	return 1;
50364  }

针对上面的代码,进行相关阐述如下:

  • 320 - 321 :如果当前任务不可被杀、则继续选择
  • 329 - 333 :1.检测到当前任务可以访问内存并且已经死掉、2.不允许任何其他任务访问保留区,除非该任务具有MMF_OOM_SKIP,因为它将释放任何内存的机会非常低。
  • 339 - 343 : 如果当前进程正在大量分配内存、并且触发了OOM则选择它。
  • 344 - 344 : oom_badness函数,计算当前的进程的 不良评分。
  • 351 - 356 :更新oom control内存区,更新“最佳”进程。

3.3 oom_badness 函数

这个函数的功能是对进程进行不良评分,函数的源代码如下:

 1192  /**
 2193   * oom_badness - heuristic function to determine which candidate task to kill
 3194   * @p: task struct of which task we should calculate
 4195   * @totalpages: total present RAM allowed for page allocation
 5196   * @memcg: task's memory controller, if constrained
 6197   * @nodemask: nodemask passed to page allocator for mempolicy ooms
 7198   *
 8199   * The heuristic for determining which task to kill is made to be as simple and
 9200   * predictable as possible.  The goal is to return the highest value for the
10201   * task consuming the most memory to avoid subsequent oom failures.
11202   */
12203  unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
13204  			  const nodemask_t *nodemask, unsigned long totalpages)
14205  {
15206  	long points;
16207  	long adj;
17208  
18209  	if (oom_unkillable_task(p, memcg, nodemask))
19210  		return 0;
20211  
21212  	p = find_lock_task_mm(p);
22213  	if (!p)
23214  		return 0;
24215  
25216  	/*
26217  	 * Do not even consider tasks which are explicitly marked oom
27218  	 * unkillable or have been already oom reaped or the are in
28219  	 * the middle of vfork
29220  	 */
30221  	adj = (long)p->signal->oom_score_adj;
31222  	if (adj == OOM_SCORE_ADJ_MIN ||
32223  			test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
33224  			in_vfork(p)) {
34225  		task_unlock(p);
35226  		return 0;
36227  	}
37228  
38229  	/*
39230  	 * The baseline for the badness score is the proportion of RAM that each
40231  	 * task's rss, pagetable and swap space use.
41232  	 */
42233  	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
43234  		mm_pgtables_bytes(p->mm) / PAGE_SIZE;
44235  	task_unlock(p);
45236  
46237  	/* Normalize to oom_score_adj units 此处是为了调整adj的单位,增加adj的加法权重 */ 
47238  	adj *= totalpages / 1000;
48239  	points += adj;
49240  
50241  	/*
51242  	 * Never return 0 for an eligible task regardless of the root bonus and
52243  	 * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
53244  	 */
54245  	return points > 0 ? points : 1;
55246  }

针对上面的代码,进行相关阐述如下:

  • 221 - 221 :这行非常关键,这个是读取进程的oom_score_adj值(/proc/PID/oom_score_adj)每个进程都有一个oom_score的属性,oom killer会杀死oom_score较大的进程。
  • 222 - 227:如果进程的oom_score_adj值 和 OOM_SCORE_ADJ_MIN (-1000)相等 ,或者进程拥有MMF_OOM_SKIP属性,或者正在进行进程fork操作 则 避免此进程。
  • 233 - 234:不良评分的基准是每个任务的RSS,分页表和交换空间使用的RAM比例。
  • 237 - 237 : 标准化oom_score_adj的单位。

特别注意:Linux 3.10 内核针对系统管理员集成把不良分数降低 3% , 相当于优先避免kill 系统管理员进程。

1	/*
2	 * Root processes get 3% bonus, just like the __vm_enough_memory()
3	 * implementation used by LSMs.
4	 */
5	if (has_capability_noaudit(p, CAP_SYS_ADMIN))
6		points -= (points * 3) / 100;