forked from mirrors/linux
		
	oom: move oom_adj value from task_struct to mm_struct
The per-task oom_adj value is a characteristic of its mm more than the task itself since it's not possible to oom kill any thread that shares the mm. If a task were to be killed while attached to an mm that could not be freed because another thread were set to OOM_DISABLE, it would have needlessly been terminated since there is no potential for future memory freeing. This patch moves oomkilladj (now more appropriately named oom_adj) from struct task_struct to struct mm_struct. This requires task_lock() on a task to check its oom_adj value to protect against exec, but it's already necessary to take the lock when dereferencing the mm to find the total VM size for the badness heuristic. This fixes a livelock if the oom killer chooses a task and another thread sharing the same memory has an oom_adj value of OOM_DISABLE. This occurs because oom_kill_task() repeatedly returns 1 and refuses to kill the chosen task while select_bad_process() will repeatedly choose the same task during the next retry. Taking task_lock() in select_bad_process() to check for OOM_DISABLE and in oom_kill_task() to check for threads sharing the same memory will be removed in the next patch in this series where it will no longer be necessary. Writing to /proc/pid/oom_adj for a kthread will now return -EINVAL since these threads are immune from oom killing already. They simply report an oom_adj value of OOM_DISABLE. Cc: Nick Piggin <npiggin@suse.de> Cc: Rik van Riel <riel@redhat.com> Cc: Mel Gorman <mel@csn.ul.ie> Signed-off-by: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
		
							parent
							
								
									c9e444103b
								
							
						
					
					
						commit
						2ff05b2b4e
					
				
					 5 changed files with 50 additions and 21 deletions
				
			
		| 
						 | 
					@ -1003,11 +1003,13 @@ CHAPTER 3: PER-PROCESS PARAMETERS
 | 
				
			||||||
3.1 /proc/<pid>/oom_adj - Adjust the oom-killer score
 | 
					3.1 /proc/<pid>/oom_adj - Adjust the oom-killer score
 | 
				
			||||||
------------------------------------------------------
 | 
					------------------------------------------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
This file can be used to adjust the score used to select which processes
 | 
					This file can be used to adjust the score used to select which processes should
 | 
				
			||||||
should be killed in an  out-of-memory  situation.  Giving it a high score will
 | 
					be killed in an out-of-memory situation.  The oom_adj value is a characteristic
 | 
				
			||||||
increase the likelihood of this process being killed by the oom-killer.  Valid
 | 
					of the task's mm, so all threads that share an mm with pid will have the same
 | 
				
			||||||
values are in the range -16 to +15, plus the special value -17, which disables
 | 
					oom_adj value.  A high value will increase the likelihood of this process being
 | 
				
			||||||
oom-killing altogether for this process.
 | 
					killed by the oom-killer.  Valid values are in the range -16 to +15 as
 | 
				
			||||||
 | 
					explained below and a special value of -17, which disables oom-killing
 | 
				
			||||||
 | 
					altogether for threads sharing pid's mm.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
The process to be killed in an out-of-memory situation is selected among all others
 | 
					The process to be killed in an out-of-memory situation is selected among all others
 | 
				
			||||||
based on its badness score. This value equals the original memory size of the process
 | 
					based on its badness score. This value equals the original memory size of the process
 | 
				
			||||||
| 
						 | 
					@ -1021,6 +1023,9 @@ the parent's score if they do not share the same memory. Thus forking servers
 | 
				
			||||||
are the prime candidates to be killed. Having only one 'hungry' child will make
 | 
					are the prime candidates to be killed. Having only one 'hungry' child will make
 | 
				
			||||||
parent less preferable than the child.
 | 
					parent less preferable than the child.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/proc/<pid>/oom_adj cannot be changed for kthreads since they are immune from
 | 
				
			||||||
 | 
					oom-killing already.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/proc/<pid>/oom_score shows process' current badness score.
 | 
					/proc/<pid>/oom_score shows process' current badness score.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
The following heuristics are then applied:
 | 
					The following heuristics are then applied:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1006,7 +1006,12 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (!task)
 | 
						if (!task)
 | 
				
			||||||
		return -ESRCH;
 | 
							return -ESRCH;
 | 
				
			||||||
	oom_adjust = task->oomkilladj;
 | 
						task_lock(task);
 | 
				
			||||||
 | 
						if (task->mm)
 | 
				
			||||||
 | 
							oom_adjust = task->mm->oom_adj;
 | 
				
			||||||
 | 
						else
 | 
				
			||||||
 | 
							oom_adjust = OOM_DISABLE;
 | 
				
			||||||
 | 
						task_unlock(task);
 | 
				
			||||||
	put_task_struct(task);
 | 
						put_task_struct(task);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
 | 
						len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
 | 
				
			||||||
| 
						 | 
					@ -1035,11 +1040,19 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 | 
				
			||||||
	task = get_proc_task(file->f_path.dentry->d_inode);
 | 
						task = get_proc_task(file->f_path.dentry->d_inode);
 | 
				
			||||||
	if (!task)
 | 
						if (!task)
 | 
				
			||||||
		return -ESRCH;
 | 
							return -ESRCH;
 | 
				
			||||||
	if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) {
 | 
						task_lock(task);
 | 
				
			||||||
 | 
						if (!task->mm) {
 | 
				
			||||||
 | 
							task_unlock(task);
 | 
				
			||||||
 | 
							put_task_struct(task);
 | 
				
			||||||
 | 
							return -EINVAL;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						if (oom_adjust < task->mm->oom_adj && !capable(CAP_SYS_RESOURCE)) {
 | 
				
			||||||
 | 
							task_unlock(task);
 | 
				
			||||||
		put_task_struct(task);
 | 
							put_task_struct(task);
 | 
				
			||||||
		return -EACCES;
 | 
							return -EACCES;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	task->oomkilladj = oom_adjust;
 | 
						task->mm->oom_adj = oom_adjust;
 | 
				
			||||||
 | 
						task_unlock(task);
 | 
				
			||||||
	put_task_struct(task);
 | 
						put_task_struct(task);
 | 
				
			||||||
	if (end - buffer == 0)
 | 
						if (end - buffer == 0)
 | 
				
			||||||
		return -EIO;
 | 
							return -EIO;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -232,6 +232,8 @@ struct mm_struct {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
 | 
						unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						s8 oom_adj;	/* OOM kill score adjustment (bit shift) */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	cpumask_t cpu_vm_mask;
 | 
						cpumask_t cpu_vm_mask;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* Architecture-specific MM context */
 | 
						/* Architecture-specific MM context */
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1178,7 +1178,6 @@ struct task_struct {
 | 
				
			||||||
	 * a short time
 | 
						 * a short time
 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
	unsigned char fpu_counter;
 | 
						unsigned char fpu_counter;
 | 
				
			||||||
	s8 oomkilladj; /* OOM kill score adjustment (bit shift). */
 | 
					 | 
				
			||||||
#ifdef CONFIG_BLK_DEV_IO_TRACE
 | 
					#ifdef CONFIG_BLK_DEV_IO_TRACE
 | 
				
			||||||
	unsigned int btrace_seq;
 | 
						unsigned int btrace_seq;
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -58,6 +58,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 | 
				
			||||||
	unsigned long points, cpu_time, run_time;
 | 
						unsigned long points, cpu_time, run_time;
 | 
				
			||||||
	struct mm_struct *mm;
 | 
						struct mm_struct *mm;
 | 
				
			||||||
	struct task_struct *child;
 | 
						struct task_struct *child;
 | 
				
			||||||
 | 
						int oom_adj;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	task_lock(p);
 | 
						task_lock(p);
 | 
				
			||||||
	mm = p->mm;
 | 
						mm = p->mm;
 | 
				
			||||||
| 
						 | 
					@ -65,6 +66,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 | 
				
			||||||
		task_unlock(p);
 | 
							task_unlock(p);
 | 
				
			||||||
		return 0;
 | 
							return 0;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
						oom_adj = mm->oom_adj;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
	 * The memory size of the process is the basis for the badness.
 | 
						 * The memory size of the process is the basis for the badness.
 | 
				
			||||||
| 
						 | 
					@ -148,15 +150,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 | 
				
			||||||
		points /= 8;
 | 
							points /= 8;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
	 * Adjust the score by oomkilladj.
 | 
						 * Adjust the score by oom_adj.
 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
	if (p->oomkilladj) {
 | 
						if (oom_adj) {
 | 
				
			||||||
		if (p->oomkilladj > 0) {
 | 
							if (oom_adj > 0) {
 | 
				
			||||||
			if (!points)
 | 
								if (!points)
 | 
				
			||||||
				points = 1;
 | 
									points = 1;
 | 
				
			||||||
			points <<= p->oomkilladj;
 | 
								points <<= oom_adj;
 | 
				
			||||||
		} else
 | 
							} else
 | 
				
			||||||
			points >>= -(p->oomkilladj);
 | 
								points >>= -(oom_adj);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifdef DEBUG
 | 
					#ifdef DEBUG
 | 
				
			||||||
| 
						 | 
					@ -251,8 +253,12 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
 | 
				
			||||||
			*ppoints = ULONG_MAX;
 | 
								*ppoints = ULONG_MAX;
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		if (p->oomkilladj == OOM_DISABLE)
 | 
							task_lock(p);
 | 
				
			||||||
 | 
							if (p->mm && p->mm->oom_adj == OOM_DISABLE) {
 | 
				
			||||||
 | 
								task_unlock(p);
 | 
				
			||||||
			continue;
 | 
								continue;
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
							task_unlock(p);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		points = badness(p, uptime.tv_sec);
 | 
							points = badness(p, uptime.tv_sec);
 | 
				
			||||||
		if (points > *ppoints || !chosen) {
 | 
							if (points > *ppoints || !chosen) {
 | 
				
			||||||
| 
						 | 
					@ -304,8 +310,7 @@ static void dump_tasks(const struct mem_cgroup *mem)
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
		printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d     %3d %s\n",
 | 
							printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d     %3d %s\n",
 | 
				
			||||||
		       p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
 | 
							       p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
 | 
				
			||||||
		       get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj,
 | 
							       get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm);
 | 
				
			||||||
		       p->comm);
 | 
					 | 
				
			||||||
		task_unlock(p);
 | 
							task_unlock(p);
 | 
				
			||||||
	} while_each_thread(g, p);
 | 
						} while_each_thread(g, p);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					@ -367,8 +372,12 @@ static int oom_kill_task(struct task_struct *p)
 | 
				
			||||||
	 * Don't kill the process if any threads are set to OOM_DISABLE
 | 
						 * Don't kill the process if any threads are set to OOM_DISABLE
 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
	do_each_thread(g, q) {
 | 
						do_each_thread(g, q) {
 | 
				
			||||||
		if (q->mm == mm && q->oomkilladj == OOM_DISABLE)
 | 
							task_lock(q);
 | 
				
			||||||
 | 
							if (q->mm == mm && q->mm && q->mm->oom_adj == OOM_DISABLE) {
 | 
				
			||||||
 | 
								task_unlock(q);
 | 
				
			||||||
			return 1;
 | 
								return 1;
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
							task_unlock(q);
 | 
				
			||||||
	} while_each_thread(g, q);
 | 
						} while_each_thread(g, q);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	__oom_kill_task(p, 1);
 | 
						__oom_kill_task(p, 1);
 | 
				
			||||||
| 
						 | 
					@ -393,10 +402,11 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 | 
				
			||||||
	struct task_struct *c;
 | 
						struct task_struct *c;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (printk_ratelimit()) {
 | 
						if (printk_ratelimit()) {
 | 
				
			||||||
		printk(KERN_WARNING "%s invoked oom-killer: "
 | 
					 | 
				
			||||||
			"gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
 | 
					 | 
				
			||||||
			current->comm, gfp_mask, order, current->oomkilladj);
 | 
					 | 
				
			||||||
		task_lock(current);
 | 
							task_lock(current);
 | 
				
			||||||
 | 
							printk(KERN_WARNING "%s invoked oom-killer: "
 | 
				
			||||||
 | 
								"gfp_mask=0x%x, order=%d, oom_adj=%d\n",
 | 
				
			||||||
 | 
								current->comm, gfp_mask, order,
 | 
				
			||||||
 | 
								current->mm ? current->mm->oom_adj : OOM_DISABLE);
 | 
				
			||||||
		cpuset_print_task_mems_allowed(current);
 | 
							cpuset_print_task_mems_allowed(current);
 | 
				
			||||||
		task_unlock(current);
 | 
							task_unlock(current);
 | 
				
			||||||
		dump_stack();
 | 
							dump_stack();
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue