<!-- received="Thu Sep  9 01:15:08 1999 EET DST" -->
<!-- sent="Thu, 9 Sep 1999 00:10:49 +0200 (CEST)" -->
<!-- name="Andrea Arcangeli" -->
<!-- email="andrea@suse.de" -->
<!-- subject="[patch] oom fixes for 2.3.17" -->
<!-- id="" -->
<!-- inreplyto="" -->
<title>Linux-kernel mailing list archive 1999-36,: [patch] oom fixes for 2.3.17</title>
<body bgcolor="#FFFFFF"><font face="Arial,Helvetica">
<h1>[patch] oom fixes for 2.3.17</h1>
<b>Andrea Arcangeli</b> (<a href="mailto:andrea@suse.de"><i>andrea@suse.de</i></a>)<br>
<i>Thu, 9 Sep 1999 00:10:49 +0200 (CEST)</i>
<p>
<ul>
<li> <b>Messages sorted by:</b> <a href="date.html#607">[ date ]</a><a href="index.html#607">[ thread ]</a><a href="subject.html#607">[ subject ]</a><a href="author.html#607">[ author ]</a>
<!-- next="start" -->
<li> <b>Next message:</b> <a href="0608.html">Andre Hedrick: "Re: Re: config-menus"</a>
<li> <b>Previous message:</b> <a href="0606.html">David Weinehall: "Re: I vote for updated RAID and KNFSD"</a>
<!-- nextthread="start" -->
<!-- reply="end" -->
</ul>
<hr>
<!-- body="start" -->
I ported my 2.2.x-oom fixes to 2.3.17.<br>
<p>
Description of the patch:<br>
<p>
o	avoids init to get a sigsegv from the signal code due oom<br>
o	improves the oom detection in the signal-frame code<br>
o	send the sigbus if we are trying to write beyond the end of<br>
	a shared mapping in the 386 case (__verify_write())<br>
o	on ia32 avoids the kernel to sigkill a task running with iopl &gt; 0,<br>
	send a sigterm instead (so X _won't_ screw-up the graphic card<br>
	anymore)<br>
o	page-cache oom friendly by breaking the loop at the first GFP fail<br>
o	page-cache nopage operation fixed to correctly kill the process<br>
	instead of sending the SIGBUS when necessary<br>
o	send a sigkill to the current task it triggered an OOM condition<br>
	by reading the ptrace data. Theorically if an iopl application<br>
	would go oom while reading a process MM, then it could get<br>
	a sigkill too, but I am not worried about that since usually<br>
	the debuggers doesn't need iopl() privilegies ;).<br>
o	remove the obsoleted lowlevel oom() function and replace it<br>
	with a sigkill where necessary<br>
<p>
diff -urN 2.3.17/arch/alpha/kernel/signal.c 2.3.17-oom/arch/alpha/kernel/signal.c<br>
--- 2.3.17/arch/alpha/kernel/signal.c	Sun Aug  1 18:11:07 1999<br>
+++ 2.3.17-oom/arch/alpha/kernel/signal.c	Wed Sep  8 19:50:32 1999<br>
@@ -437,6 +437,8 @@<br>
 		err |= __copy_to_user(frame-&gt;extramask, &amp;set-&gt;sig[1], <br>
 				      sizeof(frame-&gt;extramask));<br>
 	}<br>
+	if (err)<br>
+		goto give_sigsegv;<br>
 <br>
 	/* Set up to return from userspace.  If provided, use a stub<br>
 	   already in userspace.  */<br>
@@ -499,6 +501,8 @@<br>
 	err |= setup_sigcontext(&amp;frame-&gt;uc.uc_mcontext, regs, sw,<br>
 				set-&gt;sig[0], oldsp);<br>
 	err |= __copy_to_user(&amp;frame-&gt;uc.uc_sigmask, set, sizeof(*set));<br>
+	if (err)<br>
+		goto give_sigsegv;<br>
 <br>
 	/* Set up to return from userspace.  If provided, use a stub<br>
 	   already in userspace.  */<br>
diff -urN 2.3.17/arch/alpha/mm/fault.c 2.3.17-oom/arch/alpha/mm/fault.c<br>
--- 2.3.17/arch/alpha/mm/fault.c	Fri Aug 20 17:42:19 1999<br>
+++ 2.3.17-oom/arch/alpha/mm/fault.c	Wed Sep  8 20:43:28 1999<br>
@@ -130,13 +130,13 @@<br>
 	 * make sure we exit gracefully rather than endlessly redo<br>
 	 * the fault.<br>
 	 */<br>
+survive:<br>
 	fault = handle_mm_fault(current, vma, address, cause &gt; 0);<br>
-	up(&amp;mm-&gt;mmap_sem);<br>
-<br>
 	if (fault &lt; 0)<br>
 		goto out_of_memory;<br>
 	if (fault == 0)<br>
 		goto do_sigbus;<br>
+	up(&amp;mm-&gt;mmap_sem);<br>
 <br>
 	return;<br>
 <br>
@@ -177,13 +177,23 @@<br>
  * us unable to handle the page fault gracefully.<br>
  */<br>
 out_of_memory:<br>
-	printk(KERN_ALERT "VM: killing process %s(%d)\n",<br>
-	       current-&gt;comm, current-&gt;pid);<br>
-	if (!user_mode(regs))<br>
-		goto no_context;<br>
-	do_exit(SIGKILL);<br>
+	if (current-&gt;pid == 1)<br>
+	{<br>
+		current-&gt;policy |= SCHED_YIELD;<br>
+		schedule();<br>
+		goto survive;<br>
+	}<br>
+	up(&amp;mm-&gt;mmap_sem);<br>
+	if (user_mode(regs))<br>
+	{<br>
+		printk(KERN_ALERT "VM: killing process %s(%d)\n",<br>
+		       current-&gt;comm, current-&gt;pid);<br>
+		do_exit(SIGKILL);<br>
+	}<br>
+	goto no_context;<br>
 <br>
 do_sigbus:<br>
+	up(&amp;mm-&gt;mmap_sem);<br>
 	/*<br>
 	 * Send a sigbus, regardless of whether we were in kernel<br>
 	 * or user mode.<br>
diff -urN 2.3.17/arch/i386/kernel/signal.c 2.3.17-oom/arch/i386/kernel/signal.c<br>
--- 2.3.17/arch/i386/kernel/signal.c	Sun Aug  1 18:11:08 1999<br>
+++ 2.3.17-oom/arch/i386/kernel/signal.c	Wed Sep  8 19:58:06 1999<br>
@@ -419,13 +419,19 @@<br>
 		           ? current-&gt;exec_domain-&gt;signal_invmap[sig]<br>
 		           : sig),<br>
 		          &amp;frame-&gt;sig);<br>
+	if (err)<br>
+		goto give_sigsegv;<br>
 <br>
 	err |= setup_sigcontext(&amp;frame-&gt;sc, &amp;frame-&gt;fpstate, regs, set-&gt;sig[0]);<br>
+	if (err)<br>
+		goto give_sigsegv;<br>
 <br>
 	if (_NSIG_WORDS &gt; 1) {<br>
 		err |= __copy_to_user(frame-&gt;extramask, &amp;set-&gt;sig[1],<br>
 				      sizeof(frame-&gt;extramask));<br>
 	}<br>
+	if (err)<br>
+		goto give_sigsegv;<br>
 <br>
 	/* Set up to return from userspace.  If provided, use a stub<br>
 	   already in userspace.  */<br>
@@ -486,6 +492,8 @@<br>
 	err |= __put_user(&amp;frame-&gt;info, &amp;frame-&gt;pinfo);<br>
 	err |= __put_user(&amp;frame-&gt;uc, &amp;frame-&gt;puc);<br>
 	err |= __copy_to_user(&amp;frame-&gt;info, info, sizeof(*info));<br>
+	if (err)<br>
+		goto give_sigsegv;<br>
 <br>
 	/* Create the ucontext.  */<br>
 	err |= __put_user(0, &amp;frame-&gt;uc.uc_flags);<br>
@@ -497,6 +505,8 @@<br>
 	err |= setup_sigcontext(&amp;frame-&gt;uc.uc_mcontext, &amp;frame-&gt;fpstate,<br>
 			        regs, set-&gt;sig[0]);<br>
 	err |= __copy_to_user(&amp;frame-&gt;uc.uc_sigmask, set, sizeof(*set));<br>
+	if (err)<br>
+		goto give_sigsegv;<br>
 <br>
 	/* Set up to return from userspace.  If provided, use a stub<br>
 	   already in userspace.  */<br>
diff -urN 2.3.17/arch/i386/mm/fault.c 2.3.17-oom/arch/i386/mm/fault.c<br>
--- 2.3.17/arch/i386/mm/fault.c	Thu Aug 12 02:53:18 1999<br>
+++ 2.3.17-oom/arch/i386/mm/fault.c	Wed Sep  8 20:05:05 1999<br>
@@ -31,6 +31,7 @@<br>
 {<br>
 	struct vm_area_struct * vma;<br>
 	unsigned long start = (unsigned long) addr;<br>
+	int fault;<br>
 <br>
 	if (!size)<br>
 		return 1;<br>
@@ -50,8 +51,12 @@<br>
 	start &amp;= PAGE_MASK;<br>
 <br>
 	for (;;) {<br>
-		if (handle_mm_fault(current, vma, start, 1) &lt;= 0)<br>
-			goto bad_area;<br>
+survive:<br>
+		fault =  handle_mm_fault(current, vma, start, 1);<br>
+		if (!fault)<br>
+			goto do_sigbus;<br>
+		if (fault &lt; 0)<br>
+			goto out_of_memory;<br>
 		if (!size)<br>
 			break;<br>
 		size--;<br>
@@ -74,6 +79,19 @@<br>
 <br>
 bad_area:<br>
 	return 0;<br>
+<br>
+do_sigbus:<br>
+	force_sig(SIGBUS, current);<br>
+	goto bad_area;<br>
+<br>
+out_of_memory:<br>
+	if (current-&gt;pid == 1)<br>
+	{<br>
+		current-&gt;policy |= SCHED_YIELD;<br>
+		schedule();<br>
+		goto survive;<br>
+	}<br>
+	goto bad_area;<br>
 }<br>
 <br>
 asmlinkage void do_invalid_op(struct pt_regs *, unsigned long);<br>
@@ -163,6 +181,7 @@<br>
 	 * make sure we exit gracefully rather than endlessly redo<br>
 	 * the fault.<br>
 	 */<br>
+survive:<br>
 	{<br>
 		int fault = handle_mm_fault(tsk, vma, address, write);<br>
 		if (fault &lt; 0)<br>
@@ -262,10 +281,33 @@<br>
  * us unable to handle the page fault gracefully.<br>
  */<br>
 out_of_memory:<br>
+	if (tsk-&gt;pid == 1)<br>
+	{<br>
+		tsk-&gt;policy |= SCHED_YIELD;<br>
+		schedule();<br>
+		goto survive;<br>
+	}<br>
 	up(&amp;mm-&gt;mmap_sem);<br>
-	printk("VM: killing process %s\n", tsk-&gt;comm);<br>
 	if (error_code &amp; 4)<br>
-		do_exit(SIGKILL);<br>
+	{<br>
+		if (!((regs-&gt;eflags &gt;&gt; 12) &amp; 3))<br>
+		{<br>
+			printk(KERN_ALERT "VM: killing process %s\n",<br>
+			       tsk-&gt;comm);<br>
+			do_exit(SIGKILL);<br>
+		}<br>
+		else<br>
+		{<br>
+			/*<br>
+			 * The task is running with privilegies and so we<br>
+			 * trust it and we give it a chance to die gracefully.<br>
+			 */<br>
+			printk(KERN_ALERT "VM: terminating process %s\n",<br>
+			       tsk-&gt;comm);<br>
+			force_sig(SIGTERM, current);<br>
+			return;<br>
+		}<br>
+	}<br>
 	goto no_context;<br>
 <br>
 do_sigbus:<br>
diff -urN 2.3.17/include/linux/mm.h 2.3.17-oom/include/linux/mm.h<br>
--- 2.3.17/include/linux/mm.h	Wed Sep  8 18:18:56 1999<br>
+++ 2.3.17-oom/include/linux/mm.h	Wed Sep  8 22:30:54 1999<br>
@@ -325,7 +325,6 @@<br>
 extern unsigned long paging_init(unsigned long start_mem, unsigned long end_mem);<br>
 extern void mem_init(unsigned long start_mem, unsigned long end_mem);<br>
 extern void show_mem(void);<br>
-extern void oom(struct task_struct * tsk);<br>
 extern void si_meminfo(struct sysinfo * val);<br>
 extern void swapin_readahead(unsigned long);<br>
 <br>
diff -urN 2.3.17/kernel/ptrace.c 2.3.17-oom/kernel/ptrace.c<br>
--- 2.3.17/kernel/ptrace.c	Wed Sep  8 00:26:08 1999<br>
+++ 2.3.17-oom/kernel/ptrace.c	Wed Sep  8 19:36:04 1999<br>
@@ -24,6 +24,7 @@<br>
 	pmd_t * pgmiddle;<br>
 	pte_t * pgtable;<br>
 	unsigned long page;<br>
+	int fault;<br>
 <br>
 repeat:<br>
 	pgdir = pgd_offset(vma-&gt;vm_mm, addr);<br>
@@ -64,8 +65,12 @@<br>
 <br>
 fault_in_page:<br>
 	/* -1: out of memory. 0 - unmapped page */<br>
-	if (handle_mm_fault(tsk, vma, addr, write) &gt; 0)<br>
+	fault = handle_mm_fault(tsk, vma, addr, write);<br>
+	if (fault &gt; 0)<br>
 		goto repeat;<br>
+	if (fault &lt; 0)<br>
+		/* the out of memory is been triggered by the current task. */<br>
+		force_sig(SIGKILL, current);<br>
 	return 0;<br>
 <br>
 bad_pgd:<br>
diff -urN 2.3.17/mm/filemap.c 2.3.17-oom/mm/filemap.c<br>
--- 2.3.17/mm/filemap.c	Wed Sep  8 18:18:57 1999<br>
+++ 2.3.17-oom/mm/filemap.c	Wed Sep  8 23:18:53 1999<br>
@@ -530,7 +530,7 @@<br>
  * This adds the requested page to the page cache if it isn't already there,<br>
  * and schedules an I/O to read in its contents from disk.<br>
  */<br>
-static inline void page_cache_read(struct file * file, unsigned long offset) <br>
+static inline int page_cache_read(struct file * file, unsigned long offset) <br>
 {<br>
 	unsigned long new_page;<br>
 	struct inode *inode = file-&gt;f_dentry-&gt;d_inode;<br>
@@ -541,17 +541,17 @@<br>
 	page = __find_page_nolock(inode, offset, *hash); <br>
 	spin_unlock(&amp;pagecache_lock);<br>
 	if (page)<br>
-		return;<br>
+		return 1;<br>
 <br>
 	new_page = page_cache_alloc();<br>
 	if (!new_page)<br>
-		return;<br>
+		return 0;<br>
 	page = page_cache_entry(new_page);<br>
 <br>
 	if (!add_to_page_cache_unique(page, inode, offset, hash)) {<br>
 		inode-&gt;i_op-&gt;readpage(file, page);<br>
 		page_cache_release(page);<br>
-		return;<br>
+		return 1;<br>
 	}<br>
 <br>
 	/*<br>
@@ -559,14 +559,14 @@<br>
 	 * raced with us and added our page to the cache first.<br>
 	 */<br>
 	page_cache_free(new_page);<br>
-	return;<br>
+	return 1;<br>
 }<br>
 <br>
 /*<br>
  * Read in an entire cluster at once.  A cluster is usually a 64k-<br>
  * aligned block that includes the address requested in "offset."<br>
  */<br>
-static void read_cluster_nonblocking(struct file * file,<br>
+static int read_cluster_nonblocking(struct file * file,<br>
 	unsigned long offset)<br>
 {<br>
 	off_t filesize = file-&gt;f_dentry-&gt;d_inode-&gt;i_size;<br>
@@ -574,11 +574,12 @@<br>
 <br>
 	offset = CLUSTER_OFFSET(offset);<br>
 	while ((pages-- &gt; 0) &amp;&amp; (offset &lt; filesize)) {<br>
-		page_cache_read(file, offset);<br>
+		if (!page_cache_read(file, offset))<br>
+			return 0;<br>
 		offset += PAGE_CACHE_SIZE;<br>
 	}<br>
 <br>
-	return;<br>
+	return 1;<br>
 }<br>
 <br>
 /* <br>
@@ -912,7 +913,8 @@<br>
 	ahead = 0;<br>
 	while (ahead &lt; max_ahead) {<br>
 		ahead += PAGE_CACHE_SIZE;<br>
-		page_cache_read(filp, raend + ahead);<br>
+		if (!page_cache_read(filp, raend + ahead))<br>
+			break;<br>
 	}<br>
 /*<br>
  * If we tried to read ahead some pages,<br>
@@ -1347,7 +1349,7 @@<br>
 			flush_page_to_ram(new_page);<br>
 		}<br>
 		page_cache_release(page);<br>
-		return new_page;<br>
+		return new_page ? : -1;<br>
 	}<br>
 		<br>
 	flush_page_to_ram(old_page);<br>
@@ -1361,10 +1363,13 @@<br>
 	 * Otherwise, we're off the end of a privately mapped file,<br>
 	 * so we need to map a zero page.<br>
 	 */<br>
-	if (offset &lt; inode-&gt;i_size)<br>
-		read_cluster_nonblocking(file, offset);<br>
-	else<br>
-		page_cache_read(file, offset);<br>
+	if (offset &lt; inode-&gt;i_size) {<br>
+		if (!read_cluster_nonblocking(file, offset))<br>
+			return -1;<br>
+	} else {<br>
+		if (!page_cache_read(file, offset))<br>
+			return -1;<br>
+	}<br>
 <br>
 	/*<br>
 	 * The page we want has now been added to the page cache.<br>
diff -urN 2.3.17/mm/memory.c 2.3.17-oom/mm/memory.c<br>
--- 2.3.17/mm/memory.c	Wed Sep  8 00:26:08 1999<br>
+++ 2.3.17-oom/mm/memory.c	Wed Sep  8 23:03:06 1999<br>
@@ -69,16 +69,6 @@<br>
 mem_map_t * mem_map = NULL;<br>
 <br>
 /*<br>
- * oom() prints a message (so that the user knows why the process died),<br>
- * and gives the process an untrappable SIGKILL.<br>
- */<br>
-void oom(struct task_struct * task)<br>
-{<br>
-	printk("\nOut of memory for %s.\n", task-&gt;comm);<br>
-	force_sig(SIGKILL, task);<br>
-}<br>
-<br>
-/*<br>
  * Note: this doesn't free the actual pages themselves. That<br>
  * has been handled earlier when unmapping all the memory regions.<br>
  */<br>
@@ -732,13 +722,13 @@<br>
 	pmd = pmd_alloc(pgd, address);<br>
 	if (!pmd) {<br>
 		free_page(page);<br>
-		oom(tsk);<br>
+		force_sig(SIGKILL, tsk);<br>
 		return 0;<br>
 	}<br>
 	pte = pte_alloc(pmd, address);<br>
 	if (!pte) {<br>
 		free_page(page);<br>
-		oom(tsk);<br>
+		force_sig(SIGKILL, tsk);<br>
 		return 0;<br>
 	}<br>
 	if (!pte_none(*pte)) {<br>
<p>
<p>
<p>
o	swap_out() is now allowed to refresh the swap counter only at<br>
	once per loop. max_cnt is an unsigned long.<br>
<p>
diff -urN 2.3.17/mm/vmscan.c 2.3.17-oom/mm/vmscan.c<br>
--- 2.3.17/mm/vmscan.c	Wed Sep  8 00:26:08 1999<br>
+++ 2.3.17-oom/mm/vmscan.c	Wed Sep  8 23:12:07 1999<br>
@@ -327,6 +327,7 @@<br>
 	struct task_struct * p;<br>
 	int counter;<br>
 	int __ret = 0;<br>
+	int assign = 0;<br>
 <br>
 	lock_kernel();<br>
 	/* <br>
@@ -346,12 +347,9 @@<br>
 	counter = nr_threads / (priority+1);<br>
 	if (counter &lt; 1)<br>
 		counter = 1;<br>
-	if (counter &gt; nr_threads)<br>
-		counter = nr_threads;<br>
 <br>
 	for (; counter &gt;= 0; counter--) {<br>
-		int assign = 0;<br>
-		int max_cnt = 0;<br>
+		unsigned long max_cnt = 0;<br>
 		struct mm_struct *best = NULL;<br>
 		int pid = 0;<br>
 	select:<br>
@@ -364,7 +362,7 @@<br>
 	 		if (mm-&gt;rss &lt;= 0)<br>
 				continue;<br>
 			/* Refresh swap_cnt? */<br>
-			if (assign)<br>
+			if (assign == 1)<br>
 				mm-&gt;swap_cnt = mm-&gt;rss;<br>
 			if (mm-&gt;swap_cnt &gt; max_cnt) {<br>
 				max_cnt = mm-&gt;swap_cnt;<br>
@@ -373,6 +371,8 @@<br>
 			}<br>
 		}<br>
 		read_unlock(&amp;tasklist_lock);<br>
+		if (assign == 1)<br>
+			assign = 2;<br>
 		if (!best) {<br>
 			if (!assign) {<br>
 				assign = 1;<br>
<p>
Andrea<br>
<p>
<p>
<p>
-<br>
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in<br>
the body of a message to majordomo@vger.rutgers.edu<br>
Please read the FAQ at <a href="http://www.tux.org/lkml/">http://www.tux.org/lkml/</a><br>
<!-- body="end" -->
<hr>
<p>
<ul>
<!-- next="start" -->
<li> <b>Next message:</b> <a href="0608.html">Andre Hedrick: "Re: Re: config-menus"</a>
<li> <b>Previous message:</b> <a href="0606.html">David Weinehall: "Re: I vote for updated RAID and KNFSD"</a>
<!-- nextthread="start" -->
<!-- reply="end" -->
</ul>
</font></body>
