diff -C3 -BN -r old/Documentation/oom_killer/embedded.txt new/Documentation/oom_killer/embedded.txt
*** old/Documentation/oom_killer/embedded.txt	1970-01-01 01:00:00.000000000 +0100
--- new/Documentation/oom_killer/embedded.txt	2006-10-25 14:45:38.000000000 +0200
***************
*** 0 ****
--- 1,74 ----
+ OOM Killer for embedded systems
+ Documentation for oom.c 
+ Last update: $Date: 2006/10/24 10:15:59 $
+ 
+ *  What is oom killer?
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ 
+ It is a way to recover memory when the system lacks of (memory).
+ The only way the kernel can find is to kill a process to recover
+ the memory.
+ a routine, oom_kill is called from memory management to choose
+ and kill a process.
+ a routine, badness, is called to set the process ability to be killed
+ 
+ 
+ *  Problem with embedded systems
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ 
+ On embedded systems, as on real-time systems, we want to be deterministic
+ and may want to implement some kind of degraded mode or reconfiguration.
+ The actual implementation of oom killer implements a way to reduce the
+ badness of a process but it is quite complicated to use in a deterministic
+ way like ranking group of processes with predefined badness as this
+ badness will change during the system live.
+ 
+ 
+ *  Proposed solution for embedded systems
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ 
+ The solution I propose is to add a new /proc/pid/oom_ranking value to be
+ modified by a reconfiguration process.
+ 
+ The implementation is triggered by CONFIG_OOM_EMBEDDED
+ The kernel hook is installed at the begining of the
+ selec_bad_process()
+ I implemented the change with:
+ Overall ranking: threshold with minimal value of oom_ranking to kill a
+ process.
+ 
+ /* CODE */
+ 
+ if(p->oom_ranking < oom_rank_threshold)
+ 	continue;
+ 
+ /* END CODE */
+ 
+ The difference with the use of (p->oomkilladj == OOM_DISABLE) is for
+ degradded mode managment and reconfiguration issue:
+ One can define different kind of processes:
+ - Unkillable: oomkilladj == OOM_DISABLE
+ - Protected : oom_ranking < oom_rank_threshold
+ - eligible  : worst ranked by badness()
+ 
+ Another variable implements oom_reconfigure_wanted which incremented
+ when the oom_killer has been invoked.
+ This let the configuration manager to take opropriate
+ decision to reconfigure the system 
+ 
+ Last change: the call to panic is changed on option to a call to
+ reboot triggered by CONFIG_OOM_EMBEDDED_REBOOT:
+ emergency_restart()
+ 
+ If you do not use this option you can use panic_timeout to reboot the
+ system, this let you reboot more cleanly (if possible) and analyse
+ the crash with screen information or crash handler.
+ 
+ Changed files:
+ include/linux/sysctl.h 
+ include/linux/sched.h
+ mm/oom_kill.c
+ kernel/sysctl.c
+ fs/proc/base.c
+ kernel/Kconfig
+ Documentation/oom_killer/embedded.txt 
diff -C3 -BN -r old/fs/proc/base.c new/fs/proc/base.c
*** old/fs/proc/base.c	2006-10-16 19:06:06.000000000 +0200
--- new/fs/proc/base.c	2006-10-25 15:44:43.000000000 +0200
***************
*** 127,132 ****
--- 127,135 ----
  #endif
  	PROC_TGID_OOM_SCORE,
  	PROC_TGID_OOM_ADJUST,
+ #ifdef CONFIG_OOM_EMBEDDED
+ 	PROC_TGID_OOM_RANK,
+ #endif
  	PROC_TID_INO,
  	PROC_TID_STATUS,
  	PROC_TID_MEM,
***************
*** 168,173 ****
--- 171,179 ----
  #endif
  	PROC_TID_OOM_SCORE,
  	PROC_TID_OOM_ADJUST,
+ #ifdef CONFIG_OOM_EMBEDDED
+ 	PROC_TID_OOM_RANK,
+ #endif
  
  	/* Add new entries before this */
  	PROC_TID_FD_DIR = 0x8000,	/* 0x8000-0xffff */
***************
*** 221,226 ****
--- 227,235 ----
  #endif
  	E(PROC_TGID_OOM_SCORE, "oom_score",S_IFREG|S_IRUGO),
  	E(PROC_TGID_OOM_ADJUST,"oom_adj", S_IFREG|S_IRUGO|S_IWUSR),
+ #ifdef CONFIG_OOM_EMBEDDED
+ 	E(PROC_TGID_OOM_RANK,"oom_rank", S_IFREG|S_IRUGO|S_IWUSR),
+ #endif
  #ifdef CONFIG_AUDITSYSCALL
  	E(PROC_TGID_LOGINUID, "loginuid", S_IFREG|S_IWUSR|S_IRUGO),
  #endif
***************
*** 263,268 ****
--- 272,280 ----
  #endif
  	E(PROC_TID_OOM_SCORE,  "oom_score",S_IFREG|S_IRUGO),
  	E(PROC_TID_OOM_ADJUST, "oom_adj", S_IFREG|S_IRUGO|S_IWUSR),
+ #ifdef CONFIG_OOM_EMBEDDED
+ 	E(PROC_TID_OOM_RANK, "oom_rank", S_IFREG|S_IRUGO|S_IWUSR),
+ #endif
  #ifdef CONFIG_AUDITSYSCALL
  	E(PROC_TID_LOGINUID, "loginuid", S_IFREG|S_IWUSR|S_IRUGO),
  #endif
***************
*** 956,961 ****
--- 968,1024 ----
  	.open		= mem_open,
  };
  
+ #ifdef CONFIG_OOM_EMBEDDED
+ static ssize_t oom_ranking_read(struct file *file, char __user *buf,
+ 				size_t count, loff_t *ppos)
+ {
+ 	struct task_struct *task = proc_task(file->f_dentry->d_inode);
+ 	char buffer[8];
+ 	size_t len;
+ 	int oom_ranking = task->oomranking;
+ 	loff_t __ppos = *ppos;
+ 
+ 	len = sprintf(buffer, "%i\n", oom_ranking);
+ 	if (__ppos >= len)
+ 		return 0;
+ 	if (count > len-__ppos)
+ 		count = len-__ppos;
+ 	if (copy_to_user(buf, buffer + __ppos, count))
+ 		return -EFAULT;
+ 	*ppos = __ppos + count;
+ 	return count;
+ }
+ 
+ static ssize_t oom_ranking_write(struct file *file, const char __user *buf,
+ 				size_t count, loff_t *ppos)
+ {
+ 	struct task_struct *task = proc_task(file->f_dentry->d_inode);
+ 	char buffer[8], *end;
+ 	int oom_ranking;
+ 
+ 	if (!capable(CAP_SYS_RESOURCE))
+ 		return -EPERM;
+ 	memset(buffer, 0, 8);
+ 	if (count > 6)
+ 		count = 6;
+ 	if (copy_from_user(buffer, buf, count))
+ 		return -EFAULT;
+ 	oom_ranking = simple_strtol(buffer, &end, 0);
+ 	if (*end == '\n')
+ 		end++;
+ 	task->oomranking = oom_ranking;
+ 	if (end - buffer == 0)
+ 		return -EIO;
+ 	return end - buffer;
+ }
+ 
+ static struct file_operations proc_oom_ranking_operations = {
+ 	.read		= oom_ranking_read,
+ 	.write		= oom_ranking_write,
+ };
+ #endif
+ 
+ 
  static ssize_t oom_adjust_read(struct file *file, char __user *buf,
  				size_t count, loff_t *ppos)
  {
***************
*** 1860,1865 ****
--- 1923,1934 ----
  		case PROC_TGID_OOM_ADJUST:
  			inode->i_fop = &proc_oom_adjust_operations;
  			break;
+ #ifdef CONFIG_OOM_EMBEDDED
+ 		case PROC_TID_OOM_RANK:
+ 		case PROC_TGID_OOM_RANK:
+ 			inode->i_fop = &proc_oom_ranking_operations;
+ 			break;
+ #endif
  #ifdef CONFIG_AUDITSYSCALL
  		case PROC_TID_LOGINUID:
  		case PROC_TGID_LOGINUID:
diff -C3 -BN -r old/include/linux/sched.h new/include/linux/sched.h
*** old/include/linux/sched.h	2006-06-18 03:49:35.000000000 +0200
--- new/include/linux/sched.h	2006-10-25 14:38:12.000000000 +0200
***************
*** 791,796 ****
--- 791,799 ----
  	struct key *thread_keyring;	/* keyring private to this thread */
  	unsigned char jit_keyring;	/* default keyring to attach requested keys to */
  #endif
+ #ifdef CONFIG_OOM_EMBEDDED
+ 	int oomranking;
+ #endif
  	int oomkilladj; /* OOM kill score adjustment (bit shift). */
  	char comm[TASK_COMM_LEN]; /* executable name excluding path
  				     - access with [gs]et_task_comm (which lock
diff -C3 -BN -r old/include/linux/sysctl.h new/include/linux/sysctl.h
*** old/include/linux/sysctl.h	2006-06-18 03:49:35.000000000 +0200
--- new/include/linux/sysctl.h	2006-10-25 15:08:16.000000000 +0200
***************
*** 148,153 ****
--- 148,155 ----
  	KERN_SPIN_RETRY=70,	/* int: number of spinlock retries */
  	KERN_ACPI_VIDEO_FLAGS=71, /* int: flags for setting up video after ACPI sleep */
  	KERN_IA64_UNALIGNED=72, /* int: ia64 unaligned userland trap enable */
+ 	KERN_OOM_EMBEDDED=73,
+ 	KERN_OOM_EMBEDDED_RECONFIGURE=74,
  };
  
  
diff -C3 -BN -r old/kernel/sysctl.c new/kernel/sysctl.c
*** old/kernel/sysctl.c	2006-06-18 03:49:35.000000000 +0200
--- new/kernel/sysctl.c	2006-10-25 15:53:59.000000000 +0200
***************
*** 73,78 ****
--- 73,83 ----
  extern int sysctl_drop_caches;
  extern int percpu_pagelist_fraction;
  
+ #ifdef CONFIG_OOM_EMBEDDED
+ extern int oom_rank_threshold;
+ extern int oom_reconfigure_wanted;
+ #endif
+ 
  #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
  int unknown_nmi_panic;
  extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *,
***************
*** 398,403 ****
--- 403,426 ----
  		.strategy	= &sysctl_string,
  	},
  #endif
+ #ifdef CONFIG_OOM_EMBEDDED
+ 	{
+ 		.ctl_name	= KERN_OOM_EMBEDDED_RECONFIGURE,
+ 		.procname	= "oom_reconfigure_wanted",
+ 		.data		= &oom_reconfigure_wanted,
+ 		.maxlen		= sizeof (int),
+ 		.mode		= 0444,
+ 		.proc_handler	= &proc_dointvec,
+ 	},
+ 	{
+ 		.ctl_name	= KERN_OOM_EMBEDDED,
+ 		.procname	= "oom_rank_threshold",
+ 		.data		= &oom_rank_threshold,
+ 		.maxlen		= sizeof (int),
+ 		.mode		= 0644,
+ 		.proc_handler	= &proc_dointvec,
+ 	},
+ #endif
  #ifdef CONFIG_HOTPLUG
  	{
  		.ctl_name	= KERN_HOTPLUG,
diff -C3 -BN -r old/mm/Kconfig new/mm/Kconfig
*** old/mm/Kconfig	2006-10-16 19:06:07.000000000 +0200
--- new/mm/Kconfig	2006-10-25 15:21:42.000000000 +0200
***************
*** 145,147 ****
--- 145,177 ----
  	  while the virtual addresses are not changed. This is useful for
  	  example on NUMA systems to put pages nearer to the processors accessing
  	  the page.
+ 
+ #
+ # support for EMBEDDED specific OOM
+ #
+ config OOM_EMBEDDED
+ 	bool "Deterministic OOM"
+ 	def_bool y
+ 	depends on SYSCTL
+ 	help
+ 	  Allows a deterministic usage of the OOM Killer.
+ 	  You have to set the /proc/sys/kernel/oom_rank_threshold to enable
+ 	  the threshold. Each process will have to set its own oom_ranking
+ 	  file to a determined value.
+ 	  The OOM killer will never kill process with oom_ranking < oom_rank_threshold.
+ 	  A configuration manager process can monitor the system by reading
+ 	  /proc/sys/kernel/oom_reconfigure_wanted.
+ 
+ 	  Say yes if you want to have a deterministic OOM. Mostly wanted by RTE or embedded systems.
+ 
+ config OOM_EMBEDDED_REBOOT
+ 	bool "Deterministic OOM force reboot"
+ 	def_bool n
+ 	depends on OOM_EMBEDDED
+ 	help
+ 	  Saying yes here if you prefer to force a reboot instead of panic
+ 	  in case of an OOM.
+ 	  note that panic+panic_timeout would do a screen print and call
+ 	  the crash handler, if you have one.
+ 	  This option wont do it, saving possibilities to hang on a production
+ 	  system.
diff -C3 -BN -r old/mm/oom_kill.c new/mm/oom_kill.c
*** old/mm/oom_kill.c	2006-06-18 03:49:35.000000000 +0200
--- new/mm/oom_kill.c	2006-10-25 14:54:09.000000000 +0200
***************
*** 22,27 ****
--- 22,32 ----
  #include <linux/jiffies.h>
  #include <linux/cpuset.h>
  
+ #ifdef CONFIG_OOM_EMBEDDED
+ 	int oom_reconfigure_wanted = 0;
+ 	int	oom_rank_threshold = 0;
+ #endif
+ 
  /* #define DEBUG */
  
  /**
***************
*** 195,200 ****
--- 200,209 ----
  			continue;
  		if (p->oomkilladj == OOM_DISABLE)
  			continue;
+ #ifdef CONFIG_OOM_EMBEDDED
+ 		if (p->oomranking < oom_rank_threshold )
+ 			continue;
+ #endif
  		/* If p's nodes don't overlap ours, it won't help to kill p. */
  		if (!cpuset_excl_nodes_overlap(p))
  			continue;
***************
*** 358,363 ****
--- 367,375 ----
  		if (!p) {
  			read_unlock(&tasklist_lock);
  			cpuset_unlock();
+ #ifdef CONFIG_OOM_EMBEDDED_REBOOT
+ 			emergency_restart();
+ #endif
  			panic("Out of memory and no killable processes...\n");
  		}
  
***************
*** 368,373 ****
--- 380,388 ----
  	}
  
  out:
+ #ifdef CONFIG_OOM_EMBEDDED
+ 	oom_reconfigure_wanted++;
+ #endif
  	read_unlock(&tasklist_lock);
  	cpuset_unlock();
  
