
 Documentation/filesystems/proc.txt |   87 +++++++++++++++++++++++++++++++++++++
 include/linux/sysctl.h             |   15 +++++-
 kernel/sched.c                     |   57 ++++++++++++------------
 kernel/sysctl.c                    |   35 ++++++++++++++
 4 files changed, 165 insertions(+), 29 deletions(-)


diff -urN linux-2.5.52/Documentation/filesystems/proc.txt linux/Documentation/filesystems/proc.txt
--- linux-2.5.52/Documentation/filesystems/proc.txt	2002-12-15 21:07:51.000000000 -0500
+++ linux/Documentation/filesystems/proc.txt	2002-12-15 22:51:09.000000000 -0500
@@ -37,6 +37,7 @@
   2.8	/proc/sys/net/ipv4 - IPV4 settings
   2.9	Appletalk
   2.10	IPX
+  2.11  /proc/sys/sched - scheduler tunables
 
 ------------------------------------------------------------------------------
 Preface
@@ -1659,6 +1660,92 @@
 gives the  destination  network, the router node (or Directly) and the network
 address of the router (or Connected) for internal networks.
 
+2.11 /proc/sys/sched - scheduler tunables
+-----------------------------------------
+
+Useful knobs for tuning the scheduler live in /proc/sys/sched.
+
+child_penalty
+-------------
+
+Percentage of the parent's sleep_avg that children inherit.  sleep_avg is
+a running average of the time a process spends sleeping.  Tasks with high
+sleep_avg values are considered interactive and given a higher dynamic
+priority and a larger timeslice.  You typically want this some value just
+under 100.
+
+exit_weight
+-----------
+
+When a CPU hog task exits, its parent's sleep_avg is reduced by a factor of
+exit_weight against the exiting task's sleep_avg.
+
+interactive_delta
+-----------------
+
+If a task is "interactive" it is reinserted into the active array after it
+has expired its timeslice, instead of being inserted into the expired array.
+How "interactive" a task must be in order to be deemed interactive is a
+function of its nice value.  This interactive limit is scaled linearly by nice
+value and is offset by the interactive_delta.
+
+max_sleep_avg
+-------------
+
+max_sleep_avg is the largest value (in ms) stored for a task's running sleep
+average.  The larger this value, the longer a task needs to sleep to be
+considered interactive (maximum interactive bonus is a function of
+max_sleep_avg).
+
+max_timeslice
+-------------
+
+Maximum timeslice, in milliseconds.  This is the value given to tasks of the
+highest dynamic priority.
+
+min_timeslice
+-------------
+
+Minimum timeslice, in milliseconds.  This is the value given to tasks of the
+lowest dynamic priority.  Every task gets at least this slice of the processor
+per array switch.
+
+parent_penalty
+--------------
+
+Percentage of the parent's sleep_avg that it retains across a fork().
+sleep_avg is a running average of the time a process spends sleeping.  Tasks
+with high sleep_avg values are considered interactive and given a higher
+dynamic priority and a larger timeslice.  Normally, this value is 100 and thus
+task's retain their sleep_avg on fork.  If you want to punish interactive
+tasks for forking, set this below 100.
+
+prio_bonus_ratio
+----------------
+
+Middle percentage of the priority range that tasks can receive as a dynamic
+priority.  The default value of 25% ensures that nice values at the
+extremes are still enforced.  For example, nice +19 interactive tasks will
+never be able to preempt a nice 0 CPU hog.  Setting this higher will increase
+the size of the priority range the tasks can receive as a bonus.  Setting
+this lower will decrease this range, making the interactivity bonus less
+apparent and user nice values more applicable.
+
+starvation_limit
+----------------
+
+Sufficiently interactive tasks are reinserted into the active array when they
+run out of timeslice.  Normally, tasks are inserted into the expired array.
+Reinserting interactive tasks into the active array allows them to remain
+runnable, which is important to interactive performance.  This could starve
+expired tasks, however, since the interactive task could prevent the array
+switch.  To prevent starving the tasks on the expired array for too long. the
+starvation_limit is the longest (in ms) we will let the expired array starve
+at the expense of reinserting interactive tasks back into active.  Higher
+values here give more preferance to running interactive tasks, at the expense
+of expired tasks.  Lower values provide more fair scheduling behavior, at the
+expense of interactivity.  The units are in milliseconds.
+
 ------------------------------------------------------------------------------
 Summary
 ------------------------------------------------------------------------------
diff -urN linux-2.5.52/include/linux/sysctl.h linux/include/linux/sysctl.h
--- linux-2.5.52/include/linux/sysctl.h	2002-12-15 21:08:09.000000000 -0500
+++ linux/include/linux/sysctl.h	2002-12-15 22:51:09.000000000 -0500
@@ -66,7 +66,8 @@
 	CTL_DEV=7,		/* Devices */
 	CTL_BUS=8,		/* Busses */
 	CTL_ABI=9,		/* Binary emulation */
-	CTL_CPU=10		/* CPU stuff (speed scaling, etc) */
+	CTL_CPU=10,		/* CPU stuff (speed scaling, etc) */
+	CTL_SCHED=11,		/* scheduler tunables */
 };
 
 /* CTL_BUS names: */
@@ -157,6 +158,18 @@
 	VM_LOWER_ZONE_PROTECTION=20,/* Amount of protection of lower zones */
 };
 
+/* Tunable scheduler parameters in /proc/sys/sched/ */
+enum {
+	SCHED_MIN_TIMESLICE=1,		/* minimum process timeslice */
+	SCHED_MAX_TIMESLICE=2,		/* maximum process timeslice */
+	SCHED_CHILD_PENALTY=3,		/* penalty on fork to child */
+	SCHED_PARENT_PENALTY=4,		/* penalty on fork to parent */
+	SCHED_EXIT_WEIGHT=5,		/* penalty to parent of CPU hog child */
+	SCHED_PRIO_BONUS_RATIO=6,	/* percent of max prio given as bonus */
+	SCHED_INTERACTIVE_DELTA=7,	/* delta used to scale interactivity */
+	SCHED_MAX_SLEEP_AVG=8,		/* maximum sleep avg attainable */
+	SCHED_STARVATION_LIMIT=9,	/* no re-active if expired is starved */
+};
 
 /* CTL_NET names: */
 enum
diff -urN linux-2.5.52/kernel/sched.c linux/kernel/sched.c
--- linux-2.5.52/kernel/sched.c	2002-12-15 21:08:14.000000000 -0500
+++ linux/kernel/sched.c	2002-12-15 22:55:14.000000000 -0500
@@ -57,16 +57,19 @@
  * Minimum timeslice is 10 msecs, default timeslice is 150 msecs,
  * maximum timeslice is 300 msecs. Timeslices get refilled after
  * they expire.
+ *
+ * They are configurable via /proc/sys/sched
+ * See Documentation/filesystems/proc.txt for descriptions
  */
-#define MIN_TIMESLICE		( 10 * HZ / 1000)
-#define MAX_TIMESLICE		(300 * HZ / 1000)
-#define CHILD_PENALTY		95
-#define PARENT_PENALTY		100
-#define EXIT_WEIGHT		3
-#define PRIO_BONUS_RATIO	25
-#define INTERACTIVE_DELTA	2
-#define MAX_SLEEP_AVG		(2*HZ)
-#define STARVATION_LIMIT	(2*HZ)
+int min_timeslice = (10 * HZ) / 1000;
+int max_timeslice = (300 * HZ) / 1000;
+int child_penalty = 95;
+int parent_penalty = 100;
+int exit_weight = 3;
+int prio_bonus_ratio = 25;
+int interactive_delta = 2;
+int max_sleep_avg = 2 * HZ;
+int starvation_limit = 2 * HZ;
 
 /*
  * If a task is 'interactive' then we reinsert it in the active
@@ -76,7 +79,7 @@
  *
  * This part scales the interactivity limit depending on niceness.
  *
- * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
+ * We scale it linearly, offset by the interactive_delta delta.
  * Here are a few examples of different nice levels:
  *
  *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
@@ -100,8 +103,8 @@
 	(v1) * (v2_max) / (v1_max)
 
 #define DELTA(p) \
-	(SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*PRIO_BONUS_RATIO/100) + \
-		INTERACTIVE_DELTA)
+	(SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*prio_bonus_ratio/100) + \
+		interactive_delta)
 
 #define TASK_INTERACTIVE(p) \
 	((p)->prio <= (p)->static_prio - DELTA(p))
@@ -112,13 +115,13 @@
  *
  * The higher a thread's priority, the bigger timeslices
  * it gets during one round of execution. But even the lowest
- * priority thread gets MIN_TIMESLICE worth of execution time.
+ * priority thread gets min_timeslice worth of execution time.
  *
  * task_timeslice() is the interface that is used by the scheduler.
  */
 
-#define BASE_TIMESLICE(p) (MIN_TIMESLICE + \
-	((MAX_TIMESLICE - MIN_TIMESLICE) * (MAX_PRIO-1-(p)->static_prio)/(MAX_USER_PRIO - 1)))
+#define BASE_TIMESLICE(p) (min_timeslice + \
+	((max_timeslice - min_timeslice) * (MAX_PRIO-1-(p)->static_prio)/(MAX_USER_PRIO - 1)))
 
 static inline unsigned int task_timeslice(task_t *p)
 {
@@ -244,7 +247,7 @@
  * effective_prio - return the priority that is based on the static
  * priority but is modified by bonuses/penalties.
  *
- * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
+ * We scale the actual sleep average [0 .... max_sleep_avg]
  * into the -5 ... 0 ... +5 bonus/penalty range.
  *
  * We use 25% of the full 0...39 priority range so that:
@@ -258,8 +261,8 @@
 {
 	int bonus, prio;
 
-	bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 -
-			MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2;
+	bonus = MAX_USER_PRIO*prio_bonus_ratio*p->sleep_avg/max_sleep_avg/100 -
+			MAX_USER_PRIO*prio_bonus_ratio/100/2;
 
 	prio = p->static_prio - bonus;
 	if (prio < MAX_RT_PRIO)
@@ -289,8 +292,8 @@
 		 * boost gets as well.
 		 */
 		p->sleep_avg += sleep_time;
-		if (p->sleep_avg > MAX_SLEEP_AVG)
-			p->sleep_avg = MAX_SLEEP_AVG;
+		if (p->sleep_avg > max_sleep_avg)
+			p->sleep_avg = max_sleep_avg;
 		p->prio = effective_prio(p);
 	}
 	enqueue_task(p, array);
@@ -460,8 +463,8 @@
 		 * and children as well, to keep max-interactive tasks
 		 * from forking tasks that are max-interactive.
 		 */
-		current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100;
-		p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100;
+		current->sleep_avg = current->sleep_avg * parent_penalty / 100;
+		p->sleep_avg = p->sleep_avg * child_penalty / 100;
 		p->prio = effective_prio(p);
 	}
 	set_task_cpu(p, smp_processor_id());
@@ -486,8 +489,8 @@
 	local_irq_save(flags);
 	if (p->first_time_slice) {
 		p->parent->time_slice += p->time_slice;
-		if (unlikely(p->parent->time_slice > MAX_TIMESLICE))
-			p->parent->time_slice = MAX_TIMESLICE;
+		if (unlikely(p->parent->time_slice > max_timeslice))
+			p->parent->time_slice = max_timeslice;
 	}
 	local_irq_restore(flags);
 	/*
@@ -495,8 +498,8 @@
 	 * the sleep_avg of the parent as well.
 	 */
 	if (p->sleep_avg < p->parent->sleep_avg)
-		p->parent->sleep_avg = (p->parent->sleep_avg * EXIT_WEIGHT +
-			p->sleep_avg) / (EXIT_WEIGHT + 1);
+		p->parent->sleep_avg = (p->parent->sleep_avg * exit_weight +
+			p->sleep_avg) / (exit_weight + 1);
 }
 
 /**
@@ -870,7 +873,7 @@
 #define EXPIRED_STARVING(rq) \
 		((rq)->expired_timestamp && \
 		(jiffies - (rq)->expired_timestamp >= \
-			STARVATION_LIMIT * ((rq)->nr_running) + 1))
+			starvation_limit * ((rq)->nr_running) + 1))
 
 /*
  * This function gets called by the timer code, with HZ frequency.
diff -urN linux-2.5.52/kernel/sysctl.c linux/kernel/sysctl.c
--- linux-2.5.52/kernel/sysctl.c	2002-12-15 21:07:44.000000000 -0500
+++ linux/kernel/sysctl.c	2002-12-15 22:51:09.000000000 -0500
@@ -55,6 +55,15 @@
 extern int cad_pid;
 extern int pid_max;
 extern int sysctl_lower_zone_protection;
+extern int min_timeslice;
+extern int max_timeslice;
+extern int child_penalty;
+extern int parent_penalty;
+extern int exit_weight;
+extern int prio_bonus_ratio;
+extern int interactive_delta;
+extern int max_sleep_avg;
+extern int starvation_limit;
 
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
@@ -112,6 +121,7 @@
 
 static ctl_table kern_table[];
 static ctl_table vm_table[];
+static ctl_table sched_table[];
 #ifdef CONFIG_NET
 extern ctl_table net_table[];
 #endif
@@ -156,6 +166,7 @@
 	{CTL_FS, "fs", NULL, 0, 0555, fs_table},
 	{CTL_DEBUG, "debug", NULL, 0, 0555, debug_table},
         {CTL_DEV, "dev", NULL, 0, 0555, dev_table},
+	{CTL_SCHED, "sched", NULL, 0, 0555, sched_table},
 	{0}
 };
 
@@ -358,7 +369,29 @@
 
 static ctl_table dev_table[] = {
 	{0}
-};  
+};
+
+static ctl_table sched_table[] = {
+	{SCHED_MAX_TIMESLICE, "max_timeslice",
+	&max_timeslice, sizeof(int), 0644, NULL, &proc_dointvec},
+	{SCHED_MIN_TIMESLICE, "min_timeslice",
+	&min_timeslice, sizeof(int), 0644, NULL, &proc_dointvec},
+	{SCHED_CHILD_PENALTY, "child_penalty",
+	&child_penalty, sizeof(int), 0644, NULL, &proc_dointvec},
+	{SCHED_PARENT_PENALTY, "parent_penalty",
+	&parent_penalty, sizeof(int), 0644, NULL, &proc_dointvec},
+	{SCHED_EXIT_WEIGHT, "exit_weight",
+	&exit_weight, sizeof(int), 0644, NULL, &proc_dointvec},
+	{SCHED_PRIO_BONUS_RATIO, "prio_bonus_ratio",
+	&prio_bonus_ratio, sizeof(int), 0644, NULL, &proc_dointvec},
+	{SCHED_INTERACTIVE_DELTA, "interactive_delta",
+	&interactive_delta, sizeof(int), 0644, NULL, &proc_dointvec},
+	{SCHED_MAX_SLEEP_AVG, "max_sleep_avg",
+	&max_sleep_avg, sizeof(int), 0644, NULL, &proc_dointvec},
+	{SCHED_STARVATION_LIMIT, "starvation_limit",
+	&starvation_limit, sizeof(int), 0644, NULL, &proc_dointvec},
+	{0}
+};
 
 extern void init_irq_proc (void);
 
