Index: sched_ule.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/sched_ule.c,v
retrieving revision 1.216
diff -u -r1.216 sched_ule.c
--- sched_ule.c	23 Oct 2007 00:52:24 -0000	1.216
+++ sched_ule.c	3 Nov 2007 21:36:27 -0000
@@ -88,7 +88,8 @@
 	short		ts_flags;	/* TSF_* flags. */
 	u_char		ts_rqindex;	/* Run queue index. */
 	u_char		ts_cpu;		/* CPU that we have affinity for. */
-	int		ts_slice;	/* Ticks of slice remaining. */
+	int		ts_slice;	/* Ticks of slice used. */
+	int		ts_score;	/* Interactivity score. */
 	u_int		ts_slptime;	/* Number of ticks we vol. slept */
 	u_int		ts_runtime;	/* Number of ticks we were running */
 	/* The following variables are only used for pctcpu calculation */
@@ -102,6 +103,7 @@
 /* flags kept in ts_flags */
 #define	TSF_BOUND	0x0001		/* Thread can not migrate. */
 #define	TSF_XFERABLE	0x0002		/* Thread was added as transferable. */
+#define	TSF_INTERLOAD	0x0004		/* Interactive load on runq. */
 
 static struct td_sched td_sched0;
 
@@ -167,13 +169,15 @@
  *			the shift factor.  Without the shift the error rate
  *			due to rounding would be unacceptably high.
  * realstathz:		stathz is sometimes 0 and run off of hz.
- * sched_slice:		Runtime of each thread before rescheduling.
+ * sched_slice_max:	Maximum runtime of each thread before rescheduling.
+ * sched_slice_min:	Minimum runtime of each thread before rescheduling.
  * preempt_thresh:	Priority threshold for preemption and remote IPIs.
  */
 static int sched_interact = SCHED_INTERACT_THRESH;
 static int realstathz;
 static int tickincr;
-static int sched_slice;
+static int sched_slice_max = 1;
+static int sched_slice_min = 1;
 #ifdef PREEMPTION
 #ifdef FULL_PREEMPTION
 static int preempt_thresh = PRI_MAX_IDLE;
@@ -194,6 +198,7 @@
 	struct runq	tdq_realtime;		/* real-time run queue. */
 	struct runq	tdq_timeshare;		/* timeshare run queue. */
 	struct runq	tdq_idle;		/* Queue of IDLE threads. */
+	unsigned int	tdq_interload;		/* Interactive load. */
 	int		tdq_load;		/* Aggregate load. */
 	u_char		tdq_idx;		/* Current insert index. */
 	u_char		tdq_ridx;		/* Current removal index. */
@@ -239,7 +244,6 @@
 static int balance_interval = 128;	/* Default set in sched_initticks(). */
 static int pick_pri = 1;
 static int affinity;
-static int tryself = 1;
 static int steal_htt = 1;
 static int steal_idle = 1;
 static int steal_thresh = 2;
@@ -288,10 +292,12 @@
 static void tdq_setup(struct tdq *);
 static void tdq_load_add(struct tdq *, struct td_sched *);
 static void tdq_load_rem(struct tdq *, struct td_sched *);
+static int tdq_slice(struct tdq *);
 static __inline void tdq_runq_add(struct tdq *, struct td_sched *, int);
 static __inline void tdq_runq_rem(struct tdq *, struct td_sched *);
-void tdq_print(int cpu);
-static void runq_print(struct runq *rq);
+void tdq_print(int);
+void sched_print(struct thread *);
+static void runq_print(struct runq *);
 static void tdq_add(struct tdq *, struct thread *, int);
 #ifdef SMP
 static void tdq_move(struct tdq *, struct tdq *);
@@ -345,6 +351,26 @@
 	}
 }
 
+void
+sched_print(struct thread *td)
+{
+	struct td_sched *ts;
+
+	if (td == NULL)
+		td = curthread;
+	ts = td->td_sched;
+	printf("flags:    0x%X\n", ts->ts_flags);
+	printf("rqindex:  %d\n", ts->ts_rqindex);
+	printf("cpu:      %d\n", ts->ts_cpu);
+	printf("slice:    %d\n", ts->ts_slice);
+	printf("score:    %d\n", ts->ts_score);
+	printf("slptime:  %d\n", ts->ts_slptime);
+	printf("runtime:  %d\n", ts->ts_runtime);
+	printf("ltick:    %d\n", ts->ts_ltick);
+	printf("ftick:    %d\n", ts->ts_ftick);
+	printf("ticks:    %d\n", ts->ts_ticks);
+}
+
 /*
  * Print the status of a per-cpu thread queue.  Should be a ddb show cmd.
  */
@@ -357,7 +383,9 @@
 
 	printf("tdq %d:\n", TDQ_ID(tdq));
 	printf("\tlockptr         %p\n", TDQ_LOCKPTR(tdq));
+	printf("\tinterload:	  %d\n", tdq->tdq_interload);
 	printf("\tload:           %d\n", tdq->tdq_load);
+	printf("\tslice:          %d\n", tdq_slice(tdq));
 	printf("\ttimeshare idx:  %d\n", tdq->tdq_idx);
 	printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx);
 	printf("\trealtime runq:\n");
@@ -383,8 +411,12 @@
 static __inline void
 tdq_runq_add(struct tdq *tdq, struct td_sched *ts, int flags)
 {
+	u_char pri;
+
+	pri = ts->ts_thread->td_priority;
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED);
+	TD_SET_RUNQ(ts->ts_thread);
 #ifdef SMP
 	if (THREAD_CAN_MIGRATE(ts->ts_thread)) {
 		tdq->tdq_transferable++;
@@ -392,15 +424,15 @@
 		ts->ts_flags |= TSF_XFERABLE;
 	}
 #endif
-	if (ts->ts_runq == &tdq->tdq_timeshare) {
-		u_char pri;
-
-		pri = ts->ts_thread->td_priority;
+	if (pri <= PRI_MAX_REALTIME) {
+		ts->ts_runq = &tdq->tdq_realtime;
+	} else if (pri <= PRI_MAX_TIMESHARE) {
+		ts->ts_runq = &tdq->tdq_timeshare;
 		KASSERT(pri <= PRI_MAX_TIMESHARE && pri >= PRI_MIN_TIMESHARE,
 			("Invalid priority %d on timeshare runq", pri));
 		/*
 		 * This queue contains only priorities between MIN and MAX
-		 * realtime.  Use the whole queue to represent these values.
+		 * timeshare.  Use the whole queue to represent these values.
 		 */
 		if ((flags & (SRQ_BORROWING|SRQ_PREEMPTED)) == 0) {
 			pri = (pri - PRI_MIN_TIMESHARE) / TS_RQ_PPQ;
@@ -416,8 +448,10 @@
 		} else
 			pri = tdq->tdq_ridx;
 		runq_add_pri(ts->ts_runq, ts, pri, flags);
+		return;
 	} else
-		runq_add(ts->ts_runq, ts, flags);
+		ts->ts_runq = &tdq->tdq_idle;
+	runq_add(ts->ts_runq, ts, flags);
 }
 
 /* 
@@ -443,13 +477,6 @@
 			runq_remove_idx(ts->ts_runq, ts, &tdq->tdq_ridx);
 		else
 			runq_remove_idx(ts->ts_runq, ts, NULL);
-		/*
-		 * For timeshare threads we update the priority here so
-		 * the priority reflects the time we've been sleeping.
-		 */
-		ts->ts_ltick = ticks;
-		sched_pctcpu_update(ts);
-		sched_priority(ts->ts_thread);
 	} else
 		runq_remove(ts->ts_runq, ts);
 }
@@ -466,6 +493,8 @@
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED);
 	class = PRI_BASE(ts->ts_thread->td_pri_class);
+	tdq->tdq_interload += ts->ts_score;
+	ts->ts_flags |= TSF_INTERLOAD;
 	tdq->tdq_load++;
 	CTR2(KTR_SCHED, "cpu %d load: %d", TDQ_ID(tdq), tdq->tdq_load);
 	if (class != PRI_ITHD &&
@@ -498,9 +527,37 @@
 #endif
 	KASSERT(tdq->tdq_load != 0,
 	    ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq)));
+	ts->ts_flags &= ~TSF_INTERLOAD;
+	ts->ts_runq = NULL;
+	tdq->tdq_interload -= ts->ts_score;
 	tdq->tdq_load--;
 	CTR1(KTR_SCHED, "load: %d", tdq->tdq_load);
-	ts->ts_runq = NULL;
+}
+
+/*
+ * Compute the maximum slice when the interload changes.  This gives a soft
+ * upper bound on latency as the load increases.
+ */
+static int
+tdq_slice(struct tdq *tdq)
+{
+	int slice;
+	int load;
+
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+	if (tdq->tdq_interload == 0)
+		return (sched_slice_max);
+	/*
+	 * An interload of 100 is roughly equivalent to 100% cpu utilization
+	 * requested.  Calculate how many times overloaded we are and then
+	 * divide the latency target by this number.  None of this is precise
+	 * but it does yield decreasing slice values within the [min, max]
+	 * range as load increases.
+	 */
+	load = (tdq->tdq_interload + 99) / 100;
+	slice = sched_slice_max / load;
+	slice = max(slice, sched_slice_min);
+	return (slice);
 }
 
 #ifdef SMP
@@ -1070,14 +1127,6 @@
 	cpu = self = PCPU_GET(cpuid);
 	if (smp_started == 0)
 		return (self);
-	/*
-	 * Don't migrate a running thread from sched_switch().
-	 */
-	if (flags & SRQ_OURSELF) {
-		CTR1(KTR_ULE, "YIELDING %d",
-		    curthread->td_priority);
-		return (self);
-	}
 	pri = ts->ts_thread->td_priority;
 	cpu = ts->ts_cpu;
 	/*
@@ -1094,7 +1143,8 @@
 	/*
 	 * If we have affinity, try to place it on the cpu we last ran on.
 	 */
-	if (SCHED_AFFINITY(ts) && tdq->tdq_lowpri > pri) {
+	if (SCHED_AFFINITY(ts) &&
+	    (tdq->tdq_lowpri > pri || tdq->tdq_interload < 10)) {
 		CTR5(KTR_ULE,
 		    "affinity for %d, ltick %d ticks %d pri %d curthread %d",
 		    ts->ts_cpu, ts->ts_rltick, ticks, pri,
@@ -1113,9 +1163,9 @@
 	 * This may improve locality among sleepers and wakers when there
 	 * is shared data.
 	 */
-	if (tryself && pri < curthread->td_priority) {
-		CTR1(KTR_ULE, "tryself %d",
-		    curthread->td_priority);
+	tdq = TDQ_CPU(self);
+	if (pri < curthread->td_priority || tdq->tdq_interload < 10) {
+		CTR1(KTR_ULE, "tryself %d", curthread->td_priority);
 		return (self);
 	}
 	/*
@@ -1175,6 +1225,7 @@
 	runq_init(&tdq->tdq_timeshare);
 	runq_init(&tdq->tdq_idle);
 	tdq->tdq_load = 0;
+	tdq->tdq_interload = 0;
 }
 
 #ifdef SMP
@@ -1324,12 +1375,12 @@
 	 * in case which sched_clock() called before sched_initticks().
 	 */
 	realstathz = hz;
-	sched_slice = (realstathz/10);	/* ~100ms */
 	tickincr = 1 << SCHED_TICK_SHIFT;
 
 	/* Add thread0's load since it's running. */
 	TDQ_LOCK(tdq);
 	thread0.td_lock = TDQ_LOCKPTR(TDQ_SELF());
+	td_sched0.ts_score = 0;
 	tdq_load_add(tdq, &td_sched0);
 	TDQ_UNLOCK(tdq);
 }
@@ -1344,7 +1395,8 @@
 	int incr;
 
 	realstathz = stathz ? stathz : hz;
-	sched_slice = (realstathz/10);	/* ~100ms */
+	sched_slice_max = realstathz / 15;	/* ~66ms */
+	sched_slice_min = realstathz / 50;	/* ~20ms */
 
 	/*
 	 * tickincr is shifted out by 10 to avoid rounding errors due to
@@ -1374,7 +1426,6 @@
 #endif
 }
 
-
 /*
  * This is the core of the interactivity algorithm.  Determines a score based
  * on past behavior.  It is the ratio of sleep time to run time scaled to
@@ -1389,15 +1440,6 @@
 	int div;
 
 	ts = td->td_sched;
-	/*
-	 * The score is only needed if this is likely to be an interactive
-	 * task.  Don't go through the expense of computing it if there's
-	 * no chance.
-	 */
-	if (sched_interact <= SCHED_INTERACT_HALF &&
-		ts->ts_runtime >= ts->ts_slptime)
-			return (SCHED_INTERACT_HALF);
-
 	if (ts->ts_runtime > ts->ts_slptime) {
 		div = max(1, ts->ts_runtime / SCHED_INTERACT_HALF);
 		return (SCHED_INTERACT_HALF +
@@ -1443,7 +1485,7 @@
 	 * score.  Negative nice values make it easier for a thread to be
 	 * considered interactive.
 	 */
-	score = imax(0, sched_interact_score(td) - td->td_proc->p_nice);
+	score = imax(0, td->td_sched->ts_score - td->td_proc->p_nice);
 	if (score < sched_interact) {
 		pri = PRI_MIN_REALTIME;
 		pri += ((PRI_MAX_REALTIME - PRI_MIN_REALTIME) / sched_interact)
@@ -1477,12 +1519,15 @@
 sched_interact_update(struct thread *td)
 {
 	struct td_sched *ts;
+	struct tdq *tdq;
 	u_int sum;
+	int score;
 
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	ts = td->td_sched;
 	sum = ts->ts_runtime + ts->ts_slptime;
 	if (sum < SCHED_SLP_RUN_MAX)
-		return;
+		goto score;
 	/*
 	 * This only happens from two places:
 	 * 1) We have added an unusual amount of run time from fork_exit.
@@ -1490,13 +1535,13 @@
 	 */
 	if (sum > SCHED_SLP_RUN_MAX * 2) {
 		if (ts->ts_runtime > ts->ts_slptime) {
-			ts->ts_runtime = SCHED_SLP_RUN_MAX;
+			ts->ts_runtime = SCHED_SLP_RUN_MAX / 2;
 			ts->ts_slptime = 1;
 		} else {
-			ts->ts_slptime = SCHED_SLP_RUN_MAX;
+			ts->ts_slptime = SCHED_SLP_RUN_MAX / 2;
 			ts->ts_runtime = 1;
 		}
-		return;
+		goto score;
 	}
 	/*
 	 * If we have exceeded by more than 1/5th then the algorithm below
@@ -1506,10 +1551,19 @@
 	if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) {
 		ts->ts_runtime /= 2;
 		ts->ts_slptime /= 2;
-		return;
+		goto score;
 	}
 	ts->ts_runtime = (ts->ts_runtime / 5) * 4;
 	ts->ts_slptime = (ts->ts_slptime / 5) * 4;
+score:
+	score = sched_interact_score(td);
+	if (ts->ts_flags & TSF_INTERLOAD) {
+		tdq = TDQ_CPU(ts->ts_cpu);
+		TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+		tdq->tdq_interload -= ts->ts_score;
+		tdq->tdq_interload += score;
+	}
+	ts->ts_score = score;
 }
 
 /*
@@ -1559,7 +1613,7 @@
 {
 
 	/* Convert sched_slice to hz */
-	return (hz/(realstathz/sched_slice));
+	return (hz/(realstathz/sched_slice_max));
 }
 
 /*
@@ -1598,16 +1652,19 @@
 sched_thread_priority(struct thread *td, u_char prio)
 {
 	struct td_sched *ts;
+	struct tdq *tdq;
 
 	CTR6(KTR_SCHED, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)",
 	    td, td->td_proc->p_comm, td->td_priority, prio, curthread,
 	    curthread->td_proc->p_comm);
 	ts = td->td_sched;
+	tdq = TDQ_CPU(ts->ts_cpu);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
-	if (td->td_priority == prio)
+	if (td->td_priority <= prio) {
+		td->td_priority = prio;
 		return;
-
-	if (TD_ON_RUNQ(td) && prio < td->td_priority) {
+	}
+	if (TD_ON_RUNQ(td)) {
 		/*
 		 * If the priority has been elevated due to priority
 		 * propagation, we may have to move ourselves to a new
@@ -1617,16 +1674,14 @@
 		sched_rem(td);
 		td->td_priority = prio;
 		sched_add(td, SRQ_BORROWING);
-	} else {
 #ifdef SMP
-		struct tdq *tdq;
-
-		tdq = TDQ_CPU(ts->ts_cpu);
+	} else if (TD_IS_RUNNING(td)) {
 		if (prio < tdq->tdq_lowpri)
 			tdq->tdq_lowpri = prio;
+		td->td_priority = prio;
 #endif
+	} else
 		td->td_priority = prio;
-	}
 }
 
 /*
@@ -1772,6 +1827,8 @@
 
 	tdn = TDQ_CPU(td->td_sched->ts_cpu);
 #ifdef SMP
+	/* The load is being removed from the current cpu. */
+	tdq_load_rem(tdq, td->td_sched);
 	/*
 	 * Do the lock dance required to avoid LOR.  We grab an extra
 	 * spinlock nesting to prevent preemption while we're
@@ -1863,12 +1920,11 @@
 		TD_SET_CAN_RUN(td);
 	} else if (TD_IS_RUNNING(td)) {
 		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
-		tdq_load_rem(tdq, ts);
 		srqflag = (flags & SW_PREEMPT) ?
 		    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
 		    SRQ_OURSELF|SRQ_YIELDING;
 		if (ts->ts_cpu == cpuid)
-			tdq_add(tdq, td, srqflag);
+			tdq_runq_add(tdq, ts, srqflag);
 		else
 			mtx = sched_switch_migrate(tdq, td, srqflag);
 	} else {
@@ -1970,22 +2026,18 @@
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	ts = td->td_sched;
 	/*
-	 * If we slept for more than a tick update our interactivity and
-	 * priority.
+	 * Update interactivity and priority after a sleep.
 	 */
 	slptick = td->td_slptick;
 	td->td_slptick = 0;
-	if (slptick && slptick != ticks) {
-		u_int hzticks;
-
-		hzticks = (ticks - slptick) << SCHED_TICK_SHIFT;
-		ts->ts_slptime += hzticks;
+	if (slptick && slptick != ticks)  {
+		ts->ts_slptime += (ticks - slptick) << SCHED_TICK_SHIFT;
 		sched_interact_update(td);
 		sched_pctcpu_update(ts);
 		sched_priority(td);
+		/* Reset the slice value after we sleep. */
+		ts->ts_slice = 0;
 	}
-	/* Reset the slice value after we sleep. */
-	ts->ts_slice = sched_slice;
 	sched_add(td, SRQ_BORING);
 }
 
@@ -2040,7 +2092,6 @@
 	 */
 	ts2->ts_slptime = ts->ts_slptime;
 	ts2->ts_runtime = ts->ts_runtime;
-	ts2->ts_slice = 1;	/* Attempt to quickly learn interactivity. */
 }
 
 /*
@@ -2188,25 +2239,26 @@
 	}
 	ts = td->td_sched;
 	/*
-	 * We only do slicing code for TIMESHARE threads.
-	 */
-	if (td->td_pri_class != PRI_TIMESHARE)
-		return;
-	/*
 	 * We used a tick; charge it to the thread so that we can compute our
 	 * interactivity.
 	 */
 	td->td_sched->ts_runtime += tickincr;
 	sched_interact_update(td);
 	/*
+	 * We only do slicing code for TIMESHARE threads.
+	 */
+	if (td->td_pri_class != PRI_TIMESHARE)
+		return;
+	sched_priority(td);
+	/*
 	 * We used up one time slice.
 	 */
-	if (--ts->ts_slice > 0)
+	if (++ts->ts_slice < tdq_slice(tdq))
 		return;
 	/*
-	 * We're out of time, recompute priorities and requeue.
+	 * We're out of time, force a requeue later.
 	 */
-	sched_priority(td);
+	ts->ts_slice = 0;
 	td->td_flags |= TDF_NEEDRESCHED;
 }
 
@@ -2328,11 +2380,10 @@
 tdq_add(struct tdq *tdq, struct thread *td, int flags)
 {
 	struct td_sched *ts;
-	int class;
 #ifdef SMP
+	int class;
 	int cpumask;
 #endif
-
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	KASSERT((td->td_inhibitors == 0),
 	    ("sched_add: trying to run inhibited thread"));
@@ -2342,20 +2393,10 @@
 	    ("sched_add: thread swapped out"));
 
 	ts = td->td_sched;
-	class = PRI_BASE(td->td_pri_class);
-        TD_SET_RUNQ(td);
-	if (ts->ts_slice == 0)
-		ts->ts_slice = sched_slice;
-	/*
-	 * Pick the run queue based on priority.
-	 */
-	if (td->td_priority <= PRI_MAX_REALTIME)
-		ts->ts_runq = &tdq->tdq_realtime;
-	else if (td->td_priority <= PRI_MAX_TIMESHARE)
-		ts->ts_runq = &tdq->tdq_timeshare;
-	else
-		ts->ts_runq = &tdq->tdq_idle;
+	tdq_runq_add(tdq, ts, flags);
+	tdq_load_add(tdq, ts);
 #ifdef SMP
+	class = PRI_BASE(td->td_pri_class);
 	cpumask = 1 << ts->ts_cpu;
 	/*
 	 * If we had been idle, clear our bit in the group and potentially
@@ -2378,8 +2419,6 @@
 	if (td->td_priority < tdq->tdq_lowpri)
 		tdq->tdq_lowpri = td->td_priority;
 #endif
-	tdq_runq_add(tdq, ts, flags);
-	tdq_load_add(tdq, ts);
 }
 
 /*
@@ -2660,8 +2699,10 @@
     "Scheduler");
 SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ULE", 0,
     "Scheduler name");
-SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0,
-    "Slice size for timeshare threads");
+SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice_max, 0,
+    "Maximum slice size for timeshare threads");
+SYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &sched_slice_min, 0,
+    "Minimum slice size for timeshare threads");
 SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0,
      "Interactivity score threshold");
 SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW, &preempt_thresh,
@@ -2671,7 +2712,6 @@
     "Pick the target cpu based on priority rather than load.");
 SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0,
     "Number of hz ticks to keep thread affinity for");
-SYSCTL_INT(_kern_sched, OID_AUTO, tryself, CTLFLAG_RW, &tryself, 0, "");
 SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0,
     "Enables the long-term load balancer");
 SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW,