Index: sched_ule.c =================================================================== RCS file: /home/ncvs/src/sys/kern/sched_ule.c,v retrieving revision 1.216 diff -u -r1.216 sched_ule.c --- sched_ule.c 23 Oct 2007 00:52:24 -0000 1.216 +++ sched_ule.c 3 Nov 2007 21:36:27 -0000 @@ -88,7 +88,8 @@ short ts_flags; /* TSF_* flags. */ u_char ts_rqindex; /* Run queue index. */ u_char ts_cpu; /* CPU that we have affinity for. */ - int ts_slice; /* Ticks of slice remaining. */ + int ts_slice; /* Ticks of slice used. */ + int ts_score; /* Interactivity score. */ u_int ts_slptime; /* Number of ticks we vol. slept */ u_int ts_runtime; /* Number of ticks we were running */ /* The following variables are only used for pctcpu calculation */ @@ -102,6 +103,7 @@ /* flags kept in ts_flags */ #define TSF_BOUND 0x0001 /* Thread can not migrate. */ #define TSF_XFERABLE 0x0002 /* Thread was added as transferable. */ +#define TSF_INTERLOAD 0x0004 /* Interactive load on runq. */ static struct td_sched td_sched0; @@ -167,13 +169,15 @@ * the shift factor. Without the shift the error rate * due to rounding would be unacceptably high. * realstathz: stathz is sometimes 0 and run off of hz. - * sched_slice: Runtime of each thread before rescheduling. + * sched_slice_max: Maximum runtime of each thread before rescheduling. + * sched_slice_min: Minimum runtime of each thread before rescheduling. * preempt_thresh: Priority threshold for preemption and remote IPIs. */ static int sched_interact = SCHED_INTERACT_THRESH; static int realstathz; static int tickincr; -static int sched_slice; +static int sched_slice_max = 1; +static int sched_slice_min = 1; #ifdef PREEMPTION #ifdef FULL_PREEMPTION static int preempt_thresh = PRI_MAX_IDLE; @@ -194,6 +198,7 @@ struct runq tdq_realtime; /* real-time run queue. */ struct runq tdq_timeshare; /* timeshare run queue. */ struct runq tdq_idle; /* Queue of IDLE threads. */ + unsigned int tdq_interload; /* Interactive load. */ int tdq_load; /* Aggregate load. */ u_char tdq_idx; /* Current insert index. */ u_char tdq_ridx; /* Current removal index. */ @@ -239,7 +244,6 @@ static int balance_interval = 128; /* Default set in sched_initticks(). */ static int pick_pri = 1; static int affinity; -static int tryself = 1; static int steal_htt = 1; static int steal_idle = 1; static int steal_thresh = 2; @@ -288,10 +292,12 @@ static void tdq_setup(struct tdq *); static void tdq_load_add(struct tdq *, struct td_sched *); static void tdq_load_rem(struct tdq *, struct td_sched *); +static int tdq_slice(struct tdq *); static __inline void tdq_runq_add(struct tdq *, struct td_sched *, int); static __inline void tdq_runq_rem(struct tdq *, struct td_sched *); -void tdq_print(int cpu); -static void runq_print(struct runq *rq); +void tdq_print(int); +void sched_print(struct thread *); +static void runq_print(struct runq *); static void tdq_add(struct tdq *, struct thread *, int); #ifdef SMP static void tdq_move(struct tdq *, struct tdq *); @@ -345,6 +351,26 @@ } } +void +sched_print(struct thread *td) +{ + struct td_sched *ts; + + if (td == NULL) + td = curthread; + ts = td->td_sched; + printf("flags: 0x%X\n", ts->ts_flags); + printf("rqindex: %d\n", ts->ts_rqindex); + printf("cpu: %d\n", ts->ts_cpu); + printf("slice: %d\n", ts->ts_slice); + printf("score: %d\n", ts->ts_score); + printf("slptime: %d\n", ts->ts_slptime); + printf("runtime: %d\n", ts->ts_runtime); + printf("ltick: %d\n", ts->ts_ltick); + printf("ftick: %d\n", ts->ts_ftick); + printf("ticks: %d\n", ts->ts_ticks); +} + /* * Print the status of a per-cpu thread queue. Should be a ddb show cmd. */ @@ -357,7 +383,9 @@ printf("tdq %d:\n", TDQ_ID(tdq)); printf("\tlockptr %p\n", TDQ_LOCKPTR(tdq)); + printf("\tinterload: %d\n", tdq->tdq_interload); printf("\tload: %d\n", tdq->tdq_load); + printf("\tslice: %d\n", tdq_slice(tdq)); printf("\ttimeshare idx: %d\n", tdq->tdq_idx); printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx); printf("\trealtime runq:\n"); @@ -383,8 +411,12 @@ static __inline void tdq_runq_add(struct tdq *tdq, struct td_sched *ts, int flags) { + u_char pri; + + pri = ts->ts_thread->td_priority; TDQ_LOCK_ASSERT(tdq, MA_OWNED); THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED); + TD_SET_RUNQ(ts->ts_thread); #ifdef SMP if (THREAD_CAN_MIGRATE(ts->ts_thread)) { tdq->tdq_transferable++; @@ -392,15 +424,15 @@ ts->ts_flags |= TSF_XFERABLE; } #endif - if (ts->ts_runq == &tdq->tdq_timeshare) { - u_char pri; - - pri = ts->ts_thread->td_priority; + if (pri <= PRI_MAX_REALTIME) { + ts->ts_runq = &tdq->tdq_realtime; + } else if (pri <= PRI_MAX_TIMESHARE) { + ts->ts_runq = &tdq->tdq_timeshare; KASSERT(pri <= PRI_MAX_TIMESHARE && pri >= PRI_MIN_TIMESHARE, ("Invalid priority %d on timeshare runq", pri)); /* * This queue contains only priorities between MIN and MAX - * realtime. Use the whole queue to represent these values. + * timeshare. Use the whole queue to represent these values. */ if ((flags & (SRQ_BORROWING|SRQ_PREEMPTED)) == 0) { pri = (pri - PRI_MIN_TIMESHARE) / TS_RQ_PPQ; @@ -416,8 +448,10 @@ } else pri = tdq->tdq_ridx; runq_add_pri(ts->ts_runq, ts, pri, flags); + return; } else - runq_add(ts->ts_runq, ts, flags); + ts->ts_runq = &tdq->tdq_idle; + runq_add(ts->ts_runq, ts, flags); } /* @@ -443,13 +477,6 @@ runq_remove_idx(ts->ts_runq, ts, &tdq->tdq_ridx); else runq_remove_idx(ts->ts_runq, ts, NULL); - /* - * For timeshare threads we update the priority here so - * the priority reflects the time we've been sleeping. - */ - ts->ts_ltick = ticks; - sched_pctcpu_update(ts); - sched_priority(ts->ts_thread); } else runq_remove(ts->ts_runq, ts); } @@ -466,6 +493,8 @@ TDQ_LOCK_ASSERT(tdq, MA_OWNED); THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED); class = PRI_BASE(ts->ts_thread->td_pri_class); + tdq->tdq_interload += ts->ts_score; + ts->ts_flags |= TSF_INTERLOAD; tdq->tdq_load++; CTR2(KTR_SCHED, "cpu %d load: %d", TDQ_ID(tdq), tdq->tdq_load); if (class != PRI_ITHD && @@ -498,9 +527,37 @@ #endif KASSERT(tdq->tdq_load != 0, ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq))); + ts->ts_flags &= ~TSF_INTERLOAD; + ts->ts_runq = NULL; + tdq->tdq_interload -= ts->ts_score; tdq->tdq_load--; CTR1(KTR_SCHED, "load: %d", tdq->tdq_load); - ts->ts_runq = NULL; +} + +/* + * Compute the maximum slice when the interload changes. This gives a soft + * upper bound on latency as the load increases. + */ +static int +tdq_slice(struct tdq *tdq) +{ + int slice; + int load; + + TDQ_LOCK_ASSERT(tdq, MA_OWNED); + if (tdq->tdq_interload == 0) + return (sched_slice_max); + /* + * An interload of 100 is roughly equivalent to 100% cpu utilization + * requested. Calculate how many times overloaded we are and then + * divide the latency target by this number. None of this is precise + * but it does yield decreasing slice values within the [min, max] + * range as load increases. + */ + load = (tdq->tdq_interload + 99) / 100; + slice = sched_slice_max / load; + slice = max(slice, sched_slice_min); + return (slice); } #ifdef SMP @@ -1070,14 +1127,6 @@ cpu = self = PCPU_GET(cpuid); if (smp_started == 0) return (self); - /* - * Don't migrate a running thread from sched_switch(). - */ - if (flags & SRQ_OURSELF) { - CTR1(KTR_ULE, "YIELDING %d", - curthread->td_priority); - return (self); - } pri = ts->ts_thread->td_priority; cpu = ts->ts_cpu; /* @@ -1094,7 +1143,8 @@ /* * If we have affinity, try to place it on the cpu we last ran on. */ - if (SCHED_AFFINITY(ts) && tdq->tdq_lowpri > pri) { + if (SCHED_AFFINITY(ts) && + (tdq->tdq_lowpri > pri || tdq->tdq_interload < 10)) { CTR5(KTR_ULE, "affinity for %d, ltick %d ticks %d pri %d curthread %d", ts->ts_cpu, ts->ts_rltick, ticks, pri, @@ -1113,9 +1163,9 @@ * This may improve locality among sleepers and wakers when there * is shared data. */ - if (tryself && pri < curthread->td_priority) { - CTR1(KTR_ULE, "tryself %d", - curthread->td_priority); + tdq = TDQ_CPU(self); + if (pri < curthread->td_priority || tdq->tdq_interload < 10) { + CTR1(KTR_ULE, "tryself %d", curthread->td_priority); return (self); } /* @@ -1175,6 +1225,7 @@ runq_init(&tdq->tdq_timeshare); runq_init(&tdq->tdq_idle); tdq->tdq_load = 0; + tdq->tdq_interload = 0; } #ifdef SMP @@ -1324,12 +1375,12 @@ * in case which sched_clock() called before sched_initticks(). */ realstathz = hz; - sched_slice = (realstathz/10); /* ~100ms */ tickincr = 1 << SCHED_TICK_SHIFT; /* Add thread0's load since it's running. */ TDQ_LOCK(tdq); thread0.td_lock = TDQ_LOCKPTR(TDQ_SELF()); + td_sched0.ts_score = 0; tdq_load_add(tdq, &td_sched0); TDQ_UNLOCK(tdq); } @@ -1344,7 +1395,8 @@ int incr; realstathz = stathz ? stathz : hz; - sched_slice = (realstathz/10); /* ~100ms */ + sched_slice_max = realstathz / 15; /* ~66ms */ + sched_slice_min = realstathz / 50; /* ~20ms */ /* * tickincr is shifted out by 10 to avoid rounding errors due to @@ -1374,7 +1426,6 @@ #endif } - /* * This is the core of the interactivity algorithm. Determines a score based * on past behavior. It is the ratio of sleep time to run time scaled to @@ -1389,15 +1440,6 @@ int div; ts = td->td_sched; - /* - * The score is only needed if this is likely to be an interactive - * task. Don't go through the expense of computing it if there's - * no chance. - */ - if (sched_interact <= SCHED_INTERACT_HALF && - ts->ts_runtime >= ts->ts_slptime) - return (SCHED_INTERACT_HALF); - if (ts->ts_runtime > ts->ts_slptime) { div = max(1, ts->ts_runtime / SCHED_INTERACT_HALF); return (SCHED_INTERACT_HALF + @@ -1443,7 +1485,7 @@ * score. Negative nice values make it easier for a thread to be * considered interactive. */ - score = imax(0, sched_interact_score(td) - td->td_proc->p_nice); + score = imax(0, td->td_sched->ts_score - td->td_proc->p_nice); if (score < sched_interact) { pri = PRI_MIN_REALTIME; pri += ((PRI_MAX_REALTIME - PRI_MIN_REALTIME) / sched_interact) @@ -1477,12 +1519,15 @@ sched_interact_update(struct thread *td) { struct td_sched *ts; + struct tdq *tdq; u_int sum; + int score; + THREAD_LOCK_ASSERT(td, MA_OWNED); ts = td->td_sched; sum = ts->ts_runtime + ts->ts_slptime; if (sum < SCHED_SLP_RUN_MAX) - return; + goto score; /* * This only happens from two places: * 1) We have added an unusual amount of run time from fork_exit. @@ -1490,13 +1535,13 @@ */ if (sum > SCHED_SLP_RUN_MAX * 2) { if (ts->ts_runtime > ts->ts_slptime) { - ts->ts_runtime = SCHED_SLP_RUN_MAX; + ts->ts_runtime = SCHED_SLP_RUN_MAX / 2; ts->ts_slptime = 1; } else { - ts->ts_slptime = SCHED_SLP_RUN_MAX; + ts->ts_slptime = SCHED_SLP_RUN_MAX / 2; ts->ts_runtime = 1; } - return; + goto score; } /* * If we have exceeded by more than 1/5th then the algorithm below @@ -1506,10 +1551,19 @@ if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) { ts->ts_runtime /= 2; ts->ts_slptime /= 2; - return; + goto score; } ts->ts_runtime = (ts->ts_runtime / 5) * 4; ts->ts_slptime = (ts->ts_slptime / 5) * 4; +score: + score = sched_interact_score(td); + if (ts->ts_flags & TSF_INTERLOAD) { + tdq = TDQ_CPU(ts->ts_cpu); + TDQ_LOCK_ASSERT(tdq, MA_OWNED); + tdq->tdq_interload -= ts->ts_score; + tdq->tdq_interload += score; + } + ts->ts_score = score; } /* @@ -1559,7 +1613,7 @@ { /* Convert sched_slice to hz */ - return (hz/(realstathz/sched_slice)); + return (hz/(realstathz/sched_slice_max)); } /* @@ -1598,16 +1652,19 @@ sched_thread_priority(struct thread *td, u_char prio) { struct td_sched *ts; + struct tdq *tdq; CTR6(KTR_SCHED, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)", td, td->td_proc->p_comm, td->td_priority, prio, curthread, curthread->td_proc->p_comm); ts = td->td_sched; + tdq = TDQ_CPU(ts->ts_cpu); THREAD_LOCK_ASSERT(td, MA_OWNED); - if (td->td_priority == prio) + if (td->td_priority <= prio) { + td->td_priority = prio; return; - - if (TD_ON_RUNQ(td) && prio < td->td_priority) { + } + if (TD_ON_RUNQ(td)) { /* * If the priority has been elevated due to priority * propagation, we may have to move ourselves to a new @@ -1617,16 +1674,14 @@ sched_rem(td); td->td_priority = prio; sched_add(td, SRQ_BORROWING); - } else { #ifdef SMP - struct tdq *tdq; - - tdq = TDQ_CPU(ts->ts_cpu); + } else if (TD_IS_RUNNING(td)) { if (prio < tdq->tdq_lowpri) tdq->tdq_lowpri = prio; + td->td_priority = prio; #endif + } else td->td_priority = prio; - } } /* @@ -1772,6 +1827,8 @@ tdn = TDQ_CPU(td->td_sched->ts_cpu); #ifdef SMP + /* The load is being removed from the current cpu. */ + tdq_load_rem(tdq, td->td_sched); /* * Do the lock dance required to avoid LOR. We grab an extra * spinlock nesting to prevent preemption while we're @@ -1863,12 +1920,11 @@ TD_SET_CAN_RUN(td); } else if (TD_IS_RUNNING(td)) { MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); - tdq_load_rem(tdq, ts); srqflag = (flags & SW_PREEMPT) ? SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : SRQ_OURSELF|SRQ_YIELDING; if (ts->ts_cpu == cpuid) - tdq_add(tdq, td, srqflag); + tdq_runq_add(tdq, ts, srqflag); else mtx = sched_switch_migrate(tdq, td, srqflag); } else { @@ -1970,22 +2026,18 @@ THREAD_LOCK_ASSERT(td, MA_OWNED); ts = td->td_sched; /* - * If we slept for more than a tick update our interactivity and - * priority. + * Update interactivity and priority after a sleep. */ slptick = td->td_slptick; td->td_slptick = 0; - if (slptick && slptick != ticks) { - u_int hzticks; - - hzticks = (ticks - slptick) << SCHED_TICK_SHIFT; - ts->ts_slptime += hzticks; + if (slptick && slptick != ticks) { + ts->ts_slptime += (ticks - slptick) << SCHED_TICK_SHIFT; sched_interact_update(td); sched_pctcpu_update(ts); sched_priority(td); + /* Reset the slice value after we sleep. */ + ts->ts_slice = 0; } - /* Reset the slice value after we sleep. */ - ts->ts_slice = sched_slice; sched_add(td, SRQ_BORING); } @@ -2040,7 +2092,6 @@ */ ts2->ts_slptime = ts->ts_slptime; ts2->ts_runtime = ts->ts_runtime; - ts2->ts_slice = 1; /* Attempt to quickly learn interactivity. */ } /* @@ -2188,25 +2239,26 @@ } ts = td->td_sched; /* - * We only do slicing code for TIMESHARE threads. - */ - if (td->td_pri_class != PRI_TIMESHARE) - return; - /* * We used a tick; charge it to the thread so that we can compute our * interactivity. */ td->td_sched->ts_runtime += tickincr; sched_interact_update(td); /* + * We only do slicing code for TIMESHARE threads. + */ + if (td->td_pri_class != PRI_TIMESHARE) + return; + sched_priority(td); + /* * We used up one time slice. */ - if (--ts->ts_slice > 0) + if (++ts->ts_slice < tdq_slice(tdq)) return; /* - * We're out of time, recompute priorities and requeue. + * We're out of time, force a requeue later. */ - sched_priority(td); + ts->ts_slice = 0; td->td_flags |= TDF_NEEDRESCHED; } @@ -2328,11 +2380,10 @@ tdq_add(struct tdq *tdq, struct thread *td, int flags) { struct td_sched *ts; - int class; #ifdef SMP + int class; int cpumask; #endif - TDQ_LOCK_ASSERT(tdq, MA_OWNED); KASSERT((td->td_inhibitors == 0), ("sched_add: trying to run inhibited thread")); @@ -2342,20 +2393,10 @@ ("sched_add: thread swapped out")); ts = td->td_sched; - class = PRI_BASE(td->td_pri_class); - TD_SET_RUNQ(td); - if (ts->ts_slice == 0) - ts->ts_slice = sched_slice; - /* - * Pick the run queue based on priority. - */ - if (td->td_priority <= PRI_MAX_REALTIME) - ts->ts_runq = &tdq->tdq_realtime; - else if (td->td_priority <= PRI_MAX_TIMESHARE) - ts->ts_runq = &tdq->tdq_timeshare; - else - ts->ts_runq = &tdq->tdq_idle; + tdq_runq_add(tdq, ts, flags); + tdq_load_add(tdq, ts); #ifdef SMP + class = PRI_BASE(td->td_pri_class); cpumask = 1 << ts->ts_cpu; /* * If we had been idle, clear our bit in the group and potentially @@ -2378,8 +2419,6 @@ if (td->td_priority < tdq->tdq_lowpri) tdq->tdq_lowpri = td->td_priority; #endif - tdq_runq_add(tdq, ts, flags); - tdq_load_add(tdq, ts); } /* @@ -2660,8 +2699,10 @@ "Scheduler"); SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ULE", 0, "Scheduler name"); -SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0, - "Slice size for timeshare threads"); +SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice_max, 0, + "Maximum slice size for timeshare threads"); +SYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &sched_slice_min, 0, + "Minimum slice size for timeshare threads"); SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0, "Interactivity score threshold"); SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW, &preempt_thresh, @@ -2671,7 +2712,6 @@ "Pick the target cpu based on priority rather than load."); SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0, "Number of hz ticks to keep thread affinity for"); -SYSCTL_INT(_kern_sched, OID_AUTO, tryself, CTLFLAG_RW, &tryself, 0, ""); SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0, "Enables the long-term load balancer"); SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW,