The patch can be found inline below and also here: https://people.freebsd.org/~mjg/lock_backoff_complete.diff The previous version of the patch was posted here: https://lists.freebsd.org/pipermail/freebsd-current/2016-July/062320.html This time around instead of a total hack I have something of reasonable quality. Problem description and the way to fix it stands, to quote: > If the lock is contended, primitives like __mtx_lock_sleep will spin > checking if the owner is running or the lock was freed. The problem is > that once it is discovered that the lock is free, multiple CPUs are > likely to try to do the atomic op which will make it more costly for > everyone and throughput suffers. > The standard thing to do is to have some sort of a randomized delay so > that this kind of behaviour is reduced. So, this time around lock_delay primitive accepts configuration including min/max spin count per iteration, incrementation step per lock_delay invocation and total spins allowed during one execution of the locking primitive. The code "autotunes" with mp_ncpus on boot. Current values were determined after testing the code on a 80-way and a 40-way system. There is definitely room for improvement there, but the biggest gain as it is is sufficient to get the patch in. Parameters are settable by lock type. Converted locks are: mtx, sx, rwlocks. Trivial loops spinning as long as the lock owner is running are modified. The rest was left untouched in order to keep the patch simpler. The following locks were not touched: spin mutexes, thread locks and lockmgr. First two could be, but I don't have any suitable benchmarks for them. lockmgr has support for spinning, but is compiled out. In terms of results, this time around I got my hands on an 80-way machine (pig1 in the testing cluster). Vanilla kernel performance drops significantly with increased contention. For instance, the following is make -j 80 buildkernel of 11.0 tree on a tmpfs: 7681.08s user 17040.02s system 7014% cpu 5:52.43 total And this is with the patched kernel: 4585.85s user 2365.95s system 5675% cpu 2:02.48 total Both results are reproducible. I would like to get this in as fast as possible. Note: lock_delay has a .cap parameter which is currently set to 0 and has no sysctl to control it. It is there for later when it will replace the unmodified spinning code in sx and rwlocks. diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c index 453add4..ca9319c 100644 --- a/sys/kern/kern_mutex.c +++ b/sys/kern/kern_mutex.c _at__at_ -55,6 +55,7 _at__at_ __FBSDID("$FreeBSD$"); #include <sys/resourcevar.h> #include <sys/sched.h> #include <sys/sbuf.h> +#include <sys/smp.h> #include <sys/sysctl.h> #include <sys/turnstile.h> #include <sys/vmmeter.h> _at__at_ -138,6 +139,37 _at__at_ struct lock_class lock_class_mtx_spin = { #endif }; +#ifdef ADAPTIVE_MUTEXES +static SYSCTL_NODE(_debug, OID_AUTO, mtx, CTLFLAG_RD, NULL, "mtx debugging"); + +static struct lock_delay_config mtx_delay = { + .initial = 1000, + .step = 500, + .cap = 0, + .min = 100, + .max = 5000, +}; + +SYSCTL_INT(_debug_mtx, OID_AUTO, delay_initial, CTLFLAG_RW, &mtx_delay.initial, + 0, ""); +SYSCTL_INT(_debug_mtx, OID_AUTO, delay_step, CTLFLAG_RW, &mtx_delay.step, + 0, ""); +SYSCTL_INT(_debug_mtx, OID_AUTO, delay_min, CTLFLAG_RW, &mtx_delay.min, + 0, ""); +SYSCTL_INT(_debug_mtx, OID_AUTO, delay_max, CTLFLAG_RW, &mtx_delay.max, + 0, ""); + +static void +mtx_delay_sysinit(void *dummy) +{ + + mtx_delay.initial = mp_ncpus * 25; + mtx_delay.min = mp_ncpus * 5; + mtx_delay.max = mp_ncpus * 25 * 10; +} +LOCK_DELAY_SYSINIT(mtx_delay_sysinit); +#endif + /* * System-wide mutexes */ _at__at_ -408,8 +440,11 _at__at_ __mtx_lock_sleep(volatile uintptr_t *c, uintptr_t tid, int opts, int contested = 0; uint64_t waittime = 0; #endif -#ifdef KDTRACE_HOOKS +#if defined(ADAPTIVE_MUTEXES) || defined(KDTRACE_HOOKS) uint64_t spin_cnt = 0; + uint64_t delay = 0; +#endif +#ifdef KDTRACE_HOOKS uint64_t sleep_cnt = 0; int64_t sleep_time = 0; int64_t all_time = 0; _at__at_ -471,12 +506,8 _at__at_ __mtx_lock_sleep(volatile uintptr_t *c, uintptr_t tid, int opts, "spinning", "lockname:\"%s\"", m->lock_object.lo_name); while (mtx_owner(m) == owner && - TD_IS_RUNNING(owner)) { - cpu_spinwait(); -#ifdef KDTRACE_HOOKS - spin_cnt++; -#endif - } + TD_IS_RUNNING(owner)) + lock_delay(&delay, &mtx_delay, &spin_cnt); KTR_STATE0(KTR_SCHED, "thread", sched_tdname((struct thread *)tid), "running"); diff --git a/sys/kern/kern_rwlock.c b/sys/kern/kern_rwlock.c index 496738d..cd93520 100644 --- a/sys/kern/kern_rwlock.c +++ b/sys/kern/kern_rwlock.c _at__at_ -44,6 +44,7 _at__at_ __FBSDID("$FreeBSD$"); #include <sys/proc.h> #include <sys/rwlock.h> #include <sys/sched.h> +#include <sys/smp.h> #include <sys/sysctl.h> #include <sys/systm.h> #include <sys/turnstile.h> _at__at_ -65,15 +66,6 _at__at_ PMC_SOFT_DECLARE( , , lock, failed); */ #define rwlock2rw(c) (__containerof(c, struct rwlock, rw_lock)) -#ifdef ADAPTIVE_RWLOCKS -static int rowner_retries = 10; -static int rowner_loops = 10000; -static SYSCTL_NODE(_debug, OID_AUTO, rwlock, CTLFLAG_RD, NULL, - "rwlock debugging"); -SYSCTL_INT(_debug_rwlock, OID_AUTO, retry, CTLFLAG_RW, &rowner_retries, 0, ""); -SYSCTL_INT(_debug_rwlock, OID_AUTO, loops, CTLFLAG_RW, &rowner_loops, 0, ""); -#endif - #ifdef DDB #include <ddb/ddb.h> _at__at_ -100,6 +92,42 _at__at_ struct lock_class lock_class_rw = { #endif }; +#ifdef ADAPTIVE_RWLOCKS +static int rowner_retries = 10; +static int rowner_loops = 10000; +static SYSCTL_NODE(_debug, OID_AUTO, rwlock, CTLFLAG_RD, NULL, + "rwlock debugging"); +SYSCTL_INT(_debug_rwlock, OID_AUTO, retry, CTLFLAG_RW, &rowner_retries, 0, ""); +SYSCTL_INT(_debug_rwlock, OID_AUTO, loops, CTLFLAG_RW, &rowner_loops, 0, ""); + +static struct lock_delay_config rw_delay = { + .initial = 1000, + .step = 500, + .cap = 0, + .min = 100, + .max = 5000, +}; + +SYSCTL_INT(_debug_rwlock, OID_AUTO, delay_initial, CTLFLAG_RW, &rw_delay.initial, + 0, ""); +SYSCTL_INT(_debug_rwlock, OID_AUTO, delay_step, CTLFLAG_RW, &rw_delay.step, + 0, ""); +SYSCTL_INT(_debug_rwlock, OID_AUTO, delay_min, CTLFLAG_RW, &rw_delay.min, + 0, ""); +SYSCTL_INT(_debug_rwlock, OID_AUTO, delay_max, CTLFLAG_RW, &rw_delay.max, + 0, ""); + +static void +rw_delay_sysinit(void *dummy) +{ + + rw_delay.initial = mp_ncpus * 25; + rw_delay.min = mp_ncpus * 5; + rw_delay.max = mp_ncpus * 25 * 10; +} +LOCK_DELAY_SYSINIT(rw_delay_sysinit); +#endif + /* * Return a pointer to the owning thread if the lock is write-locked or * NULL if the lock is unlocked or read-locked. _at__at_ -355,9 +383,12 _at__at_ __rw_rlock(volatile uintptr_t *c, const char *file, int line) int contested = 0; #endif uintptr_t v; +#if defined(ADAPTIVE_RWLOCKS) || defined(KDTRACE_HOOKS) + uint64_t spin_cnt = 0; + uint64_t delay = 0; +#endif #ifdef KDTRACE_HOOKS uintptr_t state; - uint64_t spin_cnt = 0; uint64_t sleep_cnt = 0; int64_t sleep_time = 0; int64_t all_time = 0; _at__at_ -437,12 +468,8 _at__at_ __rw_rlock(volatile uintptr_t *c, const char *file, int line) sched_tdname(curthread), "spinning", "lockname:\"%s\"", rw->lock_object.lo_name); while ((struct thread*)RW_OWNER(rw->rw_lock) == - owner && TD_IS_RUNNING(owner)) { - cpu_spinwait(); -#ifdef KDTRACE_HOOKS - spin_cnt++; -#endif - } + owner && TD_IS_RUNNING(owner)) + lock_delay(&delay, &rw_delay, &spin_cnt); KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread), "running"); continue; _at__at_ -740,9 +767,12 _at__at_ __rw_wlock_hard(volatile uintptr_t *c, uintptr_t tid, const char *file, uint64_t waittime = 0; int contested = 0; #endif +#if defined(ADAPTIVE_RWLOCKS) || defined(KDTRACE_HOOKS) + uint64_t spin_cnt = 0; + uint64_t delay = 0; +#endif #ifdef KDTRACE_HOOKS uintptr_t state; - uint64_t spin_cnt = 0; uint64_t sleep_cnt = 0; int64_t sleep_time = 0; int64_t all_time = 0; _at__at_ -798,12 +828,8 _at__at_ __rw_wlock_hard(volatile uintptr_t *c, uintptr_t tid, const char *file, "spinning", "lockname:\"%s\"", rw->lock_object.lo_name); while ((struct thread*)RW_OWNER(rw->rw_lock) == owner && - TD_IS_RUNNING(owner)) { - cpu_spinwait(); -#ifdef KDTRACE_HOOKS - spin_cnt++; -#endif - } + TD_IS_RUNNING(owner)) + lock_delay(&delay, &rw_delay, &spin_cnt); KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread), "running"); continue; diff --git a/sys/kern/kern_sx.c b/sys/kern/kern_sx.c index a84df52..9a3465f 100644 --- a/sys/kern/kern_sx.c +++ b/sys/kern/kern_sx.c _at__at_ -53,6 +53,7 _at__at_ __FBSDID("$FreeBSD$"); #include <sys/sched.h> #include <sys/sleepqueue.h> #include <sys/sx.h> +#include <sys/smp.h> #include <sys/sysctl.h> #if defined(SMP) && !defined(NO_ADAPTIVE_SX) _at__at_ -145,6 +146,33 _at__at_ static u_int asx_loops = 10000; static SYSCTL_NODE(_debug, OID_AUTO, sx, CTLFLAG_RD, NULL, "sxlock debugging"); SYSCTL_UINT(_debug_sx, OID_AUTO, retries, CTLFLAG_RW, &asx_retries, 0, ""); SYSCTL_UINT(_debug_sx, OID_AUTO, loops, CTLFLAG_RW, &asx_loops, 0, ""); + +static struct lock_delay_config sx_delay = { + .initial = 1000, + .step = 500, + .cap = 0, + .min = 100, + .max = 5000, +}; + +SYSCTL_INT(_debug_sx, OID_AUTO, delay_initial, CTLFLAG_RW, &sx_delay.initial, + 0, ""); +SYSCTL_INT(_debug_sx, OID_AUTO, delay_step, CTLFLAG_RW, &sx_delay.step, + 0, ""); +SYSCTL_INT(_debug_sx, OID_AUTO, delay_min, CTLFLAG_RW, &sx_delay.min, + 0, ""); +SYSCTL_INT(_debug_sx, OID_AUTO, delay_max, CTLFLAG_RW, &sx_delay.max, + 0, ""); + +static void +sx_delay_sysinit(void *dummy) +{ + + sx_delay.initial = mp_ncpus * 25; + sx_delay.min = mp_ncpus * 5; + sx_delay.max = mp_ncpus * 25 * 10; +} +LOCK_DELAY_SYSINIT(sx_delay_sysinit); #endif void _at__at_ -513,9 +541,12 _at__at_ _sx_xlock_hard(struct sx *sx, uintptr_t tid, int opts, const char *file, int contested = 0; #endif int error = 0; +#if defined(ADAPTIVE_SX) || defined(KDTRACE_HOOKS) + uint64_t spin_cnt = 0; + uint64_t delay = 0; +#endif #ifdef KDTRACE_HOOKS uintptr_t state; - uint64_t spin_cnt = 0; uint64_t sleep_cnt = 0; int64_t sleep_time = 0; int64_t all_time = 0; _at__at_ -578,12 +609,8 _at__at_ _sx_xlock_hard(struct sx *sx, uintptr_t tid, int opts, const char *file, sx->lock_object.lo_name); GIANT_SAVE(); while (SX_OWNER(sx->sx_lock) == x && - TD_IS_RUNNING(owner)) { - cpu_spinwait(); -#ifdef KDTRACE_HOOKS - spin_cnt++; -#endif - } + TD_IS_RUNNING(owner)) + lock_delay(&delay, &sx_delay, &spin_cnt); KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread), "running"); continue; _at__at_ -818,9 +845,12 _at__at_ _sx_slock_hard(struct sx *sx, int opts, const char *file, int line) #endif uintptr_t x; int error = 0; +#if defined(ADAPTIVE_SX) || defined(KDTRACE_HOOKS) + uint64_t spin_cnt = 0; + uint64_t delay = 0; +#endif #ifdef KDTRACE_HOOKS uintptr_t state; - uint64_t spin_cnt = 0; uint64_t sleep_cnt = 0; int64_t sleep_time = 0; int64_t all_time = 0; _at__at_ -888,12 +918,8 _at__at_ _sx_slock_hard(struct sx *sx, int opts, const char *file, int line) "lockname:\"%s\"", sx->lock_object.lo_name); GIANT_SAVE(); while (SX_OWNER(sx->sx_lock) == x && - TD_IS_RUNNING(owner)) { - cpu_spinwait(); -#ifdef KDTRACE_HOOKS - spin_cnt++; -#endif - } + TD_IS_RUNNING(owner)) + lock_delay(&delay, &sx_delay, &spin_cnt); KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread), "running"); continue; diff --git a/sys/kern/subr_lock.c b/sys/kern/subr_lock.c index e78d5a9..546881f 100644 --- a/sys/kern/subr_lock.c +++ b/sys/kern/subr_lock.c _at__at_ -103,6 +103,37 _at__at_ lock_destroy(struct lock_object *lock) lock->lo_flags &= ~LO_INITIALIZED; } +uint64_t +lock_delay(uint64_t *delayp, struct lock_delay_config *l, uint64_t *spin_cntp) +{ + uint64_t i, delay, backoff, min, max, cap; + + delay = *delayp; + + if (delay == 0) + delay = l->initial; + else { + delay += l->step; + max = l->max; + if (delay > max) + delay = max; + } + + backoff = cpu_ticks() % delay; + min = l->min; + if (backoff < min) + backoff = min; + cap = l->cap; + if (cap > 0 && backoff > cap - *spin_cntp) + backoff = cap - *spin_cntp; + for (i = 0; i < backoff; i++) + cpu_spinwait(); + + *delayp = delay; + *spin_cntp += backoff; + return (cap - *spin_cntp); +} + #ifdef DDB DB_SHOW_COMMAND(lock, db_show_lock) { diff --git a/sys/sys/lock.h b/sys/sys/lock.h index 8d7a068..ffac5ba 100644 --- a/sys/sys/lock.h +++ b/sys/sys/lock.h _at__at_ -201,9 +201,23 _at__at_ extern struct lock_class lock_class_lockmgr; extern struct lock_class *lock_classes[]; +extern int lock_delay_enabled; + +struct lock_delay_config { + u_int initial; + u_int step; + u_int cap; + u_int min; + u_int max; +}; + +#define LOCK_DELAY_SYSINIT(func) \ + SYSINIT(func##_ld, SI_SUB_LOCK, SI_ORDER_ANY, func, NULL) + void lock_init(struct lock_object *, struct lock_class *, const char *, const char *, int); void lock_destroy(struct lock_object *); +uint64_t lock_delay(uint64_t *, struct lock_delay_config *, uint64_t *); void spinlock_enter(void); void spinlock_exit(void); void witness_init(struct lock_object *, const char *);Received on Sun Jul 31 2016 - 07:57:12 UTC
This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:41:07 UTC