[PATCH] randomized delay in locking primitives, take 2

From: Mateusz Guzik <mjguzik_at_gmail.com>
Date: Sun, 31 Jul 2016 11:57:06 +0200
The patch can be found inline below and also here:
https://people.freebsd.org/~mjg/lock_backoff_complete.diff

The previous version of the patch was posted here:
https://lists.freebsd.org/pipermail/freebsd-current/2016-July/062320.html

This time around instead of a total hack I have something of reasonable
quality.

Problem description and the way to fix it stands, to quote:
> If the lock is contended, primitives like __mtx_lock_sleep will spin
> checking if the owner is running or the lock was freed. The problem is
> that once it is discovered that the lock is free, multiple CPUs are
> likely to try to do the atomic op which will make it more costly for
> everyone and throughput suffers.

> The standard thing to do is to have some sort of a randomized delay so
> that this kind of behaviour is reduced.

So, this time around lock_delay primitive accepts configuration including
min/max spin count per iteration, incrementation step per lock_delay
invocation and total spins allowed during one execution of the locking
primitive. The code "autotunes" with mp_ncpus on boot. Current values
were determined after testing the code on a 80-way and a 40-way system.
There is definitely room for improvement there, but the biggest gain as
it is is sufficient to get the patch in.

Parameters are settable by lock type.

Converted locks are: mtx, sx, rwlocks.

Trivial loops spinning as long as the lock owner is running are
modified. The rest was left untouched in order to keep the patch
simpler.

The following locks were not touched: spin mutexes, thread locks and
lockmgr. First two could be, but I don't have any suitable benchmarks
for them. lockmgr has support for spinning, but is compiled out.

In terms of results, this time around I got my hands on an 80-way
machine (pig1 in the testing cluster). Vanilla kernel performance drops
significantly with increased contention. For instance, the following is
make -j 80 buildkernel of 11.0 tree on a tmpfs:
  7681.08s user 17040.02s system 7014% cpu 5:52.43 total

And this is with the patched kernel:
  4585.85s user 2365.95s system 5675% cpu 2:02.48 total

Both results are reproducible.

I would like to get this in as fast as possible.

Note: lock_delay has a .cap parameter which is currently set to 0 and
has no sysctl to control it. It is there for later when it will replace
the unmodified spinning code in sx and rwlocks.

diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c
index 453add4..ca9319c 100644
--- a/sys/kern/kern_mutex.c
+++ b/sys/kern/kern_mutex.c
_at__at_ -55,6 +55,7 _at__at_ __FBSDID("$FreeBSD$");
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/sbuf.h>
+#include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/turnstile.h>
 #include <sys/vmmeter.h>
_at__at_ -138,6 +139,37 _at__at_ struct lock_class lock_class_mtx_spin = {
 #endif
 };
 
+#ifdef ADAPTIVE_MUTEXES
+static SYSCTL_NODE(_debug, OID_AUTO, mtx, CTLFLAG_RD, NULL, "mtx debugging");
+
+static struct lock_delay_config mtx_delay = {
+	.initial	= 1000,
+	.step		= 500,
+	.cap		= 0,
+	.min		= 100,
+	.max		= 5000,
+};
+
+SYSCTL_INT(_debug_mtx, OID_AUTO, delay_initial, CTLFLAG_RW, &mtx_delay.initial,
+    0, "");
+SYSCTL_INT(_debug_mtx, OID_AUTO, delay_step, CTLFLAG_RW, &mtx_delay.step,
+    0, "");
+SYSCTL_INT(_debug_mtx, OID_AUTO, delay_min, CTLFLAG_RW, &mtx_delay.min,
+    0, "");
+SYSCTL_INT(_debug_mtx, OID_AUTO, delay_max, CTLFLAG_RW, &mtx_delay.max,
+    0, "");
+
+static void
+mtx_delay_sysinit(void *dummy)
+{
+
+	mtx_delay.initial = mp_ncpus * 25;
+	mtx_delay.min = mp_ncpus * 5;
+	mtx_delay.max = mp_ncpus * 25 * 10;
+}
+LOCK_DELAY_SYSINIT(mtx_delay_sysinit);
+#endif
+
 /*
  * System-wide mutexes
  */
_at__at_ -408,8 +440,11 _at__at_ __mtx_lock_sleep(volatile uintptr_t *c, uintptr_t tid, int opts,
 	int contested = 0;
 	uint64_t waittime = 0;
 #endif
-#ifdef KDTRACE_HOOKS
+#if defined(ADAPTIVE_MUTEXES) || defined(KDTRACE_HOOKS)
 	uint64_t spin_cnt = 0;
+	uint64_t delay = 0;
+#endif
+#ifdef KDTRACE_HOOKS
 	uint64_t sleep_cnt = 0;
 	int64_t sleep_time = 0;
 	int64_t all_time = 0;
_at__at_ -471,12 +506,8 _at__at_ __mtx_lock_sleep(volatile uintptr_t *c, uintptr_t tid, int opts,
 				    "spinning", "lockname:\"%s\"",
 				    m->lock_object.lo_name);
 				while (mtx_owner(m) == owner &&
-				    TD_IS_RUNNING(owner)) {
-					cpu_spinwait();
-#ifdef KDTRACE_HOOKS
-					spin_cnt++;
-#endif
-				}
+				    TD_IS_RUNNING(owner))
+					lock_delay(&delay, &mtx_delay, &spin_cnt);
 				KTR_STATE0(KTR_SCHED, "thread",
 				    sched_tdname((struct thread *)tid),
 				    "running");
diff --git a/sys/kern/kern_rwlock.c b/sys/kern/kern_rwlock.c
index 496738d..cd93520 100644
--- a/sys/kern/kern_rwlock.c
+++ b/sys/kern/kern_rwlock.c
_at__at_ -44,6 +44,7 _at__at_ __FBSDID("$FreeBSD$");
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
+#include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/turnstile.h>
_at__at_ -65,15 +66,6 _at__at_ PMC_SOFT_DECLARE( , , lock, failed);
  */
 #define	rwlock2rw(c)	(__containerof(c, struct rwlock, rw_lock))
 
-#ifdef ADAPTIVE_RWLOCKS
-static int rowner_retries = 10;
-static int rowner_loops = 10000;
-static SYSCTL_NODE(_debug, OID_AUTO, rwlock, CTLFLAG_RD, NULL,
-    "rwlock debugging");
-SYSCTL_INT(_debug_rwlock, OID_AUTO, retry, CTLFLAG_RW, &rowner_retries, 0, "");
-SYSCTL_INT(_debug_rwlock, OID_AUTO, loops, CTLFLAG_RW, &rowner_loops, 0, "");
-#endif
-
 #ifdef DDB
 #include <ddb/ddb.h>
 
_at__at_ -100,6 +92,42 _at__at_ struct lock_class lock_class_rw = {
 #endif
 };
 
+#ifdef ADAPTIVE_RWLOCKS
+static int rowner_retries = 10;
+static int rowner_loops = 10000;
+static SYSCTL_NODE(_debug, OID_AUTO, rwlock, CTLFLAG_RD, NULL,
+    "rwlock debugging");
+SYSCTL_INT(_debug_rwlock, OID_AUTO, retry, CTLFLAG_RW, &rowner_retries, 0, "");
+SYSCTL_INT(_debug_rwlock, OID_AUTO, loops, CTLFLAG_RW, &rowner_loops, 0, "");
+
+static struct lock_delay_config rw_delay = {
+	.initial	= 1000,
+	.step		= 500,
+	.cap		= 0,
+	.min		= 100,
+	.max		= 5000,
+};
+
+SYSCTL_INT(_debug_rwlock, OID_AUTO, delay_initial, CTLFLAG_RW, &rw_delay.initial,
+    0, "");
+SYSCTL_INT(_debug_rwlock, OID_AUTO, delay_step, CTLFLAG_RW, &rw_delay.step,
+    0, "");
+SYSCTL_INT(_debug_rwlock, OID_AUTO, delay_min, CTLFLAG_RW, &rw_delay.min,
+    0, "");
+SYSCTL_INT(_debug_rwlock, OID_AUTO, delay_max, CTLFLAG_RW, &rw_delay.max,
+    0, "");
+
+static void
+rw_delay_sysinit(void *dummy)
+{
+
+	rw_delay.initial = mp_ncpus * 25;
+	rw_delay.min = mp_ncpus * 5;
+	rw_delay.max = mp_ncpus * 25 * 10;
+}
+LOCK_DELAY_SYSINIT(rw_delay_sysinit);
+#endif
+
 /*
  * Return a pointer to the owning thread if the lock is write-locked or
  * NULL if the lock is unlocked or read-locked.
_at__at_ -355,9 +383,12 _at__at_ __rw_rlock(volatile uintptr_t *c, const char *file, int line)
 	int contested = 0;
 #endif
 	uintptr_t v;
+#if defined(ADAPTIVE_RWLOCKS) || defined(KDTRACE_HOOKS)
+	uint64_t spin_cnt = 0;
+	uint64_t delay = 0;
+#endif
 #ifdef KDTRACE_HOOKS
 	uintptr_t state;
-	uint64_t spin_cnt = 0;
 	uint64_t sleep_cnt = 0;
 	int64_t sleep_time = 0;
 	int64_t all_time = 0;
_at__at_ -437,12 +468,8 _at__at_ __rw_rlock(volatile uintptr_t *c, const char *file, int line)
 				    sched_tdname(curthread), "spinning",
 				    "lockname:\"%s\"", rw->lock_object.lo_name);
 				while ((struct thread*)RW_OWNER(rw->rw_lock) ==
-				    owner && TD_IS_RUNNING(owner)) {
-					cpu_spinwait();
-#ifdef KDTRACE_HOOKS
-					spin_cnt++;
-#endif
-				}
+				    owner && TD_IS_RUNNING(owner))
+					lock_delay(&delay, &rw_delay, &spin_cnt);
 				KTR_STATE0(KTR_SCHED, "thread",
 				    sched_tdname(curthread), "running");
 				continue;
_at__at_ -740,9 +767,12 _at__at_ __rw_wlock_hard(volatile uintptr_t *c, uintptr_t tid, const char *file,
 	uint64_t waittime = 0;
 	int contested = 0;
 #endif
+#if defined(ADAPTIVE_RWLOCKS) || defined(KDTRACE_HOOKS)
+	uint64_t spin_cnt = 0;
+	uint64_t delay = 0;
+#endif
 #ifdef KDTRACE_HOOKS
 	uintptr_t state;
-	uint64_t spin_cnt = 0;
 	uint64_t sleep_cnt = 0;
 	int64_t sleep_time = 0;
 	int64_t all_time = 0;
_at__at_ -798,12 +828,8 _at__at_ __rw_wlock_hard(volatile uintptr_t *c, uintptr_t tid, const char *file,
 			    "spinning", "lockname:\"%s\"",
 			    rw->lock_object.lo_name);
 			while ((struct thread*)RW_OWNER(rw->rw_lock) == owner &&
-			    TD_IS_RUNNING(owner)) {
-				cpu_spinwait();
-#ifdef KDTRACE_HOOKS
-				spin_cnt++;
-#endif
-			}
+			    TD_IS_RUNNING(owner))
+				lock_delay(&delay, &rw_delay, &spin_cnt);
 			KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread),
 			    "running");
 			continue;
diff --git a/sys/kern/kern_sx.c b/sys/kern/kern_sx.c
index a84df52..9a3465f 100644
--- a/sys/kern/kern_sx.c
+++ b/sys/kern/kern_sx.c
_at__at_ -53,6 +53,7 _at__at_ __FBSDID("$FreeBSD$");
 #include <sys/sched.h>
 #include <sys/sleepqueue.h>
 #include <sys/sx.h>
+#include <sys/smp.h>
 #include <sys/sysctl.h>
 
 #if defined(SMP) && !defined(NO_ADAPTIVE_SX)
_at__at_ -145,6 +146,33 _at__at_ static u_int asx_loops = 10000;
 static SYSCTL_NODE(_debug, OID_AUTO, sx, CTLFLAG_RD, NULL, "sxlock debugging");
 SYSCTL_UINT(_debug_sx, OID_AUTO, retries, CTLFLAG_RW, &asx_retries, 0, "");
 SYSCTL_UINT(_debug_sx, OID_AUTO, loops, CTLFLAG_RW, &asx_loops, 0, "");
+
+static struct lock_delay_config sx_delay = {
+	.initial	= 1000,
+	.step           = 500,
+	.cap		= 0,
+	.min		= 100,
+	.max		= 5000,
+};
+
+SYSCTL_INT(_debug_sx, OID_AUTO, delay_initial, CTLFLAG_RW, &sx_delay.initial,
+    0, "");
+SYSCTL_INT(_debug_sx, OID_AUTO, delay_step, CTLFLAG_RW, &sx_delay.step,
+    0, "");
+SYSCTL_INT(_debug_sx, OID_AUTO, delay_min, CTLFLAG_RW, &sx_delay.min,
+    0, "");
+SYSCTL_INT(_debug_sx, OID_AUTO, delay_max, CTLFLAG_RW, &sx_delay.max,
+    0, "");
+
+static void
+sx_delay_sysinit(void *dummy)
+{
+
+	sx_delay.initial = mp_ncpus * 25;
+	sx_delay.min = mp_ncpus * 5;
+	sx_delay.max = mp_ncpus * 25 * 10;
+}
+LOCK_DELAY_SYSINIT(sx_delay_sysinit);
 #endif
 
 void
_at__at_ -513,9 +541,12 _at__at_ _sx_xlock_hard(struct sx *sx, uintptr_t tid, int opts, const char *file,
 	int contested = 0;
 #endif
 	int error = 0;
+#if defined(ADAPTIVE_SX) || defined(KDTRACE_HOOKS)
+	uint64_t spin_cnt = 0;
+	uint64_t delay = 0;
+#endif
 #ifdef	KDTRACE_HOOKS
 	uintptr_t state;
-	uint64_t spin_cnt = 0;
 	uint64_t sleep_cnt = 0;
 	int64_t sleep_time = 0;
 	int64_t all_time = 0;
_at__at_ -578,12 +609,8 _at__at_ _sx_xlock_hard(struct sx *sx, uintptr_t tid, int opts, const char *file,
 					    sx->lock_object.lo_name);
 					GIANT_SAVE();
 					while (SX_OWNER(sx->sx_lock) == x &&
-					    TD_IS_RUNNING(owner)) {
-						cpu_spinwait();
-#ifdef KDTRACE_HOOKS
-						spin_cnt++;
-#endif
-					}
+					    TD_IS_RUNNING(owner))
+						lock_delay(&delay, &sx_delay, &spin_cnt);
 					KTR_STATE0(KTR_SCHED, "thread",
 					    sched_tdname(curthread), "running");
 					continue;
_at__at_ -818,9 +845,12 _at__at_ _sx_slock_hard(struct sx *sx, int opts, const char *file, int line)
 #endif
 	uintptr_t x;
 	int error = 0;
+#if defined(ADAPTIVE_SX) || defined(KDTRACE_HOOKS)
+	uint64_t spin_cnt = 0;
+	uint64_t delay = 0;
+#endif
 #ifdef KDTRACE_HOOKS
 	uintptr_t state;
-	uint64_t spin_cnt = 0;
 	uint64_t sleep_cnt = 0;
 	int64_t sleep_time = 0;
 	int64_t all_time = 0;
_at__at_ -888,12 +918,8 _at__at_ _sx_slock_hard(struct sx *sx, int opts, const char *file, int line)
 				    "lockname:\"%s\"", sx->lock_object.lo_name);
 				GIANT_SAVE();
 				while (SX_OWNER(sx->sx_lock) == x &&
-				    TD_IS_RUNNING(owner)) {
-					cpu_spinwait();
-#ifdef KDTRACE_HOOKS
-					spin_cnt++;
-#endif
-				}
+				    TD_IS_RUNNING(owner))
+					lock_delay(&delay, &sx_delay, &spin_cnt);
 				KTR_STATE0(KTR_SCHED, "thread",
 				    sched_tdname(curthread), "running");
 				continue;
diff --git a/sys/kern/subr_lock.c b/sys/kern/subr_lock.c
index e78d5a9..546881f 100644
--- a/sys/kern/subr_lock.c
+++ b/sys/kern/subr_lock.c
_at__at_ -103,6 +103,37 _at__at_ lock_destroy(struct lock_object *lock)
 	lock->lo_flags &= ~LO_INITIALIZED;
 }
 
+uint64_t
+lock_delay(uint64_t *delayp, struct lock_delay_config *l, uint64_t *spin_cntp)
+{
+	uint64_t i, delay, backoff, min, max, cap;
+
+	delay = *delayp;
+
+	if (delay == 0)
+		delay = l->initial;
+	else {
+		delay += l->step;
+		max = l->max;
+		if (delay > max)
+			delay = max;
+	}
+
+	backoff = cpu_ticks() % delay;
+	min = l->min;
+	if (backoff < min)
+		backoff = min;
+	cap = l->cap;
+	if (cap > 0 && backoff > cap - *spin_cntp)
+		backoff = cap - *spin_cntp;
+	for (i = 0; i < backoff; i++)
+		cpu_spinwait();
+
+	*delayp = delay;
+	*spin_cntp += backoff;
+	return (cap - *spin_cntp);
+}
+
 #ifdef DDB
 DB_SHOW_COMMAND(lock, db_show_lock)
 {
diff --git a/sys/sys/lock.h b/sys/sys/lock.h
index 8d7a068..ffac5ba 100644
--- a/sys/sys/lock.h
+++ b/sys/sys/lock.h
_at__at_ -201,9 +201,23 _at__at_ extern struct lock_class lock_class_lockmgr;
 
 extern struct lock_class *lock_classes[];
 
+extern int lock_delay_enabled;
+
+struct lock_delay_config {
+	u_int initial;
+	u_int step;
+	u_int cap;
+	u_int min;
+	u_int max;
+};
+
+#define	LOCK_DELAY_SYSINIT(func) \
+	SYSINIT(func##_ld, SI_SUB_LOCK, SI_ORDER_ANY, func, NULL)
+
 void	lock_init(struct lock_object *, struct lock_class *,
 	    const char *, const char *, int);
 void	lock_destroy(struct lock_object *);
+uint64_t	lock_delay(uint64_t *, struct lock_delay_config *, uint64_t *);
 void	spinlock_enter(void);
 void	spinlock_exit(void);
 void	witness_init(struct lock_object *, const char *);
Received on Sun Jul 31 2016 - 07:57:12 UTC

This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:41:07 UTC