[PATCH] microoptimize locking primitives by introducing randomized delay between atomic ops

From: Mateusz Guzik <mjguzik_at_gmail.com>
Date: Sun, 10 Jul 2016 13:13:26 +0200
If the lock is contended, primitives like __mtx_lock_sleep will spin
checking if the owner is running or the lock was freed. The problem is
that once it is discovered that the lock is free, multiple CPUs are
likely to try to do the atomic op which will make it more costly for
everyone and throughput suffers.

The standard thing to do is to have some sort of a randomized delay so
that this kind of behaviour is reduced.

As such, below is a trivial hack which takes cpu_ticks() into account
and performs % 2048, which in my testing gives reasonbly good results.

Please note there is definitely way more room for improvement in general.

In terms of results, there was no statistically significant change in
-j 40 buildworld nor buildkernel.

However, a 40-way find on a ports tree placed on tmpfs yielded the following:

x vanilla            
+ patched
+----------------------------------------------------------------------------------------+
|     ++++                +                                         x         x x x      |
|+    ++++ +++    +  +  + ++       +       +     x               x  x  xxxxxxxx x x     x|
|    |_____M____A__________|                                      |________AM______|     |
+----------------------------------------------------------------------------------------+
    N           Min           Max        Median           Avg        Stddev
x  20        12.431        15.952        14.897       14.7444    0.74241657
+  20         8.103        11.863        9.0135       9.44565     1.0059484
Difference at 95.0% confidence
	-5.29875 +/- 0.565836
	-35.9374% +/- 3.83764%
	(Student's t, pooled s = 0.884057)

The patch:

diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c
index 012cf7c..db3b950 100644
--- a/sys/kern/kern_mutex.c
+++ b/sys/kern/kern_mutex.c
_at__at_ -444,7 +444,7 _at__at_ __mtx_lock_sleep(volatile uintptr_t *c, uintptr_t tid, int opts,
 				    m->lock_object.lo_name);
 				while (mtx_owner(m) == owner &&
 				    TD_IS_RUNNING(owner)) {
-					cpu_spinwait();
+					lock_delay();
 #ifdef KDTRACE_HOOKS
 					spin_cnt++;
 #endif
diff --git a/sys/kern/kern_rwlock.c b/sys/kern/kern_rwlock.c
index 6a904d2..3b512b5 100644
--- a/sys/kern/kern_rwlock.c
+++ b/sys/kern/kern_rwlock.c
_at__at_ -438,7 +438,7 _at__at_ __rw_rlock(volatile uintptr_t *c, const char *file, int line)
 				    "lockname:\"%s\"", rw->lock_object.lo_name);
 				while ((struct thread*)RW_OWNER(rw->rw_lock) ==
 				    owner && TD_IS_RUNNING(owner)) {
-					cpu_spinwait();
+					lock_delay();
 #ifdef KDTRACE_HOOKS
 					spin_cnt++;
 #endif
_at__at_ -799,7 +799,7 _at__at_ __rw_wlock_hard(volatile uintptr_t *c, uintptr_t tid, const char *file,
 			    rw->lock_object.lo_name);
 			while ((struct thread*)RW_OWNER(rw->rw_lock) == owner &&
 			    TD_IS_RUNNING(owner)) {
-				cpu_spinwait();
+				lock_delay();
 #ifdef KDTRACE_HOOKS
 				spin_cnt++;
 #endif
diff --git a/sys/kern/kern_sx.c b/sys/kern/kern_sx.c
index 2a81c04..1bfd46f 100644
--- a/sys/kern/kern_sx.c
+++ b/sys/kern/kern_sx.c
_at__at_ -579,7 +579,7 _at__at_ _sx_xlock_hard(struct sx *sx, uintptr_t tid, int opts, const char *file,
 					GIANT_SAVE();
 					while (SX_OWNER(sx->sx_lock) == x &&
 					    TD_IS_RUNNING(owner)) {
-						cpu_spinwait();
+						lock_delay();
 #ifdef KDTRACE_HOOKS
 						spin_cnt++;
 #endif
_at__at_ -889,10 +889,10 _at__at_ _sx_slock_hard(struct sx *sx, int opts, const char *file, int line)
 				GIANT_SAVE();
 				while (SX_OWNER(sx->sx_lock) == x &&
 				    TD_IS_RUNNING(owner)) {
+					lock_delay();
 #ifdef KDTRACE_HOOKS
 					spin_cnt++;
 #endif
-					cpu_spinwait();
 				}
 				KTR_STATE0(KTR_SCHED, "thread",
 				    sched_tdname(curthread), "running");
diff --git a/sys/kern/subr_lock.c b/sys/kern/subr_lock.c
index e78d5a9..55a0fb7 100644
--- a/sys/kern/subr_lock.c
+++ b/sys/kern/subr_lock.c
_at__at_ -103,6 +103,16 _at__at_ lock_destroy(struct lock_object *lock)
 	lock->lo_flags &= ~LO_INITIALIZED;
 }
 
+void
+lock_delay(void)
+{
+	int i, delay;
+
+	delay = cpu_ticks() % 2048;
+	for (i = 0; i < delay; i++)
+		cpu_spinwait();
+}
+
 #ifdef DDB
 DB_SHOW_COMMAND(lock, db_show_lock)
 {
diff --git a/sys/sys/lock.h b/sys/sys/lock.h
index 8d7a068..d076687 100644
--- a/sys/sys/lock.h
+++ b/sys/sys/lock.h
_at__at_ -204,6 +204,7 _at__at_ extern struct lock_class *lock_classes[];
 void	lock_init(struct lock_object *, struct lock_class *,
 	    const char *, const char *, int);
 void	lock_destroy(struct lock_object *);
+void	lock_delay(void);
 void	spinlock_enter(void);
 void	spinlock_exit(void);
 void	witness_init(struct lock_object *, const char *);
Received on Sun Jul 10 2016 - 09:13:32 UTC

This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:41:06 UTC