Re: Is there a need to hold the vnode lock across getnewbuf( ) in getblk?

From: Kostik Belousov <kostikbel_at_gmail.com>
Date: Thu, 6 Nov 2008 22:17:50 +0200
On Thu, Nov 06, 2008 at 11:33:23AM -0800, Bangaru, Sailaja wrote:
> One of the problems I'm running into is a deadlock where I have threads
> blocked on newbuf (with vnode lock held) due to buffer shortage and
> bufdaemon unable to flush buffers because it can't get the exclusive
> lock for the vnode. From the code:
>  
> struct buf *
> getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int
> slptimeo,
>     int flags)
> {
> .
> .
> 
>                 /*
>                  * Buffer is not in-core, create new buffer.  The buffer
>                  * returned by getnewbuf() is locked.  Note that the
> returned
>                  * buffer is also considered valid (not marked B_INVAL).
>                  */
>                 BO_UNLOCK(bo);
> .
> .
>                 bp = getnewbuf(slpflag, slptimeo, size, maxsize,
>                                 vp_to_buf_index(vp));
>                 if (bp == NULL) {
> .
> .
>                 /*
>                  * This code is used to make sure that a buffer is not
>                  * created while the getnewbuf routine is blocked.
>                  * This can be a problem whether the vnode is locked or
> not.
>                  * If the buffer is created out from under us, we have
> to
>                  * throw away the one we just created.
>                  *
>                  * Note: this must occur before we associate the buffer
>                  * with the vp especially considering limitations in
>                  * the splay tree implementation when dealing with
> duplicate
>                  * lblkno's.
>                  */
>                 BO_LOCK(bo);
>                 if (gbincore(bo, blkno)) {
>                         BO_UNLOCK(bo);
>                         bp->b_flags |= B_INVAL;
>                         brelse(bp);
>                         goto loop;
>                 }
> 
> Given that this code rechecks if a buffer has already been created with
> the possibility that this thread might have blocked on getnewbuf (), Is
> there really 
> any need to hold the vnode lock across getnewbuf call?

Vnode lock is not needed for the fragment you cited. But dropping it
would allow for vnode to be reclaimed, that cause problems for the
callers. Callers actually do not expect the vnode to become VI_DOOMED
after call to bread().

Some time ago I developed patch that handles these bufdaemon deadlocks
with another strategy. I loaned threads that called getblk, to help
bufdaemon. The thread that is to blocked in getblk due to buffer or buffer
KVA shortage, instead does flushing of the buffers belonging to the
vnode locked by that thread.

Also, I fixed the situation where ffs_bufwrite, called to flush buffer
due to shortage, tries to allocate shadow buffer for cg block.

I did not committed this because there is a problem, at least for UFS.
Writing buffer while curthread may hold a buffer lock on the indirect
inode block might cause a panic with recursive lock attempt. This is
fixable, by changing UFS/FFS to never hold two buffer locks, but I
did not got to do it.

Peter Holm tested the patch, it eliminated "nbufkv" deadlocks for our
test scenarious.

diff --git a/sys/gnu/fs/xfs/FreeBSD/xfs_buf.c b/sys/gnu/fs/xfs/FreeBSD/xfs_buf.c
index 9759d5a..ff06865 100644
--- a/sys/gnu/fs/xfs/FreeBSD/xfs_buf.c
+++ b/sys/gnu/fs/xfs/FreeBSD/xfs_buf.c
_at__at_ -81,7 +81,7 _at__at_ xfs_buf_get_empty(size_t size,  xfs_buftarg_t *target)
 {
 	struct buf *bp;
 
-	bp = geteblk(0);
+	bp = geteblk(0, 0);
 	if (bp != NULL) {
 		bp->b_bufsize = size;
 		bp->b_bcount = size;
_at__at_ -100,7 +100,7 _at__at_ xfs_buf_get_noaddr(size_t len, xfs_buftarg_t *target)
 	if (len >= MAXPHYS)
 		return (NULL);
 
-	bp = geteblk(len);
+	bp = geteblk(len, 0);
 	if (bp != NULL) {
 		BUF_ASSERT_HELD(bp);
 
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index f75963d..b5a5154 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
_at__at_ -105,7 +105,8 _at__at_ static void vfs_setdirty_locked_object(struct buf *bp);
 static void vfs_vmio_release(struct buf *bp);
 static int vfs_bio_clcheck(struct vnode *vp, int size,
 		daddr_t lblkno, daddr_t blkno);
-static int flushbufqueues(int, int);
+static int buf_do_flush(struct vnode *vp);
+static int flushbufqueues(struct vnode *, int, int);
 static void buf_daemon(void);
 static void bremfreel(struct buf *bp);
 
_at__at_ -247,6 +248,7 _at__at_ static struct mtx nblock;
 #define QUEUE_DIRTY_GIANT 3	/* B_DELWRI buffers that need giant */
 #define QUEUE_EMPTYKVA	4	/* empty buffer headers w/KVA assignment */
 #define QUEUE_EMPTY	5	/* empty buffer headers */
+#define QUEUE_SENTINEL	1024	/* not an queue index, but mark for sentinel */
 
 /* Queues for free buffers with various properties */
 static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
_at__at_ -1673,21 +1675,23 _at__at_ vfs_bio_awrite(struct buf *bp)
  */
 
 static struct buf *
-getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
+getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize,
+    int gbflags)
 {
+	struct thread *td;
 	struct buf *bp;
 	struct buf *nbp;
 	int defrag = 0;
 	int nqindex;
 	static int flushingbufs;
 
+	td = curthread;
 	/*
 	 * We can't afford to block since we might be holding a vnode lock,
 	 * which may prevent system daemons from running.  We deal with
 	 * low-memory situations by proactively returning memory and running
 	 * async I/O rather then sync I/O.
 	 */
-
 	atomic_add_int(&getnewbufcalls, 1);
 	atomic_subtract_int(&getnewbufrestarts, 1);
 restart:
_at__at_ -1919,8 +1923,9 _at__at_ restart:
 	 */
 
 	if (bp == NULL) {
-		int flags;
+		int flags, norunbuf;
 		char *waitmsg;
+		int fl;
 
 		if (defrag) {
 			flags = VFS_BIO_NEED_BUFSPACE;
_at__at_ -1938,9 +1943,35 _at__at_ restart:
 		mtx_unlock(&bqlock);
 
 		bd_speedup();	/* heeeelp */
+		if (gbflags & GB_NOWAIT_BD)
+			return (NULL);
 
 		mtx_lock(&nblock);
 		while (needsbuffer & flags) {
+			if (vp != NULL && (td->td_pflags & TDP_BUFNEED) == 0) {
+				mtx_unlock(&nblock);
+				/*
+				 * getblk() is called with a vnode
+				 * locked, and some majority of the
+				 * dirty buffers may as well belong to
+				 * the vnode. Flushing the buffers
+				 * there would make a progress that
+				 * cannot be achieved by the
+				 * buf_daemon, that cannot lock the
+				 * vnode.
+				 */
+				norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
+				    (td->td_pflags & TDP_NORUNNINGBUF);
+				/* play bufdaemon */
+				td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
+				fl = buf_do_flush(vp);
+				td->td_pflags &= norunbuf;
+				mtx_lock(&nblock);
+				if (fl != 0)
+					continue;
+				if ((needsbuffer & flags) == 0)
+					break;
+			}
 			if (msleep(&needsbuffer, &nblock,
 			    (PRIBIO + 4) | slpflag, waitmsg, slptimeo)) {
 				mtx_unlock(&nblock);
_at__at_ -2009,6 +2040,35 _at__at_ static struct kproc_desc buf_kp = {
 };
 SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp);
 
+static int
+buf_do_flush(struct vnode *vp)
+{
+	int flushed;
+
+	flushed = flushbufqueues(vp, QUEUE_DIRTY, 0);
+	/* The list empty check here is slightly racy */
+	if (!TAILQ_EMPTY(&bufqueues[QUEUE_DIRTY_GIANT])) {
+		mtx_lock(&Giant);
+		flushed += flushbufqueues(vp, QUEUE_DIRTY_GIANT, 0);
+		mtx_unlock(&Giant);
+	}
+	if (flushed == 0) {
+		/*
+		 * Could not find any buffers without rollback
+		 * dependencies, so just write the first one
+		 * in the hopes of eventually making progress.
+		 */
+		flushbufqueues(vp, QUEUE_DIRTY, 1);
+		if (!TAILQ_EMPTY(
+			    &bufqueues[QUEUE_DIRTY_GIANT])) {
+			mtx_lock(&Giant);
+			flushbufqueues(vp, QUEUE_DIRTY_GIANT, 1);
+			mtx_unlock(&Giant);
+		}
+	}
+	return (flushed);
+}
+
 static void
 buf_daemon()
 {
_at__at_ -2022,7 +2082,7 _at__at_ buf_daemon()
 	/*
 	 * This process is allowed to take the buffer cache to the limit
 	 */
-	curthread->td_pflags |= TDP_NORUNNINGBUF;
+	curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED;
 	mtx_lock(&bdlock);
 	for (;;) {
 		bd_request = 0;
_at__at_ -2037,30 +2097,8 _at__at_ buf_daemon()
 		 * normally would so they can run in parallel with our drain.
 		 */
 		while (numdirtybuffers > lodirtybuffers) {
-			int flushed;
-
-			flushed = flushbufqueues(QUEUE_DIRTY, 0);
-			/* The list empty check here is slightly racy */
-			if (!TAILQ_EMPTY(&bufqueues[QUEUE_DIRTY_GIANT])) {
-				mtx_lock(&Giant);
-				flushed += flushbufqueues(QUEUE_DIRTY_GIANT, 0);
-				mtx_unlock(&Giant);
-			}
-			if (flushed == 0) {
-				/*
-				 * Could not find any buffers without rollback
-				 * dependencies, so just write the first one
-				 * in the hopes of eventually making progress.
-				 */
-				flushbufqueues(QUEUE_DIRTY, 1);
-				if (!TAILQ_EMPTY(
-				    &bufqueues[QUEUE_DIRTY_GIANT])) {
-					mtx_lock(&Giant);
-					flushbufqueues(QUEUE_DIRTY_GIANT, 1);
-					mtx_unlock(&Giant);
-				}
+			if (buf_do_flush(NULL) == 0)
 				break;
-			}
 			uio_yield();
 		}
 
_at__at_ -2106,7 +2144,7 _at__at_ SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps,
     0, "Number of buffers flushed with dependecies that require rollbacks");
 
 static int
-flushbufqueues(int queue, int flushdeps)
+flushbufqueues(struct vnode *lvp, int queue, int flushdeps)
 {
 	struct buf sentinel;
 	struct vnode *vp;
_at__at_ -2121,15 +2159,29 _at__at_ flushbufqueues(int queue, int flushdeps)
 		target /= 2;
 	flushed = 0;
 	bp = NULL;
+	sentinel.b_qindex = QUEUE_SENTINEL;
 	mtx_lock(&bqlock);
-	TAILQ_INSERT_TAIL(&bufqueues[queue], &sentinel, b_freelist);
+	TAILQ_INSERT_HEAD(&bufqueues[queue], &sentinel, b_freelist);
 	while (flushed != target) {
-		bp = TAILQ_FIRST(&bufqueues[queue]);
-		if (bp == &sentinel)
+		bp = TAILQ_NEXT(&sentinel, b_freelist);
+		if (bp != NULL) {
+			TAILQ_REMOVE(&bufqueues[queue], &sentinel, b_freelist);
+			TAILQ_INSERT_AFTER(&bufqueues[queue], bp, &sentinel,
+			    b_freelist);
+		} else
 			break;
-		TAILQ_REMOVE(&bufqueues[queue], bp, b_freelist);
-		TAILQ_INSERT_TAIL(&bufqueues[queue], bp, b_freelist);
-
+		/*
+		 * Skip sentinels inserted by other invocations of the
+		 * flushbufqueues(), taking care to not reorder them.
+		 */
+		if (bp->b_qindex == QUEUE_SENTINEL)
+			continue;
+		/*
+		 * Only flush the buffers that belong to the
+		 * vnode locked by the curthread.
+		 */
+		if (lvp != NULL && bp->b_vp != lvp)
+			continue;
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
 			continue;
 		if (bp->b_pin_count > 0) {
_at__at_ -2177,16 +2229,26 _at__at_ flushbufqueues(int queue, int flushdeps)
 			BUF_UNLOCK(bp);
 			continue;
 		}
-		if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
+		if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_CANRECURSE) == 0) {
 			mtx_unlock(&bqlock);
 			CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X",
 			    bp, bp->b_vp, bp->b_flags);
-			vfs_bio_awrite(bp);
+			if (curproc == bufdaemonproc)
+				vfs_bio_awrite(bp);
+			else {
+				bremfree(bp);
+				bwrite(bp);
+			}
 			vn_finished_write(mp);
 			VOP_UNLOCK(vp, 0);
 			flushwithdeps += hasdeps;
 			flushed++;
-			waitrunningbufspace();
+			/*
+			 * Sleeping on runningbufspace while holding
+			 * vnode lock leads to deadlock.
+			 */
+			if (curproc == bufdaemonproc)
+				waitrunningbufspace();
 			numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2);
 			mtx_lock(&bqlock);
 			continue;
_at__at_ -2568,7 +2630,7 _at__at_ loop:
 		maxsize = vmio ? size + (offset & PAGE_MASK) : size;
 		maxsize = imax(maxsize, bsize);
 
-		bp = getnewbuf(slpflag, slptimeo, size, maxsize);
+		bp = getnewbuf(vp, slpflag, slptimeo, size, maxsize, flags);
 		if (bp == NULL) {
 			if (slpflag || slptimeo)
 				return NULL;
_at__at_ -2643,14 +2705,17 _at__at_ loop:
  * set to B_INVAL.
  */
 struct buf *
-geteblk(int size)
+geteblk(int size, int flags)
 {
 	struct buf *bp;
 	int maxsize;
 
 	maxsize = (size + BKVAMASK) & ~BKVAMASK;
-	while ((bp = getnewbuf(0, 0, size, maxsize)) == 0)
-		continue;
+	while ((bp = getnewbuf(NULL, 0, 0, size, maxsize, flags)) == NULL) {
+		if ((flags & GB_NOWAIT_BD) &&
+		    (curthread->td_pflags & TDP_BUFNEED) != 0)
+			return (NULL);
+	}
 	allocbuf(bp, size);
 	bp->b_flags |= B_INVAL;	/* b_dep cleared by getnewbuf() */
 	BUF_ASSERT_HELD(bp);
diff --git a/sys/sys/buf.h b/sys/sys/buf.h
index e05b20c..09e5c03 100644
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
_at__at_ -443,6 +443,7 _at__at_ buf_countdeps(struct buf *bp, int i)
  */
 #define	GB_LOCK_NOWAIT	0x0001		/* Fail if we block on a buf lock. */
 #define	GB_NOCREAT	0x0002		/* Don't create a buf if not found. */
+#define	GB_NOWAIT_BD	0x0004		/* Do not wait for bufdaemon */
 
 #ifdef _KERNEL
 extern int	nbuf;			/* The number of buffer headers */
_at__at_ -487,7 +488,7 _at__at_ struct buf *     getpbuf(int *);
 struct buf *incore(struct bufobj *, daddr_t);
 struct buf *gbincore(struct bufobj *, daddr_t);
 struct buf *getblk(struct vnode *, daddr_t, int, int, int, int);
-struct buf *geteblk(int);
+struct buf *geteblk(int, int);
 int	bufwait(struct buf *);
 int	bufwrite(struct buf *);
 void	bufdone(struct buf *);
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
index b326ba7..f38f0e4 100644
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
_at__at_ -344,7 +344,7 _at__at_ do {									\
 #define	TDP_OLDMASK	0x00000001 /* Need to restore mask after suspend. */
 #define	TDP_INKTR	0x00000002 /* Thread is currently in KTR code. */
 #define	TDP_INKTRACE	0x00000004 /* Thread is currently in KTRACE code. */
-#define	TDP_UNUSED8	0x00000008 /* available */
+#define	TDP_BUFNEED	0x00000008 /* Do not recurse into the buf flush */
 #define	TDP_COWINPROGRESS 0x00000010 /* Snapshot copy-on-write in progress. */
 #define	TDP_ALTSTACK	0x00000020 /* Have alternate signal stack. */
 #define	TDP_DEADLKTREAT	0x00000040 /* Lock aquisition - deadlock treatment. */
diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c
index 563473c..8118b80 100644
--- a/sys/ufs/ffs/ffs_vfsops.c
+++ b/sys/ufs/ffs/ffs_vfsops.c
_at__at_ -1824,7 +1824,9 _at__at_ ffs_bufwrite(struct buf *bp)
 		    ("bufwrite: needs chained iodone (%p)", bp->b_iodone));
 
 		/* get a new block */
-		newbp = geteblk(bp->b_bufsize);
+		newbp = geteblk(bp->b_bufsize, GB_NOWAIT_BD);
+		if (newbp == NULL)
+			goto normal_write;
 
 		/*
 		 * set it to be identical to the old block.  We have to
_at__at_ -1864,6 +1866,7 _at__at_ ffs_bufwrite(struct buf *bp)
 	}
 
 	/* Let the normal bufwrite do the rest for us */
+normal_write:
 	return (bufwrite(bp));
 }
 

Received on Thu Nov 06 2008 - 19:18:06 UTC

This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:39:37 UTC