Re: Giving up on x buffers - losing files

From: Don Lewis <truckman_at_FreeBSD.org> Date: Tue, 29 Jun 2004 03:29:59 -0700 (PDT) · This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:37:59 UTC

On 26 Jun, Don Lewis wrote:
> On 26 Jun, Arjan van Leeuwen wrote:
>> I've had this problem for a long time with 5.x, but it doesn't seem to  
>> happen that often. Today it bit me hard.
>> 
>> Sometimes, particularly after doing a lot of file writes (i.e. compiling a  
>> lot of ports, building world and mergemastering, etc), I get the 'Giving  
>> up on x buffers' message on shutdown, and my filesystems come up dirty  
>> when I restart.
>> 
>> This wouldn't be such an enormous problem, if it wouldn't always erase the  
>> files I changed most recently. The files are simply reduced to 0 bytes. My  
>> configuration files for Opera and KDE have been victim to this more than  
>> once (because Opera writes to the file on exit, for example), but today,  
>> it was /etc/master.passwd that was reduced to 0 bytes (because I had just  
>> changed something in it).
>> 
>> I understand that turning of write caching might improve the situation,  
>> but it also makes my system a lot slower, and I don't like that on my  
>> desktop system.
>> 
>> So, why does this happen? And how do I prevent it from happening? This  
>> definitely does _not_ sound like something I want my servers to do when  
>> 5.x goes -STABLE.
> 
> I've mentioned this a couple of times on this list in the last six
> months or so.  The last time was in the last couple of weeks.  I can
> reliably trigger this problem with mergemaster.
> 
> I'm pretty sure that the problem relates to soft updates and how the
> file system syncer is shut down, which leaves unresolved dependencies
> that keep a number of dirty blocks from being flushed to disk at the end
> of the system shutdown.
> 
> I have some ideas on how to fix the problem, but I haven't had the time
> to work on it and nobody else has stepped up with a fix.
> 
> I am able to reliably work around the problem by running the sync
> command and waiting a short while after running mergemaster and before
> shutting down or rebooting the machine.

I managed to get motivated and crank out the patch below.  I'm still not
sure why the sync command seems to prevent the problem, but the sync
code in boot() seems to be unable to flush the buffers to disk.

The way this patch works is that the syncer kernel thread is not
immediately terminated at kernel shutdown time.  Instead, the syncer
thread is told to speed up and run through its work queue a few times
before terminating.

One minor issue that I noticed but haven't yet figured out is that even
after the only items remaining on the syncer work list are the syncer
vnodes for each of the mount points, something is occasionally putting
the disk device vnode back on the work list.  I suspect that when the
file system is getting synced, it is dirtying the superblock or summary
info.

With this patch, the syncer takes noticeably longer to shut down, but
the final sync done by boot() finishes either immediately or within two
iterations, even after running mergemaster.

Index: sys/kern/vfs_subr.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/vfs_subr.c,v
retrieving revision 1.494
diff -u -r1.494 vfs_subr.c
--- sys/kern/vfs_subr.c	17 Jun 2004 17:16:49 -0000	1.494
+++ sys/kern/vfs_subr.c	29 Jun 2004 10:00:17 -0000
_at__at_ -82,6 +82,7 _at__at_
 static void	vlruvp(struct vnode *vp);
 static int	flushbuflist(struct buf *blist, int flags, struct vnode *vp,
 		    int slpflag, int slptimeo, int *errorp);
+static void	syncer_shutdown(void *arg, int howto);
 static int	vtryrecycle(struct vnode *vp);
 static void	vx_lock(struct vnode *vp);
 static void	vx_unlock(struct vnode *vp);
_at__at_ -199,8 +200,11 _at__at_
 /*
  * The sync_mtx protects:
  *	vp->v_synclist
+ *	sync_vnode_count
  *	syncer_delayno
+ *	syncer_shutdown_iter
  *	syncer_workitem_pending
+ *	syncer_worklist_len
  *	rushjob
  */
 static struct mtx sync_mtx;
_at__at_ -219,6 +223,21 _at__at_
 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");

 /*
+ * Tell the syncer to make three passes through the work list before
+ * shutting down (unless it runs out of work and shuts down sooner).
+ *
+ * Run at 8 times normal speed when shutting down the syncer.  With
+ * the default settings, the syncer will take approximately 12
+ * seconds to shut down, which is less than the default 60 timeout
+ * in kproc_shutdown().
+ */
+#define SYNCER_SHUTDOWN_ITER_LIMIT	(3*SYNCER_MAXDELAY)
+#define SYNCER_SHUTDOWN_SPEEDUP		7
+static int sync_vnode_count;
+static int syncer_shutdown_iter;
+static int syncer_worklist_len;
+
+/*
  * Number of vnodes we want to exist at any one time.  This is mostly used
  * to size hash tables in vnode-related code.  It is normally not used in
  * getnewvnode(), as wantfreevnodes is normally nonzero.)
_at__at_ -1430,6 +1449,7 _at__at_
 		vp->v_iflag &= ~VI_ONWORKLST;
 		mtx_lock(&sync_mtx);
 		LIST_REMOVE(vp, v_synclist);
+ 		syncer_worklist_len--;
 		mtx_unlock(&sync_mtx);
 	}
 	vdropl(vp);
_at__at_ -1452,8 +1472,10 _at__at_
 	mtx_lock(&sync_mtx);
 	if (vp->v_iflag & VI_ONWORKLST)
 		LIST_REMOVE(vp, v_synclist);
-	else
+	else {
 		vp->v_iflag |= VI_ONWORKLST;
+ 		syncer_worklist_len++;
+	}

 	if (delay > syncer_maxdelay - 2)
 		delay = syncer_maxdelay - 2;
_at__at_ -1487,19 +1509,30 _at__at_

 	mtx_lock(&Giant);

-	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, td->td_proc,
+	EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
 	    SHUTDOWN_PRI_LAST);

 	for (;;) {
-		kthread_suspend_check(td->td_proc);
-
+		mtx_lock(&sync_mtx);
+		/*
+		 * Make one more full pass through the work list after
+		 * the only vnodes remaining on the work list are the
+		 * syncer vnodes.
+		 */
+		if (syncer_shutdown_iter > SYNCER_MAXDELAY &&
+		    syncer_worklist_len == sync_vnode_count)
+			syncer_shutdown_iter = SYNCER_MAXDELAY;
+		if (syncer_shutdown_iter == 0) {
+			mtx_unlock(&sync_mtx);
+			kthread_suspend_check(td->td_proc);
+			mtx_lock(&sync_mtx);
+		}
 		starttime = time_second;

 		/*
 		 * Push files whose dirty time has expired.  Be careful
 		 * of interrupt race on slp queue.
 		 */
-		mtx_lock(&sync_mtx);
 		slp = &syncer_workitem_pending[syncer_delayno];
 		syncer_delayno += 1;
 		if (syncer_delayno == syncer_maxdelay)
_at__at_ -1545,6 +1578,8 _at__at_
 			VI_UNLOCK(vp);
 			mtx_lock(&sync_mtx);
 		}
+		if (syncer_shutdown_iter > 0)
+			syncer_shutdown_iter--;
 		mtx_unlock(&sync_mtx);

 		/*
_at__at_ -1568,7 +1603,8 _at__at_
 			rushjob -= 1;
 			mtx_unlock(&sync_mtx);
 			continue;
-		}
+		} else if (syncer_shutdown_iter > 0)
+			rushjob = SYNCER_SHUTDOWN_SPEEDUP;
 		mtx_unlock(&sync_mtx);
 		/*
 		 * If it has taken us less than a second to process the
_at__at_ -1607,6 +1643,25 _at__at_
 }

 /*
+ * Tell the syncer to speed up its work and run though its work
+ * list several times, then tell it to shut down.
+ */
+static void
+syncer_shutdown(void *arg, int howto)
+{
+	struct thread *td;
+
+	td = FIRST_THREAD_IN_PROC(updateproc);
+	sleepq_remove(td, &lbolt);
+	mtx_lock(&sync_mtx);
+	if (rushjob < SYNCER_SHUTDOWN_SPEEDUP)
+		rushjob = SYNCER_SHUTDOWN_SPEEDUP;
+	syncer_shutdown_iter = SYNCER_SHUTDOWN_ITER_LIMIT;
+	mtx_unlock(&sync_mtx);
+	kproc_shutdown(arg, howto);
+}
+
+/*
  * Associate a p-buffer with a vnode.
  *
  * Also sets B_PAGING flag to indicate that vnode is not fully associated
_at__at_ -1720,6 +1775,7 _at__at_
 		    TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
 			mtx_lock(&sync_mtx);
 			LIST_REMOVE(newvp, v_synclist);
+ 			syncer_worklist_len--;
 			mtx_unlock(&sync_mtx);
 			newvp->v_iflag &= ~VI_ONWORKLST;
 		}
_at__at_ -3297,6 +3353,10 _at__at_
 	}
 	VI_LOCK(vp);
 	vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
+	/* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
+	mtx_lock(&sync_mtx);
+	sync_vnode_count++;
+	mtx_unlock(&sync_mtx);
 	VI_UNLOCK(vp);
 	mp->mnt_syncer = vp;
 	return (0);
_at__at_ -3390,6 +3450,8 _at__at_
 	if (vp->v_iflag & VI_ONWORKLST) {
 		mtx_lock(&sync_mtx);
 		LIST_REMOVE(vp, v_synclist);
+ 		syncer_worklist_len--;
+		sync_vnode_count--;
 		mtx_unlock(&sync_mtx);
 		vp->v_iflag &= ~VI_ONWORKLST;
 	}