Re: 6.0 hangs (while building OOo)

From: Don Lewis <truckman_at_FreeBSD.org> Date: Tue, 11 Oct 2005 02:43:07 -0700 (PDT) · This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:38:45 UTC

On  6 Oct, Kris Kennaway wrote:

> There is code in -current that saves stack traces when lockmgr locks
> are acquired, when DEBUG_LOCKS is enabled - except it sometimes panics
> while trying to save the trace because of a code bug.  I remind jeffr
> about this on a more-or-less daily basis, but he hasn't had time to
> commit the fix he has yet.  It still may be useful if this is easily
> reproducible.

I haven't yet tried to backport this to RELENG_6 and I've been having
trouble reproducing it on my SMP machine.  It looks like it has some
sort of hardware problem that is causing it to lock up under load.

>> This problem appears to be some sort of vnode lock leak.
> 
> leaked lockmgr locks usually panic when the thread exits.

There's a KASSERT to in userret() to catch this in the syscall return
path, but that check is only enabled if the INVARIANTS kernel option is
enabled, which is not currently true in RELENG_6, and Mikhail doesn't
want to risk a panic on the machine in question, which is at a remote
location.

He did reproduce the problem again with DEBUG_LOCKS enabled and we got
some more info.  Here's the locked vnode in question:

(kgdb) print *(struct vnode *)0xc2741ad4
$1 = {v_type = VDIR, v_tag = 0xc0729abe "ufs", v_op = 0xc076c5c0,
  v_data = 0xc2c03630, v_mount = 0xc1f72000, v_nmntvnodes = {
    tqe_next = 0xc2742c08, tqe_prev = 0xc2dfa880}, v_un = {vu_mount = 0x0,
    vu_socket = 0x0, vu_cdev = 0x0, vu_fifoinfo = 0x0}, v_hashlist = {
    le_next = 0xc2895604, le_prev = 0xc1f01f28}, v_hash = 218659,
  v_cache_src = {lh_first = 0x0}, v_cache_dst = {tqh_first = 0x0,
    tqh_last = 0xc2741b04}, v_dd = 0x0, v_cstart = 0, v_lasta = 0,
  v_lastw = 0, v_clen = 0, v_lock = {lk_interlock = 0xc0783b4c,
    lk_flags = 33816640, lk_sharecount = 0, lk_waitcount = 1,
    lk_exclusivecount = 1, lk_prio = 80, lk_wmesg = 0xc0729abe "ufs",
    lk_timo = 6, lk_lockholder = 0xc1fe2300, lk_newlock = 0x0,
    lk_filename = 0xc072b093 "/var/src/sys/kern/vfs_subr.c",
    lk_lockername = 0xc072aab4 "vop_stdlock", lk_lineno = 1951,
    lk_slockholder = 0xffffffff, lk_sfilename = 0xc071d296 "none",
    lk_slockername = 0xc0724a8d "never share locked", lk_slineno = 0},
  v_interlock = {mtx_object = {lo_class = 0xc075b224,
      lo_name = 0xc072b100 "vnode interlock",
      lo_type = 0xc072b100 "vnode interlock", lo_flags = 196608, lo_list = {
        tqe_next = 0x0, tqe_prev = 0x0}, lo_witness = 0x0}, mtx_lock = 4,
    mtx_recurse = 0}, v_vnlock = 0xc2741b2c,
  filename = 0xc07382b1 "/var/src/sys/ufs/ufs/ufs_lookup.c", line = 563,
  v_holdcnt = 5, v_usecount = 4, v_iflag = 0, v_vflag = 0, v_writecount = 0,
  v_freelist = {tqe_next = 0x0, tqe_prev = 0xc29b4948}, v_bufobj = {
    bo_mtx = 0xc2741b6c, bo_clean = {bv_hd = {tqh_first = 0xcc1cbd48,
        tqh_last = 0xcc1cbd80}, bv_root = 0xcc1cbd48, bv_cnt = 1}, bo_dirty = {
      bv_hd = {tqh_first = 0x0, tqh_last = 0xc2741bcc}, bv_root = 0x0,
      bv_cnt = 0}, bo_numoutput = 0, bo_flag = 0, bo_ops = 0xc0761b04,
    bo_bsize = 16384, bo_object = 0xc30cc39c, bo_synclist = {le_next = 0x0,
      le_prev = 0x0}, bo_private = 0xc2741ad4, __bo_vnode = 0xc2741ad4},
  v_pollinfo = 0x0, v_label = 0x0}
i)

The culprit process knows that it is holding the lock:

(kgdb) print ((struct vnode *)0xc2741ad4)->v_lock->lk_lockholder->td_locks
$3 = 1

And this is its stack trace, once again in wait4():

(kgdb) where
#0  sched_switch (td=0xc1fe2300, newtd=0xc1e5b300, flags=1)
    at /var/src/sys/kern/sched_4bsd.c:980
#1  0xc1fe2300 in ?? ()
#2  0xc0526218 in mi_switch (flags=1, newtd=0x0)
    at /var/src/sys/kern/kern_synch.c:356
#3  0xc0545127 in sleepq_switch (wchan=0x0)
    at /var/src/sys/kern/subr_sleepqueue.c:427
#4  0xc0545400 in sleepq_wait_sig (wchan=0x0)
    at /var/src/sys/kern/subr_sleepqueue.c:552
#5  0xc0525e67 in msleep (ident=0xc1fe520c, mtx=0xc1fe5274, priority=348,
    wmesg=0x0, timo=0) at /var/src/sys/kern/kern_synch.c:225
#6  0xc04feaec in kern_wait (td=0xc1fe2300, pid=-1, status=0xd4922c80,
    options=0, rusage=0x0) at /var/src/sys/kern/kern_exit.c:772
#7  0xc04fdeed in wait4 (td=0x0, uap=0xd4922d04)
    at /var/src/sys/kern/kern_exit.c:569
#8  0xc06ed702 in syscall (frame=
      {tf_fs = 59, tf_es = -1078001605, tf_ds = -1078001605, tf_edi = 134900352,
 tf_esi = 9195, tf_ebp = -1077951176, tf_isp = -728617628, tf_ebx = 672547876, t
f_edx = 0, tf_ecx = 137943616, tf_eax = 7, tf_trapno = 12, tf_err = 2, tf_eip =
671979599, tf_cs = 51, tf_eflags = 534, tf_esp = -1077951204, tf_ss = 59})
    at /var/src/sys/i386/i386/trap.c:976
#9  0xc06d651f in Xint0x80_syscall () at /var/src/sys/i386/i386/exception.s:200
#10 0x00000033 in ?? ()
Previous frame inner to this frame (corrupt stack?)

The code referenced by the vnode filename and line members is this block
in ufs_lookup():

        if (flags & ISDOTDOT) {
                VOP_UNLOCK(pdp, 0, td); /* race to get the inode */
                error = VFS_VGET(pdp->v_mount, dp->i_ino,
                    cnp->cn_lkflags, &tdp);
-->             vn_lock(pdp, LK_EXCLUSIVE | LK_RETRY, td);
                if (error)
                        return (error);
                *vpp = tdp;
	} else ...

I don't see any obvious lock leaks in the code.