Re: panic: mutex ncnegl not owned at /usr/src/sys/kern/vfs_cache.c:743 in 12.0-CURRENT r308447

From: Don Lewis <truckman_at_FreeBSD.org>
Date: Fri, 11 Nov 2016 23:29:48 -0800 (PST)
On 11 Nov, To: freebsd-current_at_FreeBSD.org wrote:
> Yesterday I upgraded my ports builder box from r306349 to r308477.  It
> crashed overnight during a poudriere run.
> 
> panic: mutex ncnegl not owned at /usr/src/sys/kern/vfs_cache.c:743
> cpuid = 2
> KDB: stack backtrace:
> db_trace_self_wrapper() at db_trace_self_wrapper+0x2b/frame 0xfffffe085c2a4100
> vpanic() at vpanic+0x182/frame 0xfffffe085c2a4180
> panic() at panic+0x43/frame 0xfffffe085c2a41e0
> __mtx_assert() at __mtx_assert+0xc1/frame 0xfffffe085c2a41f0
> cache_negative_remove() at cache_negative_remove+0x65/frame 0xfffffe085c2a4230
> cache_zap_locked() at cache_zap_locked+0x1ca/frame 0xfffffe085c2a4250
> cache_enter_time() at cache_enter_time+0x1c20/frame 0xfffffe085c2a4330
> tmpfs_lookup() at tmpfs_lookup+0x47a/frame 0xfffffe085c2a4390
> VOP_CACHEDLOOKUP_APV() at VOP_CACHEDLOOKUP_APV+0xda/frame 0xfffffe085c2a43c0
> vfs_cache_lookup() at vfs_cache_lookup+0xd6/frame 0xfffffe085c2a4420
> VOP_LOOKUP_APV() at VOP_LOOKUP_APV+0xda/frame 0xfffffe085c2a4450
> lookup() at lookup+0x6a2/frame 0xfffffe085c2a44e0
> namei() at namei+0x57e/frame 0xfffffe085c2a45a0
> vn_open_cred() at vn_open_cred+0x25b/frame 0xfffffe085c2a4710
> kern_openat() at kern_openat+0x25c/frame 0xfffffe085c2a48a0
> ia32_syscall() at ia32_syscall+0x330/frame 0xfffffe085c2a4a30
> Xint0x80_syscall() at Xint0x80_syscall+0x95/frame 0xfffffe085c2a4a30
> --- syscall (499, FreeBSD ELF32, sys_openat), rip = 0x97b2f57, rsp = 0xffff7698, rbp = 0xffff76b0 ---
> KDB: enter: panic
> 
> 
> I was able to get a crash dump.  This is the kgdb backtrace:
> 
> #0  __curthread () at ./machine/pcpu.h:222
> #1  doadump (textdump=1546271680) at /usr/src/sys/kern/kern_shutdown.c:298
> #2  0xffffffff80396db6 in db_fncall_generic (nargs=0, addr=<optimized out>, 
>     rv=<optimized out>, args=<optimized out>)
>     at /usr/src/sys/ddb/db_command.c:581
> #3  db_fncall (dummy1=<optimized out>, dummy2=<optimized out>, 
>     dummy3=<optimized out>, dummy4=<optimized out>)
>     at /usr/src/sys/ddb/db_command.c:629
> #4  0xffffffff80396919 in db_command (last_cmdp=<optimized out>, 
>     cmd_table=<optimized out>, dopager=<optimized out>)
>     at /usr/src/sys/ddb/db_command.c:453
> #5  0xffffffff80396674 in db_command_loop ()
>     at /usr/src/sys/ddb/db_command.c:506
> #6  0xffffffff8039972f in db_trap (type=<optimized out>, code=<optimized out>)
>     at /usr/src/sys/ddb/db_main.c:248
> #7  0xffffffff80aa0a13 in kdb_trap (type=<optimized out>, 
>     code=<optimized out>, tf=<optimized out>)
>     at /usr/src/sys/kern/subr_kdb.c:654
> #8  0xffffffff80ed8cc4 in trap (frame=0xfffffe085c2a4030)
>     at /usr/src/sys/amd64/amd64/trap.c:537
> #9  <signal handler called>
> #10 kdb_enter (why=0xffffffff8140e5d1 "panic", 
>     msg=0x80 <error: Cannot access memory at address 0x80>)
>     at /usr/src/sys/kern/subr_kdb.c:444
> #11 0xffffffff80a5eb1f in vpanic (fmt=<optimized out>, ap=0xfffffe085c2a41c0)
>     at /usr/src/sys/kern/kern_shutdown.c:752
> #12 0xffffffff80a5eb83 in panic (fmt=0xffffffff81bf1de0 <cnputs_mtx+24> "\004")
>     at /usr/src/sys/kern/kern_shutdown.c:690
> #13 0xffffffff80a40191 in __mtx_assert (c=<optimized out>, 
>     what=<optimized out>, file=0xfffffe085c2a3fe0 "", line=0)
>     at /usr/src/sys/kern/kern_mutex.c:937
> #14 0xffffffff80b0d3d5 in cache_negative_remove (ncp=0xfffff8062bb8dee0, 
>     neg_locked=true) at /usr/src/sys/kern/vfs_cache.c:743
> #15 0xffffffff80b0d87a in cache_zap_locked (ncp=0xfffff8062bb8dee0, 
>     neg_locked=true) at /usr/src/sys/kern/vfs_cache.c:886
> #16 0xffffffff80b0d210 in cache_negative_zap_one ()
>     at /usr/src/sys/kern/vfs_cache.c:835
> #17 cache_enter_time (dvp=<optimized out>, vp=<optimized out>, 
>     cnp=<optimized out>, tsp=<optimized out>, dtsp=<optimized out>)
>     at /usr/src/sys/kern/vfs_cache.c:1741
> #18 0xffffffff82893b3a in tmpfs_lookup (v=<optimized out>)
>     at /usr/src/sys/modules/tmpfs/../../fs/tmpfs/tmpfs_vnops.c:199
> #19 0xffffffff8103ee4a in VOP_CACHEDLOOKUP_APV (vop=<optimized out>, 
>     a=<optimized out>) at vnode_if.c:195
> #20 0xffffffff80b0e9d6 in VOP_CACHEDLOOKUP (dvp=0xfffff8023f3693b0, 
>     vpp=0xfffffe085c2a47d8, cnp=0xfffffe085c2a4800) at ./vnode_if.h:80
> #21 vfs_cache_lookup (ap=<optimized out>) at /usr/src/sys/kern/vfs_cache.c:2022
> #22 0xffffffff8103ecea in VOP_LOOKUP_APV (vop=<optimized out>, 
>     a=<optimized out>) at vnode_if.c:127
> #23 0xffffffff80b17df2 in VOP_LOOKUP (vpp=0xfffffe085c2a47d8, 
>     cnp=0xfffffe085c2a4800, dvp=<optimized out>) at ./vnode_if.h:54
> #24 lookup (ndp=<optimized out>) at /usr/src/sys/kern/vfs_lookup.c:830
> #25 0xffffffff80b173ae in namei (ndp=<optimized out>)
>     at /usr/src/sys/kern/vfs_lookup.c:397
> #26 0xffffffff80b333db in vn_open_cred (ndp=<optimized out>, 
>     flagp=0xfffffe085c2a485c, cmode=0, vn_open_flags=<optimized out>, 
>     cred=<optimized out>, fp=0x1d0) at /usr/src/sys/kern/vfs_vnops.c:277
> #27 0xffffffff80b2c72c in kern_openat (td=0xfffff8001d68d000, fd=-100, 
>     path=0x2c9e71e8 <error: Cannot access memory at address 0x2c9e71e8>, 
>     pathseg=UIO_USERSPACE, 
>     flags=<error reading variable: Cannot access memory at address 0x80>, 
>     mode=<optimized out>) at /usr/src/sys/kern/vfs_syscalls.c:1016
> #28 0xffffffff80ff88a0 in syscallenter (td=0xfffff8001d68d000, 
>     sa=<optimized out>)
>     at /usr/src/sys/amd64/ia32/../../kern/subr_syscall.c:135
> #29 ia32_syscall (frame=0xfffffe085c2a4a40)
>     at /usr/src/sys/amd64/ia32/ia32_syscall.c:187
> #30 <signal handler called>
> #31 0x00000000097b2f57 in ?? ()
> Backtrace stopped: Cannot access memory at address 0xffff7698
> 
> 
> I don't see any obvious issues in the code.

This problem is very reproduceable for me, so I added some assertions to
the code to try to track it down:

%svn diff sys/kern
Index: sys/kern/vfs_cache.c
===================================================================
--- sys/kern/vfs_cache.c	(revision 308477)
+++ sys/kern/vfs_cache.c	(working copy)
_at__at_ -776,6 +776,7 _at__at_
 		mtx_unlock(&neglist->nl_lock);
 	}
 
+	MPASS(ncp == NULL || neglist == NCP2NEGLIST(ncp));             [2]
 	*neglistpp = neglist;
 	*ncpp = ncp;
 }
_at__at_ -815,6 +816,7 _at__at_
 	}
 
 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
+	MPASS(neglist == NCP2NEGLIST(ncp));                            [1]
 	dvlp = VP2VNODELOCK(ncp->nc_dvp);
 	blp = NCP2BUCKETLOCK(ncp);
 	mtx_unlock(&neglist->nl_lock);
_at__at_ -832,6 +834,8 _at__at_
 	SDT_PROBE3(vfs, namecache, shrink_negative, done, ncp->nc_dvp,
 	    nc_get_name(ncp), ncp->nc_neghits);
 
+	MPASS(neglist == NCP2NEGLIST(ncp));
+	mtx_assert(&neglist->nl_lock, MA_OWNED);
 	cache_zap_locked(ncp, true);
 out_unlock_all:
 	mtx_unlock(&neglist->nl_lock);

My first attempt was to add all of the above except [2].  When I re-ran
poudriere, I got a panic at [1].  Since it looked like maybe ncp was on
the wrong list, I added the assertion at [2] and got a different panic:

panic: Bad link elm 0xfffff802a59cb690 prev->next != elm
cpuid = 2
KDB: stack backtrace:
db_trace_self_wrapper() at db_trace_self_wrapper+0x2b/frame 0xfffffe085bfa8230
vpanic() at vpanic+0x182/frame 0xfffffe085bfa82b0
panic() at panic+0x43/frame 0xfffffe085bfa8310
cache_negative_remove() at cache_negative_remove+0x21a/frame 0xfffffe085bfa8350
cache_zap_locked() at cache_zap_locked+0x1ca/frame 0xfffffe085bfa8370
cache_lookup() at cache_lookup+0xe58/frame 0xfffffe085bfa8440
vfs_cache_lookup() at vfs_cache_lookup+0xac/frame 0xfffffe085bfa84a0
VOP_LOOKUP_APV() at VOP_LOOKUP_APV+0xda/frame 0xfffffe085bfa84d0
lookup() at lookup+0x6a2/frame 0xfffffe085bfa8560
namei() at namei+0x57e/frame 0xfffffe085bfa8620
kern_renameat() at kern_renameat+0x1b4/frame 0xfffffe085bfa88a0
ia32_syscall() at ia32_syscall+0x330/frame 0xfffffe085bfa8a30
Xint0x80_syscall() at Xint0x80_syscall+0x95/frame 0xfffffe085bfa8a30


#11 0xffffffff80a5eb1f in vpanic (fmt=<optimized out>, ap=0xfffffe085bfa82f0)
    at /usr/src/sys/kern/kern_shutdown.c:752
#12 0xffffffff80a5eb83 in panic (fmt=0xffffffff81bf2de0 <cnputs_mtx+24> "\004")
    at /usr/src/sys/kern/kern_shutdown.c:690
#13 0xffffffff80b0d55a in cache_negative_remove (ncp=0xfffff802a59cb690, 
    neg_locked=<optimized out>) at /usr/src/sys/kern/vfs_cache.c:747
#14 0xffffffff80b0d84a in cache_zap_locked (ncp=0xfffff802a59cb690, 
    neg_locked=false) at /usr/src/sys/kern/vfs_cache.c:890
#15 0xffffffff80b0ab98 in cache_zap_rlocked_bucket (ncp=0xfffff802a59cb690, 
    blp=<optimized out>) at /usr/src/sys/kern/vfs_cache.c:1021
#16 cache_lookup (dvp=<optimized out>, vpp=<optimized out>, 
    cnp=<optimized out>, tsp=<optimized out>, ticksp=<optimized out>)
    at /usr/src/sys/kern/vfs_cache.c:1308
#17 0xffffffff80b0e97c in vfs_cache_lookup (ap=<optimized out>)
    at /usr/src/sys/kern/vfs_cache.c:2024
#18 0xffffffff8103fcea in VOP_LOOKUP_APV (vop=<optimized out>, 
    a=<optimized out>) at vnode_if.c:127
#19 0xffffffff80b17ed2 in VOP_LOOKUP (vpp=0xfffffe085bfa86e8, 
    cnp=0xfffffe085bfa8710, dvp=<optimized out>) at ./vnode_if.h:54
#20 lookup (ndp=<optimized out>) at /usr/src/sys/kern/vfs_lookup.c:830
#21 0xffffffff80b1748e in namei (ndp=<optimized out>)
    at /usr/src/sys/kern/vfs_lookup.c:397
#22 0xffffffff80b30bc4 in kern_renameat (td=<optimized out>, oldfd=10, 
    old=0x28e72a59 <error: Cannot access memory at address 0x28e72a59>, 
    newfd=10, 
    new=0x28e72601 <error: Cannot access memory at address 0x28e72601>, 
    pathseg=<optimized out>) at /usr/src/sys/kern/vfs_syscalls.c:3515
#23 0xffffffff80ff98a0 in syscallenter (td=0xfffff801ebfd8500, 
    sa=<optimized out>)
    at /usr/src/sys/amd64/ia32/../../kern/subr_syscall.c:135
#24 ia32_syscall (frame=0xfffffe085bfa8a40)
    at /usr/src/sys/amd64/ia32/ia32_syscall.c:187
#25 <signal handler called>


#13 0xffffffff80b0d55a in cache_negative_remove (ncp=0xfffff802a59cb690, 
    neg_locked=<optimized out>) at /usr/src/sys/kern/vfs_cache.c:747
747			TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
(kgdb) list
742		} else {
743			mtx_assert(&neglist->nl_lock, MA_OWNED);
744			mtx_assert(&ncneg_hot.nl_lock, MA_OWNED);
745		}
746		if (ncp->nc_flag & NCF_HOTNEGATIVE) {
747			TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
748		} else {
749			TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
750		}
751		if (list_locked)

(kgdb) print ncp
$1 = (struct namecache *) 0xfffff802a59cb690

(kgdb) print *ncp
$2 = {nc_hash = {le_next = 0x0, le_prev = 0xfffffe0001814178}, nc_src = {
    le_next = 0xfffff80415e22290, le_prev = 0xfffff8048e5597a8}, nc_dst = {
    tqe_next = 0xfffff80421744a10, tqe_prev = 0xfffff80421744a30}, 
  nc_dvp = 0xfffff8048e559760, n_un = {nu_vp = 0x0, nu_neghits = 0}, 
  nc_flag = 32 ' ', nc_nlen = 25 '\031', 
  nc_name = 0xfffff802a59cb6d2 "gtk3-icon-browser.desktop"}
(kgdb) print ncp->nc_dst->tqe_prev   
$8 = (struct namecache **) 0xfffff80421744a30
(kgdb) print *(ncp->nc_dst->tqe_prev)
$9 = (struct namecache *) 0x0


The line number for the failed assertion doesn't make sense.  The value
of nc_flag is 32 (0x20), but NCF_HOTNEGATIVE is defined as 0x40 and
NCF_NEGATIVE is defined as 0x20.  In any case, the list seems to be
messed up.
Received on Sat Nov 12 2016 - 06:29:57 UTC

This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:41:08 UTC