On Fri, 2014-06-06 at 10:12 -0400, Glen Barber wrote: > Two machines in the cluster panic last night with the same backtrace. > It is unclear yet exactly what was happening on the systems, but both > are port building machines using ports-mgmt/tinderbox. > > Any ideas or information on how to further debug this would be > appreciated. > > Script started on Fri Jun 6 14:01:53 2014 > command: /bin/sh > # uname -a > FreeBSD redbuild04.nyi.freebsd.org 11.0-CURRENT FreeBSD 11.0-CURRENT #1 r267110: Thu Jun 5 15:57:43 UTC 2014 sbruno_at_redbuild04.nyi.freebsd.org:/usr/obj/usr/src/sys/REDBUILD amd64 > # kgdb ./kernel.debug /var/crash/vmcore.0 > GNU gdb 6.1.1 [FreeBSD] > Copyright 2004 Free Software Foundation, Inc. > GDB is free software, covered by the GNU General Public License, and you are > welcome to change it and/or distribute copies of it under certain conditions. > Type "show copying" to see the conditions. > There is absolutely no warranty for GDB. Type "show warranty" for details. > This GDB was configured as "amd64-marcel-freebsd"... > > Unread portion of the kernel message buffer: > panic: deadlkres: possible deadlock detected on allproc_lock > > cpuid = 17 > KDB: stack backtrace: > db_trace_self_wrapper() at db_trace_self_wrapper+0x2b/frame 0xfffffe1838702a20 > kdb_backtrace() at kdb_backtrace+0x39/frame 0xfffffe1838702ad0 > panic() at panic+0x155/frame 0xfffffe1838702b50 > deadlkres() at deadlkres+0x42a/frame 0xfffffe1838702bb0 > fork_exit() at fork_exit+0x9a/frame 0xfffffe1838702bf0 > fork_trampoline() at fork_trampoline+0xe/frame 0xfffffe1838702bf0 > --- trap 0, rip = 0, rsp = 0xfffffe1838702cb0, rbp = 0 --- > KDB: enter: panic > > Reading symbols from /boot/kernel/zfs.ko.symbols...done. > Loaded symbols for /boot/kernel/zfs.ko.symbols > Reading symbols from /boot/kernel/opensolaris.ko.symbols...done. > Loaded symbols for /boot/kernel/opensolaris.ko.symbols > Reading symbols from /boot/kernel/ums.ko.symbols...done. > Loaded symbols for /boot/kernel/ums.ko.symbols > Reading symbols from /boot/kernel/linprocfs.ko.symbols...done. > Loaded symbols for /boot/kernel/linprocfs.ko.symbols > Reading symbols from /boot/kernel/linux.ko.symbols...done. > Loaded symbols for /boot/kernel/linux.ko.symbols > #0 doadump (textdump=-946873840) at pcpu.h:219 > 219 __asm("movq %%gs:%1,%0" : "=r" (td) > (kgdb) bt > #0 doadump (textdump=-946873840) at pcpu.h:219 > #1 0xffffffff8034e865 in db_fncall (dummy1=<value optimized out>, > dummy2=<value optimized out>, dummy3=<value optimized out>, > dummy4=<value optimized out>) at /usr/src/sys/ddb/db_command.c:578 > #2 0xffffffff8034e54d in db_command (cmd_table=0x0) > at /usr/src/sys/ddb/db_command.c:449 > #3 0xffffffff8034e2c4 in db_command_loop () > at /usr/src/sys/ddb/db_command.c:502 > #4 0xffffffff80350d20 in db_trap (type=<value optimized out>, code=0) > at /usr/src/sys/ddb/db_main.c:231 > #5 0xffffffff809a9bd9 in kdb_trap (type=3, code=0, tf=<value optimized out>) > at /usr/src/sys/kern/subr_kdb.c:656 > #6 0xffffffff80dc00e3 in trap (frame=0xfffffe1838702a00) > at /usr/src/sys/amd64/amd64/trap.c:551 > #7 0xffffffff80da29c2 in calltrap () > at /usr/src/sys/amd64/amd64/exception.S:231 > #8 0xffffffff809a933e in kdb_enter (why=0xffffffff81039a72 "panic", > msg=<value optimized out>) at cpufunc.h:63 > #9 0xffffffff8096a8b5 in panic (fmt=<value optimized out>) > at /usr/src/sys/kern/kern_shutdown.c:749 > #10 0xffffffff8090d16a in deadlkres () at /usr/src/sys/kern/kern_clock.c:203 > #11 0xffffffff8093170a in fork_exit (callout=0xffffffff8090cd40 <deadlkres>, > arg=0x0, frame=0xfffffe1838702c00) at /usr/src/sys/kern/kern_fork.c:977 > ---Type <return> to continue, or q <return> to quit--- > #12 0xffffffff80da2efe in fork_trampoline () > at /usr/src/sys/amd64/amd64/exception.S:605 > #13 0x0000000000000000 in ?? () > Current language: auto; currently minimal > (kgdb) fr 10 > #10 0xffffffff8090d16a in deadlkres () at /usr/src/sys/kern/kern_clock.c:203 > 203 panic("%s: possible deadlock detected on allproc_lock\n", > (kgdb) l > 198 * priority inversion problem leading to starvation. > 199 * If the lock can't be held after 100 tries, panic. > 200 */ > 201 if (!sx_try_slock(&allproc_lock)) { > 202 if (tryl > 100) > 203 panic("%s: possible deadlock detected on allproc_lock\n", > 204 __func__); > 205 tryl++; > 206 pause("allproc", sleepfreq * hz); > 207 continue; > (kgdb) up > #11 0xffffffff8093170a in fork_exit (callout=0xffffffff8090cd40 <deadlkres>, > arg=0x0, frame=0xfffffe1838702c00) at /usr/src/sys/kern/kern_fork.c:977 > 977 callout(arg, frame); > (kgdb) l > 972 * cpu_set_fork_handler intercepts this function call to > 973 * have this call a non-return function to stay in kernel mode. > 974 * initproc has its own fork handler, but it does return. > 975 */ > 976 KASSERT(callout != NULL, ("NULL callout in fork_exit")); > 977 callout(arg, frame); > 978 > 979 /* > 980 * Check if a kernel thread misbehaved and returned from its main > 981 * function. > (kgdb) list *0xffffffff8090cd40 > 0xffffffff8090cd40 is in deadlkres (/usr/src/sys/kern/kern_clock.c:185). > 180 static int blktime_threshold = 900; > 181 static int sleepfreq = 3; > 182 > 183 static void > 184 deadlkres(void) > 185 { > 186 struct proc *p; > 187 struct thread *td; > 188 void *wchan; > 189 int blkticks, i, slpticks, slptype, tryl, tticks; > (kgdb) quit > # ^D > Script done on Fri Jun 6 14:03:30 2014 > > Thanks. > > Glen > It seems that this is a degenerative condition. Processes are hanging around longer and longer. For example on one "bad" system: sbruno_at_redbuild03.nyi:~ % ps auwxxx | wc -l 91378 Running top seems to indicate a lot of processes are blocking on "proctr": ast pid: 62403; load averages: 4.45, 4.37, 4.02 up 0+04:39:30 18:03:54 91586 processes7 running, 91575 sleeping, 4 zombie CPU: 0.3% user, 0.0% nice, 20.4% system, 0.0% interrupt, 79.3% idle Mem: 41G Active, 5942M Inact, 25G Wired, 72M Cache, 22G Free ARC: 10G Total, 5902M MFU, 3643M MRU, 1040K Anon, 205M Header, 381M Other Swap: PID USERNAME THR PRI NICE SIZE RES STATE C TIME WCPU COMMAND 57535 root 1 98 0 128M 109M CPU7 7 0:19 102.68% fstat 61644 sbruno 1 78 0 225M 205M CPU17 17 0:02 41.35% top 62196 root 1 52 0 129M 109M select 10 0:01 35.58% top 61373 www 1 48 0 17012K 2424K CPU8 8 0:01 15.56% sh 61818 root 1 52 0 0K 16K proctr 21 0:00 13.70% sh 26930 root 1 39 0 10404K 2216K wait 16 2:21 10.11% sh 62149 root 1 44 0 17012K 2540K proctr 4 0:00 9.89% sh 61536 root 1 41 0 14536K 3500K CPU0 0 0:01 9.81% sh 1549 root 1 49 0 17012K 2528K proctr 2 1:33 9.23% sh 62150 root 1 52 0 8352K 2212K proctr 21 0:00 8.63% sh 57718 root 1 46 0 15104K 3272K proctr 5 0:02 8.52% gmake 1543 root 1 49 0 17012K 2544K proctr 14 1:37 7.75% sh 16466 root 1 28 0 5836K 1328K allpro 11 2:00 7.67% xargs 1544 root 1 36 0 17012K 2520K piperd 22 1:42 7.53% sh 62290 root 1 49 0 14536K 2304K proctr 6 0:00 7.02% sh 61966 root 1 52 0 8352K 2212K wait 12 0:00 6.81% sh 1550 root 1 40 0 17012K 2480K allpro 22 1:41 6.52% sh 79993 root 1 38 0 5352K 1196K proctr 3 1:48 4.10% find 8583 root 1 21 0 23520K 3692K pause 17 0:03 4.00% csh 1539 root 1 49 0 17012K 2536K piperd 9 1:48 3.65% sh 61041 root 1 31 0 17012K 2412K CPU13 13 0:00 3.58% sh 1541 root 1 34 0 17012K 2544K piperd 5 1:40 3.14% sh 62117 root 1 52 0 8352K 2212K wait 4 0:00 3.12% sh 62264 root 1 28 0 17012K 2516K CPU23 23 0:00 2.98% sh 60757 www 1 45 0 0K 16K proctr 9 0:01 2.73% sh 59971 root 1 35 0 17012K 2404K wait 21 0:01 2.28% sh 62228 root 1 40 0 17012K 2532K CPU16 16 0:00 1.85% sh 86763 root 29 52 0 25990M 264M wait 5 0:17 0.21% java 62156 root 1 52 0 0K 16K proctr 4 0:00 0.09% grep 962 mysql 28 20 0 857M 552M uwait 21 2:03 0.08% mysqld 1542 root 1 48 0 17012K 2544K proctr 1 1:33 0.04% sh 72051 root 1 52 0 16988K 3500K wait 21 1:27 0.04% sh 61874 root 1 52 0 8352K 2212K wait 3 0:00 0.03% sh 1545 root 1 46 0 17012K 2544K proctr 15 1:34 0.02% sh 60883 www 1 21 0 18740K 2308K proctr 18 0:00 0.02% egrep 24924 sbruno 1 20 0 86412K 7696K select 2 0:00 0.01% sshd 715 root 1 20 0 14404K 1920K select 15 0:01 0.01% powerd 61586 root 1 52 0 8352K 2212K wait 12 0:01 0.01% shReceived on Fri Jun 06 2014 - 16:04:16 UTC
This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:40:49 UTC