Re: panic in deadlkres() on r267110

From: Sean Bruno <sbruno_at_ignoranthack.me>
Date: Fri, 06 Jun 2014 11:04:09 -0700
On Fri, 2014-06-06 at 10:12 -0400, Glen Barber wrote:
> Two machines in the cluster panic last night with the same backtrace.
> It is unclear yet exactly what was happening on the systems, but both
> are port building machines using ports-mgmt/tinderbox.
> 
> Any ideas or information on how to further debug this would be
> appreciated.
> 
> Script started on Fri Jun  6 14:01:53 2014
> command: /bin/sh
> # uname -a
> FreeBSD redbuild04.nyi.freebsd.org 11.0-CURRENT FreeBSD 11.0-CURRENT #1 r267110: Thu Jun  5 15:57:43 UTC 2014     sbruno_at_redbuild04.nyi.freebsd.org:/usr/obj/usr/src/sys/REDBUILD  amd64
> # kgdb ./kernel.debug /var/crash/vmcore.0
> GNU gdb 6.1.1 [FreeBSD]
> Copyright 2004 Free Software Foundation, Inc.
> GDB is free software, covered by the GNU General Public License, and you are
> welcome to change it and/or distribute copies of it under certain conditions.
> Type "show copying" to see the conditions.
> There is absolutely no warranty for GDB.  Type "show warranty" for details.
> This GDB was configured as "amd64-marcel-freebsd"...
> 
> Unread portion of the kernel message buffer:
> panic: deadlkres: possible deadlock detected on allproc_lock
> 
> cpuid = 17
> KDB: stack backtrace:
> db_trace_self_wrapper() at db_trace_self_wrapper+0x2b/frame 0xfffffe1838702a20
> kdb_backtrace() at kdb_backtrace+0x39/frame 0xfffffe1838702ad0
> panic() at panic+0x155/frame 0xfffffe1838702b50
> deadlkres() at deadlkres+0x42a/frame 0xfffffe1838702bb0
> fork_exit() at fork_exit+0x9a/frame 0xfffffe1838702bf0
> fork_trampoline() at fork_trampoline+0xe/frame 0xfffffe1838702bf0
> --- trap 0, rip = 0, rsp = 0xfffffe1838702cb0, rbp = 0 ---
> KDB: enter: panic
> 
> Reading symbols from /boot/kernel/zfs.ko.symbols...done.
> Loaded symbols for /boot/kernel/zfs.ko.symbols
> Reading symbols from /boot/kernel/opensolaris.ko.symbols...done.
> Loaded symbols for /boot/kernel/opensolaris.ko.symbols
> Reading symbols from /boot/kernel/ums.ko.symbols...done.
> Loaded symbols for /boot/kernel/ums.ko.symbols
> Reading symbols from /boot/kernel/linprocfs.ko.symbols...done.
> Loaded symbols for /boot/kernel/linprocfs.ko.symbols
> Reading symbols from /boot/kernel/linux.ko.symbols...done.
> Loaded symbols for /boot/kernel/linux.ko.symbols
> #0  doadump (textdump=-946873840) at pcpu.h:219
> 219             __asm("movq %%gs:%1,%0" : "=r" (td)
> (kgdb) bt
> #0  doadump (textdump=-946873840) at pcpu.h:219
> #1  0xffffffff8034e865 in db_fncall (dummy1=<value optimized out>, 
>     dummy2=<value optimized out>, dummy3=<value optimized out>, 
>     dummy4=<value optimized out>) at /usr/src/sys/ddb/db_command.c:578
> #2  0xffffffff8034e54d in db_command (cmd_table=0x0)
>     at /usr/src/sys/ddb/db_command.c:449
> #3  0xffffffff8034e2c4 in db_command_loop ()
>     at /usr/src/sys/ddb/db_command.c:502
> #4  0xffffffff80350d20 in db_trap (type=<value optimized out>, code=0)
>     at /usr/src/sys/ddb/db_main.c:231
> #5  0xffffffff809a9bd9 in kdb_trap (type=3, code=0, tf=<value optimized out>)
>     at /usr/src/sys/kern/subr_kdb.c:656
> #6  0xffffffff80dc00e3 in trap (frame=0xfffffe1838702a00)
>     at /usr/src/sys/amd64/amd64/trap.c:551
> #7  0xffffffff80da29c2 in calltrap ()
>     at /usr/src/sys/amd64/amd64/exception.S:231
> #8  0xffffffff809a933e in kdb_enter (why=0xffffffff81039a72 "panic", 
>     msg=<value optimized out>) at cpufunc.h:63
> #9  0xffffffff8096a8b5 in panic (fmt=<value optimized out>)
>     at /usr/src/sys/kern/kern_shutdown.c:749
> #10 0xffffffff8090d16a in deadlkres () at /usr/src/sys/kern/kern_clock.c:203
> #11 0xffffffff8093170a in fork_exit (callout=0xffffffff8090cd40 <deadlkres>, 
>     arg=0x0, frame=0xfffffe1838702c00) at /usr/src/sys/kern/kern_fork.c:977
> ---Type <return> to continue, or q <return> to quit---
> #12 0xffffffff80da2efe in fork_trampoline ()
>     at /usr/src/sys/amd64/amd64/exception.S:605
> #13 0x0000000000000000 in ?? ()
> Current language:  auto; currently minimal
> (kgdb) fr 10
> #10 0xffffffff8090d16a in deadlkres () at /usr/src/sys/kern/kern_clock.c:203
> 203                     panic("%s: possible deadlock detected on allproc_lock\n",
> (kgdb) l
> 198                      * priority inversion problem leading to starvation.
> 199                      * If the lock can't be held after 100 tries, panic.
> 200                      */
> 201                     if (!sx_try_slock(&allproc_lock)) {
> 202                             if (tryl > 100)
> 203                     panic("%s: possible deadlock detected on allproc_lock\n",
> 204                                         __func__);
> 205                             tryl++;
> 206                             pause("allproc", sleepfreq * hz);
> 207                             continue;
> (kgdb) up
> #11 0xffffffff8093170a in fork_exit (callout=0xffffffff8090cd40 <deadlkres>, 
>     arg=0x0, frame=0xfffffe1838702c00) at /usr/src/sys/kern/kern_fork.c:977
> 977             callout(arg, frame);
> (kgdb) l
> 972              * cpu_set_fork_handler intercepts this function call to
> 973              * have this call a non-return function to stay in kernel mode.
> 974              * initproc has its own fork handler, but it does return.
> 975              */
> 976             KASSERT(callout != NULL, ("NULL callout in fork_exit"));
> 977             callout(arg, frame);
> 978     
> 979             /*
> 980              * Check if a kernel thread misbehaved and returned from its main
> 981              * function.
> (kgdb) list *0xffffffff8090cd40
> 0xffffffff8090cd40 is in deadlkres (/usr/src/sys/kern/kern_clock.c:185).
> 180     static int blktime_threshold = 900;
> 181     static int sleepfreq = 3;
> 182     
> 183     static void
> 184     deadlkres(void)
> 185     {
> 186             struct proc *p;
> 187             struct thread *td;
> 188             void *wchan;
> 189             int blkticks, i, slpticks, slptype, tryl, tticks;
> (kgdb) quit
> # ^D
> Script done on Fri Jun  6 14:03:30 2014
> 
> Thanks.
> 
> Glen
> 

It seems that this is a degenerative condition.  Processes are hanging
around longer and longer.  For example on one "bad" system:
sbruno_at_redbuild03.nyi:~ % ps auwxxx | wc -l
   91378

Running top seems to indicate a lot of processes are blocking on
"proctr":

ast pid: 62403;  load averages:  4.45,  4.37,  4.02
up 0+04:39:30  18:03:54
91586 processes7 running, 91575 sleeping, 4 zombie
CPU:  0.3% user,  0.0% nice, 20.4% system,  0.0% interrupt, 79.3% idle
Mem: 41G Active, 5942M Inact, 25G Wired, 72M Cache, 22G Free
ARC: 10G Total, 5902M MFU, 3643M MRU, 1040K Anon, 205M Header, 381M
Other
Swap: 

  PID USERNAME    THR PRI NICE   SIZE    RES STATE   C   TIME    WCPU
COMMAND
57535 root          1  98    0   128M   109M CPU7    7   0:19 102.68%
fstat
61644 sbruno        1  78    0   225M   205M CPU17  17   0:02  41.35%
top
62196 root          1  52    0   129M   109M select 10   0:01  35.58%
top
61373 www           1  48    0 17012K  2424K CPU8    8   0:01  15.56% sh
61818 root          1  52    0     0K    16K proctr 21   0:00  13.70% sh
26930 root          1  39    0 10404K  2216K wait   16   2:21  10.11% sh
62149 root          1  44    0 17012K  2540K proctr  4   0:00   9.89% sh
61536 root          1  41    0 14536K  3500K CPU0    0   0:01   9.81% sh
 1549 root          1  49    0 17012K  2528K proctr  2   1:33   9.23% sh
62150 root          1  52    0  8352K  2212K proctr 21   0:00   8.63% sh
57718 root          1  46    0 15104K  3272K proctr  5   0:02   8.52%
gmake
 1543 root          1  49    0 17012K  2544K proctr 14   1:37   7.75% sh
16466 root          1  28    0  5836K  1328K allpro 11   2:00   7.67%
xargs
 1544 root          1  36    0 17012K  2520K piperd 22   1:42   7.53% sh
62290 root          1  49    0 14536K  2304K proctr  6   0:00   7.02% sh
61966 root          1  52    0  8352K  2212K wait   12   0:00   6.81% sh
 1550 root          1  40    0 17012K  2480K allpro 22   1:41   6.52% sh
79993 root          1  38    0  5352K  1196K proctr  3   1:48   4.10%
find
 8583 root          1  21    0 23520K  3692K pause  17   0:03   4.00%
csh
 1539 root          1  49    0 17012K  2536K piperd  9   1:48   3.65% sh
61041 root          1  31    0 17012K  2412K CPU13  13   0:00   3.58% sh
 1541 root          1  34    0 17012K  2544K piperd  5   1:40   3.14% sh
62117 root          1  52    0  8352K  2212K wait    4   0:00   3.12% sh
62264 root          1  28    0 17012K  2516K CPU23  23   0:00   2.98% sh
60757 www           1  45    0     0K    16K proctr  9   0:01   2.73% sh
59971 root          1  35    0 17012K  2404K wait   21   0:01   2.28% sh
62228 root          1  40    0 17012K  2532K CPU16  16   0:00   1.85% sh
86763 root         29  52    0 25990M   264M wait    5   0:17   0.21%
java
62156 root          1  52    0     0K    16K proctr  4   0:00   0.09%
grep
  962 mysql        28  20    0   857M   552M uwait  21   2:03   0.08%
mysqld
 1542 root          1  48    0 17012K  2544K proctr  1   1:33   0.04% sh
72051 root          1  52    0 16988K  3500K wait   21   1:27   0.04% sh
61874 root          1  52    0  8352K  2212K wait    3   0:00   0.03% sh
 1545 root          1  46    0 17012K  2544K proctr 15   1:34   0.02% sh
60883 www           1  21    0 18740K  2308K proctr 18   0:00   0.02%
egrep
24924 sbruno        1  20    0 86412K  7696K select  2   0:00   0.01%
sshd
  715 root          1  20    0 14404K  1920K select 15   0:01   0.01%
powerd
61586 root          1  52    0  8352K  2212K wait   12   0:01   0.01% sh
Received on Fri Jun 06 2014 - 16:04:16 UTC

This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:40:49 UTC