(unknown charset) Re: kernel panic in sbflush_internal

From: (unknown charset) Steve Kargl <sgk_at_troutmask.apl.washington.edu>
Date: Tue, 22 May 2007 09:52:42 -0700
On Tue, May 22, 2007 at 08:21:13AM -0400, Robert Watson wrote:
> 
> On Mon, 21 May 2007, Steven G. Kargl wrote:
> 
> >One of my colleagues brought down a node on my cluster while running a MPI 
> >job.  The kernel coredump shows
> >
> >Script started on Mon May 21 17:02:53 2007
> >node12:root[201] kgdb kernel.debug vmcore.0
> >
> >Unread portion of the kernel message buffer:
> >panic: sbflush_internal: cc 4294965848 || mb 0 || mbcnt 0
> >cpuid = 0
> >Uptime: 7h6m34s
> >Physical memory: 16119 MB
> >Dumping 631 MB: 616 600 584 568 552 536 520 504 488 472 456 440 424 408 
> >392 376 360 344 328 312 296 280 264 248 232 216 200 184 168 152 136 120 
> >104 88 72 56 40 24 8
> 
> Is the kernel build date an accurate reflection of the source code version 
> it is being used with?  Could you let me know what file revisions are in 
> use for uipc_socket.c, uipc_sockbuf2.c, uipc_syscalls.c, tcp_usrreq.c, 
> tcp_input.c, tcp_subr.c?  Could you print *sb in frame #4, *so in frame #7, 
> *tp in frame $5, and *inp in #5 (if defined) -- otherwise, (struct inpcb 
> *)so->so_pcb, if non-NULL, in frame #6.
> 

The requested info follows below.  Here's a few details that I omitted. 
In /var/log/messages on this node, I see

May 21 17:28:38 node12 kernel: bge0: watchdog timeout -- resetting
May 21 17:28:38 node12 kernel: bge0: link state changed to DOWN
May 21 17:28:40 node12 kernel: bge0: link state changed to UP
May 21 17:29:33 node12 kernel: bge0: watchdog timeout -- resetting
May 21 17:29:33 node12 kernel: bge0: link state changed to DOWN
May 21 17:29:35 node12 kernel: bge0: link state changed to UP

Google has suggested that I set hw.pci.enable_msix="0" and 
hw.pci.enable_msi="0" to potential eliminate the above.  I also 
have
kern.ipc.nmbclusters="65536"
net.inet.tcp.sendspace="65536"
net.inet.tcp.recvspace="65536"
in my loader.conf.

The user's home directory is made available via NFS from a different 
node.  His MPI job is pushing multiple GB/s across the GigE net among
4 identical nodes.

node12:kargl[203] ident kernel.debug | grep uipc
  sys/kern/uipc_sockbuf.c,v 1.170 2007/05/16 20:41:07 rwatson
  sys/kern/uipc_socket.c,v 1.300 2007/05/16 20:41:07 rwatson
  sys/kern/uipc_syscalls.c,v 1.255 2007/05/16 20:41:08 rwatson
  sys/kern/uipc_usrreq.c,v 1.204 2007/05/11 12:28:45 rwatson

node12:kargl[206] ident kernel.debug | grep tcp_
     sys/netinet/tcp_sack.c,v 1.40 2007/05/11 11:21:43 rwatson

node10:kargl[202] ident sys/netinet/tcp_input.c
     sys/netinet/tcp_input.c,v 1.351 2007/05/16 17:14:25 andre

node10:kargl[203] ident sys/netinet/tcp_usrreq.c
     sys/netinet/tcp_usrreq.c,v 1.156 2007/05/11 10:20:50 rwatson

node10:kargl[204] ident sys/netinet/tcp_subr.c
     sys/netinet/tcp_subr.c,v 1.283 2007/05/16 17:14:25 andre


node12:root[201] kgdb kernel.debug vmcore.0
(kgdb) frame 4
#4  0xffffffff802f50cb in sbflush (sb=0xffffff031243ab68)
    at /usr/src/sys/kern/uipc_sockbuf.c:825
825     /usr/src/sys/kern/uipc_sockbuf.c: No such file or directory.
        in /usr/src/sys/kern/uipc_sockbuf.c
(kgdb) print *sb
$1 = {sb_sel = {si_thrlist = {tqe_next = 0xffffff03121ae8b0, 
      tqe_prev = 0xffffff031059d5f8}, si_thread = 0x0, si_note = {kl_list = {
        slh_first = 0x0}, kl_lock = 0xffffffff8027ac80 <knlist_mtx_lock>, 
      kl_unlock = 0xffffffff8027acb0 <knlist_mtx_unlock>, 
      kl_locked = 0xffffffff8027ace0 <knlist_mtx_locked>, 
      kl_lockarg = 0xffffff031243abb0}, si_flags = 0}, sb_mtx = {
    lock_object = {lo_name = 0xffffffff8050cd15 "so_rcv", 
      lo_type = 0xffffffff8050cd15 "so_rcv", lo_flags = 16973824, 
      lo_witness_data = {lod_list = {stqe_next = 0x0}, lod_witness = 0x0}}, 
    mtx_lock = 18446742987443408160, mtx_recurse = 0}, sb_sx = {lock_object = {
      lo_name = 0xffffffff8050d3d0 "so_rcv_sx", 
      lo_type = 0xffffffff8050d3d0 "so_rcv_sx", lo_flags = 37421056, 
      lo_witness_data = {lod_list = {stqe_next = 0x0}, lod_witness = 0x0}}, 
    sx_lock = 1, sx_recurse = 0}, sb_state = 32, sb_mb = 0x0, sb_mbtail = 0x0, 
  sb_lastrecord = 0x0, sb_sndptr = 0x0, sb_sndptroff = 0, sb_cc = 4294965848, 
  sb_hiwat = 131056, sb_mbcnt = 0, sb_mbmax = 262144, sb_ctl = 0, 
  sb_lowat = 1, sb_timeo = 0, sb_flags = 0}
(kgdb) frame 7
#7  0xffffffff802f67f2 in soclose (so=0xffffff031243aae0)
    at /usr/src/sys/kern/uipc_socket.c:642
642     /usr/src/sys/kern/uipc_socket.c: No such file or directory.
        in /usr/src/sys/kern/uipc_socket.c
(kgdb) print *so
$2 = {so_count = 1, so_type = 1, so_options = 0, so_linger = 0, 
  so_state = 266, so_qstate = 0, so_pcb = 0xffffff0311a04690, 
  so_proto = 0xffffffff80673030, so_head = 0x0, so_incomp = {tqh_first = 0x0, 
    tqh_last = 0x0}, so_comp = {tqh_first = 0x0, tqh_last = 0x0}, so_list = {
    tqe_next = 0x0, tqe_prev = 0xffffff0311f3c860}, so_qlen = 0, 
  so_incqlen = 0, so_qlimit = 0, so_timeo = 0, so_error = 0, so_sigio = 0x0, 
  so_oobmark = 0, so_aiojobq = {tqh_first = 0x0, 
    tqh_last = 0xffffff031243ab58}, so_rcv = {sb_sel = {si_thrlist = {
        tqe_next = 0xffffff03121ae8b0, tqe_prev = 0xffffff031059d5f8}, 
      si_thread = 0x0, si_note = {kl_list = {slh_first = 0x0}, 
        kl_lock = 0xffffffff8027ac80 <knlist_mtx_lock>, 
        kl_unlock = 0xffffffff8027acb0 <knlist_mtx_unlock>, 
        kl_locked = 0xffffffff8027ace0 <knlist_mtx_locked>, 
        kl_lockarg = 0xffffff031243abb0}, si_flags = 0}, sb_mtx = {
      lock_object = {lo_name = 0xffffffff8050cd15 "so_rcv", 
        lo_type = 0xffffffff8050cd15 "so_rcv", lo_flags = 16973824, 
        lo_witness_data = {lod_list = {stqe_next = 0x0}, lod_witness = 0x0}}, 
      mtx_lock = 18446742987443408160, mtx_recurse = 0}, sb_sx = {
      lock_object = {lo_name = 0xffffffff8050d3d0 "so_rcv_sx", 
        lo_type = 0xffffffff8050d3d0 "so_rcv_sx", lo_flags = 37421056, 
        lo_witness_data = {lod_list = {stqe_next = 0x0}, lod_witness = 0x0}}, 
      sx_lock = 1, sx_recurse = 0}, sb_state = 32, sb_mb = 0x0, 
    sb_mbtail = 0x0, sb_lastrecord = 0x0, sb_sndptr = 0x0, sb_sndptroff = 0, 
    sb_cc = 4294965848, sb_hiwat = 131056, sb_mbcnt = 0, sb_mbmax = 262144, 
    sb_ctl = 0, sb_lowat = 1, sb_timeo = 0, sb_flags = 0}, so_snd = {sb_sel = {
      si_thrlist = {tqe_next = 0x0, tqe_prev = 0x0}, si_thread = 0x0, 
      si_note = {kl_list = {slh_first = 0x0}, 
        kl_lock = 0xffffffff8027ac80 <knlist_mtx_lock>, 
        kl_unlock = 0xffffffff8027acb0 <knlist_mtx_unlock>, 
        kl_locked = 0xffffffff8027ace0 <knlist_mtx_locked>, 
        kl_lockarg = 0xffffff031243aca8}, si_flags = 0}, sb_mtx = {
      lock_object = {lo_name = 0xffffffff8050cd1c "so_snd", 
        lo_type = 0xffffffff8050cd1c "so_snd", lo_flags = 16973824, 
        lo_witness_data = {lod_list = {stqe_next = 0x0}, lod_witness = 0x0}}, 
      mtx_lock = 4, mtx_recurse = 0}, sb_sx = {lock_object = {
        lo_name = 0xffffffff8050d3c6 "so_snd_sx", 
        lo_type = 0xffffffff8050d3c6 "so_snd_sx", lo_flags = 37421056, 
        lo_witness_data = {lod_list = {stqe_next = 0x0}, lod_witness = 0x0}}, 
      sx_lock = 1, sx_recurse = 0}, sb_state = 16, sb_mb = 0x0, 
    sb_mbtail = 0x0, sb_lastrecord = 0x0, sb_sndptr = 0x0, sb_sndptroff = 0, 
    sb_cc = 0, sb_hiwat = 131056, sb_mbcnt = 0, sb_mbmax = 262144, sb_ctl = 0, 
    sb_lowat = 2048, sb_timeo = 0, sb_flags = 0}, so_upcall = 0, 
  so_upcallarg = 0x0, so_cred = 0xffffff031a37e700, so_label = 0x0, 
  so_peerlabel = 0x0, so_gencnt = 1035, so_emuldata = 0x0, so_accf = 0x0}
(kgdb) frame 5
#5  0xffffffff803b7246 in tcp_disconnect (tp=0xffffff03101f73e0)
    at /usr/src/sys/netinet/tcp_usrreq.c:1496
1496    /usr/src/sys/netinet/tcp_usrreq.c: No such file or directory.
        in /usr/src/sys/netinet/tcp_usrreq.c
(kgdb) print *tp
$3 = {t_segq = {lh_first = 0x0}, t_segqlen = 0, t_dupacks = 0, 
  t_timers = 0xffffff03101f7578, t_inpcb = 0xffffff0311a04690, t_state = 5, 
  t_flags = 2097636, snd_una = 1204918958, snd_max = 1204918958, 
  snd_nxt = 1204918958, snd_up = 1204918958, snd_wl1 = 3283435452, 
  snd_wl2 = 1204918958, iss = 3229930725, irs = 1013480283, 
  rcv_nxt = 3283435453, rcv_adv = 3283567957, rcv_wnd = 132504, 
  rcv_up = 3283435452, snd_wnd = 130816, snd_cwnd = 7240, snd_bwnd = 6144, 
  snd_ssthresh = 14480, snd_bandwidth = 12140, snd_recover = 1202449118, 
  t_maxopd = 1460, t_rcvtime = 25596928, t_starttime = 8536916, t_rtttime = 0, 
  t_rtseq = 1204918886, t_bw_rtttime = 22602960, t_bw_rtseq = 1204918958, 
  t_rxtcur = 314, t_maxseg = 1448, t_srtt = 425, t_rttvar = 403, 
  t_rxtshift = 0, t_rttmin = 3, t_rttbest = 10, t_rttupdated = 800408, 
  max_sndwnd = 130816, t_softerror = 0, t_oobflags = 0 '\0', t_iobc = 0 '\0', 
  snd_scale = 8 '\b', rcv_scale = 8 '\b', request_r_scale = 8 '\b', 
  ts_recent = 25589283, ts_recent_age = 25596928, ts_offset = 3770841565, 
  last_ack_sent = 3283435453, snd_cwnd_prev = 133216, 
  snd_ssthresh_prev = 18824, snd_recover_prev = 1191617750, 
  t_badrxtwin = 22521286, snd_limited = 2 '\002', snd_numholes = 0, 
  snd_holes = {tqh_first = 0x0, tqh_last = 0xffffff03101f7508}, snd_fack = 0, 
  rcv_numsacks = 0, sackblks = {{start = 0, end = 0}, {start = 0, end = 0}, {
      start = 0, end = 0}, {start = 0, end = 0}, {start = 0, end = 0}, {
      start = 0, end = 0}}, sack_newdata = 0, sackhint = {nexthole = 0x0, 
    sack_bytes_rexmit = 0}, t_rttlow = 100, rfbuf_ts = 0, rfbuf_cnt = 0}
(kgdb) print *inp
$4 = {inp_hash = {le_next = 0x0, le_prev = 0xffffff0000c18788}, inp_list = {
    le_next = 0xffffff0311a047a8, le_prev = 0xffffff0311a04588}, inp_flow = 0, 
  inp_inc = {inc_flags = 0 '\0', inc_len = 0 '\0', inc_pad = 0, inc_ie = {
      ie_fport = 25290, ie_lport = 21464, ie_dependfaddr = {ie46_foreign = {
          ia46_pad32 = {0, 0, 0}, ia46_addr4 = {s_addr = 234924224}}, 
        ie6_foreign = {__u6_addr = {
            __u6_addr8 = '\0' <repeats 12 times>, "ĀĻ\000\016", __u6_addr16 = {
              0, 0, 0, 0, 0, 0, 43200, 3584}, __u6_addr32 = {0, 0, 0, 
              234924224}}}}, ie_dependladdr = {ie46_local = {ia46_pad32 = {0, 
            0, 0}, ia46_addr4 = {s_addr = 201369792}}, ie6_local = {
          __u6_addr = {__u6_addr8 = '\0' <repeats 12 times>, "ĀĻ\000\f", 
            __u6_addr16 = {0, 0, 0, 0, 0, 0, 43200, 3072}, __u6_addr32 = {0, 
              0, 0, 201369792}}}}}}, inp_ppcb = 0xffffff03101f73e0, 
  inp_pcbinfo = 0xffffffff806ca680, inp_socket = 0xffffff031243aae0, 
  inp_label = 0x0, inp_flags = 8388608, inp_sp = 0x0, inp_vflag = 1 '\001', 
  inp_ip_ttl = 64 '_at_', inp_ip_p = 0 '\0', inp_ip_minttl = 0 '\0', 
  inp_depend4 = {inp4_ip_tos = 0 '\0', inp4_options = 0x0, 
    inp4_moptions = 0x0}, inp_depend6 = {inp6_options = 0x0, 
    inp6_outputopts = 0x0, inp6_moptions = 0x0, inp6_icmp6filt = 0x0, 
    inp6_cksum = 0, inp6_hops = 0}, inp_portlist = {le_next = 0x0, 
    le_prev = 0xffffff0003431390}, inp_phd = 0xffffff0003431380, 
  inp_gencnt = 264, inp_mtx = {lock_object = {
      lo_name = 0xffffffff80514d23 "inp", 
      lo_type = 0xffffffff8051713d "tcpinp", lo_flags = 21692416, 
      lo_witness_data = {lod_list = {stqe_next = 0x0}, lod_witness = 0x0}}, 
    mtx_lock = 18446742987443408160, mtx_recurse = 0}}
(kgdb) quit

-- 
Steve
Received on Tue May 22 2007 - 14:53:30 UTC

This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:39:10 UTC