Hi all, J. Porter Clark sent me some traces and dumps to analyze, but I was not able to reproduce it until yesterday. I guess I do know now what's going on. A patch to solve this problem is at the end. I'm hunted down the same panic Doug White tried to fix one year ago in FreeBSD 5. I've added some debug output to see what was going on: ptsopen(): tty with state 131112 has refcnt 3 (ttyp9) dev refcount = 2 ptsopen(): tty with state 131112 has refcnt 2 (ttyp8) dev refcount = 2 ptsopen(): tty with state 0 has refcnt 1 (ttyp5) dev refcount = 2 ptsopen(): tty with state 131112 has refcnt 3 (ttyp9) dev refcount = 2 ptsopen(): tty with state 131112 has refcnt 2 (ttyp2) dev refcount = 2 ptsopen(): tty with state 0 has refcnt 1 (ttyp6) dev refcount = 2 ptsopen(): tty with state 0 has refcnt 1 (ttyp3) dev refcount = 2 ptsopen(): tty with state 0 has refcnt 1 (ttyp5) dev refcount = 2 ptsopen(): tty with state 131112 has refcnt 3 (ttyp0) dev refcount = 2 ptsopen(): tty with state 131112 has refcnt 2 (ttyp8) dev refcount = 2 ptsopen(): tty with state 0 has refcnt 1 (ttyp6) dev refcount = 2 ttyrel(): tty refcnt is now 0 (ttyp3) ptsopen(): tty with state 0 has refcnt 1 (ttyp4) dev refcount = 2 ptsopen(): tty with state 0 has refcnt 0 (ttyp3) dev refcount = 2 panic() Interesting. This shouldn't happen in any case. ptsopen() is called with a tty which has a refcount of 0 which means ttyrel() has freed our struct tty before. Interesting is also that I've called ttyrel() and also ptsopen() with mtx_assert(&Giant, MA_OWNED); So it's not a missing GIANT lock at all. The box recovers if refcount > 0 and state is set to 0, but all tty/devfs operations are locked then for 2-3 minutes. In the meantime I wrote a perl script to crash a box in 5-10 minutes without significant load, just doing tty operations and writing stuff at the same time to 'closed' ttys. Devfs open doesn't check if the tty is still in use in some cases, I've added now checks for ptcopen(), ptsopen(), ttyopen() to check if we have been called with a refcount of zero. this should solve the panics with ttwakeup() we have in RELENG_5 and RELENG_6. A patch is attached at the end. Unfortunatly the backtrace in kgdb is broken because there is a null frame in between. I'll write another mail about this and I'll add later some ASSERTS in ptsopen() to get a usable trace. Fatal trap 12: page fault while in kernel mode cpuid = 1; apic id = 01 fault virtual address = 0x0 fault code = supervisor read, page not present instruction pointer = 0x20:0x0 stack pointer = 0x28:0xe8dd8974 frame pointer = 0x28:0xe8dd8988 code segment = base 0x0, limit 0xfffff, type 0x1b = DPL 0, pres 1, def32 1, gran 1 processor eflags = interrupt enabled, resume, IOPL = 0 current process = 70635 (perl) trap number = 12 panic: page fault cpuid = 1 KDB: stack backtrace: kdb_backtrace(100,c6bf2c00,28,e8dd8934,c,...) at kdb_backtrace+0x29 panic(c08a9756,c08fd779,0,fffff,c6bfd89b,...) at panic+0x114 trap_fatal(e8dd8934,0,c6bf2c00,c71f6cb8,c,...) at trap_fatal+0x2ce trap_pfault(e8dd8934,0,0) at trap_pfault+0x1d7 trap(c6710008,e8dd0028,c06c0028,c6bf2c00,c643ec00,...) at trap+0x2fd calltrap() at calltrap+0x5 --- trap 0xc, eip = 0, esp = 0xe8dd8974, ebp = 0xe8dd8988 --- (null)(c643ec88,0,0,c643ec10,c643ec00,...) at 0 ttwakeup(c643ec00,c643ec00,c6dd7980,c643ec00,e8dd89d8,...) at ttwakeup+0x65 ttymodem(c643ec00,1) at ttymodem+0x178 ptsopen(c6bf5200,3,2000,c6bf2c00) at ptsopen+0x99 giant_open(c6bf5200,3,2000,c6bf2c00) at giant_open+0x4f devfs_open(e8dd8a64) at devfs_open+0x20f VOP_OPEN_APV(c094ca20,e8dd8a64) at VOP_OPEN_APV+0x38 vn_open_cred(e8dd8bcc,e8dd8ccc,0,c6e43d00,1,...) at vn_open_cred+0x434 vn_open(e8dd8bcc,e8dd8ccc,0,1) at vn_open+0x1e kern_open(c6bf2c00,bfbfe8b0,0,3,0,...) at kern_open+0xb6 open(c6bf2c00,e8dd8d04) at open+0x1a syscall(808003b,3b,bfbf003b,2,0,...) at syscall+0x2bf Xint0x80_syscall() at Xint0x80_syscall+0x1f --- syscall (5, FreeBSD ELF32, open), eip = 0x28256aeb, esp = 0xbfbfe87c, ebp = 0xbfbfe8d8 --- (kgdb) p *(struct cdev *)0xc6bf5200 $9 = {si_priv = 0xc6bf5200, si_flags = 4, si_atime = {tv_sec = 1158948685, tv_nsec = 0}, si_ctime = {tv_sec = 1158968040, tv_nsec = 0}, si_mtime = {tv_sec = 1158948685, tv_nsec = 0}, si_uid = 0, si_gid = 0, si_mode = 438, si_cred = 0xc67f2900, si_drv0 = 6, si_refcount = 2, si_list = {le_next = 0xc680dc00, le_prev = 0xc6b57738}, si_clone = {le_next = 0x0, le_prev = 0x0}, si_children = { lh_first = 0x0}, si_siblings = {le_next = 0x0, le_prev = 0x0}, si_parent = 0x0, si_name = 0xc6bf5278 "ttyp6", si_drv1 = 0xc6dd7980, si_drv2 = 0x0, si_devsw = 0xc0959460, si_iosize_max = 65536, si_usecount = 1, si_threadcount = 4, __si_u = {__sit_tty = 0xc643ec00, __sid_snapdata = 0xc643ec00}, __si_namebuf = "ttyp3", '\0' <repeats 58 times>} $36 = {pt_flags = -960540160, pt_selr = {si_thrlist = {tqe_next = 0x4, tqe_prev = 0x4514274d}, si_thread = 0x0, si_note = {kl_list = { slh_first = 0x451472e8}, kl_lock = 0, kl_unlock = 0x4514274d, kl_locked = 0, kl_lockarg = 0x0}, si_flags = 0}, pt_selw = { si_thrlist = {tqe_next = 0x1b6, tqe_prev = 0xc67f2900}, si_thread = 0x6, si_note = {kl_list = {slh_first = 0x2}, kl_lock = 0xc680dc00, kl_unlock = 0xc6b57738, kl_locked = 0, kl_lockarg = 0x0}, si_flags = 0}, pt_send = 0 '\0', pt_ucntl = 0 '\0', pt_tty = 0x0, devs = 0x0, devc = 0xc6bf5278, pt_prison = 0xc6dd7980} kgdb) p *(struct tty *)0xc643ec00 $1 = {t_rawq = {c_cc = 0, c_cbcount = 0, c_cbmax = 0, c_cbreserved = 0, c_cf = 0x0, c_cl = 0x0}, t_rawcc = 288210, t_canq = {c_cc = 0, c_cbcount = 0, c_cbmax = 0, c_cbreserved = 0, c_cf = 0x0, c_cl = 0x0}, t_cancc = 30, t_outq = {c_cc = 0, c_cbcount = 0, c_cbmax = 0, c_cbreserved = 0, c_cf = 0x0, c_cl = 0x0}, t_outcc = 288974, t_line = 0, t_dev = 0xc6bf5200, t_mdev = 0x0, t_devunit = 0, t_state = 131080, t_flags = 0, t_timeout = -1, t_pgrp = 0x0, t_session = 0x0, t_sigio = 0x0, t_rsel = {si_thrlist = { tqe_next = 0x0, tqe_prev = 0x0}, si_thread = 0x0, si_note = {kl_list = {slh_first = 0x0}, kl_lock = 0, kl_unlock = 0, kl_locked = 0xc064f070 <knlist_mtx_locked>, kl_lockarg = 0x0}, si_flags = 0}, t_wsel = {si_thrlist = {tqe_next = 0x0, tqe_prev = 0x0}, si_thread = 0x0, si_note = {kl_list = {slh_first = 0x0}, kl_lock = 0, kl_unlock = 0, kl_locked = 0xc064f070 <knlist_mtx_locked>, kl_lockarg = 0x0}, si_flags = 0}, t_termios = {c_iflag = 11010, c_oflag = 3, c_cflag = 19200, c_lflag = 1483, c_cc = "\004ÿÿ\177\027\025\022\b\003\034\032\031\021\023\026\017\001\000\024ÿ", c_ispeed = 9600, c_ospeed = 9600}, t_init_in = {c_iflag = 11010, c_oflag = 3, c_cflag = 19200, c_lflag = 1483, c_cc = "\004ÿÿ\177\027\025\022\b\003\034\032\031\021\023\026\017\001\000\024ÿ", c_ispeed = 9600, c_ospeed = 9600}, t_init_out = { c_iflag = 11010, c_oflag = 3, c_cflag = 19200, c_lflag = 1483, c_cc = "\004ÿÿ\177\027\025\022\b\003\034\032\031\021\023\026\017\001\000\024ÿ", c_ispeed = 9600, c_ospeed = 9600}, t_lock_in = { c_iflag = 0, c_oflag = 0, c_cflag = 0, c_lflag = 0, c_cc = '\0' <repeats 19 times>, c_ispeed = 0, c_ospeed = 0}, t_lock_out = { c_iflag = 0, c_oflag = 0, c_cflag = 0, c_lflag = 0, c_cc = '\0' <repeats 19 times>, c_ispeed = 0, c_ospeed = 0}, t_winsize = { ws_row = 0, ws_col = 0, ws_xpixel = 0, ws_ypixel = 0}, t_sc = 0xc6dd7980, t_lsc = 0x0, t_column = 131840, t_rocount = 0, t_rocol = 0, t_ififosize = 512, t_ihiwat = 7680, t_ilowat = 6720, t_ispeedwat = 0, t_ohiwat = 1296, t_olowat = 256, t_ospeedwat = 0, t_gen = 15, t_list = {tqe_next = 0xc66f9c00, tqe_prev = 0xc657d5dc}, t_actout = 0, t_wopeners = 0, t_mtx = {mtx_object = { lo_class = 0xc0953f44, lo_name = 0xc08daf1c "tty", lo_type = 0xc08daf1c "tty", lo_flags = 131072, lo_list = {tqe_next = 0x0, tqe_prev = 0x0}, lo_witness = 0x0}, mtx_lock = 6, mtx_recurse = 0}, t_refcnt = 0, t_hotchar = 0, t_dtr_wait = 3000, t_do_timestamp = 0, t_timestamp = {tv_sec = 0, tv_usec = 0}, t_pps = 0x0, t_oproc = 0xc06a25d4 <ptsstart>, t_stop = 0xc06a28c4 <ptsstop>, t_param = 0, t_modem = 0, t_break = 0, t_ioctl = 0, t_open = 0, t_purge = 0, t_close = 0, t_cioctl = 0} --- sys/kern/tty.c Sun Nov 6 16:09:32 2005 +++ sys/kern/tty.c Sat Sep 23 13:16:51 2006 _at__at_ -3101,6 +3101,12 _at__at_ struct tty *tp; tp = dev->si_tty; + + /* XXX It can happen that devfs_open() calls us with tp->t_refcnt == 0 */ + if (tp == NULL || tp->t_refcnt == 0) { + return (ENXIO); + } + s = spltty(); /* * We jump to this label after all non-interrupted sleeps to pick --- sys/kern/tty_pty.c Thu Mar 30 16:46:56 2006 +++ sys/kern/tty_pty.c Sat Sep 23 13:17:36 2006 _at__at_ -170,6 +170,12 _at__at_ return(ENXIO); pt = dev->si_drv1; tp = dev->si_tty; + + /* XXX It can happen that devfs_open() calls us with tp->t_refcnt == 0 */ + if (tp == NULL || tp->t_refcnt == 0) { + return (ENXIO); + } + if ((tp->t_state & TS_ISOPEN) == 0) { ttyinitmode(tp, 1, 0); } else if (tp->t_state & TS_XCLUDE && suser(td)) _at__at_ -276,6 +282,12 _at__at_ if (!dev->si_drv1) return(ENXIO); tp = dev->si_tty; + + /* XXX It can happen that devfs_open() calls us with tp->t_refcnt == 0 */ + if (tp == NULL || tp->t_refcnt == 0) { + return (ENXIO); + } + if (tp->t_oproc) return (EIO); tp->t_timeout = -1; With this patch the system recovers fine: # ttyrel(): tty refcnt is now 1 (ttyp6) ttyref(): tty refcnt is now 2 (ttyp6) ttyref(): tty refcnt is now 2 (ttyp1) ttyrel(): tty refcnt is now 0 (ttyp3) ttyrel(): tty refcnt is now 0 (ttyp7) ttyrel(): tty refcnt is now 0 (ttyp8) ttyrel(): tty refcnt is now 0 (ttyp9) ttyref(): tty refcnt is now 1 (ttyp9) [...] Martin Blapp, <mb_at_imp.ch> <mbr_at_FreeBSD.org> ------------------------------------------------------------------ ImproWare AG, UNIXSP & ISP, Zurlindenstrasse 29, 4133 Pratteln, CH Phone: +41 61 826 93 00 Fax: +41 61 826 93 01 PGP: <finger -l mbr_at_freebsd.org> PGP Fingerprint: B434 53FC C87C FE7B 0A18 B84C 8686 EF22 D300 551E ------------------------------------------------------------------Received on Sat Sep 23 2006 - 09:42:34 UTC
This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:39:00 UTC