Re: Instant panic while trying run ports-mgmt/poudriere

From: Konstantin Belousov <kostikbel_at_gmail.com>
Date: Sun, 23 Aug 2015 15:54:42 +0300
On Sun, Aug 23, 2015 at 12:08:16PM +0300, Konstantin Belousov wrote:
> On Sun, Aug 23, 2015 at 09:54:28AM +0300, Andriy Gapon wrote:
> > On 12/08/2015 17:11, Lawrence Stewart wrote:
> > > On 08/07/15 07:33, Pawel Pekala wrote:
> > >> Hi K.,
> > >>
> > >> On 2015-08-06 12:33 -0700, "K. Macy" <kmacy_at_freebsd.org> wrote:
> > >>> Is this still happening?
> > >>
> > >> Still crashes:
> > > 
> > > +1 for me running r286617
> > 
> > Here is another +1 with r286922.
> > I can add a couple of bits of debugging data:
> > 
> > (kgdb) fr 8
> > #8  0xffffffff80639d60 in knote (list=0xfffff8019a733ea0,
> > hint=2147483648, lockflags=<value optimized out>) at
> > /usr/src/sys/kern/kern_event.c:1964
> > 1964                    } else if ((lockflags & KNF_NOKQLOCK) != 0) {
> > (kgdb) p *list
> > $2 = {kl_list = {slh_first = 0x0}, kl_lock = 0xffffffff8063a1e0
> > <knlist_mtx_lock>, kl_unlock = 0xffffffff8063a200 <knlist_mtx_unlock>,
> >   kl_assert_locked = 0xffffffff8063a220 <knlist_mtx_assert_locked>,
> > kl_assert_unlocked = 0xffffffff8063a240 <knlist_mtx_assert_unlocked>,
> >   kl_lockarg = 0xfffff8019a733bb0}
> > (kgdb) disassemble
> > Dump of assembler code for function knote:
> > 0xffffffff80639d00 <knote+0>:   push   %rbp
> > 0xffffffff80639d01 <knote+1>:   mov    %rsp,%rbp
> > 0xffffffff80639d04 <knote+4>:   push   %r15
> > 0xffffffff80639d06 <knote+6>:   push   %r14
> > 0xffffffff80639d08 <knote+8>:   push   %r13
> > 0xffffffff80639d0a <knote+10>:  push   %r12
> > 0xffffffff80639d0c <knote+12>:  push   %rbx
> > 0xffffffff80639d0d <knote+13>:  sub    $0x18,%rsp
> > 0xffffffff80639d11 <knote+17>:  mov    %edx,%r12d
> > 0xffffffff80639d14 <knote+20>:  mov    %rsi,-0x30(%rbp)
> > 0xffffffff80639d18 <knote+24>:  mov    %rdi,%rbx
> > 0xffffffff80639d1b <knote+27>:  test   %rbx,%rbx
> > 0xffffffff80639d1e <knote+30>:  je     0xffffffff80639ef6 <knote+502>
> > 0xffffffff80639d24 <knote+36>:  mov    %r12d,%eax
> > 0xffffffff80639d27 <knote+39>:  and    $0x1,%eax
> > 0xffffffff80639d2a <knote+42>:  mov    %eax,-0x3c(%rbp)
> > 0xffffffff80639d2d <knote+45>:  mov    0x28(%rbx),%rdi
> > 0xffffffff80639d31 <knote+49>:  je     0xffffffff80639d38 <knote+56>
> > 0xffffffff80639d33 <knote+51>:  callq  *0x18(%rbx)
> > 0xffffffff80639d36 <knote+54>:  jmp    0xffffffff80639d42 <knote+66>
> > 0xffffffff80639d38 <knote+56>:  callq  *0x20(%rbx)
> > 0xffffffff80639d3b <knote+59>:  mov    0x28(%rbx),%rdi
> > 0xffffffff80639d3f <knote+63>:  callq  *0x8(%rbx)
> > 0xffffffff80639d42 <knote+66>:  mov    %rbx,-0x38(%rbp)
> > 0xffffffff80639d46 <knote+70>:  mov    (%rbx),%rbx
> > 0xffffffff80639d49 <knote+73>:  test   %rbx,%rbx
> > 0xffffffff80639d4c <knote+76>:  je     0xffffffff80639ee5 <knote+485>
> > 0xffffffff80639d52 <knote+82>:  and    $0x2,%r12d
> > 0xffffffff80639d56 <knote+86>:  nopw   %cs:0x0(%rax,%rax,1)
> > 0xffffffff80639d60 <knote+96>:  mov    0x28(%rbx),%r14
> > 
> > Panic is in the last quoted instruction.
> > And:
> > (kgdb) i reg
> > rax            0x246    582
> > rbx            0xdeadc0dedeadc0de       -2401050962867404578
> > rcx            0x0      0
> > rdx            0x12e    302
> > rsi            0xffffffff80a26a5a       -2136839590
> > rdi            0xffffffff80e81b80       -2132272256
> > rbp            0xfffffe02b7efea20       0xfffffe02b7efea20
> > rsp            0xfffffe02b7efe9e0       0xfffffe02b7efe9e0
> > r8             0xffffffff80a269ce       -2136839730
> > r9             0xffffffff80e82838       -2132269000
> > r10            0x10000  65536
> > r11            0xffffffff80fabd10       -2131051248
> > r12            0x0      0
> > r13            0xfffff801ff84a818       -8787511171048
> > r14            0xfffff801ff84a800       -8787511171072
> > r15            0xfffff8019a6974f0       -8789207452432
> > rip            0xffffffff80639d60       0xffffffff80639d60 <knote+96>
> > eflags         0x10286  66182
> > 
> > I think that $rbx stands out here (this is a kernel with INVARIANTS).
> > 
> > Looking at the code, is it possible that one of the calls from within
> > the loop's body modifies the list?  If that is so and provided that is a
> > valid behavior, then maybe using SLIST_FOREACH_SAFE would help.
> 
> This is first time a useful debugging data was posted.
> 
> The 0x28 offset may indicate either kn_kq member access of the struct
> knote, or kq_list of the struct kqueue.
> 
> kl_list.slh_first of the list parameter is NULL, how would a list
> iteration loop even start ?  Can you look up the list argument value
> from the previous frame (%rdi is overwritten, so debugger might be
> confused) ?

After looking at your data closely, I think you are right.  The panic
occurs when the exit1(9) does KNOTE_LOCKED(NOTE_EXIT).  This is the
only case in the tree where filter uses knlist_remove_inevent() to detach
processed note, so indeed the slist is modified under the iterator.

Below is the patch with the suggested change and unrelated cleanup of
the uma(9) KPI use.  Please test, everybody who has a panic with the
backtrace pointing to the sys_exit().

diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c
index a4388aa..2f15f7f 100644
--- a/sys/kern/kern_event.c
+++ b/sys/kern/kern_event.c
_at__at_ -1106,7 +1106,12 _at__at_ kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int wa
 		return EINVAL;
 
 	if (kev->flags & EV_ADD)
-		tkn = knote_alloc(waitok);	/* prevent waiting with locks */
+		/*
+		 * Prevent waiting with locks.  Non-sleepable
+		 * allocation failures are handled in the loop, only
+		 * if the spare knote appears to be actually required.
+		 */
+		tkn = knote_alloc(waitok);
 	else
 		tkn = NULL;
 
_at__at_ -1310,8 +1315,7 _at__at_ done:
 		FILEDESC_XUNLOCK(td->td_proc->p_fd);
 	if (fp != NULL)
 		fdrop(fp, td);
-	if (tkn != NULL)
-		knote_free(tkn);
+	knote_free(tkn);
 	if (fops != NULL)
 		kqueue_fo_release(filt);
 	return (error);
_at__at_ -1507,10 +1511,6 _at__at_ kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops,
 	} else
 		asbt = 0;
 	marker = knote_alloc(1);
-	if (marker == NULL) {
-		error = ENOMEM;
-		goto done_nl;
-	}
 	marker->kn_status = KN_MARKER;
 	KQ_LOCK(kq);
 
_at__at_ -1929,7 +1929,7 _at__at_ void
 knote(struct knlist *list, long hint, int lockflags)
 {
 	struct kqueue *kq;
-	struct knote *kn;
+	struct knote *kn, *tkn;
 	int error;
 
 	if (list == NULL)
_at__at_ -1948,7 +1948,7 _at__at_ knote(struct knlist *list, long hint, int lockflags)
 	 * only safe if you want to remove the current item, which we are
 	 * not doing.
 	 */
-	SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
+	SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, tkn) {
 		kq = kn->kn_kq;
 		KQ_LOCK(kq);
 		if ((kn->kn_status & (KN_INFLUX | KN_SCAN)) == KN_INFLUX) {
_at__at_ -2385,15 +2385,16 _at__at_ SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
 static struct knote *
 knote_alloc(int waitok)
 {
-	return ((struct knote *)uma_zalloc(knote_zone,
-	    (waitok ? M_WAITOK : M_NOWAIT)|M_ZERO));
+
+	return (uma_zalloc(knote_zone, (waitok ? M_WAITOK : M_NOWAIT) |
+	    M_ZERO));
 }
 
 static void
 knote_free(struct knote *kn)
 {
-	if (kn != NULL)
-		uma_zfree(knote_zone, kn);
+
+	uma_zfree(knote_zone, kn);
 }
 
 /*
Received on Sun Aug 23 2015 - 10:54:49 UTC

This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:40:59 UTC