Re: panic possibly on on bridge member removal

From: John Baldwin <jhb_at_freebsd.org>
Date: Mon, 1 Oct 2012 14:29:31 -0400
On Monday, October 01, 2012 11:05:30 am Kim Culhan wrote:
> On Mon, Oct 1, 2012 at 9:49 AM, John Baldwin <jhb_at_freebsd.org> wrote:
> > On Monday, October 01, 2012 9:08:30 am Kim Culhan wrote:
> >> On Mon, Oct 1, 2012 at 8:04 AM, John Baldwin <jhb_at_freebsd.org> wrote:
> >> > On Saturday, September 29, 2012 8:40:03 am Kim Culhan wrote:
> >> >> After a few hours of operation involving tap0 added to the bridge
> >> >> running openvpn
> >> >> and shutting down openvpn which removes tap0 from the bridge, the
> >> >> machine is found to have a panic:
> >> >>
> >> >> Fatal trap 12: page fault while in kernel mode
> >> >> cpuid = 1; apic id = 01
> >> >> fault virtual address   = 0x188
> >> >> fault code              = supervisor read data, page not present
> >> >> instruction pointer     = 0x20:0xffffffff82a14f96
> >> >> stack pointer           = 0x28:0xffffff8000285670
> >> >> frame pointer           = 0x28:0xffffff80002856b0
> >> >> code segment            = base 0x0, limit 0xfffff, type 0x1b
> >> >>                         = DPL 0, pres 1, long 1, def32 0, gran 1
> >> >> processor eflags        = interrupt enabled, resume, IOPL = 0
> >> >> current process         = 12 (swi5: fast taskq)
> >> >> [ thread pid 12 tid 100022 ]
> >> >> Stopped at      bridge_enqueue+0x86:    calll   *0x188(%r12)
> >> >> db> bt
> >> >> Tracing pid 12 tid 100022 td 0xfffffe0003aff000
> >> >> bridge_enqueue() at bridge_enqueue+0x86
> >> >
> >> > Can you run 'gdb /boot/kernel/kernel' and do 'l *bridge_enqueue+0x86'?
> >> >
> >> > --
> >> > John Baldwin
> >>
> >> (gdb) l *bridge_enqueue+0x86
> >> No symbol "bridge_enqueue" in current context.
> >> (gdb)
> >
> > Oh, are you using if_bridge.ko as a module?  If so, you can try running 'gdb
> > /boot/kernel/if_bridge.ko' instead.
> 
> (gdb) l *bridge_enqueue+0x86
> 0x2f96 is in bridge_enqueue
> (/usr/src/sys/modules/if_bridge/../../net/if_bridge.c:1810).
> 1805                                    continue;
> 1806                            }
> 1807                            m->m_flags &= ~M_VLANTAG;
> 1808                    }
> 1809
> 1810                    if ((err = dst_ifp->if_transmit(dst_ifp, m))) {
> 1811                            m_freem(m0);
> 1812                            break;
> 1813                    }
> 1814            }
> (gdb)

Hmm, try this perhaps.  I think there is a race where the departing bridge
member can be destroyed before the bridge code is finishing using it.  The
way the bridge driver is supposed to prevent that is by ensuring that
bridge_ifdetach() doesn't return until all other references to the detaching
interface are known to be released in the bridge code.  I think what is
happening for you is that a bridge_enqueue() performed outside the lock is
running into dst_ifp that has been if_free()'d and had its if_transmit set
to zero when it was reallocated for something else.

Index: if_bridge.c
===================================================================
--- if_bridge.c	(revision 241096)
+++ if_bridge.c	(working copy)
_at__at_ -1833,6 +1833,7 _at__at_ static void
 bridge_dummynet(struct mbuf *m, struct ifnet *ifp)
 {
 	struct bridge_softc *sc;
+	int error = 0;
 
 	sc = ifp->if_bridge;
 
_at__at_ -1846,18 +1847,30 _at__at_ bridge_dummynet(struct mbuf *m, struct ifnet *ifp)
 		return;
 	}
 
+	BRIDGE_LOCK(sc);
+	BRIDGE_LOCK2REF(sc, error);
+	if (error) {
+		m_freem(m);
+		return;
+	}
+
 	if (PFIL_HOOKED(&V_inet_pfil_hook)
 #ifdef INET6
 	    || PFIL_HOOKED(&V_inet6_pfil_hook)
 #endif
 	    ) {
-		if (bridge_pfil(&m, sc->sc_ifp, ifp, PFIL_OUT) != 0)
+		if (bridge_pfil(&m, sc->sc_ifp, ifp, PFIL_OUT) != 0) {
+			BRIDGE_UNREF(sc);
 			return;
-		if (m == NULL)
+		}
+		if (m == NULL) {
+			BRIDGE_UNREF(sc);
 			return;
+		}
 	}
 
 	bridge_enqueue(sc, ifp, m);
+	BRIDGE_UNREF(sc);
 }
 
 /*
_at__at_ -1971,8 +1984,13 _at__at_ sendunicast:
 		return (0);
 	}
 
-	BRIDGE_UNLOCK(sc);
+	BRIDGE_LOCK2REF(sc, error);
+	if (error) {
+		m_freem(m);
+		return (0);
+	}
 	bridge_enqueue(sc, dst_if, m);
+	BRIDGE_UNREF(sc);
 	return (0);
 }
 
_at__at_ -2000,8 +2018,13 _at__at_ bridge_transmit(struct ifnet *ifp, struct mbuf *m)
 
 		eh = mtod(m, struct ether_header *);
 		dst_if = bridge_rtlookup(sc, eh->ether_dhost, 1);
-		BRIDGE_UNLOCK(sc);
-		error = bridge_enqueue(sc, dst_if, m);
+		BRIDGE_LOCK2REF(sc, error);
+		if (error)
+			m_freem(m);
+		else {
+			error = bridge_enqueue(sc, dst_if, m);
+			BRIDGE_UNREF(sc);
+		}
 	} else
 		bridge_broadcast(sc, ifp, m, 0);
 
_at__at_ -2145,20 +2168,29 _at__at_ bridge_forward(struct bridge_softc *sc, struct bri
 	    dbif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING)
 		goto drop;
 
-	BRIDGE_UNLOCK(sc);
+	BRIDGE_LOCK2REF(sc, error);
+	if (error) {
+		m_freem(m);
+		return;
+	}
 
 	if (PFIL_HOOKED(&V_inet_pfil_hook)
 #ifdef INET6
 	    || PFIL_HOOKED(&V_inet6_pfil_hook)
 #endif
 	    ) {
-		if (bridge_pfil(&m, ifp, dst_if, PFIL_OUT) != 0)
+		if (bridge_pfil(&m, ifp, dst_if, PFIL_OUT) != 0) {
+			BRIDGE_UNREF(sc);
 			return;
-		if (m == NULL)
+		}
+		if (m == NULL) {
+			BRIDGE_UNREF(sc);
 			return;
+		}
 	}
 
 	bridge_enqueue(sc, dst_if, m);
+	BRIDGE_UNREF(sc);
 	return;
 
 drop:

-- 
John Baldwin
Received on Mon Oct 01 2012 - 16:38:08 UTC

This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:40:31 UTC