Re: [panic] Race in IEEE802.11 layer towards device drivers

From: PseudoCylon <moonlightakkiy_at_yahoo.ca>
Date: Sun, 18 Jul 2010 21:21:25 -0700 (PDT)
[NB] Obviously, I didn't click "reply ALL" last time, so here are missing part

> > > >  -if(vap->iv_opmode == IEEE80211_M_HOSTAP){
> > > > 
> > > > -RUN_LOCK(sc);
> > > > +if  (vap->iv_opmode == IEEE80211_M_HOSTAP)
> > > > 
> > > >  sc->cmdq_key_set =  RUN_CMDQ_GO;
> > > > 
> > > > -RUN_UNLOCK(sc);
> > > > -}
> > > > 
> > > > 
> > > > Why  are you removing these locks?
> > 
> > It is simple assignment, it must be atomic.
> 
> Not necessarily. If you don't put lock statements around it or use the 
> volatile keyword, the compiler can re-organize the execution order. I will 
> have a look at it.
> 
> > 
> > > Another question:
> > >      i = RUN_CMDQ_GET(&sc->cmdq_store);
> > >      DPRINTF("cmdq_store=%d\n", i);
> > >    
> > >    sc->cmdq[i].func =  run_update_beacon_cb;
> > >    sc->cmdq[i].arg0 =  vap;
> > > 
> > > Why is this code and similar places not enclosed with  mutexes?
> > 
> > First, I couldn't use a lock in key_delete() because of LoR. So, I use
> > atomic instead. RUN_CMDQ_GET is atomic_fetch_add(). Whatever executes that
> > code gets unique place (cmdq[i]) to write, so there shouldn't be any race.
> > 
> > Then out of order execution happened. Specially, when key_set() overtakes
> > key_delete(), encryption fails. So, all deferred processes are called back
> > via run_cmdq_cb() to maintain the order. Because cmdq functions are first
> > written for key_delete() where lock causes LoR. So, lock isn't needed.
> > 
> > run_cmdq_cb() uses lock. But it is for calling callback functions locked.
> > So that, functions just call another function locked, like
> > ##_callback()
> > {
> >        LOCK();
> >        ##_locked();
> >        UNLOCK();
> > }
> > won't be needed.
> 
> If the run_cmdq_cb() is running at the same time which you are queuing 
> elements, then I note that you set .func before .arg0. The ##_callback() code 
> only checks if .func has been set. Actually the "i" increment should be after 
> that you filled out the data, and then you see that you cannot use atomic. I 
> think the most simple solution is to add another mutex, sc->sc_cmdq_mtx, 
which 
> protects the queue and it's associated data. Also, what do you do if the 
queue 
> wraps around? You should have a mechanism to prevent that, because you then 
> might start executing commands in random order?
> 

Here is a patch (patch against P4 if_run.c rev 14 and if_runvar.h rev 8)

-- begin patch --


diff --git a/dev/usb/wlan/if_run.c b/dev/usb/wlan/if_run.c
index 8c96534..c988ad4 100644
--- a/dev/usb/wlan/if_run.c
+++ b/dev/usb/wlan/if_run.c
_at__at_ -90,12 +90,6 _at__at_ SYSCTL_INT(_hw_usb_run, OID_AUTO, debug, CTLFLAG_RW, 
&run_debug, 0,
 #define IEEE80211_HAS_ADDR4(wh) \
 (((wh)->i_fc[1] & IEEE80211_FC1_DIR_MASK) == IEEE80211_FC1_DIR_DSTODS)
 
-/*
- * Because of LOR in run_key_delete(), use atomic instead.
- * '& RUN_CMDQ_MASQ' is to loop cmdq[].
- */
-#define RUN_CMDQ_GET(c)(atomic_fetchadd_32((c), 1) & RUN_CMDQ_MASQ)
-
 static const struct usb_device_id run_devs[] = {
     { USB_VP(USB_VENDOR_ABOCOM,USB_PRODUCT_ABOCOM_RT2770) },
     { USB_VP(USB_VENDOR_ABOCOM,USB_PRODUCT_ABOCOM_RT2870) },
_at__at_ -554,6 +548,8 _at__at_ run_attach(device_t self)
 
 mtx_init(&sc->sc_mtx, device_get_nameunit(sc->sc_dev),
     MTX_NETWORK_LOCK, MTX_DEF);
+mtx_init(&sc->sc_cmdq_mtx, device_get_nameunit(sc->sc_dev),
+    MTX_NETWORK_LOCK, MTX_DEF);
 
 iface_index = RT2860_IFACE_INDEX;
 
_at__at_ -737,6 +733,7 _at__at_ run_detach(device_t self)
 }
 
 mtx_destroy(&sc->sc_mtx);
+mtx_destroy(&sc->sc_cmdq_mtx);
 
 return (0);
 }
_at__at_ -830,9 +827,6 _at__at_ run_vap_create(struct ieee80211com *ic,
 if(sc->rvp_cnt++ == 0)
 ic->ic_opmode = opmode;
 
-if(opmode == IEEE80211_M_HOSTAP)
-sc->cmdq_run = RUN_CMDQ_GO;
-
 DPRINTF("rvp_id=%d bmap=%x rvp_cnt=%d\n",
     rvp->rvp_id, sc->rvp_bmap, sc->rvp_cnt);
 
_at__at_ -889,27 +883,31 _at__at_ run_cmdq_cb(void *arg, int pending)
 struct run_softc *sc = arg;
 uint8_t i;
 
-/* call cmdq[].func locked */
-RUN_LOCK(sc);
-for(i = sc->cmdq_exec; sc->cmdq[i].func && pending;
-    i = sc->cmdq_exec, pending--){
+RUN_CMDQ_LOCK(sc);
+for (i = sc->cmdq_exec; sc->cmdq[i].func; i = sc->cmdq_exec) {
 DPRINTFN(6, "cmdq_exec=%d pending=%d\n", i, pending);
-if(sc->cmdq_run == RUN_CMDQ_GO){
+if (sc->cmdq_run == RUN_CMDQ_GO ||
+    (sc->cmdq_key_set == RUN_CMDQ_GO &&
+    sc->cmdq[i].func == run_key_set_cb)) {
+RUN_CMDQ_UNLOCK(sc);
+RUN_LOCK(sc);
 /*
  * If arg0 is NULL, callback func needs more
  * than one arg. So, pass ptr to cmdq struct.
  */
-if(sc->cmdq[i].arg0)
+if (sc->cmdq[i].arg0)
 sc->cmdq[i].func(sc->cmdq[i].arg0);
 else
 sc->cmdq[i].func(&sc->cmdq[i]);
+RUN_UNLOCK(sc);
+RUN_CMDQ_LOCK(sc);
 }
 sc->cmdq[i].arg0 = NULL;
 sc->cmdq[i].func = NULL;
 sc->cmdq_exec++;
 sc->cmdq_exec &= RUN_CMDQ_MASQ;
 }
-RUN_UNLOCK(sc);
+RUN_CMDQ_UNLOCK(sc);
 }
 
 static void
_at__at_ -1771,6 +1769,19 _at__at_ run_newstate(struct ieee80211vap *vap, enum 
ieee80211_state nstate, int arg)
 case IEEE80211_S_INIT:
 restart_ratectl = 1;
 
+/*
+ * When hostapd has set a key, don't clear it.
+ * But, when the device is being brought down, clear it.
+ */
+if (sc->cmdq_key_set != RUN_CMDQ_GO ||
+    ostate == IEEE80211_S_RUN) {
+/* clear shared key table */
+run_set_region_4(sc,
+    RT2860_SKEY(rvp->rvp_id, 0), 0, 4 * 32);
+/* clear shared key mode */
+run_set_region_4(sc, RT2860_SKEY_MODE_0_7, 0, 4);
+}
+
 if (ostate != IEEE80211_S_RUN)
 break;
 
_at__at_ -1922,12 +1933,24 _at__at_ run_wme_update(struct ieee80211com *ic)
 struct run_softc *sc = ic->ic_ifp->if_softc;
 
 /* sometime called wothout lock */
-if(mtx_owned(&ic->ic_comlock.mtx)){
-uint32_t i = RUN_CMDQ_GET(&sc->cmdq_store);
+if (mtx_owned(&ic->ic_comlock.mtx)) {
+RUN_CMDQ_LOCK(sc);
+uint8_t i = sc->cmdq_store;
 DPRINTF("cmdq_store=%d\n", i);
+
+if (sc->cmdq[i].func != NULL) {
+DPRINTF("cmdq is full\n");
+RUN_CMDQ_UNLOCK(sc);
+return (-1);
+}
+
 sc->cmdq[i].func = run_wme_update_cb;
 sc->cmdq[i].arg0 = ic;
+sc->cmdq_store++;
+sc->cmdq_store &= RUN_CMDQ_MASQ;
 ieee80211_runtask(ic, &sc->cmdq_task);
+
+RUN_CMDQ_UNLOCK(sc);
 return (0);
 }
 
_at__at_ -2085,28 +2108,38 _at__at_ run_key_set(struct ieee80211vap *vap, struct 
ieee80211_key *k,
 {
 struct ieee80211com *ic = vap->iv_ic;
 struct run_softc *sc = ic->ic_ifp->if_softc;
-uint32_t i;
+uint8_t i;
 
-i = RUN_CMDQ_GET(&sc->cmdq_store);
+RUN_CMDQ_LOCK(sc);
+
+i = sc->cmdq_store;
 DPRINTF("cmdq_store=%d\n", i);
+
+if (sc->cmdq[i].func != NULL) {
+DPRINTF("cmdq is full\n");
+RUN_CMDQ_UNLOCK(sc);
+return (0);
+}
+
 sc->cmdq[i].func = run_key_set_cb;
 sc->cmdq[i].arg0 = NULL;
 sc->cmdq[i].arg1 = vap;
 sc->cmdq[i].k = k;
 IEEE80211_ADDR_COPY(sc->cmdq[i].mac, mac);
+sc->cmdq_store++;
+sc->cmdq_store &= RUN_CMDQ_MASQ;
 ieee80211_runtask(ic, &sc->cmdq_task);
 
 /*
  * To make sure key will be set when hostapd
  * calls iv_key_set() before if_init().
  */
-if(vap->iv_opmode == IEEE80211_M_HOSTAP){
-RUN_LOCK(sc);
+if (vap->iv_opmode == IEEE80211_M_HOSTAP)
 sc->cmdq_key_set = RUN_CMDQ_GO;
-RUN_UNLOCK(sc);
-}
 
-return(1);
+RUN_CMDQ_UNLOCK(sc);
+
+return (1);
 }
 
 /*
_at__at_ -2154,16 +2187,23 _at__at_ run_key_delete(struct ieee80211vap *vap, struct 
ieee80211_key *k)
 struct ieee80211com *ic = vap->iv_ic;
 struct run_softc *sc = ic->ic_ifp->if_softc;
 struct ieee80211_key *k0;
-uint32_t i;
+uint8_t i;
 
 /*
  * When called back, key might be gone. So, make a copy
  * of some values need to delete keys before deferring.
- * But, because of LOR with node lock, cannot use lock here.
- * So, use atomic instead.
  */
-i = RUN_CMDQ_GET(&sc->cmdq_store);
+RUN_CMDQ_LOCK(sc);
+
+i = sc->cmdq_store;
 DPRINTF("cmdq_store=%d\n", i);
+
+if (sc->cmdq[i].func != NULL) {
+DPRINTF("cmdq is full\n");
+RUN_CMDQ_UNLOCK(sc);
+return (0);
+}
+
 sc->cmdq[i].func = run_key_delete_cb;
 sc->cmdq[i].arg0 = NULL;
 sc->cmdq[i].arg1 = sc;
_at__at_ -2172,9 +2212,13 _at__at_ run_key_delete(struct ieee80211vap *vap, struct 
ieee80211_key *k)
 k0->wk_keyix = k->wk_keyix;
 /* matching wcid was written to wk_pad in run_key_set() */
 k0->wk_pad = k->wk_pad;
+sc->cmdq_store++;
+sc->cmdq_store &= RUN_CMDQ_MASQ;
 ieee80211_runtask(ic, &sc->cmdq_task);
-return (1);/* return fake success */
 
+RUN_CMDQ_UNLOCK(sc);
+
+return (1);
 }
 
 static void
_at__at_ -2362,19 +2406,31 _at__at_ run_newassoc(struct ieee80211_node *ni, int isnew)
 }
 
 /* only interested in true associations */
-if (isnew && ni->ni_associd != 0){
-
+if (isnew && ni->ni_associd != 0) {
 /*
- * This function could is called though timeout function.
+ * This function could be called though timeout function.
  * Need to defer.
  */
-uint32_t cnt = RUN_CMDQ_GET(&sc->cmdq_store);
+RUN_CMDQ_LOCK(sc);
+
+uint8_t cnt = sc->cmdq_store;
 DPRINTF("cmdq_store=%d\n", cnt);
+
+if (sc->cmdq[cnt].func != NULL) {
+DPRINTF("cmdq is full\n");
+RUN_CMDQ_UNLOCK(sc);
+return;
+}
+
 sc->cmdq[cnt].func = run_newassoc_cb;
 sc->cmdq[cnt].arg0 = NULL;
 sc->cmdq[cnt].arg1 = ni;
 sc->cmdq[cnt].wcid = wcid;
+sc->cmdq_store++;
+sc->cmdq_store &= RUN_CMDQ_MASQ;
 ieee80211_runtask(ic, &sc->cmdq_task);
+
+RUN_CMDQ_UNLOCK(sc);
 }
 
 DPRINTF("new assoc isnew=%d associd=%x addr=%s\n",
_at__at_ -2805,11 +2861,20 _at__at_ tr_setup:
 if (error != USB_ERR_CANCELLED) {
 if (error == USB_ERR_TIMEOUT) {
 device_printf(sc->sc_dev, "device timeout\n");
-uint32_t i = RUN_CMDQ_GET(&sc->cmdq_store);
+
+RUN_CMDQ_LOCK(sc);
+uint8_t i = sc->cmdq_store;
 DPRINTF("cmdq_store=%d\n", i);
-sc->cmdq[i].func = run_usb_timeout_cb;
-sc->cmdq[i].arg0 = vap;
-ieee80211_runtask(ic, &sc->cmdq_task);
+if (sc->cmdq[i].func != NULL)
+DPRINTF("cmdq is full\n");
+else {
+sc->cmdq[i].func = run_usb_timeout_cb;
+sc->cmdq[i].arg0 = vap;
+sc->cmdq_store++;
+sc->cmdq_store &= RUN_CMDQ_MASQ;
+ieee80211_runtask(ic, &sc->cmdq_task);
+}
+RUN_CMDQ_UNLOCK(sc);
 }
 
 /*
_at__at_ -3067,11 +3132,24 _at__at_ run_tx(struct run_softc *sc, struct mbuf *m, struct 
ieee80211_node *ni)
  * With multiple vaps or if_bridge, if_start() is called
  * with a non-sleepable lock, tcpinp. So, need to defer.
  */
-uint32_t i = RUN_CMDQ_GET(&sc->cmdq_store);
+RUN_CMDQ_LOCK(sc);
+uint8_t i = sc->cmdq_store;
 DPRINTFN(6, "cmdq_store=%d\n", i);
-sc->cmdq[i].func = run_drain_fifo;
-sc->cmdq[i].arg0 = sc;
-ieee80211_runtask(ic, &sc->cmdq_task);
+if (sc->cmdq[i].func != NULL) {
+DPRINTF("cmdq is full\n");
+/*
+ * Keep going.
+ * no need to stop Tx
+ * just because reading Tx stats failed
+ */
+} else {
+sc->cmdq[i].func = run_drain_fifo;
+sc->cmdq[i].arg0 = sc;
+sc->cmdq_store++;
+sc->cmdq_store &= RUN_CMDQ_MASQ;
+ieee80211_runtask(ic, &sc->cmdq_task);
+}
+RUN_CMDQ_UNLOCK(sc);
 }
 }
 
_at__at_ -3188,7 +3266,7 _at__at_ run_sendprot(struct run_softc *sc,
 ackrate = ieee80211_ack_rate(ic->ic_rt, rate);
 
 isshort = (ic->ic_flags & IEEE80211_F_SHPREAMBLE) != 0;
-dur = ieee80211_compute_duration(ic->ic_rt, pktlen, rate, isshort);
+dur = ieee80211_compute_duration(ic->ic_rt, pktlen, rate, isshort)
     + ieee80211_ack_duration(ic->ic_rt, rate, isshort);
 wflags = RT2860_TX_FRAG;
 
_at__at_ -3906,14 +3984,27 _at__at_ run_update_beacon(struct ieee80211vap *vap, int item)
 {
 struct ieee80211com *ic = vap->iv_ic;
 struct run_softc *sc = ic->ic_ifp->if_softc;
-uint32_t i;
+uint8_t i;
+
+RUN_CMDQ_LOCK(sc);
 
-i = RUN_CMDQ_GET(&sc->cmdq_store);
+i = sc->cmdq_store;
 DPRINTF("cmdq_store=%d\n", i);
+
+if (sc->cmdq[i].func != NULL) {
+DPRINTF("cmdq is full\n");
+RUN_CMDQ_UNLOCK(sc);
+return;
+}
+
 sc->cmdq[i].func = run_update_beacon_cb;
 sc->cmdq[i].arg0 = vap;
+sc->cmdq_store++;
+sc->cmdq_store &= RUN_CMDQ_MASQ;
 ieee80211_runtask(ic, &sc->cmdq_task);
 
+RUN_CMDQ_UNLOCK(sc);
+
 return;
 }
 
_at__at_ -4693,14 +4784,6 _at__at_ run_init_locked(struct run_softc *sc)
 /* clear WCID attribute table */
 run_set_region_4(sc, RT2860_WCID_ATTR(0), 0, 8 * 32);
 
-/* hostapd sets a key before init. So, don't clear it. */
-if(sc->cmdq_key_set != RUN_CMDQ_GO){
-/* clear shared key table */
-run_set_region_4(sc, RT2860_SKEY(0, 0), 0, 8 * 32);
-/* clear shared key mode */
-run_set_region_4(sc, RT2860_SKEY_MODE_0_7, 0, 4);
-}
-
 run_read(sc, RT2860_US_CYC_CNT, &tmp);
 tmp = (tmp & ~0xff) | 0x1e;
 run_write(sc, RT2860_US_CYC_CNT, tmp);
_at__at_ -4807,7 +4890,7 _at__at_ run_stop(void *arg)
 ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
 
 sc->ratectl_run = RUN_RATECTL_OFF;
-sc->cmdq_run = sc->cmdq_key_set;
+sc->cmdq_run = RUN_CMDQ_ABORT;
 
 RUN_UNLOCK(sc);
 
diff --git a/dev/usb/wlan/if_runvar.h b/dev/usb/wlan/if_runvar.h
index 39addbf..a6fddaa 100644
--- a/dev/usb/wlan/if_runvar.h
+++ b/dev/usb/wlan/if_runvar.h
_at__at_ -202,6 +202,7 _at__at_ struct run_softc {
 uint8_tsc_bssid[6];
 
 struct mtxsc_mtx;
+struct mtxsc_cmdq_mtx;
 
 struct run_endpoint_queuesc_epq[RUN_EP_QUEUES];
 
_at__at_ -210,12 +211,12 _at__at_ struct run_softc {
 uint8_tratectl_run;
 #define RUN_RATECTL_OFF0
 
-/* need to be power of 2, otherwise RUN_CMDQ_GET fails */
+/* need to be power of 2, otherwise run_cmdq_cb() fails */
 #define RUN_CMDQ_MAX16
 #define RUN_CMDQ_MASQ(RUN_CMDQ_MAX - 1)
 struct run_cmdqcmdq[RUN_CMDQ_MAX];
 struct taskcmdq_task;
-uint32_tcmdq_store;
+uint8_tcmdq_store;
 uint8_tcmdq_exec;
 uint8_tcmdq_run;
 uint8_tcmdq_key_set;
_at__at_ -255,4 +256,7 _at__at_ struct run_softc {
 #define RUN_UNLOCK(sc)mtx_unlock(&(sc)->sc_mtx)
 #define RUN_LOCK_ASSERT(sc, t)mtx_assert(&(sc)->sc_mtx, t)
 
+#define RUN_CMDQ_LOCK(sc)mtx_lock(&(sc)->sc_cmdq_mtx)
+#define RUN_CMDQ_UNLOCK(sc)mtx_unlock(&(sc)->sc_cmdq_mtx)
+
 #endif/* _IF_RUNVAR_H_ */


-- end patch --
Received on Mon Jul 19 2010 - 02:21:27 UTC

This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:40:05 UTC