=================================================================== RCS file: /usr/local/www/cvsroot/FreeBSD/src/sys/dev/em/if_em.c,v retrieving revision 1.65.2.10 retrieving revision 1.98 diff -u -p -r1.65.2.10 -r1.98 --- sys/dev/em/if_em.c 2006/01/10 10:09:03 1.65.2.10 +++ sys/dev/em/if_em.c 2006/01/11 00:30:25 1.98 @@ -31,7 +31,7 @@ POSSIBILITY OF SUCH DAMAGE. ***************************************************************************/ -/*$FreeBSD: src/sys/dev/em/if_em.c,v 1.65.2.10 2006/01/10 10:09:03 glebius Exp $*/ +/*$FreeBSD: /usr/local/www/cvsroot/FreeBSD/src/sys/dev/em/if_em.c,v 1.98 2006/01/11 00:30:25 scottl Exp $*/ #ifdef HAVE_KERNEL_OPTION_HEADERS #include "opt_device_polling.h" @@ -141,6 +141,9 @@ static int em_shutdown(device_t); static int em_suspend(device_t); static int em_resume(device_t); static void em_intr(void *); +#ifndef NO_EM_FASTINTR +static void em_intr_fast(void *); +#endif static void em_start(struct ifnet *); static void em_start_locked(struct ifnet *ifp); static int em_ioctl(struct ifnet *, u_long, caddr_t); @@ -168,7 +171,10 @@ static void em_update_stats_counters(str static void em_clean_transmit_interrupts(struct adapter *); static int em_allocate_receive_structures(struct adapter *); static int em_allocate_transmit_structures(struct adapter *); -static void em_process_receive_interrupts(struct adapter *, int); +static int em_process_receive_interrupts(struct adapter *, int); +#ifndef __NO_STRICT_ALIGNMENT +static int em_fixup_rx(struct adapter *); +#endif static void em_receive_checksum(struct adapter *, struct em_rx_desc *, struct mbuf *); @@ -206,6 +212,12 @@ static int em_sysctl_int_delay(SYSCTL_H static void em_add_int_delay_sysctl(struct adapter *, const char *, const char *, struct em_int_delay_info *, int, int); +#ifndef NO_EM_FASTINTR +static void em_add_int_process_limit(struct adapter *, const char *, + const char *, int *, int); +static void em_handle_rxtx(void *context, int pending); +static void em_handle_link(void *context, int pending); +#endif #ifdef DEVICE_POLLING static poll_handler_t em_poll; #endif @@ -254,6 +266,10 @@ TUNABLE_INT("hw.em.tx_abs_int_delay", &e TUNABLE_INT("hw.em.rx_abs_int_delay", &em_rx_abs_int_delay_dflt); TUNABLE_INT("hw.em.rxd", &em_rxd); TUNABLE_INT("hw.em.txd", &em_txd); +#ifndef NO_EM_FASTINTR +static int em_rx_process_limit = 100; +TUNABLE_INT("hw.em.rx_process_limit", &em_rx_process_limit); +#endif /********************************************************************* * Device identification routine @@ -376,6 +392,13 @@ em_attach(device_t dev) em_tx_abs_int_delay_dflt); } + /* Sysctls for limiting the amount of work done in the taskqueue */ +#ifndef NO_EM_FASTINTR + em_add_int_process_limit(adapter, "rx_processing_limit", + "max number of rx packets to process", &adapter->rx_process_limit, + em_rx_process_limit); +#endif + /* * Validate number of transmit and receive descriptors. It * must not exceed hardware maximum, and must be multiple @@ -430,7 +453,6 @@ em_attach(device_t dev) */ adapter->hw.report_tx_early = 1; - if (em_allocate_pci_resources(adapter)) { printf("em%d: Allocation of PCI resources failed\n", adapter->unit); @@ -560,6 +582,17 @@ em_detach(device_t dev) ether_poll_deregister(ifp); #endif + if (adapter->res_interrupt != NULL) { + bus_teardown_intr(dev, adapter->res_interrupt, + adapter->int_handler_tag); + bus_release_resource(dev, SYS_RES_IRQ, 0, + adapter->res_interrupt); + adapter->res_interrupt = NULL; + if (adapter->tq != NULL) { + taskqueue_drain(adapter->tq, &adapter->rxtx_task); + taskqueue_drain(taskqueue_fast, &adapter->link_task); + } + } EM_LOCK(adapter); adapter->in_detach = 1; em_stop(adapter); @@ -723,16 +756,6 @@ em_ioctl(struct ifnet *ifp, u_long comma break; case SIOCSIFMTU: { -#ifndef __NO_STRICT_ALIGNMENT - if (ifr->ifr_mtu > ETHERMTU) - /* - * XXX - * Due to the limitation of DMA engine, it needs fix-up - * code for strict alignment architectures. Disable - * jumbo frame until we have better solutions. - */ - error = EINVAL; -#else int max_frame_size; IOCTL_DEBUGOUT("ioctl rcv'd: SIOCSIFMTU (Set Interface MTU)"); @@ -761,7 +784,6 @@ em_ioctl(struct ifnet *ifp, u_long comma ifp->if_mtu + ETHER_HDR_LEN + ETHER_CRC_LEN; em_init_locked(adapter); EM_UNLOCK(adapter); -#endif break; } case SIOCSIFFLAGS: @@ -1060,11 +1082,113 @@ em_poll(struct ifnet *ifp, enum poll_cmd } #endif /* DEVICE_POLLING */ +#ifndef NO_EM_FASTINTR +static void +em_handle_link(void *context, int pending) +{ + struct adapter *adapter = context; + struct ifnet *ifp; + + ifp = adapter->ifp; + + EM_LOCK(adapter); + + callout_stop(&adapter->timer); + adapter->hw.get_link_status = 1; + em_check_for_link(&adapter->hw); + em_print_link_status(adapter); + callout_reset(&adapter->timer, hz, em_local_timer, + adapter); + EM_UNLOCK(adapter); +} + +static void +em_handle_rxtx(void *context, int pending) +{ + struct adapter *adapter = context; + struct ifnet *ifp; + + ifp = adapter->ifp; + + /* + * TODO: + * It should be possible to run the tx clean loop without the lock. + */ + if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + if (em_process_receive_interrupts(adapter, + adapter->rx_process_limit) != 0) + taskqueue_enqueue(adapter->tq, &adapter->rxtx_task); + EM_LOCK(adapter); + em_clean_transmit_interrupts(adapter); + + if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) + em_start_locked(ifp); + EM_UNLOCK(adapter); + } + + em_enable_intr(adapter); + return; +} +#endif + /********************************************************************* * * Interrupt Service routine * **********************************************************************/ +#ifndef NO_EM_FASTINTR +static void +em_intr_fast(void *arg) +{ + struct adapter *adapter = arg; + struct ifnet *ifp; + uint32_t reg_icr; + + ifp = adapter->ifp; + +#ifdef DEVICE_POLLING + if (ifp->if_capenable & IFCAP_POLLING) { + return; + } +#endif /* DEVICE_POLLING */ + + reg_icr = E1000_READ_REG(&adapter->hw, ICR); + + /* Hot eject? */ + if (reg_icr == 0xffffffff) + return; + + /* Definitely not our interrupt. */ + if (reg_icr == 0x0) + return; + + /* + * Starting with the 82571 chip, bit 31 should be used to + * determine whether the interrupt belongs to us. + */ + if (adapter->hw.mac_type >= em_82571 && + (reg_icr & E1000_ICR_INT_ASSERTED) == 0) + return; + + /* + * Mask interrupts until the taskqueue is finished running. This is + * cheap, just assume that it is needed. This also works around the + * MSI message reordering errata on certain systems. + */ + em_disable_intr(adapter); + taskqueue_enqueue(adapter->tq, &adapter->rxtx_task); + + /* Link status change */ + if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) + taskqueue_enqueue(taskqueue_fast, &adapter->link_task); + + if (reg_icr & E1000_ICR_RXO) { + adapter->rx_overruns++; + } + return; +} +#endif + static void em_intr(void *arg) { @@ -1865,7 +1989,7 @@ em_allocate_pci_resources(struct adapter if (adapter->hw.mac_type > em_82543) { /* Figure our where our IO BAR is ? */ - for (rid = PCIR_BAR(0); rid < PCIR_CARDBUSCIS;) { + for (rid = PCIR_BAR(0); rid < PCIR_CIS;) { val = pci_read_config(dev, rid, 4); if (E1000_BAR_TYPE(val) == E1000_BAR_TYPE_IO) { adapter->io_rid = rid; @@ -1876,7 +2000,7 @@ em_allocate_pci_resources(struct adapter if (E1000_BAR_MEM_TYPE(val) == E1000_BAR_MEM_TYPE_64BIT) rid += 4; } - if (rid >= PCIR_CARDBUSCIS) { + if (rid >= PCIR_CIS) { printf("em%d: Unable to locate IO BAR\n", adapter->unit); return (ENXIO); } @@ -1905,13 +2029,40 @@ em_allocate_pci_resources(struct adapter adapter->unit); return(ENXIO); } + + /* + * Try allocating a fast interrupt and the associated deferred + * processing contexts. If that doesn't work, try just using an + * ithread. + */ +#ifndef NO_EM_FASTINTR if (bus_setup_intr(dev, adapter->res_interrupt, - INTR_TYPE_NET | INTR_MPSAFE, - (void (*)(void *)) em_intr, adapter, - &adapter->int_handler_tag)) { - printf("em%d: Error registering interrupt handler!\n", - adapter->unit); - return(ENXIO); + INTR_TYPE_NET | INTR_FAST, em_intr_fast, adapter, + &adapter->int_handler_tag) == 0) { + + /* Init the deferred processing contexts. */ + TASK_INIT(&adapter->rxtx_task, 0, em_handle_rxtx, adapter); + TASK_INIT(&adapter->link_task, 0, em_handle_link, adapter); + adapter->tq = taskqueue_create_fast("em_taskq", M_NOWAIT, + taskqueue_thread_enqueue, + &adapter->tq, &adapter->tqproc); + kthread_create(taskqueue_thread_loop, + &adapter->tq, &adapter->tqproc, + 0, 0, "%s taskq", device_get_nameunit(adapter->dev)); + mtx_lock_spin(&sched_lock); + sched_prio(FIRST_THREAD_IN_PROC(adapter->tqproc), PI_NET); + mtx_unlock_spin(&sched_lock); + } +#endif + if (adapter->int_handler_tag == NULL) { + if (bus_setup_intr(dev, adapter->res_interrupt, + INTR_TYPE_NET | INTR_MPSAFE, + em_intr, adapter, + &adapter->int_handler_tag)) { + printf("em%d: Error registering interrupt handler!\n", + adapter->unit); + return(ENXIO); + } } adapter->hw.back = &adapter->osdep; @@ -1924,6 +2075,9 @@ em_free_pci_resources(struct adapter * a { device_t dev = adapter->dev; + if (adapter->tq != NULL) { + taskqueue_free(adapter->tq); + } if (adapter->res_interrupt != NULL) { bus_teardown_intr(dev, adapter->res_interrupt, adapter->int_handler_tag); @@ -2897,7 +3051,7 @@ em_free_receive_structures(struct adapte * count < 0. * *********************************************************************/ -static void +static int em_process_receive_interrupts(struct adapter * adapter, int count) { struct ifnet *ifp; @@ -2910,8 +3064,6 @@ em_process_receive_interrupts(struct ada /* Pointer to the receive descriptor being examined. */ struct em_rx_desc *current_desc; - mtx_assert(&adapter->mtx, MA_OWNED); - ifp = adapter->ifp; i = adapter->next_rx_desc_to_check; current_desc = &adapter->rx_desc_base[i]; @@ -2919,7 +3071,7 @@ em_process_receive_interrupts(struct ada BUS_DMASYNC_POSTREAD); if (!((current_desc->status) & E1000_RXD_STAT_DD)) { - return; + return (0); } while ((current_desc->status & E1000_RXD_STAT_DD) && @@ -3014,12 +3166,19 @@ em_process_receive_interrupts(struct ada ifp->if_ipackets++; em_receive_checksum(adapter, current_desc, adapter->fmp); +#ifndef __NO_STRICT_ALIGNMENT + if (ifp->if_mtu > ETHERMTU && + em_fixup_rx(adapter) != 0) + goto skip; + +#endif if (current_desc->status & E1000_RXD_STAT_VP) VLAN_INPUT_TAG(ifp, adapter->fmp, (le16toh(current_desc->special) & - E1000_RXD_SPC_VLAN_MASK), - adapter->fmp = NULL); - + E1000_RXD_SPC_VLAN_MASK)); +#ifndef __NO_STRICT_ALIGNMENT +skip: +#endif m = adapter->fmp; adapter->fmp = NULL; adapter->lmp = NULL; @@ -3038,24 +3197,75 @@ em_process_receive_interrupts(struct ada bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); - /* Advance the E1000's Receive Queue #0 "Tail Pointer". */ - E1000_WRITE_REG(&adapter->hw, RDT, i); - /* Advance our pointers to the next descriptor */ if (++i == adapter->num_rx_desc) i = 0; if (m != NULL) { adapter->next_rx_desc_to_check = i; - EM_UNLOCK(adapter); (*ifp->if_input)(ifp, m); - EM_LOCK(adapter); i = adapter->next_rx_desc_to_check; } current_desc = &adapter->rx_desc_base[i]; } adapter->next_rx_desc_to_check = i; - return; + + /* Advance the E1000's Receive Queue #0 "Tail Pointer". */ + if (--i < 0) i = adapter->num_rx_desc - 1; + E1000_WRITE_REG(&adapter->hw, RDT, i); + + if (!((current_desc->status) & E1000_RXD_STAT_DD)) { + return (0); + } + return (1); +} + +#ifndef __NO_STRICT_ALIGNMENT +/* + * When jumbo frames are enabled we should realign entire payload on + * architecures with strict alignment. This is serious design mistake of 8254x + * as it nullifies DMA operations. 8254x just allows RX buffer size to be + * 2048/4096/8192/16384. What we really want is 2048 - ETHER_ALIGN to align its + * payload. On architecures without strict alignment restrictions 8254x still + * performs unaligned memory access which would reduce the performance too. + * To avoid copying over an entire frame to align, we allocate a new mbuf and + * copy ethernet header to the new mbuf. The new mbuf is prepended into the + * existing mbuf chain. + * + * Be aware, best performance of the 8254x is achived only when jumbo frame is + * not used at all on architectures with strict alignment. + */ +static int +em_fixup_rx(struct adapter *adapter) +{ + struct mbuf *m, *n; + int error; + + error = 0; + m = adapter->fmp; + if (m->m_len <= (MCLBYTES - ETHER_HDR_LEN)) { + bcopy(m->m_data, m->m_data + ETHER_HDR_LEN, m->m_len); + m->m_data += ETHER_HDR_LEN; + } else { + MGETHDR(n, M_DONTWAIT, MT_DATA); + if (n != NULL) { + bcopy(m->m_data, n->m_data, ETHER_HDR_LEN); + m->m_data += ETHER_HDR_LEN; + m->m_len -= ETHER_HDR_LEN; + n->m_len = ETHER_HDR_LEN; + M_MOVE_PKTHDR(n, m); + n->m_next = m; + adapter->fmp = n; + } else { + adapter->dropped_pkts++; + m_freem(adapter->fmp); + adapter->fmp = NULL; + error = ENOMEM; + } + } + + return (error); } +#endif /********************************************************************* * @@ -3563,3 +3773,15 @@ em_add_int_delay_sysctl(struct adapter * OID_AUTO, name, CTLTYPE_INT|CTLFLAG_RW, info, 0, em_sysctl_int_delay, "I", description); } + +#ifndef NO_EM_FASTINTR +static void +em_add_int_process_limit(struct adapter *adapter, const char *name, + const char *description, int *limit, int value) +{ + *limit = value; + SYSCTL_ADD_INT(device_get_sysctl_ctx(adapter->dev), + SYSCTL_CHILDREN(device_get_sysctl_tree(adapter->dev)), + OID_AUTO, name, CTLTYPE_INT|CTLFLAG_RW, limit, value, description); +} +#endif