Re: mlx5(4) jumbo receive

From: Konstantin Belousov <kostikbel_at_gmail.com>
Date: Tue, 24 Apr 2018 16:59:59 +0300
On Tue, Apr 24, 2018 at 06:44:24AM -0700, Rodney W. Grimes wrote:
> > Hello,
> > the patch below is of interest for people who use Mellanox Connect-X/4
> > and 5 ethernet adapters and configure it for jumbo frames.  The patch
> > should improve the system behavior on low or fragmented memory situations.
> > See the commit message for detailed description.  Also, the patch should
> > not cause performance degradation for the normal 1500 MTU by keeping the
> > same configuration as currently unpatched driver.
> > 
> > I am interested in hearing about regressions (or not) caused by the
> > change.
> 
> I'll hopefully get  to test this early next week, but my schedule
> is very full until then.
> 
> I do have a fist full of sytle(9) and other comments though.
> 
> You can probably ignore my nits on capitalize start of comments,
> it seems that this may be the style of this code....
> 
> > commit f25c0c624177a1ca06ca051b0adb842acb66ec11
> > Author: Konstantin Belousov <konstantinb_at_mellanox.com>
> > Date:   Wed Nov 22 20:14:53 2017 +0200
> > 
> >     mlx5en: Handle jumbo frames without requiring big clusters.
> >     
> >     The scatter list is formed by the chunks of MCLBYTES each, and larger
> >     than default packets are returned to the stack as the mbuf chain.
> >     
> >     This is an improvements over the original patch by hanss_at_mellanox.com
> >     which solves some busdma issues and sizes WQEs scatter list to only
> >     have the minimal sufficient number of elements.  In particular, for
> >     the default MTU 1500 bytes the receive queue format is not changed
> >     comparing to the unpatched driver, avoiding a reported performance
> >     regression.
> >     
> >     Issue: 1065432
> >     Issue: 1184316
> >     Change-Id: Ib63a5ef55abd6ef1ec9b296e2cef88e4604bbd29
> >     Signed-off-by: Konstantin Belousov <konstantinb_at_mellanox.com>
> > 
> > diff --git a/sys/dev/mlx5/mlx5_en/en.h b/sys/dev/mlx5/mlx5_en/en.h
> > index b5000c32eaf..ff5ef446e34 100644
> > --- a/sys/dev/mlx5/mlx5_en/en.h
> > +++ b/sys/dev/mlx5/mlx5_en/en.h
> > _at__at_ -80,8 +80,19 _at__at_
> >  #define	MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE                0xa
> >  #define	MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE                0xe
> >  
> > -/* freeBSD HW LRO is limited by 16KB - the size of max mbuf */
> Though it is tempting to fix spelling errors and stylish things
> while working on code it usually helps to sepeate those into
> either a pre or post commit to reduce the lines of code change
> that need to be looked at when reviewing a diff.  This also helps
> diff(1) to chunk things a bit better.
> 
> > +#define	MLX5E_MAX_RX_SEGS 7
> > +
> > +#ifndef MLX5E_MAX_RX_BYTES
> > +#define	MLX5E_MAX_RX_BYTES MCLBYTES
> > +#endif
> > +
> > +#if (MLX5E_MAX_RX_SEGS == 1)
> > +/* FreeBSD HW LRO is limited by 16KB - the size of max mbuf */
> >  #define	MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ                 MJUM16BYTES
> > +#else
> > +#define	MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ \
> > +    MIN(65535, MLX5E_MAX_RX_SEGS * MLX5E_MAX_RX_BYTES)
> > +#endif
> >  #define	MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC      0x10
> >  #define	MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC_FROM_CQE	0x3
> >  #define	MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_PKTS      0x20
> > _at__at_ -530,6 +541,7 _at__at_ struct mlx5e_rq {
> >  	struct mtx mtx;
> >  	bus_dma_tag_t dma_tag;
> >  	u32	wqe_sz;
> > +	u32	nsegs;
> 
> Alphabetic order, and should just use modify the current u32 line:
> 	u32	nsegs, wqe_sz;
> 
> >  	struct mlx5e_rq_mbuf *mbuf;
> >  	struct ifnet *ifp;
> >  	struct mlx5e_rq_stats stats;
> > _at__at_ -795,9 +807,12 _at__at_ struct mlx5e_tx_wqe {
> >  
> >  struct mlx5e_rx_wqe {
> >  	struct mlx5_wqe_srq_next_seg next;
> > -	struct mlx5_wqe_data_seg data;
> > +	struct mlx5_wqe_data_seg data[];
> 
> None standard way to declare a pointer?
This is not a pointer declaration.  It is a flexible array member.

For the style issues, I will take look later.
> 
> >  };
> >  
> > +/* the size of the structure above must be power of two */
> the should be The.
> 
> > +CTASSERT(powerof2(sizeof(struct mlx5e_rx_wqe)));
> > +
> >  struct mlx5e_eeprom {
> >  	int	lock_bit;
> >  	int	i2c_addr;
> > diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
> > index c2c4b0d7744..25544e561e3 100644
> > --- a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
> > +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
> > _at__at_ -33,9 +33,12 _at__at_
> >  #ifndef ETH_DRIVER_VERSION
> >  #define	ETH_DRIVER_VERSION	"3.4.1"
> >  #endif
> > +
> Adding white space is a style change, and actually unwanted right here
> as the next line is tightly coupled to the previous line.
> 
> >  char mlx5e_version[] = "Mellanox Ethernet driver"
> >      " (" ETH_DRIVER_VERSION ")";
> >  
> > +static int mlx5e_get_wqe_sz(struct mlx5e_priv *priv, u32 *wqe_sz, u32 *nsegs);
> > +
> 
> Could you move the function before use and eliminate the need
> for the forward declare?
> 
> >  struct mlx5e_channel_param {
> >  	struct mlx5e_rq_param rq;
> >  	struct mlx5e_sq_param sq;
> > _at__at_ -810,6 +813,11 _at__at_ mlx5e_create_rq(struct mlx5e_channel *c,
> >  	int wq_sz;
> >  	int err;
> >  	int i;
> > +	u32 nsegs, wqe_sz;
> > +
> > +	err = mlx5e_get_wqe_sz(priv, &wqe_sz, &nsegs);
> > +	if (err != 0)
> > +		goto done;
> >  
> >  	/* Create DMA descriptor TAG */
> >  	if ((err = -bus_dma_tag_create(
> > _at__at_ -819,9 +827,9 _at__at_ mlx5e_create_rq(struct mlx5e_channel *c,
> >  	    BUS_SPACE_MAXADDR,		/* lowaddr */
> >  	    BUS_SPACE_MAXADDR,		/* highaddr */
> >  	    NULL, NULL,			/* filter, filterarg */
> > -	    MJUM16BYTES,		/* maxsize */
> > -	    1,				/* nsegments */
> > -	    MJUM16BYTES,		/* maxsegsize */
> > +	    nsegs * MLX5E_MAX_RX_BYTES,	/* maxsize */
> > +	    nsegs,			/* nsegments */
> > +	    nsegs * MLX5E_MAX_RX_BYTES,	/* maxsegsize */
> >  	    0,				/* flags */
> >  	    NULL, NULL,			/* lockfunc, lockfuncarg */
> >  	    &rq->dma_tag)))
> > _at__at_ -834,23 +842,9 _at__at_ mlx5e_create_rq(struct mlx5e_channel *c,
> >  
> >  	rq->wq.db = &rq->wq.db[MLX5_RCV_DBR];
> >  
> > -	if (priv->params.hw_lro_en) {
> > -		rq->wqe_sz = priv->params.lro_wqe_sz;
> > -	} else {
> > -		rq->wqe_sz = MLX5E_SW2MB_MTU(priv->ifp->if_mtu);
> > -	}
> > -	if (rq->wqe_sz > MJUM16BYTES) {
> > -		err = -ENOMEM;
> > +	err = mlx5e_get_wqe_sz(priv, &rq->wqe_sz, &rq->nsegs);
> > +	if (err != 0)
> >  		goto err_rq_wq_destroy;
> > -	} else if (rq->wqe_sz > MJUM9BYTES) {
> > -		rq->wqe_sz = MJUM16BYTES;
> > -	} else if (rq->wqe_sz > MJUMPAGESIZE) {
> > -		rq->wqe_sz = MJUM9BYTES;
> > -	} else if (rq->wqe_sz > MCLBYTES) {
> > -		rq->wqe_sz = MJUMPAGESIZE;
> > -	} else {
> > -		rq->wqe_sz = MCLBYTES;
> > -	}
> >  
> >  	wq_sz = mlx5_wq_ll_get_size(&rq->wq);
> >  
> > _at__at_ -861,7 +855,11 _at__at_ mlx5e_create_rq(struct mlx5e_channel *c,
> >  	rq->mbuf = malloc(wq_sz * sizeof(rq->mbuf[0]), M_MLX5EN, M_WAITOK | M_ZERO);
> >  	for (i = 0; i != wq_sz; i++) {
> >  		struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i);
> > +#if (MLX5E_MAX_RX_SEGS == 1)
> >  		uint32_t byte_count = rq->wqe_sz - MLX5E_NET_IP_ALIGN;
> > +#else
> > +		int j;
> > +#endif
> >  
> >  		err = -bus_dmamap_create(rq->dma_tag, 0, &rq->mbuf[i].dma_map);
> >  		if (err != 0) {
> > _at__at_ -869,8 +867,15 _at__at_ mlx5e_create_rq(struct mlx5e_channel *c,
> >  				bus_dmamap_destroy(rq->dma_tag, rq->mbuf[i].dma_map);
> >  			goto err_rq_mbuf_free;
> >  		}
> > -		wqe->data.lkey = c->mkey_be;
> > -		wqe->data.byte_count = cpu_to_be32(byte_count | MLX5_HW_START_PADDING);
> > +
> > +		/* set value for constant fields */
> > +#if (MLX5E_MAX_RX_SEGS == 1)
> > +		wqe->data[0].lkey = c->mkey_be;
> > +		wqe->data[0].byte_count = cpu_to_be32(byte_count | MLX5_HW_START_PADDING);
> > +#else
> > +		for (j = 0; j < rq->nsegs; j++)
> > +			wqe->data[j].lkey = c->mkey_be;
> > +#endif
> >  	}
> >  
> >  	rq->ifp = c->ifp;
> > _at__at_ -1809,16 +1814,51 _at__at_ mlx5e_close_channel_wait(struct mlx5e_channel *volatile *pp)
> >  	free(c, M_MLX5EN);
> >  }
> >  
> > +static int
> > +mlx5e_get_wqe_sz(struct mlx5e_priv *priv, u32 *wqe_sz, u32 *nsegs)
> > +{
> > +	u32 r, n;
> Alpah order on variables:
> 	u32 n, r;
> 
> > +
> > +	r = priv->params.hw_lro_en ? priv->params.lro_wqe_sz :
> > +	    MLX5E_SW2MB_MTU(priv->ifp->if_mtu);
> > +	if (r > MJUM16BYTES)
> > +		return (-ENOMEM);
> > +
> > +	if (r > MJUM9BYTES)
> > +		r = MJUM16BYTES;
> > +	else if (r > MJUMPAGESIZE)
> > +		r = MJUM9BYTES;
> > +	else if (r > MCLBYTES)
> > +		r = MJUMPAGESIZE;
> > +	else
> > +		r = MCLBYTES;
> > +
> > +	/*
> > +	 * n + 1 must be a power of two, because stride size must be.
> > +	 * Stride size is 16 * (n + 1), as the first segment is
> > +	 * control.
> > +	 */
> > +	for (n = howmany(r, MLX5E_MAX_RX_BYTES); !powerof2(n + 1); n++)
> > +		;
> > +
> > +	*wqe_sz = r;
> > +	*nsegs = n;
> > +	return (0);
> > +}
> > +
> >  static void
> >  mlx5e_build_rq_param(struct mlx5e_priv *priv,
> >      struct mlx5e_rq_param *param)
> >  {
> >  	void *rqc = param->rqc;
> >  	void *wq = MLX5_ADDR_OF(rqc, rqc, wq);
> > +	u32 wqe_sz, nsegs;
> Variable order: nsegs, wqe_sz;
> 
> >  
> > +	mlx5e_get_wqe_sz(priv, &wqe_sz, &nsegs);
> >  	MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_LINKED_LIST);
> >  	MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN);
> > -	MLX5_SET(wq, wq, log_wq_stride, ilog2(sizeof(struct mlx5e_rx_wqe)));
> > +	MLX5_SET(wq, wq, log_wq_stride, ilog2(sizeof(struct mlx5e_rx_wqe) +
> > +	    nsegs * sizeof(struct mlx5_wqe_data_seg)));
> >  	MLX5_SET(wq, wq, log_wq_sz, priv->params.log_rq_size);
> >  	MLX5_SET(wq, wq, pd, priv->pdn);
> >  
> > diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c b/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c
> > index fb14be43b32..5868b0a2f55 100644
> > --- a/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c
> > +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c
> > _at__at_ -32,21 +32,47 _at__at_ static inline int
> >  mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq,
> >      struct mlx5e_rx_wqe *wqe, u16 ix)
> >  {
> > -	bus_dma_segment_t segs[1];
> > +	bus_dma_segment_t segs[rq->nsegs];
> This code line compiles?   Or another odd pointer declare?
> 
> >  	struct mbuf *mb;
> >  	int nsegs;
> >  	int err;
> > -
> > +#if (MLX5E_MAX_RX_SEGS != 1)
> > +	struct mbuf *mb_head;
> > +	int i;
> > +#endif
> >  	if (rq->mbuf[ix].mbuf != NULL)
> >  		return (0);
> >  
> > +#if (MLX5E_MAX_RX_SEGS == 1)
> >  	mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rq->wqe_sz);
> >  	if (unlikely(!mb))
> >  		return (-ENOMEM);
> >  
> > -	/* set initial mbuf length */
> >  	mb->m_pkthdr.len = mb->m_len = rq->wqe_sz;
> > +#else
> > +	mb_head = mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
> > +	    MLX5E_MAX_RX_BYTES);
> > +	if (unlikely(mb == NULL))
> > +		return (-ENOMEM);
> > +
> > +	mb->m_len = MLX5E_MAX_RX_BYTES;
> > +	mb->m_pkthdr.len = MLX5E_MAX_RX_BYTES;
> >  
> > +	for (i = 1; i < rq->nsegs; i++) {
> > +		if (mb_head->m_pkthdr.len >= rq->wqe_sz)
> > +			break;
> > +		mb = mb->m_next = m_getjcl(M_NOWAIT, MT_DATA, 0,
> > +		    MLX5E_MAX_RX_BYTES);
> > +		if (unlikely(mb == NULL)) {
> > +			m_freem(mb_head);
> > +			return (-ENOMEM);
> > +		}
> > +		mb->m_len = MLX5E_MAX_RX_BYTES;
> > +		mb_head->m_pkthdr.len += MLX5E_MAX_RX_BYTES;
> > +	}
> > +	/* rewind to first mbuf in chain */
> > +	mb = mb_head;
> > +#endif
> >  	/* get IP header aligned */
> >  	m_adj(mb, MLX5E_NET_IP_ALIGN);
> >  
> > _at__at_ -54,12 +80,26 _at__at_ mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq,
> >  	    mb, segs, &nsegs, BUS_DMA_NOWAIT);
> >  	if (err != 0)
> >  		goto err_free_mbuf;
> > -	if (unlikely(nsegs != 1)) {
> > +	if (unlikely(nsegs == 0)) {
> >  		bus_dmamap_unload(rq->dma_tag, rq->mbuf[ix].dma_map);
> >  		err = -ENOMEM;
> >  		goto err_free_mbuf;
> >  	}
> > -	wqe->data.addr = cpu_to_be64(segs[0].ds_addr);
> > +#if (MLX5E_MAX_RX_SEGS == 1)
> > +	wqe->data[0].addr = cpu_to_be64(segs[0].ds_addr);
> > +#else
> > +	wqe->data[0].addr = cpu_to_be64(segs[0].ds_addr);
> > +	wqe->data[0].byte_count = cpu_to_be32(segs[0].ds_len |
> > +	    MLX5_HW_START_PADDING);
> > +	for (i = 1; i != nsegs; i++) {
> > +		wqe->data[i].addr = cpu_to_be64(segs[i].ds_addr);
> > +		wqe->data[i].byte_count = cpu_to_be32(segs[i].ds_len);
> > +	}
> > +	for (; i < rq->nsegs; i++) {
> > +		wqe->data[i].addr = 0;
> > +		wqe->data[i].byte_count = 0;
> > +	}
> > +#endif
> >  
> >  	rq->mbuf[ix].mbuf = mb;
> >  	rq->mbuf[ix].data = mb->m_data;
> > _at__at_ -214,6 +254,9 _at__at_ mlx5e_build_rx_mbuf(struct mlx5_cqe64 *cqe,
> >  {
> >  	struct ifnet *ifp = rq->ifp;
> >  	struct mlx5e_channel *c;
> > +#if (MLX5E_MAX_RX_SEGS != 1)
> > +	struct mbuf *mb_head;
> > +#endif
> >  	int lro_num_seg;	/* HW LRO session aggregated packets counter */
> >  	uint64_t tstmp;
> >  
> > _at__at_ -224,7 +267,26 _at__at_ mlx5e_build_rx_mbuf(struct mlx5_cqe64 *cqe,
> >  		rq->stats.lro_bytes += cqe_bcnt;
> >  	}
> >  
> > +#if (MLX5E_MAX_RX_SEGS == 1)
> >  	mb->m_pkthdr.len = mb->m_len = cqe_bcnt;
> > +#else
> > +	mb->m_pkthdr.len = cqe_bcnt;
> > +	for (mb_head = mb; mb != NULL; mb = mb->m_next) {
> > +		if (mb->m_len > cqe_bcnt)
> > +			mb->m_len = cqe_bcnt;
> > +		cqe_bcnt -= mb->m_len;
> > +		if (likely(cqe_bcnt == 0)) {
> > +			if (likely(mb->m_next != NULL)) {
> > +				/* trim off empty mbufs */
> Trim
> 
> > +				m_freem(mb->m_next);
> > +				mb->m_next = NULL;
> > +			}
> > +			break;
> > +		}
> > +	}
> > +	/* rewind to first mbuf in chain */
> Rewind
> 
> > +	mb = mb_head;
> > +#endif
> >  	/* check if a Toeplitz hash was computed */
> >  	if (cqe->rss_hash_type != 0) {
> >  		mb->m_pkthdr.flowid = be32_to_cpu(cqe->rss_hash_result);
> > _at__at_ -447,7 +509,10 _at__at_ mlx5e_rx_cq_comp(struct mlx5_core_cq *mcq)
> >  	int i = 0;
> >  
> >  #ifdef HAVE_PER_CQ_EVENT_PACKET
> > -	struct mbuf *mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rq->wqe_sz);
> > +#if (MHLEN < 15)
> > +#error "MHLEN is too small"
> > +#endif
> > +	struct mbuf *mb = m_gethdr(M_NOWAIT, MT_DATA);
> >  
> >  	if (mb != NULL) {
> >  		/* this code is used for debugging purpose only */
> > _______________________________________________
> > freebsd-net_at_freebsd.org mailing list
> > https://lists.freebsd.org/mailman/listinfo/freebsd-net
> > To unsubscribe, send any mail to "freebsd-net-unsubscribe_at_freebsd.org"
> > 
> 
> -- 
> Rod Grimes                                                 rgrimes_at_freebsd.org
Received on Tue Apr 24 2018 - 12:00:19 UTC

This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:41:15 UTC