mlx5_rxtx.c revision 9ecc306d
1/*-
2 *   BSD LICENSE
3 *
4 *   Copyright 2015 6WIND S.A.
5 *   Copyright 2015 Mellanox.
6 *
7 *   Redistribution and use in source and binary forms, with or without
8 *   modification, are permitted provided that the following conditions
9 *   are met:
10 *
11 *     * Redistributions of source code must retain the above copyright
12 *       notice, this list of conditions and the following disclaimer.
13 *     * Redistributions in binary form must reproduce the above copyright
14 *       notice, this list of conditions and the following disclaimer in
15 *       the documentation and/or other materials provided with the
16 *       distribution.
17 *     * Neither the name of 6WIND S.A. nor the names of its
18 *       contributors may be used to endorse or promote products derived
19 *       from this software without specific prior written permission.
20 *
21 *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34#include <assert.h>
35#include <stdint.h>
36#include <string.h>
37#include <stdlib.h>
38
39/* Verbs header. */
40/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
41#ifdef PEDANTIC
42#pragma GCC diagnostic ignored "-pedantic"
43#endif
44#include <infiniband/verbs.h>
45#include <infiniband/mlx5_hw.h>
46#include <infiniband/arch.h>
47#ifdef PEDANTIC
48#pragma GCC diagnostic error "-pedantic"
49#endif
50
51/* DPDK headers don't like -pedantic. */
52#ifdef PEDANTIC
53#pragma GCC diagnostic ignored "-pedantic"
54#endif
55#include <rte_mbuf.h>
56#include <rte_mempool.h>
57#include <rte_prefetch.h>
58#include <rte_common.h>
59#include <rte_branch_prediction.h>
60#include <rte_ether.h>
61#ifdef PEDANTIC
62#pragma GCC diagnostic error "-pedantic"
63#endif
64
65#include "mlx5.h"
66#include "mlx5_utils.h"
67#include "mlx5_rxtx.h"
68#include "mlx5_autoconf.h"
69#include "mlx5_defs.h"
70#include "mlx5_prm.h"
71
72#ifndef NDEBUG
73
74/**
75 * Verify or set magic value in CQE.
76 *
77 * @param cqe
78 *   Pointer to CQE.
79 *
80 * @return
81 *   0 the first time.
82 */
83static inline int
84check_cqe64_seen(volatile struct mlx5_cqe64 *cqe)
85{
86	static const uint8_t magic[] = "seen";
87	volatile uint8_t (*buf)[sizeof(cqe->rsvd40)] = &cqe->rsvd40;
88	int ret = 1;
89	unsigned int i;
90
91	for (i = 0; i < sizeof(magic) && i < sizeof(*buf); ++i)
92		if (!ret || (*buf)[i] != magic[i]) {
93			ret = 0;
94			(*buf)[i] = magic[i];
95		}
96	return ret;
97}
98
99#endif /* NDEBUG */
100
101static inline int
102check_cqe64(volatile struct mlx5_cqe64 *cqe,
103	    unsigned int cqes_n, const uint16_t ci)
104	    __attribute__((always_inline));
105
106/**
107 * Check whether CQE is valid.
108 *
109 * @param cqe
110 *   Pointer to CQE.
111 * @param cqes_n
112 *   Size of completion queue.
113 * @param ci
114 *   Consumer index.
115 *
116 * @return
117 *   0 on success, 1 on failure.
118 */
119static inline int
120check_cqe64(volatile struct mlx5_cqe64 *cqe,
121		unsigned int cqes_n, const uint16_t ci)
122{
123	uint16_t idx = ci & cqes_n;
124	uint8_t op_own = cqe->op_own;
125	uint8_t op_owner = MLX5_CQE_OWNER(op_own);
126	uint8_t op_code = MLX5_CQE_OPCODE(op_own);
127
128	if (unlikely((op_owner != (!!(idx))) || (op_code == MLX5_CQE_INVALID)))
129		return 1; /* No CQE. */
130#ifndef NDEBUG
131	if ((op_code == MLX5_CQE_RESP_ERR) ||
132	    (op_code == MLX5_CQE_REQ_ERR)) {
133		volatile struct mlx5_err_cqe *err_cqe = (volatile void *)cqe;
134		uint8_t syndrome = err_cqe->syndrome;
135
136		if ((syndrome == MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR) ||
137		    (syndrome == MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR))
138			return 0;
139		if (!check_cqe64_seen(cqe))
140			ERROR("unexpected CQE error %u (0x%02x)"
141			      " syndrome 0x%02x",
142			      op_code, op_code, syndrome);
143		return 1;
144	} else if ((op_code != MLX5_CQE_RESP_SEND) &&
145		   (op_code != MLX5_CQE_REQ)) {
146		if (!check_cqe64_seen(cqe))
147			ERROR("unexpected CQE opcode %u (0x%02x)",
148			      op_code, op_code);
149		return 1;
150	}
151#endif /* NDEBUG */
152	return 0;
153}
154
155/**
156 * Manage TX completions.
157 *
158 * When sending a burst, mlx5_tx_burst() posts several WRs.
159 *
160 * @param txq
161 *   Pointer to TX queue structure.
162 */
163static void
164txq_complete(struct txq *txq)
165{
166	const unsigned int elts_n = txq->elts_n;
167	const unsigned int cqe_n = txq->cqe_n;
168	const unsigned int cqe_cnt = cqe_n - 1;
169	uint16_t elts_free = txq->elts_tail;
170	uint16_t elts_tail;
171	uint16_t cq_ci = txq->cq_ci;
172	volatile struct mlx5_cqe64 *cqe = NULL;
173	volatile union mlx5_wqe *wqe;
174
175	do {
176		volatile struct mlx5_cqe64 *tmp;
177
178		tmp = &(*txq->cqes)[cq_ci & cqe_cnt].cqe64;
179		if (check_cqe64(tmp, cqe_n, cq_ci))
180			break;
181		cqe = tmp;
182#ifndef NDEBUG
183		if (MLX5_CQE_FORMAT(cqe->op_own) == MLX5_COMPRESSED) {
184			if (!check_cqe64_seen(cqe))
185				ERROR("unexpected compressed CQE, TX stopped");
186			return;
187		}
188		if ((MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_RESP_ERR) ||
189		    (MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_REQ_ERR)) {
190			if (!check_cqe64_seen(cqe))
191				ERROR("unexpected error CQE, TX stopped");
192			return;
193		}
194#endif /* NDEBUG */
195		++cq_ci;
196	} while (1);
197	if (unlikely(cqe == NULL))
198		return;
199	wqe = &(*txq->wqes)[htons(cqe->wqe_counter) & (txq->wqe_n - 1)];
200	elts_tail = wqe->wqe.ctrl.data[3];
201	assert(elts_tail < txq->wqe_n);
202	/* Free buffers. */
203	while (elts_free != elts_tail) {
204		struct rte_mbuf *elt = (*txq->elts)[elts_free];
205		unsigned int elts_free_next =
206			(elts_free + 1) & (elts_n - 1);
207		struct rte_mbuf *elt_next = (*txq->elts)[elts_free_next];
208
209#ifndef NDEBUG
210		/* Poisoning. */
211		memset(&(*txq->elts)[elts_free],
212		       0x66,
213		       sizeof((*txq->elts)[elts_free]));
214#endif
215		RTE_MBUF_PREFETCH_TO_FREE(elt_next);
216		/* Only one segment needs to be freed. */
217		rte_pktmbuf_free_seg(elt);
218		elts_free = elts_free_next;
219	}
220	txq->cq_ci = cq_ci;
221	txq->elts_tail = elts_tail;
222	/* Update the consumer index. */
223	rte_wmb();
224	*txq->cq_db = htonl(cq_ci);
225}
226
227/**
228 * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which
229 * the cloned mbuf is allocated is returned instead.
230 *
231 * @param buf
232 *   Pointer to mbuf.
233 *
234 * @return
235 *   Memory pool where data is located for given mbuf.
236 */
237static struct rte_mempool *
238txq_mb2mp(struct rte_mbuf *buf)
239{
240	if (unlikely(RTE_MBUF_INDIRECT(buf)))
241		return rte_mbuf_from_indirect(buf)->pool;
242	return buf->pool;
243}
244
245static inline uint32_t
246txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
247	__attribute__((always_inline));
248
249/**
250 * Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[].
251 * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full,
252 * remove an entry first.
253 *
254 * @param txq
255 *   Pointer to TX queue structure.
256 * @param[in] mp
257 *   Memory Pool for which a Memory Region lkey must be returned.
258 *
259 * @return
260 *   mr->lkey on success, (uint32_t)-1 on failure.
261 */
262static inline uint32_t
263txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
264{
265	unsigned int i;
266	uint32_t lkey = (uint32_t)-1;
267
268	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
269		if (unlikely(txq->mp2mr[i].mp == NULL)) {
270			/* Unknown MP, add a new MR for it. */
271			break;
272		}
273		if (txq->mp2mr[i].mp == mp) {
274			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
275			assert(htonl(txq->mp2mr[i].mr->lkey) ==
276			       txq->mp2mr[i].lkey);
277			lkey = txq->mp2mr[i].lkey;
278			break;
279		}
280	}
281	if (unlikely(lkey == (uint32_t)-1))
282		lkey = txq_mp2mr_reg(txq, mp, i);
283	return lkey;
284}
285
286/**
287 * Write a regular WQE.
288 *
289 * @param txq
290 *   Pointer to TX queue structure.
291 * @param wqe
292 *   Pointer to the WQE to fill.
293 * @param addr
294 *   Buffer data address.
295 * @param length
296 *   Packet length.
297 * @param lkey
298 *   Memory region lkey.
299 */
300static inline void
301mlx5_wqe_write(struct txq *txq, volatile union mlx5_wqe *wqe,
302	       uintptr_t addr, uint32_t length, uint32_t lkey)
303{
304	wqe->wqe.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
305	wqe->wqe.ctrl.data[1] = htonl((txq->qp_num_8s) | 4);
306	wqe->wqe.ctrl.data[2] = 0;
307	wqe->wqe.ctrl.data[3] = 0;
308	wqe->inl.eseg.rsvd0 = 0;
309	wqe->inl.eseg.rsvd1 = 0;
310	wqe->inl.eseg.mss = 0;
311	wqe->inl.eseg.rsvd2 = 0;
312	wqe->wqe.eseg.inline_hdr_sz = htons(MLX5_ETH_INLINE_HEADER_SIZE);
313	/* Copy the first 16 bytes into inline header. */
314	rte_memcpy((uint8_t *)(uintptr_t)wqe->wqe.eseg.inline_hdr_start,
315		   (uint8_t *)(uintptr_t)addr,
316		   MLX5_ETH_INLINE_HEADER_SIZE);
317	addr += MLX5_ETH_INLINE_HEADER_SIZE;
318	length -= MLX5_ETH_INLINE_HEADER_SIZE;
319	/* Store remaining data in data segment. */
320	wqe->wqe.dseg.byte_count = htonl(length);
321	wqe->wqe.dseg.lkey = lkey;
322	wqe->wqe.dseg.addr = htonll(addr);
323	/* Increment consumer index. */
324	++txq->wqe_ci;
325}
326
327/**
328 * Write a regular WQE with VLAN.
329 *
330 * @param txq
331 *   Pointer to TX queue structure.
332 * @param wqe
333 *   Pointer to the WQE to fill.
334 * @param addr
335 *   Buffer data address.
336 * @param length
337 *   Packet length.
338 * @param lkey
339 *   Memory region lkey.
340 * @param vlan_tci
341 *   VLAN field to insert in packet.
342 */
343static inline void
344mlx5_wqe_write_vlan(struct txq *txq, volatile union mlx5_wqe *wqe,
345		    uintptr_t addr, uint32_t length, uint32_t lkey,
346		    uint16_t vlan_tci)
347{
348	uint32_t vlan = htonl(0x81000000 | vlan_tci);
349
350	wqe->wqe.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
351	wqe->wqe.ctrl.data[1] = htonl((txq->qp_num_8s) | 4);
352	wqe->wqe.ctrl.data[2] = 0;
353	wqe->wqe.ctrl.data[3] = 0;
354	wqe->inl.eseg.rsvd0 = 0;
355	wqe->inl.eseg.rsvd1 = 0;
356	wqe->inl.eseg.mss = 0;
357	wqe->inl.eseg.rsvd2 = 0;
358	wqe->wqe.eseg.inline_hdr_sz = htons(MLX5_ETH_VLAN_INLINE_HEADER_SIZE);
359	/*
360	 * Copy 12 bytes of source & destination MAC address.
361	 * Copy 4 bytes of VLAN.
362	 * Copy 2 bytes of Ether type.
363	 */
364	rte_memcpy((uint8_t *)(uintptr_t)wqe->wqe.eseg.inline_hdr_start,
365		   (uint8_t *)(uintptr_t)addr, 12);
366	rte_memcpy((uint8_t *)((uintptr_t)wqe->wqe.eseg.inline_hdr_start + 12),
367		   &vlan, sizeof(vlan));
368	rte_memcpy((uint8_t *)((uintptr_t)wqe->wqe.eseg.inline_hdr_start + 16),
369		   (uint8_t *)((uintptr_t)addr + 12), 2);
370	addr += MLX5_ETH_VLAN_INLINE_HEADER_SIZE - sizeof(vlan);
371	length -= MLX5_ETH_VLAN_INLINE_HEADER_SIZE - sizeof(vlan);
372	/* Store remaining data in data segment. */
373	wqe->wqe.dseg.byte_count = htonl(length);
374	wqe->wqe.dseg.lkey = lkey;
375	wqe->wqe.dseg.addr = htonll(addr);
376	/* Increment consumer index. */
377	++txq->wqe_ci;
378}
379
380/**
381 * Write a inline WQE.
382 *
383 * @param txq
384 *   Pointer to TX queue structure.
385 * @param wqe
386 *   Pointer to the WQE to fill.
387 * @param addr
388 *   Buffer data address.
389 * @param length
390 *   Packet length.
391 * @param lkey
392 *   Memory region lkey.
393 */
394static inline void
395mlx5_wqe_write_inline(struct txq *txq, volatile union mlx5_wqe *wqe,
396		      uintptr_t addr, uint32_t length)
397{
398	uint32_t size;
399	uint16_t wqe_cnt = txq->wqe_n - 1;
400	uint16_t wqe_ci = txq->wqe_ci + 1;
401
402	/* Copy the first 16 bytes into inline header. */
403	rte_memcpy((void *)(uintptr_t)wqe->inl.eseg.inline_hdr_start,
404		   (void *)(uintptr_t)addr,
405		   MLX5_ETH_INLINE_HEADER_SIZE);
406	addr += MLX5_ETH_INLINE_HEADER_SIZE;
407	length -= MLX5_ETH_INLINE_HEADER_SIZE;
408	size = 3 + ((4 + length + 15) / 16);
409	wqe->inl.byte_cnt = htonl(length | MLX5_INLINE_SEG);
410	rte_memcpy((void *)(uintptr_t)&wqe->inl.data[0],
411		   (void *)addr, MLX5_WQE64_INL_DATA);
412	addr += MLX5_WQE64_INL_DATA;
413	length -= MLX5_WQE64_INL_DATA;
414	while (length) {
415		volatile union mlx5_wqe *wqe_next =
416			&(*txq->wqes)[wqe_ci & wqe_cnt];
417		uint32_t copy_bytes = (length > sizeof(*wqe)) ?
418				      sizeof(*wqe) :
419				      length;
420
421		rte_mov64((uint8_t *)(uintptr_t)&wqe_next->data[0],
422			  (uint8_t *)addr);
423		addr += copy_bytes;
424		length -= copy_bytes;
425		++wqe_ci;
426	}
427	assert(size < 64);
428	wqe->inl.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
429	wqe->inl.ctrl.data[1] = htonl(txq->qp_num_8s | size);
430	wqe->inl.ctrl.data[2] = 0;
431	wqe->inl.ctrl.data[3] = 0;
432	wqe->inl.eseg.rsvd0 = 0;
433	wqe->inl.eseg.rsvd1 = 0;
434	wqe->inl.eseg.mss = 0;
435	wqe->inl.eseg.rsvd2 = 0;
436	wqe->inl.eseg.inline_hdr_sz = htons(MLX5_ETH_INLINE_HEADER_SIZE);
437	/* Increment consumer index. */
438	txq->wqe_ci = wqe_ci;
439}
440
441/**
442 * Write a inline WQE with VLAN.
443 *
444 * @param txq
445 *   Pointer to TX queue structure.
446 * @param wqe
447 *   Pointer to the WQE to fill.
448 * @param addr
449 *   Buffer data address.
450 * @param length
451 *   Packet length.
452 * @param lkey
453 *   Memory region lkey.
454 * @param vlan_tci
455 *   VLAN field to insert in packet.
456 */
457static inline void
458mlx5_wqe_write_inline_vlan(struct txq *txq, volatile union mlx5_wqe *wqe,
459			   uintptr_t addr, uint32_t length, uint16_t vlan_tci)
460{
461	uint32_t size;
462	uint32_t wqe_cnt = txq->wqe_n - 1;
463	uint16_t wqe_ci = txq->wqe_ci + 1;
464	uint32_t vlan = htonl(0x81000000 | vlan_tci);
465
466	/*
467	 * Copy 12 bytes of source & destination MAC address.
468	 * Copy 4 bytes of VLAN.
469	 * Copy 2 bytes of Ether type.
470	 */
471	rte_memcpy((uint8_t *)(uintptr_t)wqe->inl.eseg.inline_hdr_start,
472		   (uint8_t *)addr, 12);
473	rte_memcpy((uint8_t *)(uintptr_t)wqe->inl.eseg.inline_hdr_start + 12,
474		   &vlan, sizeof(vlan));
475	rte_memcpy((uint8_t *)((uintptr_t)wqe->inl.eseg.inline_hdr_start + 16),
476		   (uint8_t *)(addr + 12), 2);
477	addr += MLX5_ETH_VLAN_INLINE_HEADER_SIZE - sizeof(vlan);
478	length -= MLX5_ETH_VLAN_INLINE_HEADER_SIZE - sizeof(vlan);
479	size = (sizeof(wqe->inl.ctrl.ctrl) +
480		sizeof(wqe->inl.eseg) +
481		sizeof(wqe->inl.byte_cnt) +
482		length + 15) / 16;
483	wqe->inl.byte_cnt = htonl(length | MLX5_INLINE_SEG);
484	rte_memcpy((void *)(uintptr_t)&wqe->inl.data[0],
485		   (void *)addr, MLX5_WQE64_INL_DATA);
486	addr += MLX5_WQE64_INL_DATA;
487	length -= MLX5_WQE64_INL_DATA;
488	while (length) {
489		volatile union mlx5_wqe *wqe_next =
490			&(*txq->wqes)[wqe_ci & wqe_cnt];
491		uint32_t copy_bytes = (length > sizeof(*wqe)) ?
492				      sizeof(*wqe) :
493				      length;
494
495		rte_mov64((uint8_t *)(uintptr_t)&wqe_next->data[0],
496			  (uint8_t *)addr);
497		addr += copy_bytes;
498		length -= copy_bytes;
499		++wqe_ci;
500	}
501	assert(size < 64);
502	wqe->inl.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
503	wqe->inl.ctrl.data[1] = htonl(txq->qp_num_8s | size);
504	wqe->inl.ctrl.data[2] = 0;
505	wqe->inl.ctrl.data[3] = 0;
506	wqe->inl.eseg.rsvd0 = 0;
507	wqe->inl.eseg.rsvd1 = 0;
508	wqe->inl.eseg.mss = 0;
509	wqe->inl.eseg.rsvd2 = 0;
510	wqe->inl.eseg.inline_hdr_sz = htons(MLX5_ETH_VLAN_INLINE_HEADER_SIZE);
511	/* Increment consumer index. */
512	txq->wqe_ci = wqe_ci;
513}
514
515/**
516 * Ring TX queue doorbell.
517 *
518 * @param txq
519 *   Pointer to TX queue structure.
520 */
521static inline void
522mlx5_tx_dbrec(struct txq *txq)
523{
524	uint8_t *dst = (uint8_t *)((uintptr_t)txq->bf_reg + txq->bf_offset);
525	uint32_t data[4] = {
526		htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND),
527		htonl(txq->qp_num_8s),
528		0,
529		0,
530	};
531	rte_wmb();
532	*txq->qp_db = htonl(txq->wqe_ci);
533	/* Ensure ordering between DB record and BF copy. */
534	rte_wmb();
535	rte_mov16(dst, (uint8_t *)data);
536	txq->bf_offset ^= txq->bf_buf_size;
537}
538
539/**
540 * Prefetch a CQE.
541 *
542 * @param txq
543 *   Pointer to TX queue structure.
544 * @param cqe_ci
545 *   CQE consumer index.
546 */
547static inline void
548tx_prefetch_cqe(struct txq *txq, uint16_t ci)
549{
550	volatile struct mlx5_cqe64 *cqe;
551
552	cqe = &(*txq->cqes)[ci & (txq->cqe_n - 1)].cqe64;
553	rte_prefetch0(cqe);
554}
555
556/**
557 * Prefetch a WQE.
558 *
559 * @param txq
560 *   Pointer to TX queue structure.
561 * @param  wqe_ci
562 *   WQE consumer index.
563 */
564static inline void
565tx_prefetch_wqe(struct txq *txq, uint16_t ci)
566{
567	volatile union mlx5_wqe *wqe;
568
569	wqe = &(*txq->wqes)[ci & (txq->wqe_n - 1)];
570	rte_prefetch0(wqe);
571}
572
573/**
574 * DPDK callback for TX.
575 *
576 * @param dpdk_txq
577 *   Generic pointer to TX queue structure.
578 * @param[in] pkts
579 *   Packets to transmit.
580 * @param pkts_n
581 *   Number of packets in array.
582 *
583 * @return
584 *   Number of packets successfully transmitted (<= pkts_n).
585 */
586uint16_t
587mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
588{
589	struct txq *txq = (struct txq *)dpdk_txq;
590	uint16_t elts_head = txq->elts_head;
591	const unsigned int elts_n = txq->elts_n;
592	unsigned int i = 0;
593	unsigned int j = 0;
594	unsigned int max;
595	unsigned int comp;
596	volatile union mlx5_wqe *wqe = NULL;
597
598	if (unlikely(!pkts_n))
599		return 0;
600	/* Prefetch first packet cacheline. */
601	tx_prefetch_cqe(txq, txq->cq_ci);
602	tx_prefetch_cqe(txq, txq->cq_ci + 1);
603	rte_prefetch0(*pkts);
604	/* Start processing. */
605	txq_complete(txq);
606	max = (elts_n - (elts_head - txq->elts_tail));
607	if (max > elts_n)
608		max -= elts_n;
609	do {
610		struct rte_mbuf *buf = *(pkts++);
611		unsigned int elts_head_next;
612		uintptr_t addr;
613		uint32_t length;
614		uint32_t lkey;
615		unsigned int segs_n = buf->nb_segs;
616		volatile struct mlx5_wqe_data_seg *dseg;
617		unsigned int ds = sizeof(*wqe) / 16;
618
619		/*
620		 * Make sure there is enough room to store this packet and
621		 * that one ring entry remains unused.
622		 */
623		assert(segs_n);
624		if (max < segs_n + 1)
625			break;
626		max -= segs_n;
627		--pkts_n;
628		elts_head_next = (elts_head + 1) & (elts_n - 1);
629		wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)];
630		dseg = &wqe->wqe.dseg;
631		rte_prefetch0(wqe);
632		if (pkts_n)
633			rte_prefetch0(*pkts);
634		/* Retrieve buffer information. */
635		addr = rte_pktmbuf_mtod(buf, uintptr_t);
636		length = DATA_LEN(buf);
637		/* Update element. */
638		(*txq->elts)[elts_head] = buf;
639		/* Prefetch next buffer data. */
640		if (pkts_n)
641			rte_prefetch0(rte_pktmbuf_mtod(*pkts,
642						       volatile void *));
643		/* Retrieve Memory Region key for this memory pool. */
644		lkey = txq_mp2mr(txq, txq_mb2mp(buf));
645		if (buf->ol_flags & PKT_TX_VLAN_PKT)
646			mlx5_wqe_write_vlan(txq, wqe, addr, length, lkey,
647					    buf->vlan_tci);
648		else
649			mlx5_wqe_write(txq, wqe, addr, length, lkey);
650		/* Should we enable HW CKSUM offload */
651		if (buf->ol_flags &
652		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
653			wqe->wqe.eseg.cs_flags =
654				MLX5_ETH_WQE_L3_CSUM |
655				MLX5_ETH_WQE_L4_CSUM;
656		} else {
657			wqe->wqe.eseg.cs_flags = 0;
658		}
659		while (--segs_n) {
660			/*
661			 * Spill on next WQE when the current one does not have
662			 * enough room left. Size of WQE must a be a multiple
663			 * of data segment size.
664			 */
665			assert(!(sizeof(*wqe) % sizeof(*dseg)));
666			if (!(ds % (sizeof(*wqe) / 16)))
667				dseg = (volatile void *)
668					&(*txq->wqes)[txq->wqe_ci++ &
669						      (txq->wqe_n - 1)];
670			else
671				++dseg;
672			++ds;
673			buf = buf->next;
674			assert(buf);
675			/* Store segment information. */
676			dseg->byte_count = htonl(DATA_LEN(buf));
677			dseg->lkey = txq_mp2mr(txq, txq_mb2mp(buf));
678			dseg->addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t));
679			(*txq->elts)[elts_head_next] = buf;
680			elts_head_next = (elts_head_next + 1) & (elts_n - 1);
681#ifdef MLX5_PMD_SOFT_COUNTERS
682			length += DATA_LEN(buf);
683#endif
684			++j;
685		}
686		/* Update DS field in WQE. */
687		wqe->wqe.ctrl.data[1] &= htonl(0xffffffc0);
688		wqe->wqe.ctrl.data[1] |= htonl(ds & 0x3f);
689		elts_head = elts_head_next;
690#ifdef MLX5_PMD_SOFT_COUNTERS
691		/* Increment sent bytes counter. */
692		txq->stats.obytes += length;
693#endif
694		elts_head = elts_head_next;
695		++i;
696	} while (pkts_n);
697	/* Take a shortcut if nothing must be sent. */
698	if (unlikely(i == 0))
699		return 0;
700	/* Check whether completion threshold has been reached. */
701	comp = txq->elts_comp + i + j;
702	if (comp >= MLX5_TX_COMP_THRESH) {
703		/* Request completion on last WQE. */
704		wqe->wqe.ctrl.data[2] = htonl(8);
705		/* Save elts_head in unused "immediate" field of WQE. */
706		wqe->wqe.ctrl.data[3] = elts_head;
707		txq->elts_comp = 0;
708	} else {
709		txq->elts_comp = comp;
710	}
711#ifdef MLX5_PMD_SOFT_COUNTERS
712	/* Increment sent packets counter. */
713	txq->stats.opackets += i;
714#endif
715	/* Ring QP doorbell. */
716	mlx5_tx_dbrec(txq);
717	txq->elts_head = elts_head;
718	return i;
719}
720
721/**
722 * DPDK callback for TX with inline support.
723 *
724 * @param dpdk_txq
725 *   Generic pointer to TX queue structure.
726 * @param[in] pkts
727 *   Packets to transmit.
728 * @param pkts_n
729 *   Number of packets in array.
730 *
731 * @return
732 *   Number of packets successfully transmitted (<= pkts_n).
733 */
734uint16_t
735mlx5_tx_burst_inline(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
736{
737	struct txq *txq = (struct txq *)dpdk_txq;
738	uint16_t elts_head = txq->elts_head;
739	const unsigned int elts_n = txq->elts_n;
740	unsigned int i = 0;
741	unsigned int j = 0;
742	unsigned int max;
743	unsigned int comp;
744	volatile union mlx5_wqe *wqe = NULL;
745	unsigned int max_inline = txq->max_inline;
746
747	if (unlikely(!pkts_n))
748		return 0;
749	/* Prefetch first packet cacheline. */
750	tx_prefetch_cqe(txq, txq->cq_ci);
751	tx_prefetch_cqe(txq, txq->cq_ci + 1);
752	rte_prefetch0(*pkts);
753	/* Start processing. */
754	txq_complete(txq);
755	max = (elts_n - (elts_head - txq->elts_tail));
756	if (max > elts_n)
757		max -= elts_n;
758	do {
759		struct rte_mbuf *buf = *(pkts++);
760		unsigned int elts_head_next;
761		uintptr_t addr;
762		uint32_t length;
763		uint32_t lkey;
764		unsigned int segs_n = buf->nb_segs;
765		volatile struct mlx5_wqe_data_seg *dseg;
766		unsigned int ds = sizeof(*wqe) / 16;
767
768		/*
769		 * Make sure there is enough room to store this packet and
770		 * that one ring entry remains unused.
771		 */
772		assert(segs_n);
773		if (max < segs_n + 1)
774			break;
775		max -= segs_n;
776		--pkts_n;
777		elts_head_next = (elts_head + 1) & (elts_n - 1);
778		wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)];
779		dseg = &wqe->wqe.dseg;
780		tx_prefetch_wqe(txq, txq->wqe_ci);
781		tx_prefetch_wqe(txq, txq->wqe_ci + 1);
782		if (pkts_n)
783			rte_prefetch0(*pkts);
784		/* Should we enable HW CKSUM offload */
785		if (buf->ol_flags &
786		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
787			wqe->inl.eseg.cs_flags =
788				MLX5_ETH_WQE_L3_CSUM |
789				MLX5_ETH_WQE_L4_CSUM;
790		} else {
791			wqe->inl.eseg.cs_flags = 0;
792		}
793		/* Retrieve buffer information. */
794		addr = rte_pktmbuf_mtod(buf, uintptr_t);
795		length = DATA_LEN(buf);
796		/* Update element. */
797		(*txq->elts)[elts_head] = buf;
798		/* Prefetch next buffer data. */
799		if (pkts_n)
800			rte_prefetch0(rte_pktmbuf_mtod(*pkts,
801						       volatile void *));
802		if ((length <= max_inline) && (segs_n == 1)) {
803			if (buf->ol_flags & PKT_TX_VLAN_PKT)
804				mlx5_wqe_write_inline_vlan(txq, wqe,
805							   addr, length,
806							   buf->vlan_tci);
807			else
808				mlx5_wqe_write_inline(txq, wqe, addr, length);
809			goto skip_segs;
810		} else {
811			/* Retrieve Memory Region key for this memory pool. */
812			lkey = txq_mp2mr(txq, txq_mb2mp(buf));
813			if (buf->ol_flags & PKT_TX_VLAN_PKT)
814				mlx5_wqe_write_vlan(txq, wqe, addr, length,
815						    lkey, buf->vlan_tci);
816			else
817				mlx5_wqe_write(txq, wqe, addr, length, lkey);
818		}
819		while (--segs_n) {
820			/*
821			 * Spill on next WQE when the current one does not have
822			 * enough room left. Size of WQE must a be a multiple
823			 * of data segment size.
824			 */
825			assert(!(sizeof(*wqe) % sizeof(*dseg)));
826			if (!(ds % (sizeof(*wqe) / 16)))
827				dseg = (volatile void *)
828					&(*txq->wqes)[txq->wqe_ci++ &
829						      (txq->wqe_n - 1)];
830			else
831				++dseg;
832			++ds;
833			buf = buf->next;
834			assert(buf);
835			/* Store segment information. */
836			dseg->byte_count = htonl(DATA_LEN(buf));
837			dseg->lkey = txq_mp2mr(txq, txq_mb2mp(buf));
838			dseg->addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t));
839			(*txq->elts)[elts_head_next] = buf;
840			elts_head_next = (elts_head_next + 1) & (elts_n - 1);
841#ifdef MLX5_PMD_SOFT_COUNTERS
842			length += DATA_LEN(buf);
843#endif
844			++j;
845		}
846		/* Update DS field in WQE. */
847		wqe->inl.ctrl.data[1] &= htonl(0xffffffc0);
848		wqe->inl.ctrl.data[1] |= htonl(ds & 0x3f);
849skip_segs:
850		elts_head = elts_head_next;
851#ifdef MLX5_PMD_SOFT_COUNTERS
852		/* Increment sent bytes counter. */
853		txq->stats.obytes += length;
854#endif
855		++i;
856	} while (pkts_n);
857	/* Take a shortcut if nothing must be sent. */
858	if (unlikely(i == 0))
859		return 0;
860	/* Check whether completion threshold has been reached. */
861	comp = txq->elts_comp + i + j;
862	if (comp >= MLX5_TX_COMP_THRESH) {
863		/* Request completion on last WQE. */
864		wqe->inl.ctrl.data[2] = htonl(8);
865		/* Save elts_head in unused "immediate" field of WQE. */
866		wqe->inl.ctrl.data[3] = elts_head;
867		txq->elts_comp = 0;
868	} else {
869		txq->elts_comp = comp;
870	}
871#ifdef MLX5_PMD_SOFT_COUNTERS
872	/* Increment sent packets counter. */
873	txq->stats.opackets += i;
874#endif
875	/* Ring QP doorbell. */
876	mlx5_tx_dbrec(txq);
877	txq->elts_head = elts_head;
878	return i;
879}
880
881/**
882 * Open a MPW session.
883 *
884 * @param txq
885 *   Pointer to TX queue structure.
886 * @param mpw
887 *   Pointer to MPW session structure.
888 * @param length
889 *   Packet length.
890 */
891static inline void
892mlx5_mpw_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
893{
894	uint16_t idx = txq->wqe_ci & (txq->wqe_n - 1);
895	volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] =
896		(volatile struct mlx5_wqe_data_seg (*)[])
897		(uintptr_t)&(*txq->wqes)[(idx + 1) & (txq->wqe_n - 1)];
898
899	mpw->state = MLX5_MPW_STATE_OPENED;
900	mpw->pkts_n = 0;
901	mpw->len = length;
902	mpw->total_len = 0;
903	mpw->wqe = &(*txq->wqes)[idx];
904	mpw->wqe->mpw.eseg.mss = htons(length);
905	mpw->wqe->mpw.eseg.inline_hdr_sz = 0;
906	mpw->wqe->mpw.eseg.rsvd0 = 0;
907	mpw->wqe->mpw.eseg.rsvd1 = 0;
908	mpw->wqe->mpw.eseg.rsvd2 = 0;
909	mpw->wqe->mpw.ctrl.data[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
910					   (txq->wqe_ci << 8) |
911					   MLX5_OPCODE_LSO_MPW);
912	mpw->wqe->mpw.ctrl.data[2] = 0;
913	mpw->wqe->mpw.ctrl.data[3] = 0;
914	mpw->data.dseg[0] = &mpw->wqe->mpw.dseg[0];
915	mpw->data.dseg[1] = &mpw->wqe->mpw.dseg[1];
916	mpw->data.dseg[2] = &(*dseg)[0];
917	mpw->data.dseg[3] = &(*dseg)[1];
918	mpw->data.dseg[4] = &(*dseg)[2];
919}
920
921/**
922 * Close a MPW session.
923 *
924 * @param txq
925 *   Pointer to TX queue structure.
926 * @param mpw
927 *   Pointer to MPW session structure.
928 */
929static inline void
930mlx5_mpw_close(struct txq *txq, struct mlx5_mpw *mpw)
931{
932	unsigned int num = mpw->pkts_n;
933
934	/*
935	 * Store size in multiple of 16 bytes. Control and Ethernet segments
936	 * count as 2.
937	 */
938	mpw->wqe->mpw.ctrl.data[1] = htonl(txq->qp_num_8s | (2 + num));
939	mpw->state = MLX5_MPW_STATE_CLOSED;
940	if (num < 3)
941		++txq->wqe_ci;
942	else
943		txq->wqe_ci += 2;
944	tx_prefetch_wqe(txq, txq->wqe_ci);
945	tx_prefetch_wqe(txq, txq->wqe_ci + 1);
946}
947
948/**
949 * DPDK callback for TX with MPW support.
950 *
951 * @param dpdk_txq
952 *   Generic pointer to TX queue structure.
953 * @param[in] pkts
954 *   Packets to transmit.
955 * @param pkts_n
956 *   Number of packets in array.
957 *
958 * @return
959 *   Number of packets successfully transmitted (<= pkts_n).
960 */
961uint16_t
962mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
963{
964	struct txq *txq = (struct txq *)dpdk_txq;
965	uint16_t elts_head = txq->elts_head;
966	const unsigned int elts_n = txq->elts_n;
967	unsigned int i = 0;
968	unsigned int j = 0;
969	unsigned int max;
970	unsigned int comp;
971	struct mlx5_mpw mpw = {
972		.state = MLX5_MPW_STATE_CLOSED,
973	};
974
975	if (unlikely(!pkts_n))
976		return 0;
977	/* Prefetch first packet cacheline. */
978	tx_prefetch_cqe(txq, txq->cq_ci);
979	tx_prefetch_wqe(txq, txq->wqe_ci);
980	tx_prefetch_wqe(txq, txq->wqe_ci + 1);
981	/* Start processing. */
982	txq_complete(txq);
983	max = (elts_n - (elts_head - txq->elts_tail));
984	if (max > elts_n)
985		max -= elts_n;
986	do {
987		struct rte_mbuf *buf = *(pkts++);
988		unsigned int elts_head_next;
989		uint32_t length;
990		unsigned int segs_n = buf->nb_segs;
991		uint32_t cs_flags = 0;
992
993		/*
994		 * Make sure there is enough room to store this packet and
995		 * that one ring entry remains unused.
996		 */
997		assert(segs_n);
998		if (max < segs_n + 1)
999			break;
1000		/* Do not bother with large packets MPW cannot handle. */
1001		if (segs_n > MLX5_MPW_DSEG_MAX)
1002			break;
1003		max -= segs_n;
1004		--pkts_n;
1005		/* Should we enable HW CKSUM offload */
1006		if (buf->ol_flags &
1007		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
1008			cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
1009		/* Retrieve packet information. */
1010		length = PKT_LEN(buf);
1011		assert(length);
1012		/* Start new session if packet differs. */
1013		if ((mpw.state == MLX5_MPW_STATE_OPENED) &&
1014		    ((mpw.len != length) ||
1015		     (segs_n != 1) ||
1016		     (mpw.wqe->mpw.eseg.cs_flags != cs_flags)))
1017			mlx5_mpw_close(txq, &mpw);
1018		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
1019			mlx5_mpw_new(txq, &mpw, length);
1020			mpw.wqe->mpw.eseg.cs_flags = cs_flags;
1021		}
1022		/* Multi-segment packets must be alone in their MPW. */
1023		assert((segs_n == 1) || (mpw.pkts_n == 0));
1024#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1025		length = 0;
1026#endif
1027		do {
1028			volatile struct mlx5_wqe_data_seg *dseg;
1029			uintptr_t addr;
1030
1031			elts_head_next = (elts_head + 1) & (elts_n - 1);
1032			assert(buf);
1033			(*txq->elts)[elts_head] = buf;
1034			dseg = mpw.data.dseg[mpw.pkts_n];
1035			addr = rte_pktmbuf_mtod(buf, uintptr_t);
1036			*dseg = (struct mlx5_wqe_data_seg){
1037				.byte_count = htonl(DATA_LEN(buf)),
1038				.lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
1039				.addr = htonll(addr),
1040			};
1041			elts_head = elts_head_next;
1042#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1043			length += DATA_LEN(buf);
1044#endif
1045			buf = buf->next;
1046			++mpw.pkts_n;
1047			++j;
1048		} while (--segs_n);
1049		assert(length == mpw.len);
1050		if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
1051			mlx5_mpw_close(txq, &mpw);
1052		elts_head = elts_head_next;
1053#ifdef MLX5_PMD_SOFT_COUNTERS
1054		/* Increment sent bytes counter. */
1055		txq->stats.obytes += length;
1056#endif
1057		++i;
1058	} while (pkts_n);
1059	/* Take a shortcut if nothing must be sent. */
1060	if (unlikely(i == 0))
1061		return 0;
1062	/* Check whether completion threshold has been reached. */
1063	/* "j" includes both packets and segments. */
1064	comp = txq->elts_comp + j;
1065	if (comp >= MLX5_TX_COMP_THRESH) {
1066		volatile union mlx5_wqe *wqe = mpw.wqe;
1067
1068		/* Request completion on last WQE. */
1069		wqe->mpw.ctrl.data[2] = htonl(8);
1070		/* Save elts_head in unused "immediate" field of WQE. */
1071		wqe->mpw.ctrl.data[3] = elts_head;
1072		txq->elts_comp = 0;
1073	} else {
1074		txq->elts_comp = comp;
1075	}
1076#ifdef MLX5_PMD_SOFT_COUNTERS
1077	/* Increment sent packets counter. */
1078	txq->stats.opackets += i;
1079#endif
1080	/* Ring QP doorbell. */
1081	if (mpw.state == MLX5_MPW_STATE_OPENED)
1082		mlx5_mpw_close(txq, &mpw);
1083	mlx5_tx_dbrec(txq);
1084	txq->elts_head = elts_head;
1085	return i;
1086}
1087
1088/**
1089 * Open a MPW inline session.
1090 *
1091 * @param txq
1092 *   Pointer to TX queue structure.
1093 * @param mpw
1094 *   Pointer to MPW session structure.
1095 * @param length
1096 *   Packet length.
1097 */
1098static inline void
1099mlx5_mpw_inline_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
1100{
1101	uint16_t idx = txq->wqe_ci & (txq->wqe_n - 1);
1102
1103	mpw->state = MLX5_MPW_INL_STATE_OPENED;
1104	mpw->pkts_n = 0;
1105	mpw->len = length;
1106	mpw->total_len = 0;
1107	mpw->wqe = &(*txq->wqes)[idx];
1108	mpw->wqe->mpw_inl.ctrl.data[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
1109					       (txq->wqe_ci << 8) |
1110					       MLX5_OPCODE_LSO_MPW);
1111	mpw->wqe->mpw_inl.ctrl.data[2] = 0;
1112	mpw->wqe->mpw_inl.ctrl.data[3] = 0;
1113	mpw->wqe->mpw_inl.eseg.mss = htons(length);
1114	mpw->wqe->mpw_inl.eseg.inline_hdr_sz = 0;
1115	mpw->wqe->mpw_inl.eseg.cs_flags = 0;
1116	mpw->wqe->mpw_inl.eseg.rsvd0 = 0;
1117	mpw->wqe->mpw_inl.eseg.rsvd1 = 0;
1118	mpw->wqe->mpw_inl.eseg.rsvd2 = 0;
1119	mpw->data.raw = &mpw->wqe->mpw_inl.data[0];
1120}
1121
1122/**
1123 * Close a MPW inline session.
1124 *
1125 * @param txq
1126 *   Pointer to TX queue structure.
1127 * @param mpw
1128 *   Pointer to MPW session structure.
1129 */
1130static inline void
1131mlx5_mpw_inline_close(struct txq *txq, struct mlx5_mpw *mpw)
1132{
1133	unsigned int size;
1134
1135	size = sizeof(*mpw->wqe) - MLX5_MWQE64_INL_DATA + mpw->total_len;
1136	/*
1137	 * Store size in multiple of 16 bytes. Control and Ethernet segments
1138	 * count as 2.
1139	 */
1140	mpw->wqe->mpw_inl.ctrl.data[1] =
1141		htonl(txq->qp_num_8s | ((size + 15) / 16));
1142	mpw->state = MLX5_MPW_STATE_CLOSED;
1143	mpw->wqe->mpw_inl.byte_cnt = htonl(mpw->total_len | MLX5_INLINE_SEG);
1144	txq->wqe_ci += (size + (sizeof(*mpw->wqe) - 1)) / sizeof(*mpw->wqe);
1145}
1146
1147/**
1148 * DPDK callback for TX with MPW inline support.
1149 *
1150 * @param dpdk_txq
1151 *   Generic pointer to TX queue structure.
1152 * @param[in] pkts
1153 *   Packets to transmit.
1154 * @param pkts_n
1155 *   Number of packets in array.
1156 *
1157 * @return
1158 *   Number of packets successfully transmitted (<= pkts_n).
1159 */
1160uint16_t
1161mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
1162			 uint16_t pkts_n)
1163{
1164	struct txq *txq = (struct txq *)dpdk_txq;
1165	uint16_t elts_head = txq->elts_head;
1166	const unsigned int elts_n = txq->elts_n;
1167	unsigned int i = 0;
1168	unsigned int j = 0;
1169	unsigned int max;
1170	unsigned int comp;
1171	unsigned int inline_room = txq->max_inline;
1172	struct mlx5_mpw mpw = {
1173		.state = MLX5_MPW_STATE_CLOSED,
1174	};
1175
1176	if (unlikely(!pkts_n))
1177		return 0;
1178	/* Prefetch first packet cacheline. */
1179	tx_prefetch_cqe(txq, txq->cq_ci);
1180	tx_prefetch_wqe(txq, txq->wqe_ci);
1181	tx_prefetch_wqe(txq, txq->wqe_ci + 1);
1182	/* Start processing. */
1183	txq_complete(txq);
1184	max = (elts_n - (elts_head - txq->elts_tail));
1185	if (max > elts_n)
1186		max -= elts_n;
1187	do {
1188		struct rte_mbuf *buf = *(pkts++);
1189		unsigned int elts_head_next;
1190		uintptr_t addr;
1191		uint32_t length;
1192		unsigned int segs_n = buf->nb_segs;
1193		uint32_t cs_flags = 0;
1194
1195		/*
1196		 * Make sure there is enough room to store this packet and
1197		 * that one ring entry remains unused.
1198		 */
1199		assert(segs_n);
1200		if (max < segs_n + 1)
1201			break;
1202		/* Do not bother with large packets MPW cannot handle. */
1203		if (segs_n > MLX5_MPW_DSEG_MAX)
1204			break;
1205		max -= segs_n;
1206		--pkts_n;
1207		/* Should we enable HW CKSUM offload */
1208		if (buf->ol_flags &
1209		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
1210			cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
1211		/* Retrieve packet information. */
1212		length = PKT_LEN(buf);
1213		/* Start new session if packet differs. */
1214		if (mpw.state == MLX5_MPW_STATE_OPENED) {
1215			if ((mpw.len != length) ||
1216			    (segs_n != 1) ||
1217			    (mpw.wqe->mpw.eseg.cs_flags != cs_flags))
1218				mlx5_mpw_close(txq, &mpw);
1219		} else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) {
1220			if ((mpw.len != length) ||
1221			    (segs_n != 1) ||
1222			    (length > inline_room) ||
1223			    (mpw.wqe->mpw_inl.eseg.cs_flags != cs_flags)) {
1224				mlx5_mpw_inline_close(txq, &mpw);
1225				inline_room = txq->max_inline;
1226			}
1227		}
1228		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
1229			if ((segs_n != 1) ||
1230			    (length > inline_room)) {
1231				mlx5_mpw_new(txq, &mpw, length);
1232				mpw.wqe->mpw.eseg.cs_flags = cs_flags;
1233			} else {
1234				mlx5_mpw_inline_new(txq, &mpw, length);
1235				mpw.wqe->mpw_inl.eseg.cs_flags = cs_flags;
1236			}
1237		}
1238		/* Multi-segment packets must be alone in their MPW. */
1239		assert((segs_n == 1) || (mpw.pkts_n == 0));
1240		if (mpw.state == MLX5_MPW_STATE_OPENED) {
1241			assert(inline_room == txq->max_inline);
1242#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1243			length = 0;
1244#endif
1245			do {
1246				volatile struct mlx5_wqe_data_seg *dseg;
1247
1248				elts_head_next =
1249					(elts_head + 1) & (elts_n - 1);
1250				assert(buf);
1251				(*txq->elts)[elts_head] = buf;
1252				dseg = mpw.data.dseg[mpw.pkts_n];
1253				addr = rte_pktmbuf_mtod(buf, uintptr_t);
1254				*dseg = (struct mlx5_wqe_data_seg){
1255					.byte_count = htonl(DATA_LEN(buf)),
1256					.lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
1257					.addr = htonll(addr),
1258				};
1259				elts_head = elts_head_next;
1260#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1261				length += DATA_LEN(buf);
1262#endif
1263				buf = buf->next;
1264				++mpw.pkts_n;
1265				++j;
1266			} while (--segs_n);
1267			assert(length == mpw.len);
1268			if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
1269				mlx5_mpw_close(txq, &mpw);
1270		} else {
1271			unsigned int max;
1272
1273			assert(mpw.state == MLX5_MPW_INL_STATE_OPENED);
1274			assert(length <= inline_room);
1275			assert(length == DATA_LEN(buf));
1276			elts_head_next = (elts_head + 1) & (elts_n - 1);
1277			addr = rte_pktmbuf_mtod(buf, uintptr_t);
1278			(*txq->elts)[elts_head] = buf;
1279			/* Maximum number of bytes before wrapping. */
1280			max = ((uintptr_t)&(*txq->wqes)[txq->wqe_n] -
1281			       (uintptr_t)mpw.data.raw);
1282			if (length > max) {
1283				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1284					   (void *)addr,
1285					   max);
1286				mpw.data.raw =
1287					(volatile void *)&(*txq->wqes)[0];
1288				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1289					   (void *)(addr + max),
1290					   length - max);
1291				mpw.data.raw += length - max;
1292			} else {
1293				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1294					   (void *)addr,
1295					   length);
1296				mpw.data.raw += length;
1297			}
1298			if ((uintptr_t)mpw.data.raw ==
1299			    (uintptr_t)&(*txq->wqes)[txq->wqe_n])
1300				mpw.data.raw =
1301					(volatile void *)&(*txq->wqes)[0];
1302			++mpw.pkts_n;
1303			++j;
1304			if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) {
1305				mlx5_mpw_inline_close(txq, &mpw);
1306				inline_room = txq->max_inline;
1307			} else {
1308				inline_room -= length;
1309			}
1310		}
1311		mpw.total_len += length;
1312		elts_head = elts_head_next;
1313#ifdef MLX5_PMD_SOFT_COUNTERS
1314		/* Increment sent bytes counter. */
1315		txq->stats.obytes += length;
1316#endif
1317		++i;
1318	} while (pkts_n);
1319	/* Take a shortcut if nothing must be sent. */
1320	if (unlikely(i == 0))
1321		return 0;
1322	/* Check whether completion threshold has been reached. */
1323	/* "j" includes both packets and segments. */
1324	comp = txq->elts_comp + j;
1325	if (comp >= MLX5_TX_COMP_THRESH) {
1326		volatile union mlx5_wqe *wqe = mpw.wqe;
1327
1328		/* Request completion on last WQE. */
1329		wqe->mpw_inl.ctrl.data[2] = htonl(8);
1330		/* Save elts_head in unused "immediate" field of WQE. */
1331		wqe->mpw_inl.ctrl.data[3] = elts_head;
1332		txq->elts_comp = 0;
1333	} else {
1334		txq->elts_comp = comp;
1335	}
1336#ifdef MLX5_PMD_SOFT_COUNTERS
1337	/* Increment sent packets counter. */
1338	txq->stats.opackets += i;
1339#endif
1340	/* Ring QP doorbell. */
1341	if (mpw.state == MLX5_MPW_INL_STATE_OPENED)
1342		mlx5_mpw_inline_close(txq, &mpw);
1343	else if (mpw.state == MLX5_MPW_STATE_OPENED)
1344		mlx5_mpw_close(txq, &mpw);
1345	mlx5_tx_dbrec(txq);
1346	txq->elts_head = elts_head;
1347	return i;
1348}
1349
1350/**
1351 * Translate RX completion flags to packet type.
1352 *
1353 * @param[in] cqe
1354 *   Pointer to CQE.
1355 *
1356 * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
1357 *
1358 * @return
1359 *   Packet type for struct rte_mbuf.
1360 */
1361static inline uint32_t
1362rxq_cq_to_pkt_type(volatile struct mlx5_cqe64 *cqe)
1363{
1364	uint32_t pkt_type;
1365	uint8_t flags = cqe->l4_hdr_type_etc;
1366	uint8_t info = cqe->rsvd0[0];
1367
1368	if (info & IBV_EXP_CQ_RX_TUNNEL_PACKET)
1369		pkt_type =
1370			TRANSPOSE(flags,
1371				  IBV_EXP_CQ_RX_OUTER_IPV4_PACKET,
1372				  RTE_PTYPE_L3_IPV4) |
1373			TRANSPOSE(flags,
1374				  IBV_EXP_CQ_RX_OUTER_IPV6_PACKET,
1375				  RTE_PTYPE_L3_IPV6) |
1376			TRANSPOSE(flags,
1377				  IBV_EXP_CQ_RX_IPV4_PACKET,
1378				  RTE_PTYPE_INNER_L3_IPV4) |
1379			TRANSPOSE(flags,
1380				  IBV_EXP_CQ_RX_IPV6_PACKET,
1381				  RTE_PTYPE_INNER_L3_IPV6);
1382	else
1383		pkt_type =
1384			TRANSPOSE(flags,
1385				  MLX5_CQE_L3_HDR_TYPE_IPV6,
1386				  RTE_PTYPE_L3_IPV6) |
1387			TRANSPOSE(flags,
1388				  MLX5_CQE_L3_HDR_TYPE_IPV4,
1389				  RTE_PTYPE_L3_IPV4);
1390	return pkt_type;
1391}
1392
1393/**
1394 * Get size of the next packet for a given CQE. For compressed CQEs, the
1395 * consumer index is updated only once all packets of the current one have
1396 * been processed.
1397 *
1398 * @param rxq
1399 *   Pointer to RX queue.
1400 * @param cqe
1401 *   CQE to process.
1402 *
1403 * @return
1404 *   Packet size in bytes (0 if there is none), -1 in case of completion
1405 *   with error.
1406 */
1407static inline int
1408mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe64 *cqe,
1409		 uint16_t cqe_cnt)
1410{
1411	struct rxq_zip *zip = &rxq->zip;
1412	uint16_t cqe_n = cqe_cnt + 1;
1413	int len = 0;
1414
1415	/* Process compressed data in the CQE and mini arrays. */
1416	if (zip->ai) {
1417		volatile struct mlx5_mini_cqe8 (*mc)[8] =
1418			(volatile struct mlx5_mini_cqe8 (*)[8])
1419			(uintptr_t)(&(*rxq->cqes)[zip->ca & cqe_cnt].cqe64);
1420
1421		len = ntohl((*mc)[zip->ai & 7].byte_cnt);
1422		if ((++zip->ai & 7) == 0) {
1423			/*
1424			 * Increment consumer index to skip the number of
1425			 * CQEs consumed. Hardware leaves holes in the CQ
1426			 * ring for software use.
1427			 */
1428			zip->ca = zip->na;
1429			zip->na += 8;
1430		}
1431		if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) {
1432			uint16_t idx = rxq->cq_ci;
1433			uint16_t end = zip->cq_ci;
1434
1435			while (idx != end) {
1436				(*rxq->cqes)[idx & cqe_cnt].cqe64.op_own =
1437					MLX5_CQE_INVALIDATE;
1438				++idx;
1439			}
1440			rxq->cq_ci = zip->cq_ci;
1441			zip->ai = 0;
1442		}
1443	/* No compressed data, get next CQE and verify if it is compressed. */
1444	} else {
1445		int ret;
1446		int8_t op_own;
1447
1448		ret = check_cqe64(cqe, cqe_n, rxq->cq_ci);
1449		if (unlikely(ret == 1))
1450			return 0;
1451		++rxq->cq_ci;
1452		op_own = cqe->op_own;
1453		if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) {
1454			volatile struct mlx5_mini_cqe8 (*mc)[8] =
1455				(volatile struct mlx5_mini_cqe8 (*)[8])
1456				(uintptr_t)(&(*rxq->cqes)[rxq->cq_ci &
1457							  cqe_cnt].cqe64);
1458
1459			/* Fix endianness. */
1460			zip->cqe_cnt = ntohl(cqe->byte_cnt);
1461			/*
1462			 * Current mini array position is the one returned by
1463			 * check_cqe64().
1464			 *
1465			 * If completion comprises several mini arrays, as a
1466			 * special case the second one is located 7 CQEs after
1467			 * the initial CQE instead of 8 for subsequent ones.
1468			 */
1469			zip->ca = rxq->cq_ci & cqe_cnt;
1470			zip->na = zip->ca + 7;
1471			/* Compute the next non compressed CQE. */
1472			--rxq->cq_ci;
1473			zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
1474			/* Get packet size to return. */
1475			len = ntohl((*mc)[0].byte_cnt);
1476			zip->ai = 1;
1477		} else {
1478			len = ntohl(cqe->byte_cnt);
1479		}
1480		/* Error while receiving packet. */
1481		if (unlikely(MLX5_CQE_OPCODE(op_own) == MLX5_CQE_RESP_ERR))
1482			return -1;
1483	}
1484	return len;
1485}
1486
1487/**
1488 * Translate RX completion flags to offload flags.
1489 *
1490 * @param[in] rxq
1491 *   Pointer to RX queue structure.
1492 * @param[in] cqe
1493 *   Pointer to CQE.
1494 *
1495 * @return
1496 *   Offload flags (ol_flags) for struct rte_mbuf.
1497 */
1498static inline uint32_t
1499rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe64 *cqe)
1500{
1501	uint32_t ol_flags = 0;
1502	uint8_t l3_hdr = (cqe->l4_hdr_type_etc) & MLX5_CQE_L3_HDR_TYPE_MASK;
1503	uint8_t l4_hdr = (cqe->l4_hdr_type_etc) & MLX5_CQE_L4_HDR_TYPE_MASK;
1504	uint8_t info = cqe->rsvd0[0];
1505
1506	if ((l3_hdr == MLX5_CQE_L3_HDR_TYPE_IPV4) ||
1507	    (l3_hdr == MLX5_CQE_L3_HDR_TYPE_IPV6))
1508		ol_flags |=
1509			(!(cqe->hds_ip_ext & MLX5_CQE_L3_OK) *
1510			 PKT_RX_IP_CKSUM_BAD);
1511	if ((l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP) ||
1512	    (l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP_EMP_ACK) ||
1513	    (l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP_ACK) ||
1514	    (l4_hdr == MLX5_CQE_L4_HDR_TYPE_UDP))
1515		ol_flags |=
1516			(!(cqe->hds_ip_ext & MLX5_CQE_L4_OK) *
1517			 PKT_RX_L4_CKSUM_BAD);
1518	/*
1519	 * PKT_RX_IP_CKSUM_BAD and PKT_RX_L4_CKSUM_BAD are used in place
1520	 * of PKT_RX_EIP_CKSUM_BAD because the latter is not functional
1521	 * (its value is 0).
1522	 */
1523	if ((info & IBV_EXP_CQ_RX_TUNNEL_PACKET) && (rxq->csum_l2tun))
1524		ol_flags |=
1525			TRANSPOSE(~cqe->l4_hdr_type_etc,
1526				  IBV_EXP_CQ_RX_OUTER_IP_CSUM_OK,
1527				  PKT_RX_IP_CKSUM_BAD) |
1528			TRANSPOSE(~cqe->l4_hdr_type_etc,
1529				  IBV_EXP_CQ_RX_OUTER_TCP_UDP_CSUM_OK,
1530				  PKT_RX_L4_CKSUM_BAD);
1531	return ol_flags;
1532}
1533
1534/**
1535 * DPDK callback for RX.
1536 *
1537 * @param dpdk_rxq
1538 *   Generic pointer to RX queue structure.
1539 * @param[out] pkts
1540 *   Array to store received packets.
1541 * @param pkts_n
1542 *   Maximum number of packets in array.
1543 *
1544 * @return
1545 *   Number of packets successfully received (<= pkts_n).
1546 */
1547uint16_t
1548mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
1549{
1550	struct rxq *rxq = dpdk_rxq;
1551	const unsigned int wqe_cnt = rxq->elts_n - 1;
1552	const unsigned int cqe_cnt = rxq->cqe_n - 1;
1553	const unsigned int sges_n = rxq->sges_n;
1554	struct rte_mbuf *pkt = NULL;
1555	struct rte_mbuf *seg = NULL;
1556	volatile struct mlx5_cqe64 *cqe =
1557		&(*rxq->cqes)[rxq->cq_ci & cqe_cnt].cqe64;
1558	unsigned int i = 0;
1559	unsigned int rq_ci = rxq->rq_ci << sges_n;
1560	int len;
1561
1562	while (pkts_n) {
1563		unsigned int idx = rq_ci & wqe_cnt;
1564		volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx];
1565		struct rte_mbuf *rep = (*rxq->elts)[idx];
1566
1567		if (pkt)
1568			NEXT(seg) = rep;
1569		seg = rep;
1570		rte_prefetch0(seg);
1571		rte_prefetch0(cqe);
1572		rte_prefetch0(wqe);
1573		rep = rte_mbuf_raw_alloc(rxq->mp);
1574		if (unlikely(rep == NULL)) {
1575			while (pkt != seg) {
1576				assert(pkt != (*rxq->elts)[idx]);
1577				seg = NEXT(pkt);
1578				rte_mbuf_refcnt_set(pkt, 0);
1579				__rte_mbuf_raw_free(pkt);
1580				pkt = seg;
1581			}
1582			++rxq->stats.rx_nombuf;
1583			break;
1584		}
1585		if (!pkt) {
1586			cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt].cqe64;
1587			len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt);
1588			if (len == 0) {
1589				rte_mbuf_refcnt_set(rep, 0);
1590				__rte_mbuf_raw_free(rep);
1591				break;
1592			}
1593			if (unlikely(len == -1)) {
1594				/* RX error, packet is likely too large. */
1595				rte_mbuf_refcnt_set(rep, 0);
1596				__rte_mbuf_raw_free(rep);
1597				++rxq->stats.idropped;
1598				goto skip;
1599			}
1600			pkt = seg;
1601			assert(len >= (rxq->crc_present << 2));
1602			/* Update packet information. */
1603			pkt->packet_type = 0;
1604			pkt->ol_flags = 0;
1605			if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip |
1606			    rxq->crc_present) {
1607				if (rxq->csum) {
1608					pkt->packet_type =
1609						rxq_cq_to_pkt_type(cqe);
1610					pkt->ol_flags =
1611						rxq_cq_to_ol_flags(rxq, cqe);
1612				}
1613				if (cqe->l4_hdr_type_etc &
1614				    MLX5_CQE_VLAN_STRIPPED) {
1615					pkt->ol_flags |= PKT_RX_VLAN_PKT |
1616						PKT_RX_VLAN_STRIPPED;
1617					pkt->vlan_tci = ntohs(cqe->vlan_info);
1618				}
1619				if (rxq->crc_present)
1620					len -= ETHER_CRC_LEN;
1621			}
1622			PKT_LEN(pkt) = len;
1623		}
1624		DATA_LEN(rep) = DATA_LEN(seg);
1625		PKT_LEN(rep) = PKT_LEN(seg);
1626		SET_DATA_OFF(rep, DATA_OFF(seg));
1627		NB_SEGS(rep) = NB_SEGS(seg);
1628		PORT(rep) = PORT(seg);
1629		NEXT(rep) = NULL;
1630		(*rxq->elts)[idx] = rep;
1631		/*
1632		 * Fill NIC descriptor with the new buffer.  The lkey and size
1633		 * of the buffers are already known, only the buffer address
1634		 * changes.
1635		 */
1636		wqe->addr = htonll(rte_pktmbuf_mtod(rep, uintptr_t));
1637		if (len > DATA_LEN(seg)) {
1638			len -= DATA_LEN(seg);
1639			++NB_SEGS(pkt);
1640			++rq_ci;
1641			continue;
1642		}
1643		DATA_LEN(seg) = len;
1644#ifdef MLX5_PMD_SOFT_COUNTERS
1645		/* Increment bytes counter. */
1646		rxq->stats.ibytes += PKT_LEN(pkt);
1647#endif
1648		/* Return packet. */
1649		*(pkts++) = pkt;
1650		pkt = NULL;
1651		--pkts_n;
1652		++i;
1653skip:
1654		/* Align consumer index to the next stride. */
1655		rq_ci >>= sges_n;
1656		++rq_ci;
1657		rq_ci <<= sges_n;
1658	}
1659	if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci)))
1660		return 0;
1661	/* Update the consumer index. */
1662	rxq->rq_ci = rq_ci >> sges_n;
1663	rte_wmb();
1664	*rxq->cq_db = htonl(rxq->cq_ci);
1665	rte_wmb();
1666	*rxq->rq_db = htonl(rxq->rq_ci);
1667#ifdef MLX5_PMD_SOFT_COUNTERS
1668	/* Increment packets counter. */
1669	rxq->stats.ipackets += i;
1670#endif
1671	return i;
1672}
1673
1674/**
1675 * Dummy DPDK callback for TX.
1676 *
1677 * This function is used to temporarily replace the real callback during
1678 * unsafe control operations on the queue, or in case of error.
1679 *
1680 * @param dpdk_txq
1681 *   Generic pointer to TX queue structure.
1682 * @param[in] pkts
1683 *   Packets to transmit.
1684 * @param pkts_n
1685 *   Number of packets in array.
1686 *
1687 * @return
1688 *   Number of packets successfully transmitted (<= pkts_n).
1689 */
1690uint16_t
1691removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
1692{
1693	(void)dpdk_txq;
1694	(void)pkts;
1695	(void)pkts_n;
1696	return 0;
1697}
1698
1699/**
1700 * Dummy DPDK callback for RX.
1701 *
1702 * This function is used to temporarily replace the real callback during
1703 * unsafe control operations on the queue, or in case of error.
1704 *
1705 * @param dpdk_rxq
1706 *   Generic pointer to RX queue structure.
1707 * @param[out] pkts
1708 *   Array to store received packets.
1709 * @param pkts_n
1710 *   Maximum number of packets in array.
1711 *
1712 * @return
1713 *   Number of packets successfully received (<= pkts_n).
1714 */
1715uint16_t
1716removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
1717{
1718	(void)dpdk_rxq;
1719	(void)pkts;
1720	(void)pkts_n;
1721	return 0;
1722}
1723