1/*-
2 *   BSD LICENSE
3 *
4 *   Copyright 2015 6WIND S.A.
5 *   Copyright 2015 Mellanox.
6 *
7 *   Redistribution and use in source and binary forms, with or without
8 *   modification, are permitted provided that the following conditions
9 *   are met:
10 *
11 *     * Redistributions of source code must retain the above copyright
12 *       notice, this list of conditions and the following disclaimer.
13 *     * Redistributions in binary form must reproduce the above copyright
14 *       notice, this list of conditions and the following disclaimer in
15 *       the documentation and/or other materials provided with the
16 *       distribution.
17 *     * Neither the name of 6WIND S.A. nor the names of its
18 *       contributors may be used to endorse or promote products derived
19 *       from this software without specific prior written permission.
20 *
21 *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34#include <assert.h>
35#include <stdint.h>
36#include <string.h>
37#include <stdlib.h>
38
39/* Verbs header. */
40/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
41#ifdef PEDANTIC
42#pragma GCC diagnostic ignored "-Wpedantic"
43#endif
44#include <infiniband/verbs.h>
45#include <infiniband/mlx5_hw.h>
46#include <infiniband/arch.h>
47#ifdef PEDANTIC
48#pragma GCC diagnostic error "-Wpedantic"
49#endif
50
51/* DPDK headers don't like -pedantic. */
52#ifdef PEDANTIC
53#pragma GCC diagnostic ignored "-Wpedantic"
54#endif
55#include <rte_mbuf.h>
56#include <rte_mempool.h>
57#include <rte_prefetch.h>
58#include <rte_common.h>
59#include <rte_branch_prediction.h>
60#include <rte_ether.h>
61#ifdef PEDANTIC
62#pragma GCC diagnostic error "-Wpedantic"
63#endif
64
65#include "mlx5.h"
66#include "mlx5_utils.h"
67#include "mlx5_rxtx.h"
68#include "mlx5_autoconf.h"
69#include "mlx5_defs.h"
70#include "mlx5_prm.h"
71
72static inline int
73check_cqe(volatile struct mlx5_cqe *cqe,
74	  unsigned int cqes_n, const uint16_t ci)
75	  __attribute__((always_inline));
76
77static inline uint32_t
78txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
79	__attribute__((always_inline));
80
81static inline void
82mlx5_tx_dbrec(struct txq *txq, volatile struct mlx5_wqe *wqe)
83	__attribute__((always_inline));
84
85static inline uint32_t
86rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe)
87	__attribute__((always_inline));
88
89static inline int
90mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe,
91		 uint16_t cqe_cnt, uint32_t *rss_hash)
92		 __attribute__((always_inline));
93
94static inline uint32_t
95rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe *cqe)
96		   __attribute__((always_inline));
97
98#ifndef NDEBUG
99
100/**
101 * Verify or set magic value in CQE.
102 *
103 * @param cqe
104 *   Pointer to CQE.
105 *
106 * @return
107 *   0 the first time.
108 */
109static inline int
110check_cqe_seen(volatile struct mlx5_cqe *cqe)
111{
112	static const uint8_t magic[] = "seen";
113	volatile uint8_t (*buf)[sizeof(cqe->rsvd3)] = &cqe->rsvd3;
114	int ret = 1;
115	unsigned int i;
116
117	for (i = 0; i < sizeof(magic) && i < sizeof(*buf); ++i)
118		if (!ret || (*buf)[i] != magic[i]) {
119			ret = 0;
120			(*buf)[i] = magic[i];
121		}
122	return ret;
123}
124
125#endif /* NDEBUG */
126
127/**
128 * Check whether CQE is valid.
129 *
130 * @param cqe
131 *   Pointer to CQE.
132 * @param cqes_n
133 *   Size of completion queue.
134 * @param ci
135 *   Consumer index.
136 *
137 * @return
138 *   0 on success, 1 on failure.
139 */
140static inline int
141check_cqe(volatile struct mlx5_cqe *cqe,
142	  unsigned int cqes_n, const uint16_t ci)
143{
144	uint16_t idx = ci & cqes_n;
145	uint8_t op_own = cqe->op_own;
146	uint8_t op_owner = MLX5_CQE_OWNER(op_own);
147	uint8_t op_code = MLX5_CQE_OPCODE(op_own);
148
149	if (unlikely((op_owner != (!!(idx))) || (op_code == MLX5_CQE_INVALID)))
150		return 1; /* No CQE. */
151#ifndef NDEBUG
152	if ((op_code == MLX5_CQE_RESP_ERR) ||
153	    (op_code == MLX5_CQE_REQ_ERR)) {
154		volatile struct mlx5_err_cqe *err_cqe = (volatile void *)cqe;
155		uint8_t syndrome = err_cqe->syndrome;
156
157		if ((syndrome == MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR) ||
158		    (syndrome == MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR))
159			return 0;
160		if (!check_cqe_seen(cqe))
161			ERROR("unexpected CQE error %u (0x%02x)"
162			      " syndrome 0x%02x",
163			      op_code, op_code, syndrome);
164		return 1;
165	} else if ((op_code != MLX5_CQE_RESP_SEND) &&
166		   (op_code != MLX5_CQE_REQ)) {
167		if (!check_cqe_seen(cqe))
168			ERROR("unexpected CQE opcode %u (0x%02x)",
169			      op_code, op_code);
170		return 1;
171	}
172#endif /* NDEBUG */
173	return 0;
174}
175
176static inline void
177txq_complete(struct txq *txq) __attribute__((always_inline));
178
179/**
180 * Manage TX completions.
181 *
182 * When sending a burst, mlx5_tx_burst() posts several WRs.
183 *
184 * @param txq
185 *   Pointer to TX queue structure.
186 */
187static inline void
188txq_complete(struct txq *txq)
189{
190	const unsigned int elts_n = 1 << txq->elts_n;
191	const unsigned int cqe_n = 1 << txq->cqe_n;
192	const unsigned int cqe_cnt = cqe_n - 1;
193	uint16_t elts_free = txq->elts_tail;
194	uint16_t elts_tail;
195	uint16_t cq_ci = txq->cq_ci;
196	volatile struct mlx5_cqe *cqe = NULL;
197	volatile struct mlx5_wqe *wqe;
198
199	do {
200		volatile struct mlx5_cqe *tmp;
201
202		tmp = &(*txq->cqes)[cq_ci & cqe_cnt];
203		if (check_cqe(tmp, cqe_n, cq_ci))
204			break;
205		cqe = tmp;
206#ifndef NDEBUG
207		if (MLX5_CQE_FORMAT(cqe->op_own) == MLX5_COMPRESSED) {
208			if (!check_cqe_seen(cqe))
209				ERROR("unexpected compressed CQE, TX stopped");
210			return;
211		}
212		if ((MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_RESP_ERR) ||
213		    (MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_REQ_ERR)) {
214			if (!check_cqe_seen(cqe))
215				ERROR("unexpected error CQE, TX stopped");
216			return;
217		}
218#endif /* NDEBUG */
219		++cq_ci;
220	} while (1);
221	if (unlikely(cqe == NULL))
222		return;
223	wqe = &(*txq->wqes)[ntohs(cqe->wqe_counter) &
224			    ((1 << txq->wqe_n) - 1)].hdr;
225	elts_tail = wqe->ctrl[3];
226	assert(elts_tail < (1 << txq->wqe_n));
227	/* Free buffers. */
228	while (elts_free != elts_tail) {
229		struct rte_mbuf *elt = (*txq->elts)[elts_free];
230		unsigned int elts_free_next =
231			(elts_free + 1) & (elts_n - 1);
232		struct rte_mbuf *elt_next = (*txq->elts)[elts_free_next];
233
234#ifndef NDEBUG
235		/* Poisoning. */
236		memset(&(*txq->elts)[elts_free],
237		       0x66,
238		       sizeof((*txq->elts)[elts_free]));
239#endif
240		RTE_MBUF_PREFETCH_TO_FREE(elt_next);
241		/* Only one segment needs to be freed. */
242		rte_pktmbuf_free_seg(elt);
243		elts_free = elts_free_next;
244	}
245	txq->cq_ci = cq_ci;
246	txq->elts_tail = elts_tail;
247	/* Update the consumer index. */
248	rte_wmb();
249	*txq->cq_db = htonl(cq_ci);
250}
251
252/**
253 * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which
254 * the cloned mbuf is allocated is returned instead.
255 *
256 * @param buf
257 *   Pointer to mbuf.
258 *
259 * @return
260 *   Memory pool where data is located for given mbuf.
261 */
262static struct rte_mempool *
263txq_mb2mp(struct rte_mbuf *buf)
264{
265	if (unlikely(RTE_MBUF_INDIRECT(buf)))
266		return rte_mbuf_from_indirect(buf)->pool;
267	return buf->pool;
268}
269
270/**
271 * Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[].
272 * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full,
273 * remove an entry first.
274 *
275 * @param txq
276 *   Pointer to TX queue structure.
277 * @param[in] mp
278 *   Memory Pool for which a Memory Region lkey must be returned.
279 *
280 * @return
281 *   mr->lkey on success, (uint32_t)-1 on failure.
282 */
283static inline uint32_t
284txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
285{
286	unsigned int i;
287	uint32_t lkey = (uint32_t)-1;
288
289	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
290		if (unlikely(txq->mp2mr[i].mp == NULL)) {
291			/* Unknown MP, add a new MR for it. */
292			break;
293		}
294		if (txq->mp2mr[i].mp == mp) {
295			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
296			assert(htonl(txq->mp2mr[i].mr->lkey) ==
297			       txq->mp2mr[i].lkey);
298			lkey = txq->mp2mr[i].lkey;
299			break;
300		}
301	}
302	if (unlikely(lkey == (uint32_t)-1))
303		lkey = txq_mp2mr_reg(txq, mp, i);
304	return lkey;
305}
306
307/**
308 * Ring TX queue doorbell.
309 *
310 * @param txq
311 *   Pointer to TX queue structure.
312 * @param wqe
313 *   Pointer to the last WQE posted in the NIC.
314 */
315static inline void
316mlx5_tx_dbrec(struct txq *txq, volatile struct mlx5_wqe *wqe)
317{
318	uint64_t *dst = (uint64_t *)((uintptr_t)txq->bf_reg);
319	volatile uint64_t *src = ((volatile uint64_t *)wqe);
320
321	rte_wmb();
322	*txq->qp_db = htonl(txq->wqe_ci);
323	/* Ensure ordering between DB record and BF copy. */
324	rte_wmb();
325	*dst = *src;
326}
327
328/**
329 * Prefetch a CQE.
330 *
331 * @param txq
332 *   Pointer to TX queue structure.
333 * @param cqe_ci
334 *   CQE consumer index.
335 */
336static inline void
337tx_prefetch_cqe(struct txq *txq, uint16_t ci)
338{
339	volatile struct mlx5_cqe *cqe;
340
341	cqe = &(*txq->cqes)[ci & ((1 << txq->cqe_n) - 1)];
342	rte_prefetch0(cqe);
343}
344
345/**
346 * Prefetch a WQE.
347 *
348 * @param txq
349 *   Pointer to TX queue structure.
350 * @param  wqe_ci
351 *   WQE consumer index.
352 */
353static inline void
354tx_prefetch_wqe(struct txq *txq, uint16_t ci)
355{
356	volatile struct mlx5_wqe64 *wqe;
357
358	wqe = &(*txq->wqes)[ci & ((1 << txq->wqe_n) - 1)];
359	rte_prefetch0(wqe);
360}
361
362/**
363 * DPDK callback for TX.
364 *
365 * @param dpdk_txq
366 *   Generic pointer to TX queue structure.
367 * @param[in] pkts
368 *   Packets to transmit.
369 * @param pkts_n
370 *   Number of packets in array.
371 *
372 * @return
373 *   Number of packets successfully transmitted (<= pkts_n).
374 */
375uint16_t
376mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
377{
378	struct txq *txq = (struct txq *)dpdk_txq;
379	uint16_t elts_head = txq->elts_head;
380	const unsigned int elts_n = 1 << txq->elts_n;
381	unsigned int i = 0;
382	unsigned int j = 0;
383	unsigned int max;
384	unsigned int comp;
385	volatile struct mlx5_wqe *wqe = NULL;
386	unsigned int segs_n = 0;
387	struct rte_mbuf *buf = NULL;
388	uint8_t *raw;
389
390	if (unlikely(!pkts_n))
391		return 0;
392	/* Prefetch first packet cacheline. */
393	tx_prefetch_cqe(txq, txq->cq_ci);
394	tx_prefetch_cqe(txq, txq->cq_ci + 1);
395	rte_prefetch0(*pkts);
396	/* Start processing. */
397	txq_complete(txq);
398	max = (elts_n - (elts_head - txq->elts_tail));
399	if (max > elts_n)
400		max -= elts_n;
401	do {
402		volatile struct mlx5_wqe_data_seg *dseg = NULL;
403		uint32_t length;
404		unsigned int ds = 0;
405		uintptr_t addr;
406#ifdef MLX5_PMD_SOFT_COUNTERS
407		uint32_t total_length = 0;
408#endif
409
410		/* first_seg */
411		buf = *(pkts++);
412		segs_n = buf->nb_segs;
413		/*
414		 * Make sure there is enough room to store this packet and
415		 * that one ring entry remains unused.
416		 */
417		assert(segs_n);
418		if (max < segs_n + 1)
419			break;
420		max -= segs_n;
421		--segs_n;
422		if (!segs_n)
423			--pkts_n;
424		wqe = &(*txq->wqes)[txq->wqe_ci &
425				    ((1 << txq->wqe_n) - 1)].hdr;
426		tx_prefetch_wqe(txq, txq->wqe_ci + 1);
427		if (pkts_n > 1)
428			rte_prefetch0(*pkts);
429		addr = rte_pktmbuf_mtod(buf, uintptr_t);
430		length = DATA_LEN(buf);
431#ifdef MLX5_PMD_SOFT_COUNTERS
432		total_length = length;
433#endif
434		if (length < (MLX5_WQE_DWORD_SIZE + 2)) {
435			txq->stats.oerrors++;
436			break;
437		}
438		/* Update element. */
439		(*txq->elts)[elts_head] = buf;
440		elts_head = (elts_head + 1) & (elts_n - 1);
441		/* Prefetch next buffer data. */
442		if (pkts_n > 1) {
443			volatile void *pkt_addr;
444
445			pkt_addr = rte_pktmbuf_mtod(*pkts, volatile void *);
446			rte_prefetch0(pkt_addr);
447		}
448		/* Should we enable HW CKSUM offload */
449		if (buf->ol_flags &
450		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
451			wqe->eseg.cs_flags =
452				MLX5_ETH_WQE_L3_CSUM |
453				MLX5_ETH_WQE_L4_CSUM;
454		} else {
455			wqe->eseg.cs_flags = 0;
456		}
457		raw  = (uint8_t *)(uintptr_t)&wqe->eseg.inline_hdr[0];
458		/* Start the know and common part of the WQE structure. */
459		wqe->ctrl[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
460		wqe->ctrl[2] = 0;
461		wqe->ctrl[3] = 0;
462		wqe->eseg.rsvd0 = 0;
463		wqe->eseg.rsvd1 = 0;
464		wqe->eseg.mss = 0;
465		wqe->eseg.rsvd2 = 0;
466		/* Start by copying the Ethernet Header. */
467		memcpy((uint8_t *)raw, ((uint8_t *)addr), 16);
468		length -= MLX5_WQE_DWORD_SIZE;
469		addr += MLX5_WQE_DWORD_SIZE;
470		/* Replace the Ethernet type by the VLAN if necessary. */
471		if (buf->ol_flags & PKT_TX_VLAN_PKT) {
472			uint32_t vlan = htonl(0x81000000 | buf->vlan_tci);
473
474			memcpy((uint8_t *)(raw + MLX5_WQE_DWORD_SIZE -
475					   sizeof(vlan)),
476			       &vlan, sizeof(vlan));
477			addr -= sizeof(vlan);
478			length += sizeof(vlan);
479		}
480		/* Inline if enough room. */
481		if (txq->max_inline != 0) {
482			uintptr_t end =
483				(uintptr_t)&(*txq->wqes)[1 << txq->wqe_n];
484			uint16_t max_inline =
485				txq->max_inline * RTE_CACHE_LINE_SIZE;
486			uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE;
487			uint16_t room;
488
489			raw += MLX5_WQE_DWORD_SIZE;
490			room = end - (uintptr_t)raw;
491			if (room > max_inline) {
492				uintptr_t addr_end = (addr + max_inline) &
493					~(RTE_CACHE_LINE_SIZE - 1);
494				uint16_t copy_b = ((addr_end - addr) > length) ?
495						  length :
496						  (addr_end - addr);
497
498				rte_memcpy((void *)raw, (void *)addr, copy_b);
499				addr += copy_b;
500				length -= copy_b;
501				pkt_inline_sz += copy_b;
502				/* Sanity check. */
503				assert(addr <= addr_end);
504			}
505			/* Store the inlined packet size in the WQE. */
506			wqe->eseg.inline_hdr_sz = htons(pkt_inline_sz);
507			/*
508			 * 2 DWORDs consumed by the WQE header + 1 DSEG +
509			 * the size of the inline part of the packet.
510			 */
511			ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2);
512			if (length > 0) {
513				dseg = (struct mlx5_wqe_data_seg *)
514					((uintptr_t)wqe +
515					 (ds * MLX5_WQE_DWORD_SIZE));
516				if ((uintptr_t)dseg >= end)
517					dseg = (struct mlx5_wqe_data_seg *)
518						((uintptr_t)&(*txq->wqes)[0]);
519				goto use_dseg;
520			} else if (!segs_n) {
521				goto next_pkt;
522			} else {
523				goto next_seg;
524			}
525		} else {
526			/*
527			 * No inline has been done in the packet, only the
528			 * Ethernet Header as been stored.
529			 */
530			wqe->eseg.inline_hdr_sz = htons(MLX5_WQE_DWORD_SIZE);
531			dseg = (struct mlx5_wqe_data_seg *)
532				((uintptr_t)wqe + (3 * MLX5_WQE_DWORD_SIZE));
533			ds = 3;
534use_dseg:
535			/* Add the remaining packet as a simple ds. */
536			*dseg = (struct mlx5_wqe_data_seg) {
537				.addr = htonll(addr),
538				.byte_count = htonl(length),
539				.lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
540			};
541			++ds;
542			if (!segs_n)
543				goto next_pkt;
544		}
545next_seg:
546		assert(buf);
547		assert(ds);
548		assert(wqe);
549		/*
550		 * Spill on next WQE when the current one does not have
551		 * enough room left. Size of WQE must a be a multiple
552		 * of data segment size.
553		 */
554		assert(!(MLX5_WQE_SIZE % MLX5_WQE_DWORD_SIZE));
555		if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE))) {
556			unsigned int n = (txq->wqe_ci + ((ds + 3) / 4)) &
557				((1 << txq->wqe_n) - 1);
558
559			dseg = (struct mlx5_wqe_data_seg *)
560				((uintptr_t)&(*txq->wqes)[n]);
561			tx_prefetch_wqe(txq, n + 1);
562		} else {
563			++dseg;
564		}
565		++ds;
566		buf = buf->next;
567		assert(buf);
568		length = DATA_LEN(buf);
569#ifdef MLX5_PMD_SOFT_COUNTERS
570		total_length += length;
571#endif
572		/* Store segment information. */
573		*dseg = (struct mlx5_wqe_data_seg) {
574			.addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)),
575			.byte_count = htonl(length),
576			.lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
577		};
578		(*txq->elts)[elts_head] = buf;
579		elts_head = (elts_head + 1) & (elts_n - 1);
580		++j;
581		--segs_n;
582		if (segs_n)
583			goto next_seg;
584		else
585			--pkts_n;
586next_pkt:
587		++i;
588		wqe->ctrl[1] = htonl(txq->qp_num_8s | ds);
589		txq->wqe_ci += (ds + 3) / 4;
590#ifdef MLX5_PMD_SOFT_COUNTERS
591		/* Increment sent bytes counter. */
592		txq->stats.obytes += total_length;
593#endif
594	} while (pkts_n);
595	/* Take a shortcut if nothing must be sent. */
596	if (unlikely(i == 0))
597		return 0;
598	/* Check whether completion threshold has been reached. */
599	comp = txq->elts_comp + i + j;
600	if (comp >= MLX5_TX_COMP_THRESH) {
601		/* Request completion on last WQE. */
602		wqe->ctrl[2] = htonl(8);
603		/* Save elts_head in unused "immediate" field of WQE. */
604		wqe->ctrl[3] = elts_head;
605		txq->elts_comp = 0;
606	} else {
607		txq->elts_comp = comp;
608	}
609#ifdef MLX5_PMD_SOFT_COUNTERS
610	/* Increment sent packets counter. */
611	txq->stats.opackets += i;
612#endif
613	/* Ring QP doorbell. */
614	mlx5_tx_dbrec(txq, (volatile struct mlx5_wqe *)wqe);
615	txq->elts_head = elts_head;
616	return i;
617}
618
619/**
620 * Open a MPW session.
621 *
622 * @param txq
623 *   Pointer to TX queue structure.
624 * @param mpw
625 *   Pointer to MPW session structure.
626 * @param length
627 *   Packet length.
628 */
629static inline void
630mlx5_mpw_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
631{
632	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
633	volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] =
634		(volatile struct mlx5_wqe_data_seg (*)[])
635		(uintptr_t)&(*txq->wqes)[(idx + 1) & ((1 << txq->wqe_n) - 1)];
636
637	mpw->state = MLX5_MPW_STATE_OPENED;
638	mpw->pkts_n = 0;
639	mpw->len = length;
640	mpw->total_len = 0;
641	mpw->wqe = (volatile struct mlx5_wqe *)&(*txq->wqes)[idx].hdr;
642	mpw->wqe->eseg.mss = htons(length);
643	mpw->wqe->eseg.inline_hdr_sz = 0;
644	mpw->wqe->eseg.rsvd0 = 0;
645	mpw->wqe->eseg.rsvd1 = 0;
646	mpw->wqe->eseg.rsvd2 = 0;
647	mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
648				  (txq->wqe_ci << 8) | MLX5_OPCODE_TSO);
649	mpw->wqe->ctrl[2] = 0;
650	mpw->wqe->ctrl[3] = 0;
651	mpw->data.dseg[0] = (volatile struct mlx5_wqe_data_seg *)
652		(((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
653	mpw->data.dseg[1] = (volatile struct mlx5_wqe_data_seg *)
654		(((uintptr_t)mpw->wqe) + (3 * MLX5_WQE_DWORD_SIZE));
655	mpw->data.dseg[2] = &(*dseg)[0];
656	mpw->data.dseg[3] = &(*dseg)[1];
657	mpw->data.dseg[4] = &(*dseg)[2];
658}
659
660/**
661 * Close a MPW session.
662 *
663 * @param txq
664 *   Pointer to TX queue structure.
665 * @param mpw
666 *   Pointer to MPW session structure.
667 */
668static inline void
669mlx5_mpw_close(struct txq *txq, struct mlx5_mpw *mpw)
670{
671	unsigned int num = mpw->pkts_n;
672
673	/*
674	 * Store size in multiple of 16 bytes. Control and Ethernet segments
675	 * count as 2.
676	 */
677	mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | (2 + num));
678	mpw->state = MLX5_MPW_STATE_CLOSED;
679	if (num < 3)
680		++txq->wqe_ci;
681	else
682		txq->wqe_ci += 2;
683	tx_prefetch_wqe(txq, txq->wqe_ci);
684	tx_prefetch_wqe(txq, txq->wqe_ci + 1);
685}
686
687/**
688 * DPDK callback for TX with MPW support.
689 *
690 * @param dpdk_txq
691 *   Generic pointer to TX queue structure.
692 * @param[in] pkts
693 *   Packets to transmit.
694 * @param pkts_n
695 *   Number of packets in array.
696 *
697 * @return
698 *   Number of packets successfully transmitted (<= pkts_n).
699 */
700uint16_t
701mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
702{
703	struct txq *txq = (struct txq *)dpdk_txq;
704	uint16_t elts_head = txq->elts_head;
705	const unsigned int elts_n = 1 << txq->elts_n;
706	unsigned int i = 0;
707	unsigned int j = 0;
708	unsigned int max;
709	unsigned int comp;
710	struct mlx5_mpw mpw = {
711		.state = MLX5_MPW_STATE_CLOSED,
712	};
713
714	if (unlikely(!pkts_n))
715		return 0;
716	/* Prefetch first packet cacheline. */
717	tx_prefetch_cqe(txq, txq->cq_ci);
718	tx_prefetch_wqe(txq, txq->wqe_ci);
719	tx_prefetch_wqe(txq, txq->wqe_ci + 1);
720	/* Start processing. */
721	txq_complete(txq);
722	max = (elts_n - (elts_head - txq->elts_tail));
723	if (max > elts_n)
724		max -= elts_n;
725	do {
726		struct rte_mbuf *buf = *(pkts++);
727		unsigned int elts_head_next;
728		uint32_t length;
729		unsigned int segs_n = buf->nb_segs;
730		uint32_t cs_flags = 0;
731
732		/*
733		 * Make sure there is enough room to store this packet and
734		 * that one ring entry remains unused.
735		 */
736		assert(segs_n);
737		if (max < segs_n + 1)
738			break;
739		/* Do not bother with large packets MPW cannot handle. */
740		if (segs_n > MLX5_MPW_DSEG_MAX) {
741			txq->stats.oerrors++;
742			break;
743		}
744		max -= segs_n;
745		--pkts_n;
746		/* Should we enable HW CKSUM offload */
747		if (buf->ol_flags &
748		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
749			cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
750		/* Retrieve packet information. */
751		length = PKT_LEN(buf);
752		assert(length);
753		/* Start new session if packet differs. */
754		if ((mpw.state == MLX5_MPW_STATE_OPENED) &&
755		    ((mpw.len != length) ||
756		     (segs_n != 1) ||
757		     (mpw.wqe->eseg.cs_flags != cs_flags)))
758			mlx5_mpw_close(txq, &mpw);
759		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
760			mlx5_mpw_new(txq, &mpw, length);
761			mpw.wqe->eseg.cs_flags = cs_flags;
762		}
763		/* Multi-segment packets must be alone in their MPW. */
764		assert((segs_n == 1) || (mpw.pkts_n == 0));
765#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
766		length = 0;
767#endif
768		do {
769			volatile struct mlx5_wqe_data_seg *dseg;
770			uintptr_t addr;
771
772			elts_head_next = (elts_head + 1) & (elts_n - 1);
773			assert(buf);
774			(*txq->elts)[elts_head] = buf;
775			dseg = mpw.data.dseg[mpw.pkts_n];
776			addr = rte_pktmbuf_mtod(buf, uintptr_t);
777			*dseg = (struct mlx5_wqe_data_seg){
778				.byte_count = htonl(DATA_LEN(buf)),
779				.lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
780				.addr = htonll(addr),
781			};
782			elts_head = elts_head_next;
783#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
784			length += DATA_LEN(buf);
785#endif
786			buf = buf->next;
787			++mpw.pkts_n;
788			++j;
789		} while (--segs_n);
790		assert(length == mpw.len);
791		if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
792			mlx5_mpw_close(txq, &mpw);
793		elts_head = elts_head_next;
794#ifdef MLX5_PMD_SOFT_COUNTERS
795		/* Increment sent bytes counter. */
796		txq->stats.obytes += length;
797#endif
798		++i;
799	} while (pkts_n);
800	/* Take a shortcut if nothing must be sent. */
801	if (unlikely(i == 0))
802		return 0;
803	/* Check whether completion threshold has been reached. */
804	/* "j" includes both packets and segments. */
805	comp = txq->elts_comp + j;
806	if (comp >= MLX5_TX_COMP_THRESH) {
807		volatile struct mlx5_wqe *wqe = mpw.wqe;
808
809		/* Request completion on last WQE. */
810		wqe->ctrl[2] = htonl(8);
811		/* Save elts_head in unused "immediate" field of WQE. */
812		wqe->ctrl[3] = elts_head;
813		txq->elts_comp = 0;
814	} else {
815		txq->elts_comp = comp;
816	}
817#ifdef MLX5_PMD_SOFT_COUNTERS
818	/* Increment sent packets counter. */
819	txq->stats.opackets += i;
820#endif
821	/* Ring QP doorbell. */
822	if (mpw.state == MLX5_MPW_STATE_OPENED)
823		mlx5_mpw_close(txq, &mpw);
824	mlx5_tx_dbrec(txq, mpw.wqe);
825	txq->elts_head = elts_head;
826	return i;
827}
828
829/**
830 * Open a MPW inline session.
831 *
832 * @param txq
833 *   Pointer to TX queue structure.
834 * @param mpw
835 *   Pointer to MPW session structure.
836 * @param length
837 *   Packet length.
838 */
839static inline void
840mlx5_mpw_inline_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
841{
842	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
843	struct mlx5_wqe_inl_small *inl;
844
845	mpw->state = MLX5_MPW_INL_STATE_OPENED;
846	mpw->pkts_n = 0;
847	mpw->len = length;
848	mpw->total_len = 0;
849	mpw->wqe = (volatile struct mlx5_wqe *)&(*txq->wqes)[idx].hdr;
850	mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
851				  (txq->wqe_ci << 8) |
852				  MLX5_OPCODE_TSO);
853	mpw->wqe->ctrl[2] = 0;
854	mpw->wqe->ctrl[3] = 0;
855	mpw->wqe->eseg.mss = htons(length);
856	mpw->wqe->eseg.inline_hdr_sz = 0;
857	mpw->wqe->eseg.cs_flags = 0;
858	mpw->wqe->eseg.rsvd0 = 0;
859	mpw->wqe->eseg.rsvd1 = 0;
860	mpw->wqe->eseg.rsvd2 = 0;
861	inl = (struct mlx5_wqe_inl_small *)
862		(((uintptr_t)mpw->wqe) + 2 * MLX5_WQE_DWORD_SIZE);
863	mpw->data.raw = (uint8_t *)&inl->raw;
864}
865
866/**
867 * Close a MPW inline session.
868 *
869 * @param txq
870 *   Pointer to TX queue structure.
871 * @param mpw
872 *   Pointer to MPW session structure.
873 */
874static inline void
875mlx5_mpw_inline_close(struct txq *txq, struct mlx5_mpw *mpw)
876{
877	unsigned int size;
878	struct mlx5_wqe_inl_small *inl = (struct mlx5_wqe_inl_small *)
879		(((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
880
881	size = MLX5_WQE_SIZE - MLX5_MWQE64_INL_DATA + mpw->total_len;
882	/*
883	 * Store size in multiple of 16 bytes. Control and Ethernet segments
884	 * count as 2.
885	 */
886	mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | MLX5_WQE_DS(size));
887	mpw->state = MLX5_MPW_STATE_CLOSED;
888	inl->byte_cnt = htonl(mpw->total_len | MLX5_INLINE_SEG);
889	txq->wqe_ci += (size + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
890}
891
892/**
893 * DPDK callback for TX with MPW inline support.
894 *
895 * @param dpdk_txq
896 *   Generic pointer to TX queue structure.
897 * @param[in] pkts
898 *   Packets to transmit.
899 * @param pkts_n
900 *   Number of packets in array.
901 *
902 * @return
903 *   Number of packets successfully transmitted (<= pkts_n).
904 */
905uint16_t
906mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
907			 uint16_t pkts_n)
908{
909	struct txq *txq = (struct txq *)dpdk_txq;
910	uint16_t elts_head = txq->elts_head;
911	const unsigned int elts_n = 1 << txq->elts_n;
912	unsigned int i = 0;
913	unsigned int j = 0;
914	unsigned int max;
915	unsigned int comp;
916	unsigned int inline_room = txq->max_inline * RTE_CACHE_LINE_SIZE;
917	struct mlx5_mpw mpw = {
918		.state = MLX5_MPW_STATE_CLOSED,
919	};
920
921	if (unlikely(!pkts_n))
922		return 0;
923	/* Prefetch first packet cacheline. */
924	tx_prefetch_cqe(txq, txq->cq_ci);
925	tx_prefetch_wqe(txq, txq->wqe_ci);
926	tx_prefetch_wqe(txq, txq->wqe_ci + 1);
927	/* Start processing. */
928	txq_complete(txq);
929	max = (elts_n - (elts_head - txq->elts_tail));
930	if (max > elts_n)
931		max -= elts_n;
932	do {
933		struct rte_mbuf *buf = *(pkts++);
934		unsigned int elts_head_next;
935		uintptr_t addr;
936		uint32_t length;
937		unsigned int segs_n = buf->nb_segs;
938		uint32_t cs_flags = 0;
939
940		/*
941		 * Make sure there is enough room to store this packet and
942		 * that one ring entry remains unused.
943		 */
944		assert(segs_n);
945		if (max < segs_n + 1)
946			break;
947		/* Do not bother with large packets MPW cannot handle. */
948		if (segs_n > MLX5_MPW_DSEG_MAX) {
949			txq->stats.oerrors++;
950			break;
951		}
952		max -= segs_n;
953		--pkts_n;
954		/* Should we enable HW CKSUM offload */
955		if (buf->ol_flags &
956		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
957			cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
958		/* Retrieve packet information. */
959		length = PKT_LEN(buf);
960		/* Start new session if packet differs. */
961		if (mpw.state == MLX5_MPW_STATE_OPENED) {
962			if ((mpw.len != length) ||
963			    (segs_n != 1) ||
964			    (mpw.wqe->eseg.cs_flags != cs_flags))
965				mlx5_mpw_close(txq, &mpw);
966		} else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) {
967			if ((mpw.len != length) ||
968			    (segs_n != 1) ||
969			    (length > inline_room) ||
970			    (mpw.wqe->eseg.cs_flags != cs_flags)) {
971				mlx5_mpw_inline_close(txq, &mpw);
972				inline_room =
973					txq->max_inline * RTE_CACHE_LINE_SIZE;
974			}
975		}
976		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
977			if ((segs_n != 1) ||
978			    (length > inline_room)) {
979				mlx5_mpw_new(txq, &mpw, length);
980				mpw.wqe->eseg.cs_flags = cs_flags;
981			} else {
982				mlx5_mpw_inline_new(txq, &mpw, length);
983				mpw.wqe->eseg.cs_flags = cs_flags;
984			}
985		}
986		/* Multi-segment packets must be alone in their MPW. */
987		assert((segs_n == 1) || (mpw.pkts_n == 0));
988		if (mpw.state == MLX5_MPW_STATE_OPENED) {
989			assert(inline_room ==
990			       txq->max_inline * RTE_CACHE_LINE_SIZE);
991#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
992			length = 0;
993#endif
994			do {
995				volatile struct mlx5_wqe_data_seg *dseg;
996
997				elts_head_next =
998					(elts_head + 1) & (elts_n - 1);
999				assert(buf);
1000				(*txq->elts)[elts_head] = buf;
1001				dseg = mpw.data.dseg[mpw.pkts_n];
1002				addr = rte_pktmbuf_mtod(buf, uintptr_t);
1003				*dseg = (struct mlx5_wqe_data_seg){
1004					.byte_count = htonl(DATA_LEN(buf)),
1005					.lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
1006					.addr = htonll(addr),
1007				};
1008				elts_head = elts_head_next;
1009#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1010				length += DATA_LEN(buf);
1011#endif
1012				buf = buf->next;
1013				++mpw.pkts_n;
1014				++j;
1015			} while (--segs_n);
1016			assert(length == mpw.len);
1017			if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
1018				mlx5_mpw_close(txq, &mpw);
1019		} else {
1020			unsigned int max;
1021
1022			assert(mpw.state == MLX5_MPW_INL_STATE_OPENED);
1023			assert(length <= inline_room);
1024			assert(length == DATA_LEN(buf));
1025			elts_head_next = (elts_head + 1) & (elts_n - 1);
1026			addr = rte_pktmbuf_mtod(buf, uintptr_t);
1027			(*txq->elts)[elts_head] = buf;
1028			/* Maximum number of bytes before wrapping. */
1029			max = ((uintptr_t)&(*txq->wqes)[1 << txq->wqe_n] -
1030			       (uintptr_t)mpw.data.raw);
1031			if (length > max) {
1032				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1033					   (void *)addr,
1034					   max);
1035				mpw.data.raw =
1036					(volatile void *)&(*txq->wqes)[0];
1037				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1038					   (void *)(addr + max),
1039					   length - max);
1040				mpw.data.raw += length - max;
1041			} else {
1042				rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1043					   (void *)addr,
1044					   length);
1045				mpw.data.raw += length;
1046			}
1047			if ((uintptr_t)mpw.data.raw ==
1048			    (uintptr_t)&(*txq->wqes)[1 << txq->wqe_n])
1049				mpw.data.raw =
1050					(volatile void *)&(*txq->wqes)[0];
1051			++mpw.pkts_n;
1052			mpw.total_len += length;
1053			++j;
1054			if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) {
1055				mlx5_mpw_inline_close(txq, &mpw);
1056				inline_room =
1057					txq->max_inline * RTE_CACHE_LINE_SIZE;
1058			} else {
1059				inline_room -= length;
1060			}
1061		}
1062		elts_head = elts_head_next;
1063#ifdef MLX5_PMD_SOFT_COUNTERS
1064		/* Increment sent bytes counter. */
1065		txq->stats.obytes += length;
1066#endif
1067		++i;
1068	} while (pkts_n);
1069	/* Take a shortcut if nothing must be sent. */
1070	if (unlikely(i == 0))
1071		return 0;
1072	/* Check whether completion threshold has been reached. */
1073	/* "j" includes both packets and segments. */
1074	comp = txq->elts_comp + j;
1075	if (comp >= MLX5_TX_COMP_THRESH) {
1076		volatile struct mlx5_wqe *wqe = mpw.wqe;
1077
1078		/* Request completion on last WQE. */
1079		wqe->ctrl[2] = htonl(8);
1080		/* Save elts_head in unused "immediate" field of WQE. */
1081		wqe->ctrl[3] = elts_head;
1082		txq->elts_comp = 0;
1083	} else {
1084		txq->elts_comp = comp;
1085	}
1086#ifdef MLX5_PMD_SOFT_COUNTERS
1087	/* Increment sent packets counter. */
1088	txq->stats.opackets += i;
1089#endif
1090	/* Ring QP doorbell. */
1091	if (mpw.state == MLX5_MPW_INL_STATE_OPENED)
1092		mlx5_mpw_inline_close(txq, &mpw);
1093	else if (mpw.state == MLX5_MPW_STATE_OPENED)
1094		mlx5_mpw_close(txq, &mpw);
1095	mlx5_tx_dbrec(txq, mpw.wqe);
1096	txq->elts_head = elts_head;
1097	return i;
1098}
1099
1100/**
1101 * Translate RX completion flags to packet type.
1102 *
1103 * @param[in] cqe
1104 *   Pointer to CQE.
1105 *
1106 * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
1107 *
1108 * @return
1109 *   Packet type for struct rte_mbuf.
1110 */
1111static inline uint32_t
1112rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe)
1113{
1114	uint32_t pkt_type;
1115	uint16_t flags = ntohs(cqe->hdr_type_etc);
1116
1117	if (cqe->pkt_info & MLX5_CQE_RX_TUNNEL_PACKET) {
1118		pkt_type =
1119			TRANSPOSE(flags,
1120				  MLX5_CQE_RX_IPV4_PACKET,
1121				  RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN) |
1122			TRANSPOSE(flags,
1123				  MLX5_CQE_RX_IPV6_PACKET,
1124				  RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN);
1125		pkt_type |= ((cqe->pkt_info & MLX5_CQE_RX_OUTER_PACKET) ?
1126			     RTE_PTYPE_L3_IPV6_EXT_UNKNOWN :
1127			     RTE_PTYPE_L3_IPV4_EXT_UNKNOWN);
1128	} else {
1129		pkt_type =
1130			TRANSPOSE(flags,
1131				  MLX5_CQE_L3_HDR_TYPE_IPV6,
1132				  RTE_PTYPE_L3_IPV6_EXT_UNKNOWN) |
1133			TRANSPOSE(flags,
1134				  MLX5_CQE_L3_HDR_TYPE_IPV4,
1135				  RTE_PTYPE_L3_IPV4_EXT_UNKNOWN);
1136	}
1137	return pkt_type;
1138}
1139
1140/**
1141 * Get size of the next packet for a given CQE. For compressed CQEs, the
1142 * consumer index is updated only once all packets of the current one have
1143 * been processed.
1144 *
1145 * @param rxq
1146 *   Pointer to RX queue.
1147 * @param cqe
1148 *   CQE to process.
1149 * @param[out] rss_hash
1150 *   Packet RSS Hash result.
1151 *
1152 * @return
1153 *   Packet size in bytes (0 if there is none), -1 in case of completion
1154 *   with error.
1155 */
1156static inline int
1157mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe,
1158		 uint16_t cqe_cnt, uint32_t *rss_hash)
1159{
1160	struct rxq_zip *zip = &rxq->zip;
1161	uint16_t cqe_n = cqe_cnt + 1;
1162	int len = 0;
1163
1164	/* Process compressed data in the CQE and mini arrays. */
1165	if (zip->ai) {
1166		volatile struct mlx5_mini_cqe8 (*mc)[8] =
1167			(volatile struct mlx5_mini_cqe8 (*)[8])
1168			(uintptr_t)(&(*rxq->cqes)[zip->ca & cqe_cnt]);
1169
1170		len = ntohl((*mc)[zip->ai & 7].byte_cnt);
1171		*rss_hash = ntohl((*mc)[zip->ai & 7].rx_hash_result);
1172		if ((++zip->ai & 7) == 0) {
1173			/*
1174			 * Increment consumer index to skip the number of
1175			 * CQEs consumed. Hardware leaves holes in the CQ
1176			 * ring for software use.
1177			 */
1178			zip->ca = zip->na;
1179			zip->na += 8;
1180		}
1181		if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) {
1182			uint16_t idx = rxq->cq_ci;
1183			uint16_t end = zip->cq_ci;
1184
1185			while (idx != end) {
1186				(*rxq->cqes)[idx & cqe_cnt].op_own =
1187					MLX5_CQE_INVALIDATE;
1188				++idx;
1189			}
1190			rxq->cq_ci = zip->cq_ci;
1191			zip->ai = 0;
1192		}
1193	/* No compressed data, get next CQE and verify if it is compressed. */
1194	} else {
1195		int ret;
1196		int8_t op_own;
1197
1198		ret = check_cqe(cqe, cqe_n, rxq->cq_ci);
1199		if (unlikely(ret == 1))
1200			return 0;
1201		++rxq->cq_ci;
1202		op_own = cqe->op_own;
1203		if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) {
1204			volatile struct mlx5_mini_cqe8 (*mc)[8] =
1205				(volatile struct mlx5_mini_cqe8 (*)[8])
1206				(uintptr_t)(&(*rxq->cqes)[rxq->cq_ci &
1207							  cqe_cnt]);
1208
1209			/* Fix endianness. */
1210			zip->cqe_cnt = ntohl(cqe->byte_cnt);
1211			/*
1212			 * Current mini array position is the one returned by
1213			 * check_cqe64().
1214			 *
1215			 * If completion comprises several mini arrays, as a
1216			 * special case the second one is located 7 CQEs after
1217			 * the initial CQE instead of 8 for subsequent ones.
1218			 */
1219			zip->ca = rxq->cq_ci & cqe_cnt;
1220			zip->na = zip->ca + 7;
1221			/* Compute the next non compressed CQE. */
1222			--rxq->cq_ci;
1223			zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
1224			/* Get packet size to return. */
1225			len = ntohl((*mc)[0].byte_cnt);
1226			*rss_hash = ntohl((*mc)[0].rx_hash_result);
1227			zip->ai = 1;
1228		} else {
1229			len = ntohl(cqe->byte_cnt);
1230			*rss_hash = ntohl(cqe->rx_hash_res);
1231		}
1232		/* Error while receiving packet. */
1233		if (unlikely(MLX5_CQE_OPCODE(op_own) == MLX5_CQE_RESP_ERR))
1234			return -1;
1235	}
1236	return len;
1237}
1238
1239/**
1240 * Translate RX completion flags to offload flags.
1241 *
1242 * @param[in] rxq
1243 *   Pointer to RX queue structure.
1244 * @param[in] cqe
1245 *   Pointer to CQE.
1246 *
1247 * @return
1248 *   Offload flags (ol_flags) for struct rte_mbuf.
1249 */
1250static inline uint32_t
1251rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe *cqe)
1252{
1253	uint32_t ol_flags = 0;
1254	uint16_t flags = ntohs(cqe->hdr_type_etc);
1255
1256	ol_flags =
1257		TRANSPOSE(flags,
1258			  MLX5_CQE_RX_L3_HDR_VALID,
1259			  PKT_RX_IP_CKSUM_GOOD) |
1260		TRANSPOSE(flags,
1261			  MLX5_CQE_RX_L4_HDR_VALID,
1262			  PKT_RX_L4_CKSUM_GOOD);
1263	if ((cqe->pkt_info & MLX5_CQE_RX_TUNNEL_PACKET) && (rxq->csum_l2tun))
1264		ol_flags |=
1265			TRANSPOSE(flags,
1266				  MLX5_CQE_RX_L3_HDR_VALID,
1267				  PKT_RX_IP_CKSUM_GOOD) |
1268			TRANSPOSE(flags,
1269				  MLX5_CQE_RX_L4_HDR_VALID,
1270				  PKT_RX_L4_CKSUM_GOOD);
1271	return ol_flags;
1272}
1273
1274/**
1275 * DPDK callback for RX.
1276 *
1277 * @param dpdk_rxq
1278 *   Generic pointer to RX queue structure.
1279 * @param[out] pkts
1280 *   Array to store received packets.
1281 * @param pkts_n
1282 *   Maximum number of packets in array.
1283 *
1284 * @return
1285 *   Number of packets successfully received (<= pkts_n).
1286 */
1287uint16_t
1288mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
1289{
1290	struct rxq *rxq = dpdk_rxq;
1291	const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1;
1292	const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1;
1293	const unsigned int sges_n = rxq->sges_n;
1294	struct rte_mbuf *pkt = NULL;
1295	struct rte_mbuf *seg = NULL;
1296	volatile struct mlx5_cqe *cqe =
1297		&(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
1298	unsigned int i = 0;
1299	unsigned int rq_ci = rxq->rq_ci << sges_n;
1300	int len = 0; /* keep its value across iterations. */
1301
1302	while (pkts_n) {
1303		unsigned int idx = rq_ci & wqe_cnt;
1304		volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx];
1305		struct rte_mbuf *rep = (*rxq->elts)[idx];
1306		uint32_t rss_hash_res = 0;
1307
1308		if (pkt)
1309			NEXT(seg) = rep;
1310		seg = rep;
1311		rte_prefetch0(seg);
1312		rte_prefetch0(cqe);
1313		rte_prefetch0(wqe);
1314		rep = rte_mbuf_raw_alloc(rxq->mp);
1315		if (unlikely(rep == NULL)) {
1316			++rxq->stats.rx_nombuf;
1317			if (!pkt) {
1318				/*
1319				 * no buffers before we even started,
1320				 * bail out silently.
1321				 */
1322				break;
1323			}
1324			while (pkt != seg) {
1325				assert(pkt != (*rxq->elts)[idx]);
1326				rep = NEXT(pkt);
1327				rte_mbuf_refcnt_set(pkt, 0);
1328				__rte_mbuf_raw_free(pkt);
1329				pkt = rep;
1330			}
1331			break;
1332		}
1333		if (!pkt) {
1334			cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
1335			len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt,
1336					       &rss_hash_res);
1337			if (!len) {
1338				rte_mbuf_refcnt_set(rep, 0);
1339				__rte_mbuf_raw_free(rep);
1340				break;
1341			}
1342			if (unlikely(len == -1)) {
1343				/* RX error, packet is likely too large. */
1344				rte_mbuf_refcnt_set(rep, 0);
1345				__rte_mbuf_raw_free(rep);
1346				++rxq->stats.idropped;
1347				goto skip;
1348			}
1349			pkt = seg;
1350			assert(len >= (rxq->crc_present << 2));
1351			/* Update packet information. */
1352			pkt->packet_type = 0;
1353			pkt->ol_flags = 0;
1354			if (rss_hash_res && rxq->rss_hash) {
1355				pkt->hash.rss = rss_hash_res;
1356				pkt->ol_flags = PKT_RX_RSS_HASH;
1357			}
1358			if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip |
1359			    rxq->crc_present) {
1360				if (rxq->csum) {
1361					pkt->packet_type =
1362						rxq_cq_to_pkt_type(cqe);
1363					pkt->ol_flags |=
1364						rxq_cq_to_ol_flags(rxq, cqe);
1365				}
1366				if (ntohs(cqe->hdr_type_etc) &
1367				    MLX5_CQE_VLAN_STRIPPED) {
1368					pkt->ol_flags |= PKT_RX_VLAN_PKT |
1369						PKT_RX_VLAN_STRIPPED;
1370					pkt->vlan_tci = ntohs(cqe->vlan_info);
1371				}
1372				if (rxq->crc_present)
1373					len -= ETHER_CRC_LEN;
1374			}
1375			PKT_LEN(pkt) = len;
1376		}
1377		DATA_LEN(rep) = DATA_LEN(seg);
1378		PKT_LEN(rep) = PKT_LEN(seg);
1379		SET_DATA_OFF(rep, DATA_OFF(seg));
1380		NB_SEGS(rep) = NB_SEGS(seg);
1381		PORT(rep) = PORT(seg);
1382		NEXT(rep) = NULL;
1383		(*rxq->elts)[idx] = rep;
1384		/*
1385		 * Fill NIC descriptor with the new buffer.  The lkey and size
1386		 * of the buffers are already known, only the buffer address
1387		 * changes.
1388		 */
1389		wqe->addr = htonll(rte_pktmbuf_mtod(rep, uintptr_t));
1390		if (len > DATA_LEN(seg)) {
1391			len -= DATA_LEN(seg);
1392			++NB_SEGS(pkt);
1393			++rq_ci;
1394			continue;
1395		}
1396		DATA_LEN(seg) = len;
1397#ifdef MLX5_PMD_SOFT_COUNTERS
1398		/* Increment bytes counter. */
1399		rxq->stats.ibytes += PKT_LEN(pkt);
1400#endif
1401		/* Return packet. */
1402		*(pkts++) = pkt;
1403		pkt = NULL;
1404		--pkts_n;
1405		++i;
1406skip:
1407		/* Align consumer index to the next stride. */
1408		rq_ci >>= sges_n;
1409		++rq_ci;
1410		rq_ci <<= sges_n;
1411	}
1412	if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci)))
1413		return 0;
1414	/* Update the consumer index. */
1415	rxq->rq_ci = rq_ci >> sges_n;
1416	rte_wmb();
1417	*rxq->cq_db = htonl(rxq->cq_ci);
1418	rte_wmb();
1419	*rxq->rq_db = htonl(rxq->rq_ci);
1420#ifdef MLX5_PMD_SOFT_COUNTERS
1421	/* Increment packets counter. */
1422	rxq->stats.ipackets += i;
1423#endif
1424	return i;
1425}
1426
1427/**
1428 * Dummy DPDK callback for TX.
1429 *
1430 * This function is used to temporarily replace the real callback during
1431 * unsafe control operations on the queue, or in case of error.
1432 *
1433 * @param dpdk_txq
1434 *   Generic pointer to TX queue structure.
1435 * @param[in] pkts
1436 *   Packets to transmit.
1437 * @param pkts_n
1438 *   Number of packets in array.
1439 *
1440 * @return
1441 *   Number of packets successfully transmitted (<= pkts_n).
1442 */
1443uint16_t
1444removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
1445{
1446	(void)dpdk_txq;
1447	(void)pkts;
1448	(void)pkts_n;
1449	return 0;
1450}
1451
1452/**
1453 * Dummy DPDK callback for RX.
1454 *
1455 * This function is used to temporarily replace the real callback during
1456 * unsafe control operations on the queue, or in case of error.
1457 *
1458 * @param dpdk_rxq
1459 *   Generic pointer to RX queue structure.
1460 * @param[out] pkts
1461 *   Array to store received packets.
1462 * @param pkts_n
1463 *   Maximum number of packets in array.
1464 *
1465 * @return
1466 *   Number of packets successfully received (<= pkts_n).
1467 */
1468uint16_t
1469removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
1470{
1471	(void)dpdk_rxq;
1472	(void)pkts;
1473	(void)pkts_n;
1474	return 0;
1475}
1476