vmxnet3_rxtx.c revision 3d9b7210
1/*-
2 *   BSD LICENSE
3 *
4 *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
5 *   All rights reserved.
6 *
7 *   Redistribution and use in source and binary forms, with or without
8 *   modification, are permitted provided that the following conditions
9 *   are met:
10 *
11 *     * Redistributions of source code must retain the above copyright
12 *       notice, this list of conditions and the following disclaimer.
13 *     * Redistributions in binary form must reproduce the above copyright
14 *       notice, this list of conditions and the following disclaimer in
15 *       the documentation and/or other materials provided with the
16 *       distribution.
17 *     * Neither the name of Intel Corporation nor the names of its
18 *       contributors may be used to endorse or promote products derived
19 *       from this software without specific prior written permission.
20 *
21 *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34#include <sys/queue.h>
35
36#include <stdio.h>
37#include <stdlib.h>
38#include <string.h>
39#include <errno.h>
40#include <stdint.h>
41#include <stdarg.h>
42#include <unistd.h>
43#include <inttypes.h>
44
45#include <rte_byteorder.h>
46#include <rte_common.h>
47#include <rte_cycles.h>
48#include <rte_log.h>
49#include <rte_debug.h>
50#include <rte_interrupts.h>
51#include <rte_pci.h>
52#include <rte_memory.h>
53#include <rte_memzone.h>
54#include <rte_launch.h>
55#include <rte_eal.h>
56#include <rte_per_lcore.h>
57#include <rte_lcore.h>
58#include <rte_atomic.h>
59#include <rte_branch_prediction.h>
60#include <rte_mempool.h>
61#include <rte_malloc.h>
62#include <rte_mbuf.h>
63#include <rte_ether.h>
64#include <rte_ethdev.h>
65#include <rte_prefetch.h>
66#include <rte_ip.h>
67#include <rte_udp.h>
68#include <rte_tcp.h>
69#include <rte_sctp.h>
70#include <rte_string_fns.h>
71#include <rte_errno.h>
72
73#include "base/vmxnet3_defs.h"
74#include "vmxnet3_ring.h"
75
76#include "vmxnet3_logs.h"
77#include "vmxnet3_ethdev.h"
78
79static const uint32_t rxprod_reg[2] = {VMXNET3_REG_RXPROD, VMXNET3_REG_RXPROD2};
80
81static int vmxnet3_post_rx_bufs(vmxnet3_rx_queue_t*, uint8_t);
82static void vmxnet3_tq_tx_complete(vmxnet3_tx_queue_t *);
83#ifdef RTE_LIBRTE_VMXNET3_DEBUG_DRIVER_NOT_USED
84static void vmxnet3_rxq_dump(struct vmxnet3_rx_queue *);
85static void vmxnet3_txq_dump(struct vmxnet3_tx_queue *);
86#endif
87
88#ifdef RTE_LIBRTE_VMXNET3_DEBUG_DRIVER_NOT_USED
89static void
90vmxnet3_rxq_dump(struct vmxnet3_rx_queue *rxq)
91{
92	uint32_t avail = 0;
93
94	if (rxq == NULL)
95		return;
96
97	PMD_RX_LOG(DEBUG,
98		   "RXQ: cmd0 base : %p cmd1 base : %p comp ring base : %p.",
99		   rxq->cmd_ring[0].base, rxq->cmd_ring[1].base, rxq->comp_ring.base);
100	PMD_RX_LOG(DEBUG,
101		   "RXQ: cmd0 basePA : 0x%lx cmd1 basePA : 0x%lx comp ring basePA : 0x%lx.",
102		   (unsigned long)rxq->cmd_ring[0].basePA,
103		   (unsigned long)rxq->cmd_ring[1].basePA,
104		   (unsigned long)rxq->comp_ring.basePA);
105
106	avail = vmxnet3_cmd_ring_desc_avail(&rxq->cmd_ring[0]);
107	PMD_RX_LOG(DEBUG,
108		   "RXQ:cmd0: size=%u; free=%u; next2proc=%u; queued=%u",
109		   (uint32_t)rxq->cmd_ring[0].size, avail,
110		   rxq->comp_ring.next2proc,
111		   rxq->cmd_ring[0].size - avail);
112
113	avail = vmxnet3_cmd_ring_desc_avail(&rxq->cmd_ring[1]);
114	PMD_RX_LOG(DEBUG, "RXQ:cmd1 size=%u; free=%u; next2proc=%u; queued=%u",
115		   (uint32_t)rxq->cmd_ring[1].size, avail, rxq->comp_ring.next2proc,
116		   rxq->cmd_ring[1].size - avail);
117
118}
119
120static void
121vmxnet3_txq_dump(struct vmxnet3_tx_queue *txq)
122{
123	uint32_t avail = 0;
124
125	if (txq == NULL)
126		return;
127
128	PMD_TX_LOG(DEBUG, "TXQ: cmd base : %p comp ring base : %p data ring base : %p.",
129		   txq->cmd_ring.base, txq->comp_ring.base, txq->data_ring.base);
130	PMD_TX_LOG(DEBUG, "TXQ: cmd basePA : 0x%lx comp ring basePA : 0x%lx data ring basePA : 0x%lx.",
131		   (unsigned long)txq->cmd_ring.basePA,
132		   (unsigned long)txq->comp_ring.basePA,
133		   (unsigned long)txq->data_ring.basePA);
134
135	avail = vmxnet3_cmd_ring_desc_avail(&txq->cmd_ring);
136	PMD_TX_LOG(DEBUG, "TXQ: size=%u; free=%u; next2proc=%u; queued=%u",
137		   (uint32_t)txq->cmd_ring.size, avail,
138		   txq->comp_ring.next2proc, txq->cmd_ring.size - avail);
139}
140#endif
141
142static void
143vmxnet3_tx_cmd_ring_release_mbufs(vmxnet3_cmd_ring_t *ring)
144{
145	while (ring->next2comp != ring->next2fill) {
146		/* No need to worry about desc ownership, device is quiesced by now. */
147		vmxnet3_buf_info_t *buf_info = ring->buf_info + ring->next2comp;
148
149		if (buf_info->m) {
150			rte_pktmbuf_free(buf_info->m);
151			buf_info->m = NULL;
152			buf_info->bufPA = 0;
153			buf_info->len = 0;
154		}
155		vmxnet3_cmd_ring_adv_next2comp(ring);
156	}
157}
158
159static void
160vmxnet3_rx_cmd_ring_release_mbufs(vmxnet3_cmd_ring_t *ring)
161{
162	uint32_t i;
163
164	for (i = 0; i < ring->size; i++) {
165		/* No need to worry about desc ownership, device is quiesced by now. */
166		vmxnet3_buf_info_t *buf_info = &ring->buf_info[i];
167
168		if (buf_info->m) {
169			rte_pktmbuf_free_seg(buf_info->m);
170			buf_info->m = NULL;
171			buf_info->bufPA = 0;
172			buf_info->len = 0;
173		}
174		vmxnet3_cmd_ring_adv_next2comp(ring);
175	}
176}
177
178static void
179vmxnet3_cmd_ring_release(vmxnet3_cmd_ring_t *ring)
180{
181	rte_free(ring->buf_info);
182	ring->buf_info = NULL;
183}
184
185void
186vmxnet3_dev_tx_queue_release(void *txq)
187{
188	vmxnet3_tx_queue_t *tq = txq;
189
190	if (tq != NULL) {
191		/* Release mbufs */
192		vmxnet3_tx_cmd_ring_release_mbufs(&tq->cmd_ring);
193		/* Release the cmd_ring */
194		vmxnet3_cmd_ring_release(&tq->cmd_ring);
195	}
196}
197
198void
199vmxnet3_dev_rx_queue_release(void *rxq)
200{
201	int i;
202	vmxnet3_rx_queue_t *rq = rxq;
203
204	if (rq != NULL) {
205		/* Release mbufs */
206		for (i = 0; i < VMXNET3_RX_CMDRING_SIZE; i++)
207			vmxnet3_rx_cmd_ring_release_mbufs(&rq->cmd_ring[i]);
208
209		/* Release both the cmd_rings */
210		for (i = 0; i < VMXNET3_RX_CMDRING_SIZE; i++)
211			vmxnet3_cmd_ring_release(&rq->cmd_ring[i]);
212	}
213}
214
215static void
216vmxnet3_dev_tx_queue_reset(void *txq)
217{
218	vmxnet3_tx_queue_t *tq = txq;
219	struct vmxnet3_cmd_ring *ring = &tq->cmd_ring;
220	struct vmxnet3_comp_ring *comp_ring = &tq->comp_ring;
221	struct vmxnet3_data_ring *data_ring = &tq->data_ring;
222	int size;
223
224	if (tq != NULL) {
225		/* Release the cmd_ring mbufs */
226		vmxnet3_tx_cmd_ring_release_mbufs(&tq->cmd_ring);
227	}
228
229	/* Tx vmxnet rings structure initialization*/
230	ring->next2fill = 0;
231	ring->next2comp = 0;
232	ring->gen = VMXNET3_INIT_GEN;
233	comp_ring->next2proc = 0;
234	comp_ring->gen = VMXNET3_INIT_GEN;
235
236	size = sizeof(struct Vmxnet3_TxDesc) * ring->size;
237	size += sizeof(struct Vmxnet3_TxCompDesc) * comp_ring->size;
238	size += sizeof(struct Vmxnet3_TxDataDesc) * data_ring->size;
239
240	memset(ring->base, 0, size);
241}
242
243static void
244vmxnet3_dev_rx_queue_reset(void *rxq)
245{
246	int i;
247	vmxnet3_rx_queue_t *rq = rxq;
248	struct vmxnet3_cmd_ring *ring0, *ring1;
249	struct vmxnet3_comp_ring *comp_ring;
250	int size;
251
252	if (rq != NULL) {
253		/* Release both the cmd_rings mbufs */
254		for (i = 0; i < VMXNET3_RX_CMDRING_SIZE; i++)
255			vmxnet3_rx_cmd_ring_release_mbufs(&rq->cmd_ring[i]);
256	}
257
258	ring0 = &rq->cmd_ring[0];
259	ring1 = &rq->cmd_ring[1];
260	comp_ring = &rq->comp_ring;
261
262	/* Rx vmxnet rings structure initialization */
263	ring0->next2fill = 0;
264	ring1->next2fill = 0;
265	ring0->next2comp = 0;
266	ring1->next2comp = 0;
267	ring0->gen = VMXNET3_INIT_GEN;
268	ring1->gen = VMXNET3_INIT_GEN;
269	comp_ring->next2proc = 0;
270	comp_ring->gen = VMXNET3_INIT_GEN;
271
272	size = sizeof(struct Vmxnet3_RxDesc) * (ring0->size + ring1->size);
273	size += sizeof(struct Vmxnet3_RxCompDesc) * comp_ring->size;
274
275	memset(ring0->base, 0, size);
276}
277
278void
279vmxnet3_dev_clear_queues(struct rte_eth_dev *dev)
280{
281	unsigned i;
282
283	PMD_INIT_FUNC_TRACE();
284
285	for (i = 0; i < dev->data->nb_tx_queues; i++) {
286		struct vmxnet3_tx_queue *txq = dev->data->tx_queues[i];
287
288		if (txq != NULL) {
289			txq->stopped = TRUE;
290			vmxnet3_dev_tx_queue_reset(txq);
291		}
292	}
293
294	for (i = 0; i < dev->data->nb_rx_queues; i++) {
295		struct vmxnet3_rx_queue *rxq = dev->data->rx_queues[i];
296
297		if (rxq != NULL) {
298			rxq->stopped = TRUE;
299			vmxnet3_dev_rx_queue_reset(rxq);
300		}
301	}
302}
303
304static int
305vmxnet3_unmap_pkt(uint16_t eop_idx, vmxnet3_tx_queue_t *txq)
306{
307	int completed = 0;
308	struct rte_mbuf *mbuf;
309
310	/* Release cmd_ring descriptor and free mbuf */
311	RTE_ASSERT(txq->cmd_ring.base[eop_idx].txd.eop == 1);
312
313	mbuf = txq->cmd_ring.buf_info[eop_idx].m;
314	if (mbuf == NULL)
315		rte_panic("EOP desc does not point to a valid mbuf");
316	rte_pktmbuf_free(mbuf);
317
318	txq->cmd_ring.buf_info[eop_idx].m = NULL;
319
320	while (txq->cmd_ring.next2comp != eop_idx) {
321		/* no out-of-order completion */
322		RTE_ASSERT(txq->cmd_ring.base[txq->cmd_ring.next2comp].txd.cq == 0);
323		vmxnet3_cmd_ring_adv_next2comp(&txq->cmd_ring);
324		completed++;
325	}
326
327	/* Mark the txd for which tcd was generated as completed */
328	vmxnet3_cmd_ring_adv_next2comp(&txq->cmd_ring);
329
330	return completed + 1;
331}
332
333static void
334vmxnet3_tq_tx_complete(vmxnet3_tx_queue_t *txq)
335{
336	int completed = 0;
337	vmxnet3_comp_ring_t *comp_ring = &txq->comp_ring;
338	struct Vmxnet3_TxCompDesc *tcd = (struct Vmxnet3_TxCompDesc *)
339		(comp_ring->base + comp_ring->next2proc);
340
341	while (tcd->gen == comp_ring->gen) {
342		completed += vmxnet3_unmap_pkt(tcd->txdIdx, txq);
343
344		vmxnet3_comp_ring_adv_next2proc(comp_ring);
345		tcd = (struct Vmxnet3_TxCompDesc *)(comp_ring->base +
346						    comp_ring->next2proc);
347	}
348
349	PMD_TX_LOG(DEBUG, "Processed %d tx comps & command descs.", completed);
350}
351
352uint16_t
353vmxnet3_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
354		  uint16_t nb_pkts)
355{
356	uint16_t nb_tx;
357	vmxnet3_tx_queue_t *txq = tx_queue;
358	struct vmxnet3_hw *hw = txq->hw;
359	Vmxnet3_TxQueueCtrl *txq_ctrl = &txq->shared->ctrl;
360	uint32_t deferred = rte_le_to_cpu_32(txq_ctrl->txNumDeferred);
361
362	if (unlikely(txq->stopped)) {
363		PMD_TX_LOG(DEBUG, "Tx queue is stopped.");
364		return 0;
365	}
366
367	/* Free up the comp_descriptors aggressively */
368	vmxnet3_tq_tx_complete(txq);
369
370	nb_tx = 0;
371	while (nb_tx < nb_pkts) {
372		Vmxnet3_GenericDesc *gdesc;
373		vmxnet3_buf_info_t *tbi;
374		uint32_t first2fill, avail, dw2;
375		struct rte_mbuf *txm = tx_pkts[nb_tx];
376		struct rte_mbuf *m_seg = txm;
377		int copy_size = 0;
378		bool tso = (txm->ol_flags & PKT_TX_TCP_SEG) != 0;
379		/* # of descriptors needed for a packet. */
380		unsigned count = txm->nb_segs;
381
382		avail = vmxnet3_cmd_ring_desc_avail(&txq->cmd_ring);
383		if (count > avail) {
384			/* Is command ring full? */
385			if (unlikely(avail == 0)) {
386				PMD_TX_LOG(DEBUG, "No free ring descriptors");
387				txq->stats.tx_ring_full++;
388				txq->stats.drop_total += (nb_pkts - nb_tx);
389				break;
390			}
391
392			/* Command ring is not full but cannot handle the
393			 * multi-segmented packet. Let's try the next packet
394			 * in this case.
395			 */
396			PMD_TX_LOG(DEBUG, "Running out of ring descriptors "
397				   "(avail %d needed %d)", avail, count);
398			txq->stats.drop_total++;
399			if (tso)
400				txq->stats.drop_tso++;
401			rte_pktmbuf_free(txm);
402			nb_tx++;
403			continue;
404		}
405
406		/* Drop non-TSO packet that is excessively fragmented */
407		if (unlikely(!tso && count > VMXNET3_MAX_TXD_PER_PKT)) {
408			PMD_TX_LOG(ERR, "Non-TSO packet cannot occupy more than %d tx "
409				   "descriptors. Packet dropped.", VMXNET3_MAX_TXD_PER_PKT);
410			txq->stats.drop_too_many_segs++;
411			txq->stats.drop_total++;
412			rte_pktmbuf_free(txm);
413			nb_tx++;
414			continue;
415		}
416
417		if (txm->nb_segs == 1 &&
418		    rte_pktmbuf_pkt_len(txm) <= VMXNET3_HDR_COPY_SIZE) {
419			struct Vmxnet3_TxDataDesc *tdd;
420
421			tdd = txq->data_ring.base + txq->cmd_ring.next2fill;
422			copy_size = rte_pktmbuf_pkt_len(txm);
423			rte_memcpy(tdd->data, rte_pktmbuf_mtod(txm, char *), copy_size);
424		}
425
426		/* use the previous gen bit for the SOP desc */
427		dw2 = (txq->cmd_ring.gen ^ 0x1) << VMXNET3_TXD_GEN_SHIFT;
428		first2fill = txq->cmd_ring.next2fill;
429		do {
430			/* Remember the transmit buffer for cleanup */
431			tbi = txq->cmd_ring.buf_info + txq->cmd_ring.next2fill;
432
433			/* NB: the following assumes that VMXNET3 maximum
434			 * transmit buffer size (16K) is greater than
435			 * maximum size of mbuf segment size.
436			 */
437			gdesc = txq->cmd_ring.base + txq->cmd_ring.next2fill;
438			if (copy_size)
439				gdesc->txd.addr = rte_cpu_to_le_64(txq->data_ring.basePA +
440								   txq->cmd_ring.next2fill *
441								   sizeof(struct Vmxnet3_TxDataDesc));
442			else
443				gdesc->txd.addr = rte_mbuf_data_dma_addr(m_seg);
444
445			gdesc->dword[2] = dw2 | m_seg->data_len;
446			gdesc->dword[3] = 0;
447
448			/* move to the next2fill descriptor */
449			vmxnet3_cmd_ring_adv_next2fill(&txq->cmd_ring);
450
451			/* use the right gen for non-SOP desc */
452			dw2 = txq->cmd_ring.gen << VMXNET3_TXD_GEN_SHIFT;
453		} while ((m_seg = m_seg->next) != NULL);
454
455		/* set the last buf_info for the pkt */
456		tbi->m = txm;
457		/* Update the EOP descriptor */
458		gdesc->dword[3] |= VMXNET3_TXD_EOP | VMXNET3_TXD_CQ;
459
460		/* Add VLAN tag if present */
461		gdesc = txq->cmd_ring.base + first2fill;
462		if (txm->ol_flags & PKT_TX_VLAN_PKT) {
463			gdesc->txd.ti = 1;
464			gdesc->txd.tci = txm->vlan_tci;
465		}
466
467		if (tso) {
468			uint16_t mss = txm->tso_segsz;
469
470			RTE_ASSERT(mss > 0);
471
472			gdesc->txd.hlen = txm->l2_len + txm->l3_len + txm->l4_len;
473			gdesc->txd.om = VMXNET3_OM_TSO;
474			gdesc->txd.msscof = mss;
475
476			deferred += (rte_pktmbuf_pkt_len(txm) - gdesc->txd.hlen + mss - 1) / mss;
477		} else if (txm->ol_flags & PKT_TX_L4_MASK) {
478			gdesc->txd.om = VMXNET3_OM_CSUM;
479			gdesc->txd.hlen = txm->l2_len + txm->l3_len;
480
481			switch (txm->ol_flags & PKT_TX_L4_MASK) {
482			case PKT_TX_TCP_CKSUM:
483				gdesc->txd.msscof = gdesc->txd.hlen + offsetof(struct tcp_hdr, cksum);
484				break;
485			case PKT_TX_UDP_CKSUM:
486				gdesc->txd.msscof = gdesc->txd.hlen + offsetof(struct udp_hdr, dgram_cksum);
487				break;
488			default:
489				PMD_TX_LOG(WARNING, "requested cksum offload not supported %#llx",
490					   txm->ol_flags & PKT_TX_L4_MASK);
491				abort();
492			}
493			deferred++;
494		} else {
495			gdesc->txd.hlen = 0;
496			gdesc->txd.om = VMXNET3_OM_NONE;
497			gdesc->txd.msscof = 0;
498			deferred++;
499		}
500
501		/* flip the GEN bit on the SOP */
502		rte_compiler_barrier();
503		gdesc->dword[2] ^= VMXNET3_TXD_GEN;
504
505		txq_ctrl->txNumDeferred = rte_cpu_to_le_32(deferred);
506		nb_tx++;
507	}
508
509	PMD_TX_LOG(DEBUG, "vmxnet3 txThreshold: %u", rte_le_to_cpu_32(txq_ctrl->txThreshold));
510
511	if (deferred >= rte_le_to_cpu_32(txq_ctrl->txThreshold)) {
512		txq_ctrl->txNumDeferred = 0;
513		/* Notify vSwitch that packets are available. */
514		VMXNET3_WRITE_BAR0_REG(hw, (VMXNET3_REG_TXPROD + txq->queue_id * VMXNET3_REG_ALIGN),
515				       txq->cmd_ring.next2fill);
516	}
517
518	return nb_tx;
519}
520
521/*
522 *  Allocates mbufs and clusters. Post rx descriptors with buffer details
523 *  so that device can receive packets in those buffers.
524 *  Ring layout:
525 *      Among the two rings, 1st ring contains buffers of type 0 and type 1.
526 *      bufs_per_pkt is set such that for non-LRO cases all the buffers required
527 *      by a frame will fit in 1st ring (1st buf of type0 and rest of type1).
528 *      2nd ring contains buffers of type 1 alone. Second ring mostly be used
529 *      only for LRO.
530 */
531static int
532vmxnet3_post_rx_bufs(vmxnet3_rx_queue_t *rxq, uint8_t ring_id)
533{
534	int err = 0;
535	uint32_t i = 0, val = 0;
536	struct vmxnet3_cmd_ring *ring = &rxq->cmd_ring[ring_id];
537
538	if (ring_id == 0) {
539		/* Usually: One HEAD type buf per packet
540		 * val = (ring->next2fill % rxq->hw->bufs_per_pkt) ?
541		 * VMXNET3_RXD_BTYPE_BODY : VMXNET3_RXD_BTYPE_HEAD;
542		 */
543
544		/* We use single packet buffer so all heads here */
545		val = VMXNET3_RXD_BTYPE_HEAD;
546	} else {
547		/* All BODY type buffers for 2nd ring */
548		val = VMXNET3_RXD_BTYPE_BODY;
549	}
550
551	while (vmxnet3_cmd_ring_desc_avail(ring) > 0) {
552		struct Vmxnet3_RxDesc *rxd;
553		struct rte_mbuf *mbuf;
554		vmxnet3_buf_info_t *buf_info = &ring->buf_info[ring->next2fill];
555
556		rxd = (struct Vmxnet3_RxDesc *)(ring->base + ring->next2fill);
557
558		/* Allocate blank mbuf for the current Rx Descriptor */
559		mbuf = rte_mbuf_raw_alloc(rxq->mp);
560		if (unlikely(mbuf == NULL)) {
561			PMD_RX_LOG(ERR, "Error allocating mbuf");
562			rxq->stats.rx_buf_alloc_failure++;
563			err = ENOMEM;
564			break;
565		}
566
567		/*
568		 * Load mbuf pointer into buf_info[ring_size]
569		 * buf_info structure is equivalent to cookie for virtio-virtqueue
570		 */
571		buf_info->m = mbuf;
572		buf_info->len = (uint16_t)(mbuf->buf_len -
573					   RTE_PKTMBUF_HEADROOM);
574		buf_info->bufPA = rte_mbuf_data_dma_addr_default(mbuf);
575
576		/* Load Rx Descriptor with the buffer's GPA */
577		rxd->addr = buf_info->bufPA;
578
579		/* After this point rxd->addr MUST not be NULL */
580		rxd->btype = val;
581		rxd->len = buf_info->len;
582		/* Flip gen bit at the end to change ownership */
583		rxd->gen = ring->gen;
584
585		vmxnet3_cmd_ring_adv_next2fill(ring);
586		i++;
587	}
588
589	/* Return error only if no buffers are posted at present */
590	if (vmxnet3_cmd_ring_desc_avail(ring) >= (ring->size - 1))
591		return -err;
592	else
593		return i;
594}
595
596
597/* Receive side checksum and other offloads */
598static void
599vmxnet3_rx_offload(const Vmxnet3_RxCompDesc *rcd, struct rte_mbuf *rxm)
600{
601	/* Check for RSS */
602	if (rcd->rssType != VMXNET3_RCD_RSS_TYPE_NONE) {
603		rxm->ol_flags |= PKT_RX_RSS_HASH;
604		rxm->hash.rss = rcd->rssHash;
605	}
606
607	/* Check packet type, checksum errors, etc. Only support IPv4 for now. */
608	if (rcd->v4) {
609		struct ether_hdr *eth = rte_pktmbuf_mtod(rxm, struct ether_hdr *);
610		struct ipv4_hdr *ip = (struct ipv4_hdr *)(eth + 1);
611
612		if (((ip->version_ihl & 0xf) << 2) > (int)sizeof(struct ipv4_hdr))
613			rxm->packet_type = RTE_PTYPE_L3_IPV4_EXT;
614		else
615			rxm->packet_type = RTE_PTYPE_L3_IPV4;
616
617		if (!rcd->cnc) {
618			if (!rcd->ipc)
619				rxm->ol_flags |= PKT_RX_IP_CKSUM_BAD;
620
621			if ((rcd->tcp || rcd->udp) && !rcd->tuc)
622				rxm->ol_flags |= PKT_RX_L4_CKSUM_BAD;
623		}
624	}
625}
626
627/*
628 * Process the Rx Completion Ring of given vmxnet3_rx_queue
629 * for nb_pkts burst and return the number of packets received
630 */
631uint16_t
632vmxnet3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
633{
634	uint16_t nb_rx;
635	uint32_t nb_rxd, idx;
636	uint8_t ring_idx;
637	vmxnet3_rx_queue_t *rxq;
638	Vmxnet3_RxCompDesc *rcd;
639	vmxnet3_buf_info_t *rbi;
640	Vmxnet3_RxDesc *rxd;
641	struct rte_mbuf *rxm = NULL;
642	struct vmxnet3_hw *hw;
643
644	nb_rx = 0;
645	ring_idx = 0;
646	nb_rxd = 0;
647	idx = 0;
648
649	rxq = rx_queue;
650	hw = rxq->hw;
651
652	rcd = &rxq->comp_ring.base[rxq->comp_ring.next2proc].rcd;
653
654	if (unlikely(rxq->stopped)) {
655		PMD_RX_LOG(DEBUG, "Rx queue is stopped.");
656		return 0;
657	}
658
659	while (rcd->gen == rxq->comp_ring.gen) {
660		if (nb_rx >= nb_pkts)
661			break;
662
663		idx = rcd->rxdIdx;
664		ring_idx = (uint8_t)((rcd->rqID == rxq->qid1) ? 0 : 1);
665		rxd = (Vmxnet3_RxDesc *)rxq->cmd_ring[ring_idx].base + idx;
666		RTE_SET_USED(rxd); /* used only for assert when enabled */
667		rbi = rxq->cmd_ring[ring_idx].buf_info + idx;
668
669		PMD_RX_LOG(DEBUG, "rxd idx: %d ring idx: %d.", idx, ring_idx);
670
671		RTE_ASSERT(rcd->len <= rxd->len);
672		RTE_ASSERT(rbi->m);
673
674		/* Get the packet buffer pointer from buf_info */
675		rxm = rbi->m;
676
677		/* Clear descriptor associated buf_info to be reused */
678		rbi->m = NULL;
679		rbi->bufPA = 0;
680
681		/* Update the index that we received a packet */
682		rxq->cmd_ring[ring_idx].next2comp = idx;
683
684		/* For RCD with EOP set, check if there is frame error */
685		if (unlikely(rcd->eop && rcd->err)) {
686			rxq->stats.drop_total++;
687			rxq->stats.drop_err++;
688
689			if (!rcd->fcs) {
690				rxq->stats.drop_fcs++;
691				PMD_RX_LOG(ERR, "Recv packet dropped due to frame err.");
692			}
693			PMD_RX_LOG(ERR, "Error in received packet rcd#:%d rxd:%d",
694				   (int)(rcd - (struct Vmxnet3_RxCompDesc *)
695					 rxq->comp_ring.base), rcd->rxdIdx);
696			rte_pktmbuf_free_seg(rxm);
697			goto rcd_done;
698		}
699
700		/* Initialize newly received packet buffer */
701		rxm->port = rxq->port_id;
702		rxm->nb_segs = 1;
703		rxm->next = NULL;
704		rxm->pkt_len = (uint16_t)rcd->len;
705		rxm->data_len = (uint16_t)rcd->len;
706		rxm->data_off = RTE_PKTMBUF_HEADROOM;
707		rxm->ol_flags = 0;
708		rxm->vlan_tci = 0;
709
710		/*
711		 * If this is the first buffer of the received packet,
712		 * set the pointer to the first mbuf of the packet
713		 * Otherwise, update the total length and the number of segments
714		 * of the current scattered packet, and update the pointer to
715		 * the last mbuf of the current packet.
716		 */
717		if (rcd->sop) {
718			RTE_ASSERT(rxd->btype == VMXNET3_RXD_BTYPE_HEAD);
719
720			if (unlikely(rcd->len == 0)) {
721				RTE_ASSERT(rcd->eop);
722
723				PMD_RX_LOG(DEBUG,
724					   "Rx buf was skipped. rxring[%d][%d])",
725					   ring_idx, idx);
726				rte_pktmbuf_free_seg(rxm);
727				goto rcd_done;
728			}
729
730			rxq->start_seg = rxm;
731			vmxnet3_rx_offload(rcd, rxm);
732		} else {
733			struct rte_mbuf *start = rxq->start_seg;
734
735			RTE_ASSERT(rxd->btype == VMXNET3_RXD_BTYPE_BODY);
736
737			start->pkt_len += rxm->data_len;
738			start->nb_segs++;
739
740			rxq->last_seg->next = rxm;
741		}
742		rxq->last_seg = rxm;
743
744		if (rcd->eop) {
745			struct rte_mbuf *start = rxq->start_seg;
746
747			/* Check for hardware stripped VLAN tag */
748			if (rcd->ts) {
749				start->ol_flags |= (PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED);
750				start->vlan_tci = rte_le_to_cpu_16((uint16_t)rcd->tci);
751			}
752
753			rx_pkts[nb_rx++] = start;
754			rxq->start_seg = NULL;
755		}
756
757rcd_done:
758		rxq->cmd_ring[ring_idx].next2comp = idx;
759		VMXNET3_INC_RING_IDX_ONLY(rxq->cmd_ring[ring_idx].next2comp,
760					  rxq->cmd_ring[ring_idx].size);
761
762		/* It's time to allocate some new buf and renew descriptors */
763		vmxnet3_post_rx_bufs(rxq, ring_idx);
764		if (unlikely(rxq->shared->ctrl.updateRxProd)) {
765			VMXNET3_WRITE_BAR0_REG(hw, rxprod_reg[ring_idx] + (rxq->queue_id * VMXNET3_REG_ALIGN),
766					       rxq->cmd_ring[ring_idx].next2fill);
767		}
768
769		/* Advance to the next descriptor in comp_ring */
770		vmxnet3_comp_ring_adv_next2proc(&rxq->comp_ring);
771
772		rcd = &rxq->comp_ring.base[rxq->comp_ring.next2proc].rcd;
773		nb_rxd++;
774		if (nb_rxd > rxq->cmd_ring[0].size) {
775			PMD_RX_LOG(ERR, "Used up quota of receiving packets,"
776				   " relinquish control.");
777			break;
778		}
779	}
780
781	return nb_rx;
782}
783
784/*
785 * Create memzone for device rings. malloc can't be used as the physical address is
786 * needed. If the memzone is already created, then this function returns a ptr
787 * to the old one.
788 */
789static const struct rte_memzone *
790ring_dma_zone_reserve(struct rte_eth_dev *dev, const char *ring_name,
791		      uint16_t queue_id, uint32_t ring_size, int socket_id)
792{
793	char z_name[RTE_MEMZONE_NAMESIZE];
794	const struct rte_memzone *mz;
795
796	snprintf(z_name, sizeof(z_name), "%s_%s_%d_%d",
797		 dev->driver->pci_drv.driver.name, ring_name,
798		 dev->data->port_id, queue_id);
799
800	mz = rte_memzone_lookup(z_name);
801	if (mz)
802		return mz;
803
804	return rte_memzone_reserve_aligned(z_name, ring_size,
805					   socket_id, 0, VMXNET3_RING_BA_ALIGN);
806}
807
808int
809vmxnet3_dev_tx_queue_setup(struct rte_eth_dev *dev,
810			   uint16_t queue_idx,
811			   uint16_t nb_desc,
812			   unsigned int socket_id,
813			   __rte_unused const struct rte_eth_txconf *tx_conf)
814{
815	struct vmxnet3_hw *hw = dev->data->dev_private;
816	const struct rte_memzone *mz;
817	struct vmxnet3_tx_queue *txq;
818	struct vmxnet3_cmd_ring *ring;
819	struct vmxnet3_comp_ring *comp_ring;
820	struct vmxnet3_data_ring *data_ring;
821	int size;
822
823	PMD_INIT_FUNC_TRACE();
824
825	if ((tx_conf->txq_flags & ETH_TXQ_FLAGS_NOXSUMSCTP) !=
826	    ETH_TXQ_FLAGS_NOXSUMSCTP) {
827		PMD_INIT_LOG(ERR, "SCTP checksum offload not supported");
828		return -EINVAL;
829	}
830
831	txq = rte_zmalloc("ethdev_tx_queue", sizeof(struct vmxnet3_tx_queue),
832			  RTE_CACHE_LINE_SIZE);
833	if (txq == NULL) {
834		PMD_INIT_LOG(ERR, "Can not allocate tx queue structure");
835		return -ENOMEM;
836	}
837
838	txq->queue_id = queue_idx;
839	txq->port_id = dev->data->port_id;
840	txq->shared = &hw->tqd_start[queue_idx];
841	txq->hw = hw;
842	txq->qid = queue_idx;
843	txq->stopped = TRUE;
844
845	ring = &txq->cmd_ring;
846	comp_ring = &txq->comp_ring;
847	data_ring = &txq->data_ring;
848
849	/* Tx vmxnet ring length should be between 512-4096 */
850	if (nb_desc < VMXNET3_DEF_TX_RING_SIZE) {
851		PMD_INIT_LOG(ERR, "VMXNET3 Tx Ring Size Min: %u",
852			     VMXNET3_DEF_TX_RING_SIZE);
853		return -EINVAL;
854	} else if (nb_desc > VMXNET3_TX_RING_MAX_SIZE) {
855		PMD_INIT_LOG(ERR, "VMXNET3 Tx Ring Size Max: %u",
856			     VMXNET3_TX_RING_MAX_SIZE);
857		return -EINVAL;
858	} else {
859		ring->size = nb_desc;
860		ring->size &= ~VMXNET3_RING_SIZE_MASK;
861	}
862	comp_ring->size = data_ring->size = ring->size;
863
864	/* Tx vmxnet rings structure initialization*/
865	ring->next2fill = 0;
866	ring->next2comp = 0;
867	ring->gen = VMXNET3_INIT_GEN;
868	comp_ring->next2proc = 0;
869	comp_ring->gen = VMXNET3_INIT_GEN;
870
871	size = sizeof(struct Vmxnet3_TxDesc) * ring->size;
872	size += sizeof(struct Vmxnet3_TxCompDesc) * comp_ring->size;
873	size += sizeof(struct Vmxnet3_TxDataDesc) * data_ring->size;
874
875	mz = ring_dma_zone_reserve(dev, "txdesc", queue_idx, size, socket_id);
876	if (mz == NULL) {
877		PMD_INIT_LOG(ERR, "ERROR: Creating queue descriptors zone");
878		return -ENOMEM;
879	}
880	memset(mz->addr, 0, mz->len);
881
882	/* cmd_ring initialization */
883	ring->base = mz->addr;
884	ring->basePA = mz->phys_addr;
885
886	/* comp_ring initialization */
887	comp_ring->base = ring->base + ring->size;
888	comp_ring->basePA = ring->basePA +
889		(sizeof(struct Vmxnet3_TxDesc) * ring->size);
890
891	/* data_ring initialization */
892	data_ring->base = (Vmxnet3_TxDataDesc *)(comp_ring->base + comp_ring->size);
893	data_ring->basePA = comp_ring->basePA +
894			(sizeof(struct Vmxnet3_TxCompDesc) * comp_ring->size);
895
896	/* cmd_ring0 buf_info allocation */
897	ring->buf_info = rte_zmalloc("tx_ring_buf_info",
898				     ring->size * sizeof(vmxnet3_buf_info_t), RTE_CACHE_LINE_SIZE);
899	if (ring->buf_info == NULL) {
900		PMD_INIT_LOG(ERR, "ERROR: Creating tx_buf_info structure");
901		return -ENOMEM;
902	}
903
904	/* Update the data portion with txq */
905	dev->data->tx_queues[queue_idx] = txq;
906
907	return 0;
908}
909
910int
911vmxnet3_dev_rx_queue_setup(struct rte_eth_dev *dev,
912			   uint16_t queue_idx,
913			   uint16_t nb_desc,
914			   unsigned int socket_id,
915			   __rte_unused const struct rte_eth_rxconf *rx_conf,
916			   struct rte_mempool *mp)
917{
918	const struct rte_memzone *mz;
919	struct vmxnet3_rx_queue *rxq;
920	struct vmxnet3_hw *hw = dev->data->dev_private;
921	struct vmxnet3_cmd_ring *ring0, *ring1, *ring;
922	struct vmxnet3_comp_ring *comp_ring;
923	int size;
924	uint8_t i;
925	char mem_name[32];
926
927	PMD_INIT_FUNC_TRACE();
928
929	rxq = rte_zmalloc("ethdev_rx_queue", sizeof(struct vmxnet3_rx_queue),
930			  RTE_CACHE_LINE_SIZE);
931	if (rxq == NULL) {
932		PMD_INIT_LOG(ERR, "Can not allocate rx queue structure");
933		return -ENOMEM;
934	}
935
936	rxq->mp = mp;
937	rxq->queue_id = queue_idx;
938	rxq->port_id = dev->data->port_id;
939	rxq->shared = &hw->rqd_start[queue_idx];
940	rxq->hw = hw;
941	rxq->qid1 = queue_idx;
942	rxq->qid2 = queue_idx + hw->num_rx_queues;
943	rxq->stopped = TRUE;
944
945	ring0 = &rxq->cmd_ring[0];
946	ring1 = &rxq->cmd_ring[1];
947	comp_ring = &rxq->comp_ring;
948
949	/* Rx vmxnet rings length should be between 256-4096 */
950	if (nb_desc < VMXNET3_DEF_RX_RING_SIZE) {
951		PMD_INIT_LOG(ERR, "VMXNET3 Rx Ring Size Min: 256");
952		return -EINVAL;
953	} else if (nb_desc > VMXNET3_RX_RING_MAX_SIZE) {
954		PMD_INIT_LOG(ERR, "VMXNET3 Rx Ring Size Max: 4096");
955		return -EINVAL;
956	} else {
957		ring0->size = nb_desc;
958		ring0->size &= ~VMXNET3_RING_SIZE_MASK;
959		ring1->size = ring0->size;
960	}
961
962	comp_ring->size = ring0->size + ring1->size;
963
964	/* Rx vmxnet rings structure initialization */
965	ring0->next2fill = 0;
966	ring1->next2fill = 0;
967	ring0->next2comp = 0;
968	ring1->next2comp = 0;
969	ring0->gen = VMXNET3_INIT_GEN;
970	ring1->gen = VMXNET3_INIT_GEN;
971	comp_ring->next2proc = 0;
972	comp_ring->gen = VMXNET3_INIT_GEN;
973
974	size = sizeof(struct Vmxnet3_RxDesc) * (ring0->size + ring1->size);
975	size += sizeof(struct Vmxnet3_RxCompDesc) * comp_ring->size;
976
977	mz = ring_dma_zone_reserve(dev, "rxdesc", queue_idx, size, socket_id);
978	if (mz == NULL) {
979		PMD_INIT_LOG(ERR, "ERROR: Creating queue descriptors zone");
980		return -ENOMEM;
981	}
982	memset(mz->addr, 0, mz->len);
983
984	/* cmd_ring0 initialization */
985	ring0->base = mz->addr;
986	ring0->basePA = mz->phys_addr;
987
988	/* cmd_ring1 initialization */
989	ring1->base = ring0->base + ring0->size;
990	ring1->basePA = ring0->basePA + sizeof(struct Vmxnet3_RxDesc) * ring0->size;
991
992	/* comp_ring initialization */
993	comp_ring->base = ring1->base + ring1->size;
994	comp_ring->basePA = ring1->basePA + sizeof(struct Vmxnet3_RxDesc) *
995		ring1->size;
996
997	/* cmd_ring0-cmd_ring1 buf_info allocation */
998	for (i = 0; i < VMXNET3_RX_CMDRING_SIZE; i++) {
999
1000		ring = &rxq->cmd_ring[i];
1001		ring->rid = i;
1002		snprintf(mem_name, sizeof(mem_name), "rx_ring_%d_buf_info", i);
1003
1004		ring->buf_info = rte_zmalloc(mem_name,
1005					     ring->size * sizeof(vmxnet3_buf_info_t),
1006					     RTE_CACHE_LINE_SIZE);
1007		if (ring->buf_info == NULL) {
1008			PMD_INIT_LOG(ERR, "ERROR: Creating rx_buf_info structure");
1009			return -ENOMEM;
1010		}
1011	}
1012
1013	/* Update the data portion with rxq */
1014	dev->data->rx_queues[queue_idx] = rxq;
1015
1016	return 0;
1017}
1018
1019/*
1020 * Initializes Receive Unit
1021 * Load mbufs in rx queue in advance
1022 */
1023int
1024vmxnet3_dev_rxtx_init(struct rte_eth_dev *dev)
1025{
1026	struct vmxnet3_hw *hw = dev->data->dev_private;
1027
1028	int i, ret;
1029	uint8_t j;
1030
1031	PMD_INIT_FUNC_TRACE();
1032
1033	for (i = 0; i < hw->num_rx_queues; i++) {
1034		vmxnet3_rx_queue_t *rxq = dev->data->rx_queues[i];
1035
1036		for (j = 0; j < VMXNET3_RX_CMDRING_SIZE; j++) {
1037			/* Passing 0 as alloc_num will allocate full ring */
1038			ret = vmxnet3_post_rx_bufs(rxq, j);
1039			if (ret <= 0) {
1040				PMD_INIT_LOG(ERR,
1041					     "ERROR: Posting Rxq: %d buffers ring: %d",
1042					     i, j);
1043				return -ret;
1044			}
1045			/*
1046			 * Updating device with the index:next2fill to fill the
1047			 * mbufs for coming packets.
1048			 */
1049			if (unlikely(rxq->shared->ctrl.updateRxProd)) {
1050				VMXNET3_WRITE_BAR0_REG(hw, rxprod_reg[j] + (rxq->queue_id * VMXNET3_REG_ALIGN),
1051						       rxq->cmd_ring[j].next2fill);
1052			}
1053		}
1054		rxq->stopped = FALSE;
1055		rxq->start_seg = NULL;
1056	}
1057
1058	for (i = 0; i < dev->data->nb_tx_queues; i++) {
1059		struct vmxnet3_tx_queue *txq = dev->data->tx_queues[i];
1060
1061		txq->stopped = FALSE;
1062	}
1063
1064	return 0;
1065}
1066
1067static uint8_t rss_intel_key[40] = {
1068	0x6D, 0x5A, 0x56, 0xDA, 0x25, 0x5B, 0x0E, 0xC2,
1069	0x41, 0x67, 0x25, 0x3D, 0x43, 0xA3, 0x8F, 0xB0,
1070	0xD0, 0xCA, 0x2B, 0xCB, 0xAE, 0x7B, 0x30, 0xB4,
1071	0x77, 0xCB, 0x2D, 0xA3, 0x80, 0x30, 0xF2, 0x0C,
1072	0x6A, 0x42, 0xB7, 0x3B, 0xBE, 0xAC, 0x01, 0xFA,
1073};
1074
1075/*
1076 * Configure RSS feature
1077 */
1078int
1079vmxnet3_rss_configure(struct rte_eth_dev *dev)
1080{
1081	struct vmxnet3_hw *hw = dev->data->dev_private;
1082	struct VMXNET3_RSSConf *dev_rss_conf;
1083	struct rte_eth_rss_conf *port_rss_conf;
1084	uint64_t rss_hf;
1085	uint8_t i, j;
1086
1087	PMD_INIT_FUNC_TRACE();
1088
1089	dev_rss_conf = hw->rss_conf;
1090	port_rss_conf = &dev->data->dev_conf.rx_adv_conf.rss_conf;
1091
1092	/* loading hashFunc */
1093	dev_rss_conf->hashFunc = VMXNET3_RSS_HASH_FUNC_TOEPLITZ;
1094	/* loading hashKeySize */
1095	dev_rss_conf->hashKeySize = VMXNET3_RSS_MAX_KEY_SIZE;
1096	/* loading indTableSize: Must not exceed VMXNET3_RSS_MAX_IND_TABLE_SIZE (128)*/
1097	dev_rss_conf->indTableSize = (uint16_t)(hw->num_rx_queues * 4);
1098
1099	if (port_rss_conf->rss_key == NULL) {
1100		/* Default hash key */
1101		port_rss_conf->rss_key = rss_intel_key;
1102	}
1103
1104	/* loading hashKey */
1105	memcpy(&dev_rss_conf->hashKey[0], port_rss_conf->rss_key,
1106	       dev_rss_conf->hashKeySize);
1107
1108	/* loading indTable */
1109	for (i = 0, j = 0; i < dev_rss_conf->indTableSize; i++, j++) {
1110		if (j == dev->data->nb_rx_queues)
1111			j = 0;
1112		dev_rss_conf->indTable[i] = j;
1113	}
1114
1115	/* loading hashType */
1116	dev_rss_conf->hashType = 0;
1117	rss_hf = port_rss_conf->rss_hf & VMXNET3_RSS_OFFLOAD_ALL;
1118	if (rss_hf & ETH_RSS_IPV4)
1119		dev_rss_conf->hashType |= VMXNET3_RSS_HASH_TYPE_IPV4;
1120	if (rss_hf & ETH_RSS_NONFRAG_IPV4_TCP)
1121		dev_rss_conf->hashType |= VMXNET3_RSS_HASH_TYPE_TCP_IPV4;
1122	if (rss_hf & ETH_RSS_IPV6)
1123		dev_rss_conf->hashType |= VMXNET3_RSS_HASH_TYPE_IPV6;
1124	if (rss_hf & ETH_RSS_NONFRAG_IPV6_TCP)
1125		dev_rss_conf->hashType |= VMXNET3_RSS_HASH_TYPE_TCP_IPV6;
1126
1127	return VMXNET3_SUCCESS;
1128}
1129