vmxnet3_rxtx.c revision c300e355
1/*-
2 *   BSD LICENSE
3 *
4 *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
5 *   All rights reserved.
6 *
7 *   Redistribution and use in source and binary forms, with or without
8 *   modification, are permitted provided that the following conditions
9 *   are met:
10 *
11 *     * Redistributions of source code must retain the above copyright
12 *       notice, this list of conditions and the following disclaimer.
13 *     * Redistributions in binary form must reproduce the above copyright
14 *       notice, this list of conditions and the following disclaimer in
15 *       the documentation and/or other materials provided with the
16 *       distribution.
17 *     * Neither the name of Intel Corporation nor the names of its
18 *       contributors may be used to endorse or promote products derived
19 *       from this software without specific prior written permission.
20 *
21 *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34#include <sys/queue.h>
35
36#include <stdio.h>
37#include <stdlib.h>
38#include <string.h>
39#include <errno.h>
40#include <stdint.h>
41#include <stdarg.h>
42#include <unistd.h>
43#include <inttypes.h>
44
45#include <rte_byteorder.h>
46#include <rte_common.h>
47#include <rte_cycles.h>
48#include <rte_log.h>
49#include <rte_debug.h>
50#include <rte_interrupts.h>
51#include <rte_pci.h>
52#include <rte_memory.h>
53#include <rte_memzone.h>
54#include <rte_launch.h>
55#include <rte_eal.h>
56#include <rte_per_lcore.h>
57#include <rte_lcore.h>
58#include <rte_atomic.h>
59#include <rte_branch_prediction.h>
60#include <rte_ring.h>
61#include <rte_mempool.h>
62#include <rte_malloc.h>
63#include <rte_mbuf.h>
64#include <rte_ether.h>
65#include <rte_ethdev.h>
66#include <rte_prefetch.h>
67#include <rte_ip.h>
68#include <rte_udp.h>
69#include <rte_tcp.h>
70#include <rte_sctp.h>
71#include <rte_string_fns.h>
72#include <rte_errno.h>
73
74#include "base/vmxnet3_defs.h"
75#include "vmxnet3_ring.h"
76
77#include "vmxnet3_logs.h"
78#include "vmxnet3_ethdev.h"
79
80static const uint32_t rxprod_reg[2] = {VMXNET3_REG_RXPROD, VMXNET3_REG_RXPROD2};
81
82static int vmxnet3_post_rx_bufs(vmxnet3_rx_queue_t*, uint8_t);
83static void vmxnet3_tq_tx_complete(vmxnet3_tx_queue_t *);
84#ifdef RTE_LIBRTE_VMXNET3_DEBUG_DRIVER_NOT_USED
85static void vmxnet3_rxq_dump(struct vmxnet3_rx_queue *);
86static void vmxnet3_txq_dump(struct vmxnet3_tx_queue *);
87#endif
88
89#ifdef RTE_LIBRTE_VMXNET3_DEBUG_DRIVER_NOT_USED
90static void
91vmxnet3_rxq_dump(struct vmxnet3_rx_queue *rxq)
92{
93	uint32_t avail = 0;
94
95	if (rxq == NULL)
96		return;
97
98	PMD_RX_LOG(DEBUG,
99		   "RXQ: cmd0 base : %p cmd1 base : %p comp ring base : %p.",
100		   rxq->cmd_ring[0].base, rxq->cmd_ring[1].base, rxq->comp_ring.base);
101	PMD_RX_LOG(DEBUG,
102		   "RXQ: cmd0 basePA : 0x%lx cmd1 basePA : 0x%lx comp ring basePA : 0x%lx.",
103		   (unsigned long)rxq->cmd_ring[0].basePA,
104		   (unsigned long)rxq->cmd_ring[1].basePA,
105		   (unsigned long)rxq->comp_ring.basePA);
106
107	avail = vmxnet3_cmd_ring_desc_avail(&rxq->cmd_ring[0]);
108	PMD_RX_LOG(DEBUG,
109		   "RXQ:cmd0: size=%u; free=%u; next2proc=%u; queued=%u",
110		   (uint32_t)rxq->cmd_ring[0].size, avail,
111		   rxq->comp_ring.next2proc,
112		   rxq->cmd_ring[0].size - avail);
113
114	avail = vmxnet3_cmd_ring_desc_avail(&rxq->cmd_ring[1]);
115	PMD_RX_LOG(DEBUG, "RXQ:cmd1 size=%u; free=%u; next2proc=%u; queued=%u",
116		   (uint32_t)rxq->cmd_ring[1].size, avail, rxq->comp_ring.next2proc,
117		   rxq->cmd_ring[1].size - avail);
118
119}
120
121static void
122vmxnet3_txq_dump(struct vmxnet3_tx_queue *txq)
123{
124	uint32_t avail = 0;
125
126	if (txq == NULL)
127		return;
128
129	PMD_TX_LOG(DEBUG, "TXQ: cmd base : %p comp ring base : %p data ring base : %p.",
130		   txq->cmd_ring.base, txq->comp_ring.base, txq->data_ring.base);
131	PMD_TX_LOG(DEBUG, "TXQ: cmd basePA : 0x%lx comp ring basePA : 0x%lx data ring basePA : 0x%lx.",
132		   (unsigned long)txq->cmd_ring.basePA,
133		   (unsigned long)txq->comp_ring.basePA,
134		   (unsigned long)txq->data_ring.basePA);
135
136	avail = vmxnet3_cmd_ring_desc_avail(&txq->cmd_ring);
137	PMD_TX_LOG(DEBUG, "TXQ: size=%u; free=%u; next2proc=%u; queued=%u",
138		   (uint32_t)txq->cmd_ring.size, avail,
139		   txq->comp_ring.next2proc, txq->cmd_ring.size - avail);
140}
141#endif
142
143static void
144vmxnet3_tx_cmd_ring_release_mbufs(vmxnet3_cmd_ring_t *ring)
145{
146	while (ring->next2comp != ring->next2fill) {
147		/* No need to worry about desc ownership, device is quiesced by now. */
148		vmxnet3_buf_info_t *buf_info = ring->buf_info + ring->next2comp;
149
150		if (buf_info->m) {
151			rte_pktmbuf_free(buf_info->m);
152			buf_info->m = NULL;
153			buf_info->bufPA = 0;
154			buf_info->len = 0;
155		}
156		vmxnet3_cmd_ring_adv_next2comp(ring);
157	}
158}
159
160static void
161vmxnet3_rx_cmd_ring_release_mbufs(vmxnet3_cmd_ring_t *ring)
162{
163	uint32_t i;
164
165	for (i = 0; i < ring->size; i++) {
166		/* No need to worry about desc ownership, device is quiesced by now. */
167		vmxnet3_buf_info_t *buf_info = &ring->buf_info[i];
168
169		if (buf_info->m) {
170			rte_pktmbuf_free_seg(buf_info->m);
171			buf_info->m = NULL;
172			buf_info->bufPA = 0;
173			buf_info->len = 0;
174		}
175		vmxnet3_cmd_ring_adv_next2comp(ring);
176	}
177}
178
179static void
180vmxnet3_cmd_ring_release(vmxnet3_cmd_ring_t *ring)
181{
182	rte_free(ring->buf_info);
183	ring->buf_info = NULL;
184}
185
186
187void
188vmxnet3_dev_tx_queue_release(void *txq)
189{
190	vmxnet3_tx_queue_t *tq = txq;
191
192	if (tq != NULL) {
193		/* Release mbufs */
194		vmxnet3_tx_cmd_ring_release_mbufs(&tq->cmd_ring);
195		/* Release the cmd_ring */
196		vmxnet3_cmd_ring_release(&tq->cmd_ring);
197	}
198}
199
200void
201vmxnet3_dev_rx_queue_release(void *rxq)
202{
203	int i;
204	vmxnet3_rx_queue_t *rq = rxq;
205
206	if (rq != NULL) {
207		/* Release mbufs */
208		for (i = 0; i < VMXNET3_RX_CMDRING_SIZE; i++)
209			vmxnet3_rx_cmd_ring_release_mbufs(&rq->cmd_ring[i]);
210
211		/* Release both the cmd_rings */
212		for (i = 0; i < VMXNET3_RX_CMDRING_SIZE; i++)
213			vmxnet3_cmd_ring_release(&rq->cmd_ring[i]);
214	}
215}
216
217static void
218vmxnet3_dev_tx_queue_reset(void *txq)
219{
220	vmxnet3_tx_queue_t *tq = txq;
221	struct vmxnet3_cmd_ring *ring = &tq->cmd_ring;
222	struct vmxnet3_comp_ring *comp_ring = &tq->comp_ring;
223	struct vmxnet3_data_ring *data_ring = &tq->data_ring;
224	int size;
225
226	if (tq != NULL) {
227		/* Release the cmd_ring mbufs */
228		vmxnet3_tx_cmd_ring_release_mbufs(&tq->cmd_ring);
229	}
230
231	/* Tx vmxnet rings structure initialization*/
232	ring->next2fill = 0;
233	ring->next2comp = 0;
234	ring->gen = VMXNET3_INIT_GEN;
235	comp_ring->next2proc = 0;
236	comp_ring->gen = VMXNET3_INIT_GEN;
237
238	size = sizeof(struct Vmxnet3_TxDesc) * ring->size;
239	size += sizeof(struct Vmxnet3_TxCompDesc) * comp_ring->size;
240	size += sizeof(struct Vmxnet3_TxDataDesc) * data_ring->size;
241
242	memset(ring->base, 0, size);
243}
244
245static void
246vmxnet3_dev_rx_queue_reset(void *rxq)
247{
248	int i;
249	vmxnet3_rx_queue_t *rq = rxq;
250	struct vmxnet3_cmd_ring *ring0, *ring1;
251	struct vmxnet3_comp_ring *comp_ring;
252	int size;
253
254	if (rq != NULL) {
255		/* Release both the cmd_rings mbufs */
256		for (i = 0; i < VMXNET3_RX_CMDRING_SIZE; i++)
257			vmxnet3_rx_cmd_ring_release_mbufs(&rq->cmd_ring[i]);
258	}
259
260	ring0 = &rq->cmd_ring[0];
261	ring1 = &rq->cmd_ring[1];
262	comp_ring = &rq->comp_ring;
263
264	/* Rx vmxnet rings structure initialization */
265	ring0->next2fill = 0;
266	ring1->next2fill = 0;
267	ring0->next2comp = 0;
268	ring1->next2comp = 0;
269	ring0->gen = VMXNET3_INIT_GEN;
270	ring1->gen = VMXNET3_INIT_GEN;
271	comp_ring->next2proc = 0;
272	comp_ring->gen = VMXNET3_INIT_GEN;
273
274	size = sizeof(struct Vmxnet3_RxDesc) * (ring0->size + ring1->size);
275	size += sizeof(struct Vmxnet3_RxCompDesc) * comp_ring->size;
276
277	memset(ring0->base, 0, size);
278}
279
280void
281vmxnet3_dev_clear_queues(struct rte_eth_dev *dev)
282{
283	unsigned i;
284
285	PMD_INIT_FUNC_TRACE();
286
287	for (i = 0; i < dev->data->nb_tx_queues; i++) {
288		struct vmxnet3_tx_queue *txq = dev->data->tx_queues[i];
289
290		if (txq != NULL) {
291			txq->stopped = TRUE;
292			vmxnet3_dev_tx_queue_reset(txq);
293		}
294	}
295
296	for (i = 0; i < dev->data->nb_rx_queues; i++) {
297		struct vmxnet3_rx_queue *rxq = dev->data->rx_queues[i];
298
299		if (rxq != NULL) {
300			rxq->stopped = TRUE;
301			vmxnet3_dev_rx_queue_reset(rxq);
302		}
303	}
304}
305
306static int
307vmxnet3_unmap_pkt(uint16_t eop_idx, vmxnet3_tx_queue_t *txq)
308{
309	int completed = 0;
310	struct rte_mbuf *mbuf;
311
312	/* Release cmd_ring descriptor and free mbuf */
313	RTE_ASSERT(txq->cmd_ring.base[eop_idx].txd.eop == 1);
314
315	mbuf = txq->cmd_ring.buf_info[eop_idx].m;
316	if (mbuf == NULL)
317		rte_panic("EOP desc does not point to a valid mbuf");
318	rte_pktmbuf_free(mbuf);
319
320	txq->cmd_ring.buf_info[eop_idx].m = NULL;
321
322	while (txq->cmd_ring.next2comp != eop_idx) {
323		/* no out-of-order completion */
324		RTE_ASSERT(txq->cmd_ring.base[txq->cmd_ring.next2comp].txd.cq == 0);
325		vmxnet3_cmd_ring_adv_next2comp(&txq->cmd_ring);
326		completed++;
327	}
328
329	/* Mark the txd for which tcd was generated as completed */
330	vmxnet3_cmd_ring_adv_next2comp(&txq->cmd_ring);
331
332	return completed + 1;
333}
334
335static void
336vmxnet3_tq_tx_complete(vmxnet3_tx_queue_t *txq)
337{
338	int completed = 0;
339	vmxnet3_comp_ring_t *comp_ring = &txq->comp_ring;
340	struct Vmxnet3_TxCompDesc *tcd = (struct Vmxnet3_TxCompDesc *)
341		(comp_ring->base + comp_ring->next2proc);
342
343	while (tcd->gen == comp_ring->gen) {
344		completed += vmxnet3_unmap_pkt(tcd->txdIdx, txq);
345
346		vmxnet3_comp_ring_adv_next2proc(comp_ring);
347		tcd = (struct Vmxnet3_TxCompDesc *)(comp_ring->base +
348						    comp_ring->next2proc);
349	}
350
351	PMD_TX_LOG(DEBUG, "Processed %d tx comps & command descs.", completed);
352}
353
354uint16_t
355vmxnet3_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
356		  uint16_t nb_pkts)
357{
358	uint16_t nb_tx;
359	vmxnet3_tx_queue_t *txq = tx_queue;
360	struct vmxnet3_hw *hw = txq->hw;
361	Vmxnet3_TxQueueCtrl *txq_ctrl = &txq->shared->ctrl;
362	uint32_t deferred = rte_le_to_cpu_32(txq_ctrl->txNumDeferred);
363
364	if (unlikely(txq->stopped)) {
365		PMD_TX_LOG(DEBUG, "Tx queue is stopped.");
366		return 0;
367	}
368
369	/* Free up the comp_descriptors aggressively */
370	vmxnet3_tq_tx_complete(txq);
371
372	nb_tx = 0;
373	while (nb_tx < nb_pkts) {
374		Vmxnet3_GenericDesc *gdesc;
375		vmxnet3_buf_info_t *tbi;
376		uint32_t first2fill, avail, dw2;
377		struct rte_mbuf *txm = tx_pkts[nb_tx];
378		struct rte_mbuf *m_seg = txm;
379		int copy_size = 0;
380		bool tso = (txm->ol_flags & PKT_TX_TCP_SEG) != 0;
381		/* # of descriptors needed for a packet. */
382		unsigned count = txm->nb_segs;
383
384		avail = vmxnet3_cmd_ring_desc_avail(&txq->cmd_ring);
385		if (count > avail) {
386			/* Is command ring full? */
387			if (unlikely(avail == 0)) {
388				PMD_TX_LOG(DEBUG, "No free ring descriptors");
389				txq->stats.tx_ring_full++;
390				txq->stats.drop_total += (nb_pkts - nb_tx);
391				break;
392			}
393
394			/* Command ring is not full but cannot handle the
395			 * multi-segmented packet. Let's try the next packet
396			 * in this case.
397			 */
398			PMD_TX_LOG(DEBUG, "Running out of ring descriptors "
399				   "(avail %d needed %d)", avail, count);
400			txq->stats.drop_total++;
401			if (tso)
402				txq->stats.drop_tso++;
403			rte_pktmbuf_free(txm);
404			nb_tx++;
405			continue;
406		}
407
408		/* Drop non-TSO packet that is excessively fragmented */
409		if (unlikely(!tso && count > VMXNET3_MAX_TXD_PER_PKT)) {
410			PMD_TX_LOG(ERR, "Non-TSO packet cannot occupy more than %d tx "
411				   "descriptors. Packet dropped.", VMXNET3_MAX_TXD_PER_PKT);
412			txq->stats.drop_too_many_segs++;
413			txq->stats.drop_total++;
414			rte_pktmbuf_free(txm);
415			nb_tx++;
416			continue;
417		}
418
419		if (txm->nb_segs == 1 && rte_pktmbuf_pkt_len(txm) <= VMXNET3_HDR_COPY_SIZE) {
420			struct Vmxnet3_TxDataDesc *tdd;
421
422			tdd = txq->data_ring.base + txq->cmd_ring.next2fill;
423			copy_size = rte_pktmbuf_pkt_len(txm);
424			rte_memcpy(tdd->data, rte_pktmbuf_mtod(txm, char *), copy_size);
425		}
426
427		/* use the previous gen bit for the SOP desc */
428		dw2 = (txq->cmd_ring.gen ^ 0x1) << VMXNET3_TXD_GEN_SHIFT;
429		first2fill = txq->cmd_ring.next2fill;
430		do {
431			/* Remember the transmit buffer for cleanup */
432			tbi = txq->cmd_ring.buf_info + txq->cmd_ring.next2fill;
433
434			/* NB: the following assumes that VMXNET3 maximum
435			 * transmit buffer size (16K) is greater than
436			 * maximum size of mbuf segment size.
437			 */
438			gdesc = txq->cmd_ring.base + txq->cmd_ring.next2fill;
439			if (copy_size)
440				gdesc->txd.addr = rte_cpu_to_le_64(txq->data_ring.basePA +
441								txq->cmd_ring.next2fill *
442								sizeof(struct Vmxnet3_TxDataDesc));
443			else
444				gdesc->txd.addr = rte_mbuf_data_dma_addr(m_seg);
445
446			gdesc->dword[2] = dw2 | m_seg->data_len;
447			gdesc->dword[3] = 0;
448
449			/* move to the next2fill descriptor */
450			vmxnet3_cmd_ring_adv_next2fill(&txq->cmd_ring);
451
452			/* use the right gen for non-SOP desc */
453			dw2 = txq->cmd_ring.gen << VMXNET3_TXD_GEN_SHIFT;
454		} while ((m_seg = m_seg->next) != NULL);
455
456		/* set the last buf_info for the pkt */
457		tbi->m = txm;
458		/* Update the EOP descriptor */
459		gdesc->dword[3] |= VMXNET3_TXD_EOP | VMXNET3_TXD_CQ;
460
461		/* Add VLAN tag if present */
462		gdesc = txq->cmd_ring.base + first2fill;
463		if (txm->ol_flags & PKT_TX_VLAN_PKT) {
464			gdesc->txd.ti = 1;
465			gdesc->txd.tci = txm->vlan_tci;
466		}
467
468		if (tso) {
469			uint16_t mss = txm->tso_segsz;
470
471			RTE_ASSERT(mss > 0);
472
473			gdesc->txd.hlen = txm->l2_len + txm->l3_len + txm->l4_len;
474			gdesc->txd.om = VMXNET3_OM_TSO;
475			gdesc->txd.msscof = mss;
476
477			deferred += (rte_pktmbuf_pkt_len(txm) - gdesc->txd.hlen + mss - 1) / mss;
478		} else if (txm->ol_flags & PKT_TX_L4_MASK) {
479			gdesc->txd.om = VMXNET3_OM_CSUM;
480			gdesc->txd.hlen = txm->l2_len + txm->l3_len;
481
482			switch (txm->ol_flags & PKT_TX_L4_MASK) {
483			case PKT_TX_TCP_CKSUM:
484				gdesc->txd.msscof = gdesc->txd.hlen + offsetof(struct tcp_hdr, cksum);
485				break;
486			case PKT_TX_UDP_CKSUM:
487				gdesc->txd.msscof = gdesc->txd.hlen + offsetof(struct udp_hdr, dgram_cksum);
488				break;
489			default:
490				PMD_TX_LOG(WARNING, "requested cksum offload not supported %#llx",
491					   txm->ol_flags & PKT_TX_L4_MASK);
492				abort();
493			}
494			deferred++;
495		} else {
496			gdesc->txd.hlen = 0;
497			gdesc->txd.om = VMXNET3_OM_NONE;
498			gdesc->txd.msscof = 0;
499			deferred++;
500		}
501
502		/* flip the GEN bit on the SOP */
503		rte_compiler_barrier();
504		gdesc->dword[2] ^= VMXNET3_TXD_GEN;
505
506		txq_ctrl->txNumDeferred = rte_cpu_to_le_32(deferred);
507		nb_tx++;
508	}
509
510	PMD_TX_LOG(DEBUG, "vmxnet3 txThreshold: %u", rte_le_to_cpu_32(txq_ctrl->txThreshold));
511
512	if (deferred >= rte_le_to_cpu_32(txq_ctrl->txThreshold)) {
513		txq_ctrl->txNumDeferred = 0;
514		/* Notify vSwitch that packets are available. */
515		VMXNET3_WRITE_BAR0_REG(hw, (VMXNET3_REG_TXPROD + txq->queue_id * VMXNET3_REG_ALIGN),
516				       txq->cmd_ring.next2fill);
517	}
518
519	return nb_tx;
520}
521
522/*
523 *  Allocates mbufs and clusters. Post rx descriptors with buffer details
524 *  so that device can receive packets in those buffers.
525 *	Ring layout:
526 *      Among the two rings, 1st ring contains buffers of type 0 and type1.
527 *      bufs_per_pkt is set such that for non-LRO cases all the buffers required
528 *      by a frame will fit in 1st ring (1st buf of type0 and rest of type1).
529 *      2nd ring contains buffers of type 1 alone. Second ring mostly be used
530 *      only for LRO.
531 *
532 */
533static int
534vmxnet3_post_rx_bufs(vmxnet3_rx_queue_t *rxq, uint8_t ring_id)
535{
536	int err = 0;
537	uint32_t i = 0, val = 0;
538	struct vmxnet3_cmd_ring *ring = &rxq->cmd_ring[ring_id];
539
540	if (ring_id == 0) {
541		/* Usually: One HEAD type buf per packet
542		 * val = (ring->next2fill % rxq->hw->bufs_per_pkt) ?
543		 * VMXNET3_RXD_BTYPE_BODY : VMXNET3_RXD_BTYPE_HEAD;
544		 */
545
546		/* We use single packet buffer so all heads here */
547		val = VMXNET3_RXD_BTYPE_HEAD;
548	} else {
549		/* All BODY type buffers for 2nd ring */
550		val = VMXNET3_RXD_BTYPE_BODY;
551	}
552
553	while (vmxnet3_cmd_ring_desc_avail(ring) > 0) {
554		struct Vmxnet3_RxDesc *rxd;
555		struct rte_mbuf *mbuf;
556		vmxnet3_buf_info_t *buf_info = &ring->buf_info[ring->next2fill];
557
558		rxd = (struct Vmxnet3_RxDesc *)(ring->base + ring->next2fill);
559
560		/* Allocate blank mbuf for the current Rx Descriptor */
561		mbuf = rte_mbuf_raw_alloc(rxq->mp);
562		if (unlikely(mbuf == NULL)) {
563			PMD_RX_LOG(ERR, "Error allocating mbuf");
564			rxq->stats.rx_buf_alloc_failure++;
565			err = ENOMEM;
566			break;
567		}
568
569		/*
570		 * Load mbuf pointer into buf_info[ring_size]
571		 * buf_info structure is equivalent to cookie for virtio-virtqueue
572		 */
573		buf_info->m = mbuf;
574		buf_info->len = (uint16_t)(mbuf->buf_len -
575					   RTE_PKTMBUF_HEADROOM);
576		buf_info->bufPA =
577			rte_mbuf_data_dma_addr_default(mbuf);
578
579		/* Load Rx Descriptor with the buffer's GPA */
580		rxd->addr = buf_info->bufPA;
581
582		/* After this point rxd->addr MUST not be NULL */
583		rxd->btype = val;
584		rxd->len = buf_info->len;
585		/* Flip gen bit at the end to change ownership */
586		rxd->gen = ring->gen;
587
588		vmxnet3_cmd_ring_adv_next2fill(ring);
589		i++;
590	}
591
592	/* Return error only if no buffers are posted at present */
593	if (vmxnet3_cmd_ring_desc_avail(ring) >= (ring->size - 1))
594		return -err;
595	else
596		return i;
597}
598
599
600/* Receive side checksum and other offloads */
601static void
602vmxnet3_rx_offload(const Vmxnet3_RxCompDesc *rcd, struct rte_mbuf *rxm)
603{
604	/* Check for RSS */
605	if (rcd->rssType != VMXNET3_RCD_RSS_TYPE_NONE) {
606		rxm->ol_flags |= PKT_RX_RSS_HASH;
607		rxm->hash.rss = rcd->rssHash;
608	}
609
610	/* Check packet type, checksum errors, etc. Only support IPv4 for now. */
611	if (rcd->v4) {
612		struct ether_hdr *eth = rte_pktmbuf_mtod(rxm, struct ether_hdr *);
613		struct ipv4_hdr *ip = (struct ipv4_hdr *)(eth + 1);
614
615		if (((ip->version_ihl & 0xf) << 2) > (int)sizeof(struct ipv4_hdr))
616			rxm->packet_type = RTE_PTYPE_L3_IPV4_EXT;
617		else
618			rxm->packet_type = RTE_PTYPE_L3_IPV4;
619
620		if (!rcd->cnc) {
621			if (!rcd->ipc)
622				rxm->ol_flags |= PKT_RX_IP_CKSUM_BAD;
623
624			if ((rcd->tcp || rcd->udp) && !rcd->tuc)
625				rxm->ol_flags |= PKT_RX_L4_CKSUM_BAD;
626		}
627	}
628}
629
630/*
631 * Process the Rx Completion Ring of given vmxnet3_rx_queue
632 * for nb_pkts burst and return the number of packets received
633 */
634uint16_t
635vmxnet3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
636{
637	uint16_t nb_rx;
638	uint32_t nb_rxd, idx;
639	uint8_t ring_idx;
640	vmxnet3_rx_queue_t *rxq;
641	Vmxnet3_RxCompDesc *rcd;
642	vmxnet3_buf_info_t *rbi;
643	Vmxnet3_RxDesc *rxd;
644	struct rte_mbuf *rxm = NULL;
645	struct vmxnet3_hw *hw;
646
647	nb_rx = 0;
648	ring_idx = 0;
649	nb_rxd = 0;
650	idx = 0;
651
652	rxq = rx_queue;
653	hw = rxq->hw;
654
655	rcd = &rxq->comp_ring.base[rxq->comp_ring.next2proc].rcd;
656
657	if (unlikely(rxq->stopped)) {
658		PMD_RX_LOG(DEBUG, "Rx queue is stopped.");
659		return 0;
660	}
661
662	while (rcd->gen == rxq->comp_ring.gen) {
663		if (nb_rx >= nb_pkts)
664			break;
665
666		idx = rcd->rxdIdx;
667		ring_idx = (uint8_t)((rcd->rqID == rxq->qid1) ? 0 : 1);
668		rxd = (Vmxnet3_RxDesc *)rxq->cmd_ring[ring_idx].base + idx;
669		RTE_SET_USED(rxd); /* used only for assert when enabled */
670		rbi = rxq->cmd_ring[ring_idx].buf_info + idx;
671
672		PMD_RX_LOG(DEBUG, "rxd idx: %d ring idx: %d.", idx, ring_idx);
673
674		RTE_ASSERT(rcd->len <= rxd->len);
675		RTE_ASSERT(rbi->m);
676
677		/* Get the packet buffer pointer from buf_info */
678		rxm = rbi->m;
679
680		/* Clear descriptor associated buf_info to be reused */
681		rbi->m = NULL;
682		rbi->bufPA = 0;
683
684		/* Update the index that we received a packet */
685		rxq->cmd_ring[ring_idx].next2comp = idx;
686
687		/* For RCD with EOP set, check if there is frame error */
688		if (unlikely(rcd->eop && rcd->err)) {
689			rxq->stats.drop_total++;
690			rxq->stats.drop_err++;
691
692			if (!rcd->fcs) {
693				rxq->stats.drop_fcs++;
694				PMD_RX_LOG(ERR, "Recv packet dropped due to frame err.");
695			}
696			PMD_RX_LOG(ERR, "Error in received packet rcd#:%d rxd:%d",
697				   (int)(rcd - (struct Vmxnet3_RxCompDesc *)
698					 rxq->comp_ring.base), rcd->rxdIdx);
699			rte_pktmbuf_free_seg(rxm);
700			goto rcd_done;
701		}
702
703
704		/* Initialize newly received packet buffer */
705		rxm->port = rxq->port_id;
706		rxm->nb_segs = 1;
707		rxm->next = NULL;
708		rxm->pkt_len = (uint16_t)rcd->len;
709		rxm->data_len = (uint16_t)rcd->len;
710		rxm->data_off = RTE_PKTMBUF_HEADROOM;
711		rxm->ol_flags = 0;
712		rxm->vlan_tci = 0;
713
714		/*
715		 * If this is the first buffer of the received packet,
716		 * set the pointer to the first mbuf of the packet
717		 * Otherwise, update the total length and the number of segments
718		 * of the current scattered packet, and update the pointer to
719		 * the last mbuf of the current packet.
720		 */
721		if (rcd->sop) {
722			RTE_ASSERT(rxd->btype == VMXNET3_RXD_BTYPE_HEAD);
723
724			if (unlikely(rcd->len == 0)) {
725				RTE_ASSERT(rcd->eop);
726
727				PMD_RX_LOG(DEBUG,
728					   "Rx buf was skipped. rxring[%d][%d])",
729					   ring_idx, idx);
730				rte_pktmbuf_free_seg(rxm);
731				goto rcd_done;
732			}
733
734			rxq->start_seg = rxm;
735			vmxnet3_rx_offload(rcd, rxm);
736		} else {
737			struct rte_mbuf *start = rxq->start_seg;
738
739			RTE_ASSERT(rxd->btype == VMXNET3_RXD_BTYPE_BODY);
740
741			start->pkt_len += rxm->data_len;
742			start->nb_segs++;
743
744			rxq->last_seg->next = rxm;
745		}
746		rxq->last_seg = rxm;
747
748		if (rcd->eop) {
749			struct rte_mbuf *start = rxq->start_seg;
750
751			/* Check for hardware stripped VLAN tag */
752			if (rcd->ts) {
753				start->ol_flags |= (PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED);
754				start->vlan_tci = rte_le_to_cpu_16((uint16_t)rcd->tci);
755			}
756
757			rx_pkts[nb_rx++] = start;
758			rxq->start_seg = NULL;
759		}
760
761rcd_done:
762		rxq->cmd_ring[ring_idx].next2comp = idx;
763		VMXNET3_INC_RING_IDX_ONLY(rxq->cmd_ring[ring_idx].next2comp, rxq->cmd_ring[ring_idx].size);
764
765		/* It's time to allocate some new buf and renew descriptors */
766		vmxnet3_post_rx_bufs(rxq, ring_idx);
767		if (unlikely(rxq->shared->ctrl.updateRxProd)) {
768			VMXNET3_WRITE_BAR0_REG(hw, rxprod_reg[ring_idx] + (rxq->queue_id * VMXNET3_REG_ALIGN),
769					       rxq->cmd_ring[ring_idx].next2fill);
770		}
771
772		/* Advance to the next descriptor in comp_ring */
773		vmxnet3_comp_ring_adv_next2proc(&rxq->comp_ring);
774
775		rcd = &rxq->comp_ring.base[rxq->comp_ring.next2proc].rcd;
776		nb_rxd++;
777		if (nb_rxd > rxq->cmd_ring[0].size) {
778			PMD_RX_LOG(ERR,
779				   "Used up quota of receiving packets,"
780				   " relinquish control.");
781			break;
782		}
783	}
784
785	return nb_rx;
786}
787
788/*
789 * Create memzone for device rings. malloc can't be used as the physical address is
790 * needed. If the memzone is already created, then this function returns a ptr
791 * to the old one.
792 */
793static const struct rte_memzone *
794ring_dma_zone_reserve(struct rte_eth_dev *dev, const char *ring_name,
795		      uint16_t queue_id, uint32_t ring_size, int socket_id)
796{
797	char z_name[RTE_MEMZONE_NAMESIZE];
798	const struct rte_memzone *mz;
799
800	snprintf(z_name, sizeof(z_name), "%s_%s_%d_%d",
801			dev->driver->pci_drv.name, ring_name,
802			dev->data->port_id, queue_id);
803
804	mz = rte_memzone_lookup(z_name);
805	if (mz)
806		return mz;
807
808	return rte_memzone_reserve_aligned(z_name, ring_size,
809			socket_id, 0, VMXNET3_RING_BA_ALIGN);
810}
811
812int
813vmxnet3_dev_tx_queue_setup(struct rte_eth_dev *dev,
814			   uint16_t queue_idx,
815			   uint16_t nb_desc,
816			   unsigned int socket_id,
817			   __attribute__((unused)) const struct rte_eth_txconf *tx_conf)
818{
819	struct vmxnet3_hw *hw = dev->data->dev_private;
820	const struct rte_memzone *mz;
821	struct vmxnet3_tx_queue *txq;
822	struct vmxnet3_cmd_ring *ring;
823	struct vmxnet3_comp_ring *comp_ring;
824	struct vmxnet3_data_ring *data_ring;
825	int size;
826
827	PMD_INIT_FUNC_TRACE();
828
829	if ((tx_conf->txq_flags & ETH_TXQ_FLAGS_NOXSUMSCTP) !=
830	    ETH_TXQ_FLAGS_NOXSUMSCTP) {
831		PMD_INIT_LOG(ERR, "SCTP checksum offload not supported");
832		return -EINVAL;
833	}
834
835	txq = rte_zmalloc("ethdev_tx_queue", sizeof(struct vmxnet3_tx_queue), RTE_CACHE_LINE_SIZE);
836	if (txq == NULL) {
837		PMD_INIT_LOG(ERR, "Can not allocate tx queue structure");
838		return -ENOMEM;
839	}
840
841	txq->queue_id = queue_idx;
842	txq->port_id = dev->data->port_id;
843	txq->shared = &hw->tqd_start[queue_idx];
844	txq->hw = hw;
845	txq->qid = queue_idx;
846	txq->stopped = TRUE;
847
848	ring = &txq->cmd_ring;
849	comp_ring = &txq->comp_ring;
850	data_ring = &txq->data_ring;
851
852	/* Tx vmxnet ring length should be between 512-4096 */
853	if (nb_desc < VMXNET3_DEF_TX_RING_SIZE) {
854		PMD_INIT_LOG(ERR, "VMXNET3 Tx Ring Size Min: %u",
855			     VMXNET3_DEF_TX_RING_SIZE);
856		return -EINVAL;
857	} else if (nb_desc > VMXNET3_TX_RING_MAX_SIZE) {
858		PMD_INIT_LOG(ERR, "VMXNET3 Tx Ring Size Max: %u",
859			     VMXNET3_TX_RING_MAX_SIZE);
860		return -EINVAL;
861	} else {
862		ring->size = nb_desc;
863		ring->size &= ~VMXNET3_RING_SIZE_MASK;
864	}
865	comp_ring->size = data_ring->size = ring->size;
866
867	/* Tx vmxnet rings structure initialization*/
868	ring->next2fill = 0;
869	ring->next2comp = 0;
870	ring->gen = VMXNET3_INIT_GEN;
871	comp_ring->next2proc = 0;
872	comp_ring->gen = VMXNET3_INIT_GEN;
873
874	size = sizeof(struct Vmxnet3_TxDesc) * ring->size;
875	size += sizeof(struct Vmxnet3_TxCompDesc) * comp_ring->size;
876	size += sizeof(struct Vmxnet3_TxDataDesc) * data_ring->size;
877
878	mz = ring_dma_zone_reserve(dev, "txdesc", queue_idx, size, socket_id);
879	if (mz == NULL) {
880		PMD_INIT_LOG(ERR, "ERROR: Creating queue descriptors zone");
881		return -ENOMEM;
882	}
883	memset(mz->addr, 0, mz->len);
884
885	/* cmd_ring initialization */
886	ring->base = mz->addr;
887	ring->basePA = mz->phys_addr;
888
889	/* comp_ring initialization */
890	comp_ring->base = ring->base + ring->size;
891	comp_ring->basePA = ring->basePA +
892		(sizeof(struct Vmxnet3_TxDesc) * ring->size);
893
894	/* data_ring initialization */
895	data_ring->base = (Vmxnet3_TxDataDesc *)(comp_ring->base + comp_ring->size);
896	data_ring->basePA = comp_ring->basePA +
897			(sizeof(struct Vmxnet3_TxCompDesc) * comp_ring->size);
898
899	/* cmd_ring0 buf_info allocation */
900	ring->buf_info = rte_zmalloc("tx_ring_buf_info",
901				     ring->size * sizeof(vmxnet3_buf_info_t), RTE_CACHE_LINE_SIZE);
902	if (ring->buf_info == NULL) {
903		PMD_INIT_LOG(ERR, "ERROR: Creating tx_buf_info structure");
904		return -ENOMEM;
905	}
906
907	/* Update the data portion with txq */
908	dev->data->tx_queues[queue_idx] = txq;
909
910	return 0;
911}
912
913int
914vmxnet3_dev_rx_queue_setup(struct rte_eth_dev *dev,
915			   uint16_t queue_idx,
916			   uint16_t nb_desc,
917			   unsigned int socket_id,
918			   __attribute__((unused)) const struct rte_eth_rxconf *rx_conf,
919			   struct rte_mempool *mp)
920{
921	const struct rte_memzone *mz;
922	struct vmxnet3_rx_queue *rxq;
923	struct vmxnet3_hw     *hw = dev->data->dev_private;
924	struct vmxnet3_cmd_ring *ring0, *ring1, *ring;
925	struct vmxnet3_comp_ring *comp_ring;
926	int size;
927	uint8_t i;
928	char mem_name[32];
929
930	PMD_INIT_FUNC_TRACE();
931
932	rxq = rte_zmalloc("ethdev_rx_queue", sizeof(struct vmxnet3_rx_queue), RTE_CACHE_LINE_SIZE);
933	if (rxq == NULL) {
934		PMD_INIT_LOG(ERR, "Can not allocate rx queue structure");
935		return -ENOMEM;
936	}
937
938	rxq->mp = mp;
939	rxq->queue_id = queue_idx;
940	rxq->port_id = dev->data->port_id;
941	rxq->shared = &hw->rqd_start[queue_idx];
942	rxq->hw = hw;
943	rxq->qid1 = queue_idx;
944	rxq->qid2 = queue_idx + hw->num_rx_queues;
945	rxq->stopped = TRUE;
946
947	ring0 = &rxq->cmd_ring[0];
948	ring1 = &rxq->cmd_ring[1];
949	comp_ring = &rxq->comp_ring;
950
951	/* Rx vmxnet rings length should be between 256-4096 */
952	if (nb_desc < VMXNET3_DEF_RX_RING_SIZE) {
953		PMD_INIT_LOG(ERR, "VMXNET3 Rx Ring Size Min: 256");
954		return -EINVAL;
955	} else if (nb_desc > VMXNET3_RX_RING_MAX_SIZE) {
956		PMD_INIT_LOG(ERR, "VMXNET3 Rx Ring Size Max: 4096");
957		return -EINVAL;
958	} else {
959		ring0->size = nb_desc;
960		ring0->size &= ~VMXNET3_RING_SIZE_MASK;
961		ring1->size = ring0->size;
962	}
963
964	comp_ring->size = ring0->size + ring1->size;
965
966	/* Rx vmxnet rings structure initialization */
967	ring0->next2fill = 0;
968	ring1->next2fill = 0;
969	ring0->next2comp = 0;
970	ring1->next2comp = 0;
971	ring0->gen = VMXNET3_INIT_GEN;
972	ring1->gen = VMXNET3_INIT_GEN;
973	comp_ring->next2proc = 0;
974	comp_ring->gen = VMXNET3_INIT_GEN;
975
976	size = sizeof(struct Vmxnet3_RxDesc) * (ring0->size + ring1->size);
977	size += sizeof(struct Vmxnet3_RxCompDesc) * comp_ring->size;
978
979	mz = ring_dma_zone_reserve(dev, "rxdesc", queue_idx, size, socket_id);
980	if (mz == NULL) {
981		PMD_INIT_LOG(ERR, "ERROR: Creating queue descriptors zone");
982		return -ENOMEM;
983	}
984	memset(mz->addr, 0, mz->len);
985
986	/* cmd_ring0 initialization */
987	ring0->base = mz->addr;
988	ring0->basePA = mz->phys_addr;
989
990	/* cmd_ring1 initialization */
991	ring1->base = ring0->base + ring0->size;
992	ring1->basePA = ring0->basePA + sizeof(struct Vmxnet3_RxDesc) * ring0->size;
993
994	/* comp_ring initialization */
995	comp_ring->base = ring1->base + ring1->size;
996	comp_ring->basePA = ring1->basePA + sizeof(struct Vmxnet3_RxDesc) *
997		ring1->size;
998
999	/* cmd_ring0-cmd_ring1 buf_info allocation */
1000	for (i = 0; i < VMXNET3_RX_CMDRING_SIZE; i++) {
1001
1002		ring = &rxq->cmd_ring[i];
1003		ring->rid = i;
1004		snprintf(mem_name, sizeof(mem_name), "rx_ring_%d_buf_info", i);
1005
1006		ring->buf_info = rte_zmalloc(mem_name, ring->size * sizeof(vmxnet3_buf_info_t), RTE_CACHE_LINE_SIZE);
1007		if (ring->buf_info == NULL) {
1008			PMD_INIT_LOG(ERR, "ERROR: Creating rx_buf_info structure");
1009			return -ENOMEM;
1010		}
1011	}
1012
1013	/* Update the data portion with rxq */
1014	dev->data->rx_queues[queue_idx] = rxq;
1015
1016	return 0;
1017}
1018
1019/*
1020 * Initializes Receive Unit
1021 * Load mbufs in rx queue in advance
1022 */
1023int
1024vmxnet3_dev_rxtx_init(struct rte_eth_dev *dev)
1025{
1026	struct vmxnet3_hw *hw = dev->data->dev_private;
1027
1028	int i, ret;
1029	uint8_t j;
1030
1031	PMD_INIT_FUNC_TRACE();
1032
1033	for (i = 0; i < hw->num_rx_queues; i++) {
1034		vmxnet3_rx_queue_t *rxq = dev->data->rx_queues[i];
1035
1036		for (j = 0; j < VMXNET3_RX_CMDRING_SIZE; j++) {
1037			/* Passing 0 as alloc_num will allocate full ring */
1038			ret = vmxnet3_post_rx_bufs(rxq, j);
1039			if (ret <= 0) {
1040				PMD_INIT_LOG(ERR, "ERROR: Posting Rxq: %d buffers ring: %d", i, j);
1041				return -ret;
1042			}
1043			/* Updating device with the index:next2fill to fill the mbufs for coming packets */
1044			if (unlikely(rxq->shared->ctrl.updateRxProd)) {
1045				VMXNET3_WRITE_BAR0_REG(hw, rxprod_reg[j] + (rxq->queue_id * VMXNET3_REG_ALIGN),
1046						       rxq->cmd_ring[j].next2fill);
1047			}
1048		}
1049		rxq->stopped = FALSE;
1050		rxq->start_seg = NULL;
1051	}
1052
1053	for (i = 0; i < dev->data->nb_tx_queues; i++) {
1054		struct vmxnet3_tx_queue *txq = dev->data->tx_queues[i];
1055
1056		txq->stopped = FALSE;
1057	}
1058
1059	return 0;
1060}
1061
1062static uint8_t rss_intel_key[40] = {
1063	0x6D, 0x5A, 0x56, 0xDA, 0x25, 0x5B, 0x0E, 0xC2,
1064	0x41, 0x67, 0x25, 0x3D, 0x43, 0xA3, 0x8F, 0xB0,
1065	0xD0, 0xCA, 0x2B, 0xCB, 0xAE, 0x7B, 0x30, 0xB4,
1066	0x77, 0xCB, 0x2D, 0xA3, 0x80, 0x30, 0xF2, 0x0C,
1067	0x6A, 0x42, 0xB7, 0x3B, 0xBE, 0xAC, 0x01, 0xFA,
1068};
1069
1070/*
1071 * Configure RSS feature
1072 */
1073int
1074vmxnet3_rss_configure(struct rte_eth_dev *dev)
1075{
1076	struct vmxnet3_hw *hw = dev->data->dev_private;
1077	struct VMXNET3_RSSConf *dev_rss_conf;
1078	struct rte_eth_rss_conf *port_rss_conf;
1079	uint64_t rss_hf;
1080	uint8_t i, j;
1081
1082	PMD_INIT_FUNC_TRACE();
1083
1084	dev_rss_conf = hw->rss_conf;
1085	port_rss_conf = &dev->data->dev_conf.rx_adv_conf.rss_conf;
1086
1087	/* loading hashFunc */
1088	dev_rss_conf->hashFunc = VMXNET3_RSS_HASH_FUNC_TOEPLITZ;
1089	/* loading hashKeySize */
1090	dev_rss_conf->hashKeySize = VMXNET3_RSS_MAX_KEY_SIZE;
1091	/* loading indTableSize : Must not exceed VMXNET3_RSS_MAX_IND_TABLE_SIZE (128)*/
1092	dev_rss_conf->indTableSize = (uint16_t)(hw->num_rx_queues * 4);
1093
1094	if (port_rss_conf->rss_key == NULL) {
1095		/* Default hash key */
1096		port_rss_conf->rss_key = rss_intel_key;
1097	}
1098
1099	/* loading hashKey */
1100	memcpy(&dev_rss_conf->hashKey[0], port_rss_conf->rss_key, dev_rss_conf->hashKeySize);
1101
1102	/* loading indTable */
1103	for (i = 0, j = 0; i < dev_rss_conf->indTableSize; i++, j++) {
1104		if (j == dev->data->nb_rx_queues)
1105			j = 0;
1106		dev_rss_conf->indTable[i] = j;
1107	}
1108
1109	/* loading hashType */
1110	dev_rss_conf->hashType = 0;
1111	rss_hf = port_rss_conf->rss_hf & VMXNET3_RSS_OFFLOAD_ALL;
1112	if (rss_hf & ETH_RSS_IPV4)
1113		dev_rss_conf->hashType |= VMXNET3_RSS_HASH_TYPE_IPV4;
1114	if (rss_hf & ETH_RSS_NONFRAG_IPV4_TCP)
1115		dev_rss_conf->hashType |= VMXNET3_RSS_HASH_TYPE_TCP_IPV4;
1116	if (rss_hf & ETH_RSS_IPV6)
1117		dev_rss_conf->hashType |= VMXNET3_RSS_HASH_TYPE_IPV6;
1118	if (rss_hf & ETH_RSS_NONFRAG_IPV6_TCP)
1119		dev_rss_conf->hashType |= VMXNET3_RSS_HASH_TYPE_TCP_IPV6;
1120
1121	return VMXNET3_SUCCESS;
1122}
1123