virtio_rxtx.c revision 0c3ed7dc
1/*-
2 *   BSD LICENSE
3 *
4 *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5 *   All rights reserved.
6 *
7 *   Redistribution and use in source and binary forms, with or without
8 *   modification, are permitted provided that the following conditions
9 *   are met:
10 *
11 *     * Redistributions of source code must retain the above copyright
12 *       notice, this list of conditions and the following disclaimer.
13 *     * Redistributions in binary form must reproduce the above copyright
14 *       notice, this list of conditions and the following disclaimer in
15 *       the documentation and/or other materials provided with the
16 *       distribution.
17 *     * Neither the name of Intel Corporation nor the names of its
18 *       contributors may be used to endorse or promote products derived
19 *       from this software without specific prior written permission.
20 *
21 *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34#include <stdint.h>
35#include <stdio.h>
36#include <stdlib.h>
37#include <string.h>
38#include <errno.h>
39
40#include <rte_cycles.h>
41#include <rte_memory.h>
42#include <rte_memzone.h>
43#include <rte_branch_prediction.h>
44#include <rte_mempool.h>
45#include <rte_malloc.h>
46#include <rte_mbuf.h>
47#include <rte_ether.h>
48#include <rte_ethdev.h>
49#include <rte_prefetch.h>
50#include <rte_string_fns.h>
51#include <rte_errno.h>
52#include <rte_byteorder.h>
53#include <rte_cpuflags.h>
54#include <rte_net.h>
55#include <rte_ip.h>
56#include <rte_udp.h>
57#include <rte_tcp.h>
58
59#include "virtio_logs.h"
60#include "virtio_ethdev.h"
61#include "virtio_pci.h"
62#include "virtqueue.h"
63#include "virtio_rxtx.h"
64
65#ifdef RTE_LIBRTE_VIRTIO_DEBUG_DUMP
66#define VIRTIO_DUMP_PACKET(m, len) rte_pktmbuf_dump(stdout, m, len)
67#else
68#define  VIRTIO_DUMP_PACKET(m, len) do { } while (0)
69#endif
70
71
72#define VIRTIO_SIMPLE_FLAGS ((uint32_t)ETH_TXQ_FLAGS_NOMULTSEGS | \
73	ETH_TXQ_FLAGS_NOOFFLOADS)
74
75static void
76vq_ring_free_chain(struct virtqueue *vq, uint16_t desc_idx)
77{
78	struct vring_desc *dp, *dp_tail;
79	struct vq_desc_extra *dxp;
80	uint16_t desc_idx_last = desc_idx;
81
82	dp  = &vq->vq_ring.desc[desc_idx];
83	dxp = &vq->vq_descx[desc_idx];
84	vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt + dxp->ndescs);
85	if ((dp->flags & VRING_DESC_F_INDIRECT) == 0) {
86		while (dp->flags & VRING_DESC_F_NEXT) {
87			desc_idx_last = dp->next;
88			dp = &vq->vq_ring.desc[dp->next];
89		}
90	}
91	dxp->ndescs = 0;
92
93	/*
94	 * We must append the existing free chain, if any, to the end of
95	 * newly freed chain. If the virtqueue was completely used, then
96	 * head would be VQ_RING_DESC_CHAIN_END (ASSERTed above).
97	 */
98	if (vq->vq_desc_tail_idx == VQ_RING_DESC_CHAIN_END) {
99		vq->vq_desc_head_idx = desc_idx;
100	} else {
101		dp_tail = &vq->vq_ring.desc[vq->vq_desc_tail_idx];
102		dp_tail->next = desc_idx;
103	}
104
105	vq->vq_desc_tail_idx = desc_idx_last;
106	dp->next = VQ_RING_DESC_CHAIN_END;
107}
108
109static uint16_t
110virtqueue_dequeue_burst_rx(struct virtqueue *vq, struct rte_mbuf **rx_pkts,
111			   uint32_t *len, uint16_t num)
112{
113	struct vring_used_elem *uep;
114	struct rte_mbuf *cookie;
115	uint16_t used_idx, desc_idx;
116	uint16_t i;
117
118	/*  Caller does the check */
119	for (i = 0; i < num ; i++) {
120		used_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1));
121		uep = &vq->vq_ring.used->ring[used_idx];
122		desc_idx = (uint16_t) uep->id;
123		len[i] = uep->len;
124		cookie = (struct rte_mbuf *)vq->vq_descx[desc_idx].cookie;
125
126		if (unlikely(cookie == NULL)) {
127			PMD_DRV_LOG(ERR, "vring descriptor with no mbuf cookie at %u\n",
128				vq->vq_used_cons_idx);
129			break;
130		}
131
132		rte_prefetch0(cookie);
133		rte_packet_prefetch(rte_pktmbuf_mtod(cookie, void *));
134		rx_pkts[i]  = cookie;
135		vq->vq_used_cons_idx++;
136		vq_ring_free_chain(vq, desc_idx);
137		vq->vq_descx[desc_idx].cookie = NULL;
138	}
139
140	return i;
141}
142
143#ifndef DEFAULT_TX_FREE_THRESH
144#define DEFAULT_TX_FREE_THRESH 32
145#endif
146
147/* Cleanup from completed transmits. */
148static void
149virtio_xmit_cleanup(struct virtqueue *vq, uint16_t num)
150{
151	uint16_t i, used_idx, desc_idx;
152	for (i = 0; i < num; i++) {
153		struct vring_used_elem *uep;
154		struct vq_desc_extra *dxp;
155
156		used_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1));
157		uep = &vq->vq_ring.used->ring[used_idx];
158
159		desc_idx = (uint16_t) uep->id;
160		dxp = &vq->vq_descx[desc_idx];
161		vq->vq_used_cons_idx++;
162		vq_ring_free_chain(vq, desc_idx);
163
164		if (dxp->cookie != NULL) {
165			rte_pktmbuf_free(dxp->cookie);
166			dxp->cookie = NULL;
167		}
168	}
169}
170
171
172static inline int
173virtqueue_enqueue_recv_refill(struct virtqueue *vq, struct rte_mbuf *cookie)
174{
175	struct vq_desc_extra *dxp;
176	struct virtio_hw *hw = vq->hw;
177	struct vring_desc *start_dp;
178	uint16_t needed = 1;
179	uint16_t head_idx, idx;
180
181	if (unlikely(vq->vq_free_cnt == 0))
182		return -ENOSPC;
183	if (unlikely(vq->vq_free_cnt < needed))
184		return -EMSGSIZE;
185
186	head_idx = vq->vq_desc_head_idx;
187	if (unlikely(head_idx >= vq->vq_nentries))
188		return -EFAULT;
189
190	idx = head_idx;
191	dxp = &vq->vq_descx[idx];
192	dxp->cookie = (void *)cookie;
193	dxp->ndescs = needed;
194
195	start_dp = vq->vq_ring.desc;
196	start_dp[idx].addr =
197		VIRTIO_MBUF_ADDR(cookie, vq) +
198		RTE_PKTMBUF_HEADROOM - hw->vtnet_hdr_size;
199	start_dp[idx].len =
200		cookie->buf_len - RTE_PKTMBUF_HEADROOM + hw->vtnet_hdr_size;
201	start_dp[idx].flags =  VRING_DESC_F_WRITE;
202	idx = start_dp[idx].next;
203	vq->vq_desc_head_idx = idx;
204	if (vq->vq_desc_head_idx == VQ_RING_DESC_CHAIN_END)
205		vq->vq_desc_tail_idx = idx;
206	vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt - needed);
207	vq_update_avail_ring(vq, head_idx);
208
209	return 0;
210}
211
212/* When doing TSO, the IP length is not included in the pseudo header
213 * checksum of the packet given to the PMD, but for virtio it is
214 * expected.
215 */
216static void
217virtio_tso_fix_cksum(struct rte_mbuf *m)
218{
219	/* common case: header is not fragmented */
220	if (likely(rte_pktmbuf_data_len(m) >= m->l2_len + m->l3_len +
221			m->l4_len)) {
222		struct ipv4_hdr *iph;
223		struct ipv6_hdr *ip6h;
224		struct tcp_hdr *th;
225		uint16_t prev_cksum, new_cksum, ip_len, ip_paylen;
226		uint32_t tmp;
227
228		iph = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr *, m->l2_len);
229		th = RTE_PTR_ADD(iph, m->l3_len);
230		if ((iph->version_ihl >> 4) == 4) {
231			iph->hdr_checksum = 0;
232			iph->hdr_checksum = rte_ipv4_cksum(iph);
233			ip_len = iph->total_length;
234			ip_paylen = rte_cpu_to_be_16(rte_be_to_cpu_16(ip_len) -
235				m->l3_len);
236		} else {
237			ip6h = (struct ipv6_hdr *)iph;
238			ip_paylen = ip6h->payload_len;
239		}
240
241		/* calculate the new phdr checksum not including ip_paylen */
242		prev_cksum = th->cksum;
243		tmp = prev_cksum;
244		tmp += ip_paylen;
245		tmp = (tmp & 0xffff) + (tmp >> 16);
246		new_cksum = tmp;
247
248		/* replace it in the packet */
249		th->cksum = new_cksum;
250	}
251}
252
253static inline int
254tx_offload_enabled(struct virtio_hw *hw)
255{
256	return vtpci_with_feature(hw, VIRTIO_NET_F_CSUM) ||
257		vtpci_with_feature(hw, VIRTIO_NET_F_HOST_TSO4) ||
258		vtpci_with_feature(hw, VIRTIO_NET_F_HOST_TSO6);
259}
260
261/* avoid write operation when necessary, to lessen cache issues */
262#define ASSIGN_UNLESS_EQUAL(var, val) do {	\
263	if ((var) != (val))			\
264		(var) = (val);			\
265} while (0)
266
267static inline void
268virtqueue_enqueue_xmit(struct virtnet_tx *txvq, struct rte_mbuf *cookie,
269		       uint16_t needed, int use_indirect, int can_push)
270{
271	struct virtio_tx_region *txr = txvq->virtio_net_hdr_mz->addr;
272	struct vq_desc_extra *dxp;
273	struct virtqueue *vq = txvq->vq;
274	struct vring_desc *start_dp;
275	uint16_t seg_num = cookie->nb_segs;
276	uint16_t head_idx, idx;
277	uint16_t head_size = vq->hw->vtnet_hdr_size;
278	struct virtio_net_hdr *hdr;
279	int offload;
280
281	offload = tx_offload_enabled(vq->hw);
282	head_idx = vq->vq_desc_head_idx;
283	idx = head_idx;
284	dxp = &vq->vq_descx[idx];
285	dxp->cookie = (void *)cookie;
286	dxp->ndescs = needed;
287
288	start_dp = vq->vq_ring.desc;
289
290	if (can_push) {
291		/* prepend cannot fail, checked by caller */
292		hdr = (struct virtio_net_hdr *)
293			rte_pktmbuf_prepend(cookie, head_size);
294		/* if offload disabled, it is not zeroed below, do it now */
295		if (offload == 0) {
296			ASSIGN_UNLESS_EQUAL(hdr->csum_start, 0);
297			ASSIGN_UNLESS_EQUAL(hdr->csum_offset, 0);
298			ASSIGN_UNLESS_EQUAL(hdr->flags, 0);
299			ASSIGN_UNLESS_EQUAL(hdr->gso_type, 0);
300			ASSIGN_UNLESS_EQUAL(hdr->gso_size, 0);
301			ASSIGN_UNLESS_EQUAL(hdr->hdr_len, 0);
302		}
303	} else if (use_indirect) {
304		/* setup tx ring slot to point to indirect
305		 * descriptor list stored in reserved region.
306		 *
307		 * the first slot in indirect ring is already preset
308		 * to point to the header in reserved region
309		 */
310		start_dp[idx].addr  = txvq->virtio_net_hdr_mem +
311			RTE_PTR_DIFF(&txr[idx].tx_indir, txr);
312		start_dp[idx].len   = (seg_num + 1) * sizeof(struct vring_desc);
313		start_dp[idx].flags = VRING_DESC_F_INDIRECT;
314		hdr = (struct virtio_net_hdr *)&txr[idx].tx_hdr;
315
316		/* loop below will fill in rest of the indirect elements */
317		start_dp = txr[idx].tx_indir;
318		idx = 1;
319	} else {
320		/* setup first tx ring slot to point to header
321		 * stored in reserved region.
322		 */
323		start_dp[idx].addr  = txvq->virtio_net_hdr_mem +
324			RTE_PTR_DIFF(&txr[idx].tx_hdr, txr);
325		start_dp[idx].len   = vq->hw->vtnet_hdr_size;
326		start_dp[idx].flags = VRING_DESC_F_NEXT;
327		hdr = (struct virtio_net_hdr *)&txr[idx].tx_hdr;
328
329		idx = start_dp[idx].next;
330	}
331
332	/* Checksum Offload / TSO */
333	if (offload) {
334		if (cookie->ol_flags & PKT_TX_TCP_SEG)
335			cookie->ol_flags |= PKT_TX_TCP_CKSUM;
336
337		switch (cookie->ol_flags & PKT_TX_L4_MASK) {
338		case PKT_TX_UDP_CKSUM:
339			hdr->csum_start = cookie->l2_len + cookie->l3_len;
340			hdr->csum_offset = offsetof(struct udp_hdr,
341				dgram_cksum);
342			hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
343			break;
344
345		case PKT_TX_TCP_CKSUM:
346			hdr->csum_start = cookie->l2_len + cookie->l3_len;
347			hdr->csum_offset = offsetof(struct tcp_hdr, cksum);
348			hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
349			break;
350
351		default:
352			ASSIGN_UNLESS_EQUAL(hdr->csum_start, 0);
353			ASSIGN_UNLESS_EQUAL(hdr->csum_offset, 0);
354			ASSIGN_UNLESS_EQUAL(hdr->flags, 0);
355			break;
356		}
357
358		/* TCP Segmentation Offload */
359		if (cookie->ol_flags & PKT_TX_TCP_SEG) {
360			virtio_tso_fix_cksum(cookie);
361			hdr->gso_type = (cookie->ol_flags & PKT_TX_IPV6) ?
362				VIRTIO_NET_HDR_GSO_TCPV6 :
363				VIRTIO_NET_HDR_GSO_TCPV4;
364			hdr->gso_size = cookie->tso_segsz;
365			hdr->hdr_len =
366				cookie->l2_len +
367				cookie->l3_len +
368				cookie->l4_len;
369		} else {
370			ASSIGN_UNLESS_EQUAL(hdr->gso_type, 0);
371			ASSIGN_UNLESS_EQUAL(hdr->gso_size, 0);
372			ASSIGN_UNLESS_EQUAL(hdr->hdr_len, 0);
373		}
374	}
375
376	do {
377		start_dp[idx].addr  = VIRTIO_MBUF_DATA_DMA_ADDR(cookie, vq);
378		start_dp[idx].len   = cookie->data_len;
379		start_dp[idx].flags = cookie->next ? VRING_DESC_F_NEXT : 0;
380		idx = start_dp[idx].next;
381	} while ((cookie = cookie->next) != NULL);
382
383	if (use_indirect)
384		idx = vq->vq_ring.desc[head_idx].next;
385
386	vq->vq_desc_head_idx = idx;
387	if (vq->vq_desc_head_idx == VQ_RING_DESC_CHAIN_END)
388		vq->vq_desc_tail_idx = idx;
389	vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt - needed);
390	vq_update_avail_ring(vq, head_idx);
391}
392
393void
394virtio_dev_cq_start(struct rte_eth_dev *dev)
395{
396	struct virtio_hw *hw = dev->data->dev_private;
397
398	if (hw->cvq && hw->cvq->vq) {
399		VIRTQUEUE_DUMP((struct virtqueue *)hw->cvq->vq);
400	}
401}
402
403int
404virtio_dev_rx_queue_setup(struct rte_eth_dev *dev,
405			uint16_t queue_idx,
406			uint16_t nb_desc,
407			unsigned int socket_id __rte_unused,
408			__rte_unused const struct rte_eth_rxconf *rx_conf,
409			struct rte_mempool *mp)
410{
411	uint16_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_RQ_QUEUE_IDX;
412	struct virtio_hw *hw = dev->data->dev_private;
413	struct virtqueue *vq = hw->vqs[vtpci_queue_idx];
414	struct virtnet_rx *rxvq;
415	int error, nbufs;
416	struct rte_mbuf *m;
417	uint16_t desc_idx;
418
419	PMD_INIT_FUNC_TRACE();
420
421	if (nb_desc == 0 || nb_desc > vq->vq_nentries)
422		nb_desc = vq->vq_nentries;
423	vq->vq_free_cnt = RTE_MIN(vq->vq_free_cnt, nb_desc);
424
425	rxvq = &vq->rxq;
426	rxvq->queue_id = queue_idx;
427	rxvq->mpool = mp;
428	if (rxvq->mpool == NULL) {
429		rte_exit(EXIT_FAILURE,
430			"Cannot allocate mbufs for rx virtqueue");
431	}
432	dev->data->rx_queues[queue_idx] = rxvq;
433
434
435	/* Allocate blank mbufs for the each rx descriptor */
436	nbufs = 0;
437	error = ENOSPC;
438
439	if (hw->use_simple_rxtx) {
440		for (desc_idx = 0; desc_idx < vq->vq_nentries;
441		     desc_idx++) {
442			vq->vq_ring.avail->ring[desc_idx] = desc_idx;
443			vq->vq_ring.desc[desc_idx].flags =
444				VRING_DESC_F_WRITE;
445		}
446	}
447
448	memset(&rxvq->fake_mbuf, 0, sizeof(rxvq->fake_mbuf));
449	for (desc_idx = 0; desc_idx < RTE_PMD_VIRTIO_RX_MAX_BURST;
450	     desc_idx++) {
451		vq->sw_ring[vq->vq_nentries + desc_idx] =
452			&rxvq->fake_mbuf;
453	}
454
455	while (!virtqueue_full(vq)) {
456		m = rte_mbuf_raw_alloc(rxvq->mpool);
457		if (m == NULL)
458			break;
459
460		/* Enqueue allocated buffers */
461		if (hw->use_simple_rxtx)
462			error = virtqueue_enqueue_recv_refill_simple(vq, m);
463		else
464			error = virtqueue_enqueue_recv_refill(vq, m);
465
466		if (error) {
467			rte_pktmbuf_free(m);
468			break;
469		}
470		nbufs++;
471	}
472
473	vq_update_avail_idx(vq);
474
475	PMD_INIT_LOG(DEBUG, "Allocated %d bufs", nbufs);
476
477	virtio_rxq_vec_setup(rxvq);
478
479	VIRTQUEUE_DUMP(vq);
480
481	return 0;
482}
483
484static void
485virtio_update_rxtx_handler(struct rte_eth_dev *dev,
486			   const struct rte_eth_txconf *tx_conf)
487{
488	uint8_t use_simple_rxtx = 0;
489	struct virtio_hw *hw = dev->data->dev_private;
490
491#if defined RTE_ARCH_X86
492	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE3))
493		use_simple_rxtx = 1;
494#elif defined RTE_ARCH_ARM64 || defined CONFIG_RTE_ARCH_ARM
495	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON))
496		use_simple_rxtx = 1;
497#endif
498	/* Use simple rx/tx func if single segment and no offloads */
499	if (use_simple_rxtx &&
500	    (tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
501	    !vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
502		PMD_INIT_LOG(INFO, "Using simple rx/tx path");
503		dev->tx_pkt_burst = virtio_xmit_pkts_simple;
504		dev->rx_pkt_burst = virtio_recv_pkts_vec;
505		hw->use_simple_rxtx = use_simple_rxtx;
506	}
507}
508
509/*
510 * struct rte_eth_dev *dev: Used to update dev
511 * uint16_t nb_desc: Defaults to values read from config space
512 * unsigned int socket_id: Used to allocate memzone
513 * const struct rte_eth_txconf *tx_conf: Used to setup tx engine
514 * uint16_t queue_idx: Just used as an index in dev txq list
515 */
516int
517virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
518			uint16_t queue_idx,
519			uint16_t nb_desc,
520			unsigned int socket_id __rte_unused,
521			const struct rte_eth_txconf *tx_conf)
522{
523	uint8_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_TQ_QUEUE_IDX;
524	struct virtio_hw *hw = dev->data->dev_private;
525	struct virtqueue *vq = hw->vqs[vtpci_queue_idx];
526	struct virtnet_tx *txvq;
527	uint16_t tx_free_thresh;
528	uint16_t desc_idx;
529
530	PMD_INIT_FUNC_TRACE();
531
532	virtio_update_rxtx_handler(dev, tx_conf);
533
534	if (nb_desc == 0 || nb_desc > vq->vq_nentries)
535		nb_desc = vq->vq_nentries;
536	vq->vq_free_cnt = RTE_MIN(vq->vq_free_cnt, nb_desc);
537
538	txvq = &vq->txq;
539	txvq->queue_id = queue_idx;
540
541	tx_free_thresh = tx_conf->tx_free_thresh;
542	if (tx_free_thresh == 0)
543		tx_free_thresh =
544			RTE_MIN(vq->vq_nentries / 4, DEFAULT_TX_FREE_THRESH);
545
546	if (tx_free_thresh >= (vq->vq_nentries - 3)) {
547		RTE_LOG(ERR, PMD, "tx_free_thresh must be less than the "
548			"number of TX entries minus 3 (%u)."
549			" (tx_free_thresh=%u port=%u queue=%u)\n",
550			vq->vq_nentries - 3,
551			tx_free_thresh, dev->data->port_id, queue_idx);
552		return -EINVAL;
553	}
554
555	vq->vq_free_thresh = tx_free_thresh;
556
557	if (hw->use_simple_rxtx) {
558		uint16_t mid_idx  = vq->vq_nentries >> 1;
559
560		for (desc_idx = 0; desc_idx < mid_idx; desc_idx++) {
561			vq->vq_ring.avail->ring[desc_idx] =
562				desc_idx + mid_idx;
563			vq->vq_ring.desc[desc_idx + mid_idx].next =
564				desc_idx;
565			vq->vq_ring.desc[desc_idx + mid_idx].addr =
566				txvq->virtio_net_hdr_mem +
567				offsetof(struct virtio_tx_region, tx_hdr);
568			vq->vq_ring.desc[desc_idx + mid_idx].len =
569				vq->hw->vtnet_hdr_size;
570			vq->vq_ring.desc[desc_idx + mid_idx].flags =
571				VRING_DESC_F_NEXT;
572			vq->vq_ring.desc[desc_idx].flags = 0;
573		}
574		for (desc_idx = mid_idx; desc_idx < vq->vq_nentries;
575		     desc_idx++)
576			vq->vq_ring.avail->ring[desc_idx] = desc_idx;
577	}
578
579	VIRTQUEUE_DUMP(vq);
580
581	dev->data->tx_queues[queue_idx] = txvq;
582	return 0;
583}
584
585static void
586virtio_discard_rxbuf(struct virtqueue *vq, struct rte_mbuf *m)
587{
588	int error;
589	/*
590	 * Requeue the discarded mbuf. This should always be
591	 * successful since it was just dequeued.
592	 */
593	error = virtqueue_enqueue_recv_refill(vq, m);
594	if (unlikely(error)) {
595		RTE_LOG(ERR, PMD, "cannot requeue discarded mbuf");
596		rte_pktmbuf_free(m);
597	}
598}
599
600static void
601virtio_update_packet_stats(struct virtnet_stats *stats, struct rte_mbuf *mbuf)
602{
603	uint32_t s = mbuf->pkt_len;
604	struct ether_addr *ea;
605
606	if (s == 64) {
607		stats->size_bins[1]++;
608	} else if (s > 64 && s < 1024) {
609		uint32_t bin;
610
611		/* count zeros, and offset into correct bin */
612		bin = (sizeof(s) * 8) - __builtin_clz(s) - 5;
613		stats->size_bins[bin]++;
614	} else {
615		if (s < 64)
616			stats->size_bins[0]++;
617		else if (s < 1519)
618			stats->size_bins[6]++;
619		else if (s >= 1519)
620			stats->size_bins[7]++;
621	}
622
623	ea = rte_pktmbuf_mtod(mbuf, struct ether_addr *);
624	if (is_multicast_ether_addr(ea)) {
625		if (is_broadcast_ether_addr(ea))
626			stats->broadcast++;
627		else
628			stats->multicast++;
629	}
630}
631
632/* Optionally fill offload information in structure */
633static int
634virtio_rx_offload(struct rte_mbuf *m, struct virtio_net_hdr *hdr)
635{
636	struct rte_net_hdr_lens hdr_lens;
637	uint32_t hdrlen, ptype;
638	int l4_supported = 0;
639
640	/* nothing to do */
641	if (hdr->flags == 0 && hdr->gso_type == VIRTIO_NET_HDR_GSO_NONE)
642		return 0;
643
644	m->ol_flags |= PKT_RX_IP_CKSUM_UNKNOWN;
645
646	ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
647	m->packet_type = ptype;
648	if ((ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP ||
649	    (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP ||
650	    (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP)
651		l4_supported = 1;
652
653	if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
654		hdrlen = hdr_lens.l2_len + hdr_lens.l3_len + hdr_lens.l4_len;
655		if (hdr->csum_start <= hdrlen && l4_supported) {
656			m->ol_flags |= PKT_RX_L4_CKSUM_NONE;
657		} else {
658			/* Unknown proto or tunnel, do sw cksum. We can assume
659			 * the cksum field is in the first segment since the
660			 * buffers we provided to the host are large enough.
661			 * In case of SCTP, this will be wrong since it's a CRC
662			 * but there's nothing we can do.
663			 */
664			uint16_t csum, off;
665
666			rte_raw_cksum_mbuf(m, hdr->csum_start,
667				rte_pktmbuf_pkt_len(m) - hdr->csum_start,
668				&csum);
669			if (likely(csum != 0xffff))
670				csum = ~csum;
671			off = hdr->csum_offset + hdr->csum_start;
672			if (rte_pktmbuf_data_len(m) >= off + 1)
673				*rte_pktmbuf_mtod_offset(m, uint16_t *,
674					off) = csum;
675		}
676	} else if (hdr->flags & VIRTIO_NET_HDR_F_DATA_VALID && l4_supported) {
677		m->ol_flags |= PKT_RX_L4_CKSUM_GOOD;
678	}
679
680	/* GSO request, save required information in mbuf */
681	if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
682		/* Check unsupported modes */
683		if ((hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN) ||
684		    (hdr->gso_size == 0)) {
685			return -EINVAL;
686		}
687
688		/* Update mss lengthes in mbuf */
689		m->tso_segsz = hdr->gso_size;
690		switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
691			case VIRTIO_NET_HDR_GSO_TCPV4:
692			case VIRTIO_NET_HDR_GSO_TCPV6:
693				m->ol_flags |= PKT_RX_LRO | \
694					PKT_RX_L4_CKSUM_NONE;
695				break;
696			default:
697				return -EINVAL;
698		}
699	}
700
701	return 0;
702}
703
704static inline int
705rx_offload_enabled(struct virtio_hw *hw)
706{
707	return vtpci_with_feature(hw, VIRTIO_NET_F_GUEST_CSUM) ||
708		vtpci_with_feature(hw, VIRTIO_NET_F_GUEST_TSO4) ||
709		vtpci_with_feature(hw, VIRTIO_NET_F_GUEST_TSO6);
710}
711
712#define VIRTIO_MBUF_BURST_SZ 64
713#define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
714uint16_t
715virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
716{
717	struct virtnet_rx *rxvq = rx_queue;
718	struct virtqueue *vq = rxvq->vq;
719	struct virtio_hw *hw;
720	struct rte_mbuf *rxm, *new_mbuf;
721	uint16_t nb_used, num, nb_rx;
722	uint32_t len[VIRTIO_MBUF_BURST_SZ];
723	struct rte_mbuf *rcv_pkts[VIRTIO_MBUF_BURST_SZ];
724	int error;
725	uint32_t i, nb_enqueued;
726	uint32_t hdr_size;
727	int offload;
728	struct virtio_net_hdr *hdr;
729
730	nb_used = VIRTQUEUE_NUSED(vq);
731
732	virtio_rmb();
733
734	num = (uint16_t)(likely(nb_used <= nb_pkts) ? nb_used : nb_pkts);
735	num = (uint16_t)(likely(num <= VIRTIO_MBUF_BURST_SZ) ? num : VIRTIO_MBUF_BURST_SZ);
736	if (likely(num > DESC_PER_CACHELINE))
737		num = num - ((vq->vq_used_cons_idx + num) % DESC_PER_CACHELINE);
738
739	num = virtqueue_dequeue_burst_rx(vq, rcv_pkts, len, num);
740	PMD_RX_LOG(DEBUG, "used:%d dequeue:%d", nb_used, num);
741
742	hw = vq->hw;
743	nb_rx = 0;
744	nb_enqueued = 0;
745	hdr_size = hw->vtnet_hdr_size;
746	offload = rx_offload_enabled(hw);
747
748	for (i = 0; i < num ; i++) {
749		rxm = rcv_pkts[i];
750
751		PMD_RX_LOG(DEBUG, "packet len:%d", len[i]);
752
753		if (unlikely(len[i] < hdr_size + ETHER_HDR_LEN)) {
754			PMD_RX_LOG(ERR, "Packet drop");
755			nb_enqueued++;
756			virtio_discard_rxbuf(vq, rxm);
757			rxvq->stats.errors++;
758			continue;
759		}
760
761		rxm->port = rxvq->port_id;
762		rxm->data_off = RTE_PKTMBUF_HEADROOM;
763		rxm->ol_flags = 0;
764		rxm->vlan_tci = 0;
765
766		rxm->nb_segs = 1;
767		rxm->next = NULL;
768		rxm->pkt_len = (uint32_t)(len[i] - hdr_size);
769		rxm->data_len = (uint16_t)(len[i] - hdr_size);
770
771		hdr = (struct virtio_net_hdr *)((char *)rxm->buf_addr +
772			RTE_PKTMBUF_HEADROOM - hdr_size);
773
774		if (hw->vlan_strip)
775			rte_vlan_strip(rxm);
776
777		if (offload && virtio_rx_offload(rxm, hdr) < 0) {
778			virtio_discard_rxbuf(vq, rxm);
779			rxvq->stats.errors++;
780			continue;
781		}
782
783		VIRTIO_DUMP_PACKET(rxm, rxm->data_len);
784
785		rx_pkts[nb_rx++] = rxm;
786
787		rxvq->stats.bytes += rx_pkts[nb_rx - 1]->pkt_len;
788		virtio_update_packet_stats(&rxvq->stats, rxm);
789	}
790
791	rxvq->stats.packets += nb_rx;
792
793	/* Allocate new mbuf for the used descriptor */
794	error = ENOSPC;
795	while (likely(!virtqueue_full(vq))) {
796		new_mbuf = rte_mbuf_raw_alloc(rxvq->mpool);
797		if (unlikely(new_mbuf == NULL)) {
798			struct rte_eth_dev *dev
799				= &rte_eth_devices[rxvq->port_id];
800			dev->data->rx_mbuf_alloc_failed++;
801			break;
802		}
803		error = virtqueue_enqueue_recv_refill(vq, new_mbuf);
804		if (unlikely(error)) {
805			rte_pktmbuf_free(new_mbuf);
806			break;
807		}
808		nb_enqueued++;
809	}
810
811	if (likely(nb_enqueued)) {
812		vq_update_avail_idx(vq);
813
814		if (unlikely(virtqueue_kick_prepare(vq))) {
815			virtqueue_notify(vq);
816			PMD_RX_LOG(DEBUG, "Notified");
817		}
818	}
819
820	return nb_rx;
821}
822
823uint16_t
824virtio_recv_mergeable_pkts(void *rx_queue,
825			struct rte_mbuf **rx_pkts,
826			uint16_t nb_pkts)
827{
828	struct virtnet_rx *rxvq = rx_queue;
829	struct virtqueue *vq = rxvq->vq;
830	struct virtio_hw *hw;
831	struct rte_mbuf *rxm, *new_mbuf;
832	uint16_t nb_used, num, nb_rx;
833	uint32_t len[VIRTIO_MBUF_BURST_SZ];
834	struct rte_mbuf *rcv_pkts[VIRTIO_MBUF_BURST_SZ];
835	struct rte_mbuf *prev;
836	int error;
837	uint32_t i, nb_enqueued;
838	uint32_t seg_num;
839	uint16_t extra_idx;
840	uint32_t seg_res;
841	uint32_t hdr_size;
842	int offload;
843
844	nb_used = VIRTQUEUE_NUSED(vq);
845
846	virtio_rmb();
847
848	PMD_RX_LOG(DEBUG, "used:%d", nb_used);
849
850	hw = vq->hw;
851	nb_rx = 0;
852	i = 0;
853	nb_enqueued = 0;
854	seg_num = 0;
855	extra_idx = 0;
856	seg_res = 0;
857	hdr_size = hw->vtnet_hdr_size;
858	offload = rx_offload_enabled(hw);
859
860	while (i < nb_used) {
861		struct virtio_net_hdr_mrg_rxbuf *header;
862
863		if (nb_rx == nb_pkts)
864			break;
865
866		num = virtqueue_dequeue_burst_rx(vq, rcv_pkts, len, 1);
867		if (num != 1)
868			continue;
869
870		i++;
871
872		PMD_RX_LOG(DEBUG, "dequeue:%d", num);
873		PMD_RX_LOG(DEBUG, "packet len:%d", len[0]);
874
875		rxm = rcv_pkts[0];
876
877		if (unlikely(len[0] < hdr_size + ETHER_HDR_LEN)) {
878			PMD_RX_LOG(ERR, "Packet drop");
879			nb_enqueued++;
880			virtio_discard_rxbuf(vq, rxm);
881			rxvq->stats.errors++;
882			continue;
883		}
884
885		header = (struct virtio_net_hdr_mrg_rxbuf *)((char *)rxm->buf_addr +
886			RTE_PKTMBUF_HEADROOM - hdr_size);
887		seg_num = header->num_buffers;
888
889		if (seg_num == 0)
890			seg_num = 1;
891
892		rxm->data_off = RTE_PKTMBUF_HEADROOM;
893		rxm->nb_segs = seg_num;
894		rxm->next = NULL;
895		rxm->ol_flags = 0;
896		rxm->vlan_tci = 0;
897		rxm->pkt_len = (uint32_t)(len[0] - hdr_size);
898		rxm->data_len = (uint16_t)(len[0] - hdr_size);
899
900		rxm->port = rxvq->port_id;
901		rx_pkts[nb_rx] = rxm;
902		prev = rxm;
903
904		if (offload && virtio_rx_offload(rxm, &header->hdr) < 0) {
905			virtio_discard_rxbuf(vq, rxm);
906			rxvq->stats.errors++;
907			continue;
908		}
909
910		seg_res = seg_num - 1;
911
912		while (seg_res != 0) {
913			/*
914			 * Get extra segments for current uncompleted packet.
915			 */
916			uint16_t  rcv_cnt =
917				RTE_MIN(seg_res, RTE_DIM(rcv_pkts));
918			if (likely(VIRTQUEUE_NUSED(vq) >= rcv_cnt)) {
919				uint32_t rx_num =
920					virtqueue_dequeue_burst_rx(vq,
921					rcv_pkts, len, rcv_cnt);
922				i += rx_num;
923				rcv_cnt = rx_num;
924			} else {
925				PMD_RX_LOG(ERR,
926					   "No enough segments for packet.");
927				nb_enqueued++;
928				virtio_discard_rxbuf(vq, rxm);
929				rxvq->stats.errors++;
930				break;
931			}
932
933			extra_idx = 0;
934
935			while (extra_idx < rcv_cnt) {
936				rxm = rcv_pkts[extra_idx];
937
938				rxm->data_off = RTE_PKTMBUF_HEADROOM - hdr_size;
939				rxm->next = NULL;
940				rxm->pkt_len = (uint32_t)(len[extra_idx]);
941				rxm->data_len = (uint16_t)(len[extra_idx]);
942
943				if (prev)
944					prev->next = rxm;
945
946				prev = rxm;
947				rx_pkts[nb_rx]->pkt_len += rxm->pkt_len;
948				extra_idx++;
949			};
950			seg_res -= rcv_cnt;
951		}
952
953		if (hw->vlan_strip)
954			rte_vlan_strip(rx_pkts[nb_rx]);
955
956		VIRTIO_DUMP_PACKET(rx_pkts[nb_rx],
957			rx_pkts[nb_rx]->data_len);
958
959		rxvq->stats.bytes += rx_pkts[nb_rx]->pkt_len;
960		virtio_update_packet_stats(&rxvq->stats, rx_pkts[nb_rx]);
961		nb_rx++;
962	}
963
964	rxvq->stats.packets += nb_rx;
965
966	/* Allocate new mbuf for the used descriptor */
967	error = ENOSPC;
968	while (likely(!virtqueue_full(vq))) {
969		new_mbuf = rte_mbuf_raw_alloc(rxvq->mpool);
970		if (unlikely(new_mbuf == NULL)) {
971			struct rte_eth_dev *dev
972				= &rte_eth_devices[rxvq->port_id];
973			dev->data->rx_mbuf_alloc_failed++;
974			break;
975		}
976		error = virtqueue_enqueue_recv_refill(vq, new_mbuf);
977		if (unlikely(error)) {
978			rte_pktmbuf_free(new_mbuf);
979			break;
980		}
981		nb_enqueued++;
982	}
983
984	if (likely(nb_enqueued)) {
985		vq_update_avail_idx(vq);
986
987		if (unlikely(virtqueue_kick_prepare(vq))) {
988			virtqueue_notify(vq);
989			PMD_RX_LOG(DEBUG, "Notified");
990		}
991	}
992
993	return nb_rx;
994}
995
996uint16_t
997virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
998{
999	struct virtnet_tx *txvq = tx_queue;
1000	struct virtqueue *vq = txvq->vq;
1001	struct virtio_hw *hw = vq->hw;
1002	uint16_t hdr_size = hw->vtnet_hdr_size;
1003	uint16_t nb_used, nb_tx;
1004	int error;
1005
1006	if (unlikely(nb_pkts < 1))
1007		return nb_pkts;
1008
1009	PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts);
1010	nb_used = VIRTQUEUE_NUSED(vq);
1011
1012	virtio_rmb();
1013	if (likely(nb_used > vq->vq_nentries - vq->vq_free_thresh))
1014		virtio_xmit_cleanup(vq, nb_used);
1015
1016	for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
1017		struct rte_mbuf *txm = tx_pkts[nb_tx];
1018		int can_push = 0, use_indirect = 0, slots, need;
1019
1020		/* Do VLAN tag insertion */
1021		if (unlikely(txm->ol_flags & PKT_TX_VLAN_PKT)) {
1022			error = rte_vlan_insert(&txm);
1023			if (unlikely(error)) {
1024				rte_pktmbuf_free(txm);
1025				continue;
1026			}
1027		}
1028
1029		/* optimize ring usage */
1030		if (vtpci_with_feature(hw, VIRTIO_F_ANY_LAYOUT) &&
1031		    rte_mbuf_refcnt_read(txm) == 1 &&
1032		    RTE_MBUF_DIRECT(txm) &&
1033		    txm->nb_segs == 1 &&
1034		    rte_pktmbuf_headroom(txm) >= hdr_size &&
1035		    rte_is_aligned(rte_pktmbuf_mtod(txm, char *),
1036				   __alignof__(struct virtio_net_hdr_mrg_rxbuf)))
1037			can_push = 1;
1038		else if (vtpci_with_feature(hw, VIRTIO_RING_F_INDIRECT_DESC) &&
1039			 txm->nb_segs < VIRTIO_MAX_TX_INDIRECT)
1040			use_indirect = 1;
1041
1042		/* How many main ring entries are needed to this Tx?
1043		 * any_layout => number of segments
1044		 * indirect   => 1
1045		 * default    => number of segments + 1
1046		 */
1047		slots = use_indirect ? 1 : (txm->nb_segs + !can_push);
1048		need = slots - vq->vq_free_cnt;
1049
1050		/* Positive value indicates it need free vring descriptors */
1051		if (unlikely(need > 0)) {
1052			nb_used = VIRTQUEUE_NUSED(vq);
1053			virtio_rmb();
1054			need = RTE_MIN(need, (int)nb_used);
1055
1056			virtio_xmit_cleanup(vq, need);
1057			need = slots - vq->vq_free_cnt;
1058			if (unlikely(need > 0)) {
1059				PMD_TX_LOG(ERR,
1060					   "No free tx descriptors to transmit");
1061				break;
1062			}
1063		}
1064
1065		/* Enqueue Packet buffers */
1066		virtqueue_enqueue_xmit(txvq, txm, slots, use_indirect, can_push);
1067
1068		txvq->stats.bytes += txm->pkt_len;
1069		virtio_update_packet_stats(&txvq->stats, txm);
1070	}
1071
1072	txvq->stats.packets += nb_tx;
1073
1074	if (likely(nb_tx)) {
1075		vq_update_avail_idx(vq);
1076
1077		if (unlikely(virtqueue_kick_prepare(vq))) {
1078			virtqueue_notify(vq);
1079			PMD_TX_LOG(DEBUG, "Notified backend after xmit");
1080		}
1081	}
1082
1083	return nb_tx;
1084}
1085