main.c revision 9365d6cf
1/*-
2 *   BSD LICENSE
3 *
4 *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
5 *   All rights reserved.
6 *
7 *   Redistribution and use in source and binary forms, with or without
8 *   modification, are permitted provided that the following conditions
9 *   are met:
10 *
11 *     * Redistributions of source code must retain the above copyright
12 *       notice, this list of conditions and the following disclaimer.
13 *     * Redistributions in binary form must reproduce the above copyright
14 *       notice, this list of conditions and the following disclaimer in
15 *       the documentation and/or other materials provided with the
16 *       distribution.
17 *     * Neither the name of Intel Corporation nor the names of its
18 *       contributors may be used to endorse or promote products derived
19 *       from this software without specific prior written permission.
20 *
21 *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34#include <arpa/inet.h>
35#include <getopt.h>
36#include <linux/if_ether.h>
37#include <linux/if_vlan.h>
38#include <linux/virtio_net.h>
39#include <linux/virtio_ring.h>
40#include <signal.h>
41#include <stdint.h>
42#include <sys/eventfd.h>
43#include <sys/param.h>
44#include <unistd.h>
45
46#include <rte_atomic.h>
47#include <rte_cycles.h>
48#include <rte_ethdev.h>
49#include <rte_log.h>
50#include <rte_string_fns.h>
51
52#include "main.h"
53#include "virtio-net.h"
54#include "xen_vhost.h"
55
56#define MAX_QUEUES 128
57
58/* the maximum number of external ports supported */
59#define MAX_SUP_PORTS 1
60
61/*
62 * Calculate the number of buffers needed per port
63 */
64#define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +		\
65							(num_switching_cores*MAX_PKT_BURST) +  			\
66							(num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67							(num_switching_cores*MBUF_CACHE_SIZE))
68
69#define MBUF_CACHE_SIZE 64
70
71/*
72 * RX and TX Prefetch, Host, and Write-back threshold values should be
73 * carefully set for optimal performance. Consult the network
74 * controller's datasheet and supporting DPDK documentation for guidance
75 * on how these parameters should be set.
76 */
77#define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */
78#define RX_HTHRESH 8 /* Default values of RX host threshold reg. */
79#define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */
80
81/*
82 * These default values are optimized for use with the Intel(R) 82599 10 GbE
83 * Controller and the DPDK ixgbe PMD. Consider using other values for other
84 * network controllers and/or network drivers.
85 */
86#define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */
87#define TX_HTHRESH 0  /* Default values of TX host threshold reg. */
88#define TX_WTHRESH 0  /* Default values of TX write-back threshold reg. */
89
90#define MAX_PKT_BURST 32		/* Max burst size for RX/TX */
91#define MAX_MRG_PKT_BURST 16	/* Max burst for merge buffers. Set to 1 due to performance issue. */
92#define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
93
94/* State of virtio device. */
95#define DEVICE_NOT_READY     0
96#define DEVICE_READY         1
97#define DEVICE_SAFE_REMOVE   2
98
99/* Config_core_flag status definitions. */
100#define REQUEST_DEV_REMOVAL 1
101#define ACK_DEV_REMOVAL 0
102
103/* Configurable number of RX/TX ring descriptors */
104#define RTE_TEST_RX_DESC_DEFAULT 128
105#define RTE_TEST_TX_DESC_DEFAULT 512
106
107#define INVALID_PORT_ID 0xFF
108
109/* Max number of devices. Limited by vmdq. */
110#define MAX_DEVICES 64
111
112/* Size of buffers used for snprintfs. */
113#define MAX_PRINT_BUFF 6072
114
115
116/* Maximum long option length for option parsing. */
117#define MAX_LONG_OPT_SZ 64
118
119/* Used to compare MAC addresses. */
120#define MAC_ADDR_CMP 0xFFFFFFFFFFFF
121
122/* mask of enabled ports */
123static uint32_t enabled_port_mask = 0;
124
125/*Number of switching cores enabled*/
126static uint32_t num_switching_cores = 0;
127
128/* number of devices/queues to support*/
129static uint32_t num_queues = 0;
130uint32_t num_devices = 0;
131
132/* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
133static uint32_t enable_vm2vm = 1;
134/* Enable stats. */
135static uint32_t enable_stats = 0;
136
137/* empty vmdq configuration structure. Filled in programatically */
138static const struct rte_eth_conf vmdq_conf_default = {
139	.rxmode = {
140		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
141		.split_hdr_size = 0,
142		.header_split   = 0, /**< Header Split disabled */
143		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
144		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
145		/*
146		 * It is necessary for 1G NIC such as I350,
147		 * this fixes bug of ipv4 forwarding in guest can't
148		 * forward pakets from one virtio dev to another virtio dev.
149		 */
150		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
151		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
152		.hw_strip_crc   = 1, /**< CRC stripped by hardware */
153	},
154
155	.txmode = {
156		.mq_mode = ETH_MQ_TX_NONE,
157	},
158	.rx_adv_conf = {
159		/*
160		 * should be overridden separately in code with
161		 * appropriate values
162		 */
163		.vmdq_rx_conf = {
164			.nb_queue_pools = ETH_8_POOLS,
165			.enable_default_pool = 0,
166			.default_pool = 0,
167			.nb_pool_maps = 0,
168			.pool_map = {{0, 0},},
169		},
170	},
171};
172
173static unsigned lcore_ids[RTE_MAX_LCORE];
174static uint8_t ports[RTE_MAX_ETHPORTS];
175static unsigned num_ports = 0; /**< The number of ports specified in command line */
176
177const uint16_t vlan_tags[] = {
178	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
179	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
180	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
181	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
182	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
183	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
184	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
185	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
186};
187
188/* ethernet addresses of ports */
189static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
190
191/* heads for the main used and free linked lists for the data path. */
192static struct virtio_net_data_ll *ll_root_used = NULL;
193static struct virtio_net_data_ll *ll_root_free = NULL;
194
195/* Array of data core structures containing information on individual core linked lists. */
196static struct lcore_info lcore_info[RTE_MAX_LCORE];
197
198/* Used for queueing bursts of TX packets. */
199struct mbuf_table {
200	unsigned len;
201	unsigned txq_id;
202	struct rte_mbuf *m_table[MAX_PKT_BURST];
203};
204
205/* TX queue for each data core. */
206struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
207
208/* Vlan header struct used to insert vlan tags on TX. */
209struct vlan_ethhdr {
210	unsigned char   h_dest[ETH_ALEN];
211	unsigned char   h_source[ETH_ALEN];
212	__be16          h_vlan_proto;
213	__be16          h_vlan_TCI;
214	__be16          h_vlan_encapsulated_proto;
215};
216
217/* Header lengths. */
218#define VLAN_HLEN       4
219#define VLAN_ETH_HLEN   18
220
221/* Per-device statistics struct */
222struct device_statistics {
223	uint64_t tx_total;
224	rte_atomic64_t rx_total;
225	uint64_t tx;
226	rte_atomic64_t rx;
227} __rte_cache_aligned;
228struct device_statistics dev_statistics[MAX_DEVICES];
229
230/*
231 * Builds up the correct configuration for VMDQ VLAN pool map
232 * according to the pool & queue limits.
233 */
234static inline int
235get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
236{
237	struct rte_eth_vmdq_rx_conf conf;
238	unsigned i;
239
240	memset(&conf, 0, sizeof(conf));
241	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
242	conf.nb_pool_maps = num_devices;
243
244	for (i = 0; i < conf.nb_pool_maps; i++) {
245		conf.pool_map[i].vlan_id = vlan_tags[ i ];
246		conf.pool_map[i].pools = (1UL << i);
247	}
248
249	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
250	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
251		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
252	return 0;
253}
254
255/*
256 * Validate the device number according to the max pool number gotten form dev_info
257 * If the device number is invalid, give the error message and return -1.
258 * Each device must have its own pool.
259 */
260static inline int
261validate_num_devices(uint32_t max_nb_devices)
262{
263	if (num_devices > max_nb_devices) {
264		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
265		return -1;
266	}
267	return 0;
268}
269
270/*
271 * Initialises a given port using global settings and with the rx buffers
272 * coming from the mbuf_pool passed as parameter
273 */
274static inline int
275port_init(uint8_t port, struct rte_mempool *mbuf_pool)
276{
277	struct rte_eth_dev_info dev_info;
278	struct rte_eth_rxconf *rxconf;
279	struct rte_eth_conf port_conf;
280	uint16_t rx_rings, tx_rings = (uint16_t)rte_lcore_count();
281	const uint16_t rx_ring_size = RTE_TEST_RX_DESC_DEFAULT, tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
282	int retval;
283	uint16_t q;
284
285	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
286	rte_eth_dev_info_get (port, &dev_info);
287
288	/*configure the number of supported virtio devices based on VMDQ limits */
289	num_devices = dev_info.max_vmdq_pools;
290	num_queues = dev_info.max_rx_queues;
291
292	retval = validate_num_devices(MAX_DEVICES);
293	if (retval < 0)
294		return retval;
295
296	/* Get port configuration. */
297	retval = get_eth_conf(&port_conf, num_devices);
298	if (retval < 0)
299		return retval;
300
301	if (port >= rte_eth_dev_count()) return -1;
302
303	rx_rings = (uint16_t)num_queues,
304	/* Configure ethernet device. */
305	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
306	if (retval != 0)
307		return retval;
308
309	rte_eth_dev_info_get(port, &dev_info);
310	rxconf = &dev_info.default_rxconf;
311	rxconf->rx_drop_en = 1;
312	/* Setup the queues. */
313	for (q = 0; q < rx_rings; q ++) {
314		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
315						rte_eth_dev_socket_id(port), rxconf,
316						mbuf_pool);
317		if (retval < 0)
318			return retval;
319	}
320	for (q = 0; q < tx_rings; q ++) {
321		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
322						rte_eth_dev_socket_id(port),
323						NULL);
324		if (retval < 0)
325			return retval;
326	}
327
328	/* Start the device. */
329	retval  = rte_eth_dev_start(port);
330	if (retval < 0)
331		return retval;
332
333	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
334	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
335	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
336			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
337			(unsigned)port,
338			vmdq_ports_eth_addr[port].addr_bytes[0],
339			vmdq_ports_eth_addr[port].addr_bytes[1],
340			vmdq_ports_eth_addr[port].addr_bytes[2],
341			vmdq_ports_eth_addr[port].addr_bytes[3],
342			vmdq_ports_eth_addr[port].addr_bytes[4],
343			vmdq_ports_eth_addr[port].addr_bytes[5]);
344
345	return 0;
346}
347
348/*
349 * Parse the portmask provided at run time.
350 */
351static int
352parse_portmask(const char *portmask)
353{
354	char *end = NULL;
355	unsigned long pm;
356
357	errno = 0;
358
359	/* parse hexadecimal string */
360	pm = strtoul(portmask, &end, 16);
361	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
362		return -1;
363
364	if (pm == 0)
365		return -1;
366
367	return pm;
368
369}
370
371/*
372 * Parse num options at run time.
373 */
374static int
375parse_num_opt(const char *q_arg, uint32_t max_valid_value)
376{
377	char *end = NULL;
378	unsigned long num;
379
380	errno = 0;
381
382	/* parse unsigned int string */
383	num = strtoul(q_arg, &end, 10);
384	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
385		return -1;
386
387	if (num > max_valid_value)
388		return -1;
389
390	return num;
391
392}
393
394/*
395 * Display usage
396 */
397static void
398us_vhost_usage(const char *prgname)
399{
400	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK --vm2vm [0|1] --stats [0-N] --nb-devices ND\n"
401	"		-p PORTMASK: Set mask for ports to be used by application\n"
402	"		--vm2vm [0|1]: disable/enable(default) vm2vm comms\n"
403	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n",
404	       prgname);
405}
406
407/*
408 * Parse the arguments given in the command line of the application.
409 */
410static int
411us_vhost_parse_args(int argc, char **argv)
412{
413	int opt, ret;
414	int option_index;
415	unsigned i;
416	const char *prgname = argv[0];
417	static struct option long_option[] = {
418		{"vm2vm", required_argument, NULL, 0},
419		{"stats", required_argument, NULL, 0},
420		{NULL, 0, 0, 0}
421	};
422
423	/* Parse command line */
424	while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) {
425		switch (opt) {
426		/* Portmask */
427		case 'p':
428			enabled_port_mask = parse_portmask(optarg);
429			if (enabled_port_mask == 0) {
430				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
431				us_vhost_usage(prgname);
432				return -1;
433			}
434			break;
435
436		case 0:
437			/* Enable/disable vm2vm comms. */
438			if (!strncmp(long_option[option_index].name, "vm2vm", MAX_LONG_OPT_SZ)) {
439				ret = parse_num_opt(optarg, 1);
440				if (ret == -1) {
441					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for vm2vm [0|1]\n");
442					us_vhost_usage(prgname);
443					return -1;
444				} else {
445					enable_vm2vm = ret;
446				}
447			}
448
449			/* Enable/disable stats. */
450			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
451				ret = parse_num_opt(optarg, INT32_MAX);
452				if (ret == -1) {
453					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
454					us_vhost_usage(prgname);
455					return -1;
456				} else {
457					enable_stats = ret;
458				}
459			}
460			break;
461
462			/* Invalid option - print options. */
463		default:
464			us_vhost_usage(prgname);
465			return -1;
466		}
467	}
468
469	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
470		if (enabled_port_mask & (1 << i))
471			ports[num_ports++] = (uint8_t)i;
472	}
473
474	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
475		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
476			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
477		return -1;
478	}
479
480	return 0;
481}
482
483/*
484 * Update the global var NUM_PORTS and array PORTS according to system ports number
485 * and return valid ports number
486 */
487static unsigned check_ports_num(unsigned nb_ports)
488{
489	unsigned valid_num_ports = num_ports;
490	unsigned portid;
491
492	if (num_ports > nb_ports) {
493		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
494			num_ports, nb_ports);
495		num_ports = nb_ports;
496	}
497
498	for (portid = 0; portid < num_ports; portid ++) {
499		if (ports[portid] >= nb_ports) {
500			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
501				ports[portid], (nb_ports - 1));
502			ports[portid] = INVALID_PORT_ID;
503			valid_num_ports--;
504		}
505	}
506	return valid_num_ports;
507}
508
509/*
510 * Function to convert guest physical addresses to vhost virtual addresses. This
511 * is used to convert virtio buffer addresses.
512 */
513static inline uint64_t __attribute__((always_inline))
514gpa_to_vva(struct virtio_net *dev, uint64_t guest_pa)
515{
516	struct virtio_memory_regions *region;
517	uint32_t regionidx;
518	uint64_t vhost_va = 0;
519
520	for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
521		region = &dev->mem->regions[regionidx];
522		if ((guest_pa >= region->guest_phys_address) &&
523			(guest_pa <= region->guest_phys_address_end)) {
524			vhost_va = region->address_offset + guest_pa;
525			break;
526		}
527	}
528	RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") GPA %p| VVA %p\n",
529		dev->device_fh, (void*)(uintptr_t)guest_pa, (void*)(uintptr_t)vhost_va);
530
531	return vhost_va;
532}
533
534/*
535 * This function adds buffers to the virtio devices RX virtqueue. Buffers can
536 * be received from the physical port or from another virtio device. A packet
537 * count is returned to indicate the number of packets that were succesfully
538 * added to the RX queue.
539 */
540static inline uint32_t __attribute__((always_inline))
541virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count)
542{
543	struct vhost_virtqueue *vq;
544	struct vring_desc *desc;
545	struct rte_mbuf *buff;
546	/* The virtio_hdr is initialised to 0. */
547	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0,0,0,0,0,0},0};
548	uint64_t buff_addr = 0;
549	uint64_t buff_hdr_addr = 0;
550	uint32_t head[MAX_PKT_BURST], packet_len = 0;
551	uint32_t head_idx, packet_success = 0;
552	uint16_t avail_idx, res_cur_idx;
553	uint16_t res_base_idx, res_end_idx;
554	uint16_t free_entries;
555	uint8_t success = 0;
556	void *userdata;
557
558	RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") virtio_dev_rx()\n", dev->device_fh);
559	vq = dev->virtqueue_rx;
560	count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
561	/* As many data cores may want access to available buffers, they need to be reserved. */
562	do {
563
564		res_base_idx = vq->last_used_idx_res;
565
566		avail_idx = *((volatile uint16_t *)&vq->avail->idx);
567
568		free_entries = (avail_idx - res_base_idx);
569
570		/*check that we have enough buffers*/
571		if (unlikely(count > free_entries))
572			count = free_entries;
573
574		if (count == 0)
575			return 0;
576
577		res_end_idx = res_base_idx + count;
578		/* vq->last_used_idx_res is atomically updated. */
579		success = rte_atomic16_cmpset(&vq->last_used_idx_res, res_base_idx,
580									res_end_idx);
581	} while (unlikely(success == 0));
582	res_cur_idx = res_base_idx;
583	RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") Current Index %d| End Index %d\n",
584		dev->device_fh, res_cur_idx, res_end_idx);
585
586	/* Prefetch available ring to retrieve indexes. */
587	rte_prefetch0(&vq->avail->ring[res_cur_idx & (vq->size - 1)]);
588
589	/* Retrieve all of the head indexes first to avoid caching issues. */
590	for (head_idx = 0; head_idx < count; head_idx++)
591		head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) & (vq->size - 1)];
592
593	/*Prefetch descriptor index. */
594	rte_prefetch0(&vq->desc[head[packet_success]]);
595
596	while (res_cur_idx != res_end_idx) {
597		/* Get descriptor from available ring */
598		desc = &vq->desc[head[packet_success]];
599		/* Prefetch descriptor address. */
600		rte_prefetch0(desc);
601
602		buff = pkts[packet_success];
603
604		/* Convert from gpa to vva (guest physical addr -> vhost virtual addr) */
605		buff_addr = gpa_to_vva(dev, desc->addr);
606		/* Prefetch buffer address. */
607		rte_prefetch0((void*)(uintptr_t)buff_addr);
608
609		{
610			/* Copy virtio_hdr to packet and increment buffer address */
611			buff_hdr_addr = buff_addr;
612			packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
613
614			/*
615			 * If the descriptors are chained the header and data are placed in
616			 * separate buffers.
617			 */
618			if (desc->flags & VRING_DESC_F_NEXT) {
619				desc->len = vq->vhost_hlen;
620				desc = &vq->desc[desc->next];
621				/* Buffer address translation. */
622				buff_addr = gpa_to_vva(dev, desc->addr);
623				desc->len = rte_pktmbuf_data_len(buff);
624			} else {
625				buff_addr += vq->vhost_hlen;
626				desc->len = packet_len;
627			}
628		}
629
630		/* Update used ring with desc information */
631		vq->used->ring[res_cur_idx & (vq->size - 1)].id = head[packet_success];
632		vq->used->ring[res_cur_idx & (vq->size - 1)].len = packet_len;
633
634		/* Copy mbuf data to buffer */
635		userdata = rte_pktmbuf_mtod(buff, void *);
636		rte_memcpy((void *)(uintptr_t)buff_addr, userdata, rte_pktmbuf_data_len(buff));
637
638		res_cur_idx++;
639		packet_success++;
640
641		/* mergeable is disabled then a header is required per buffer. */
642		rte_memcpy((void *)(uintptr_t)buff_hdr_addr, (const void *)&virtio_hdr, vq->vhost_hlen);
643		if (res_cur_idx < res_end_idx) {
644			/* Prefetch descriptor index. */
645			rte_prefetch0(&vq->desc[head[packet_success]]);
646		}
647	}
648
649	rte_compiler_barrier();
650
651	/* Wait until it's our turn to add our buffer to the used ring. */
652	while (unlikely(vq->last_used_idx != res_base_idx))
653		rte_pause();
654
655	*(volatile uint16_t *)&vq->used->idx += count;
656
657	vq->last_used_idx = res_end_idx;
658
659	return count;
660}
661
662/*
663 * Compares a packet destination MAC address to a device MAC address.
664 */
665static inline int __attribute__((always_inline))
666ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
667{
668	return ((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0;
669}
670
671/*
672 * This function registers mac along with a
673 * vlan tag to a VMDQ.
674 */
675static int
676link_vmdq(struct virtio_net *dev)
677{
678	int ret;
679	struct virtio_net_data_ll *dev_ll;
680
681	dev_ll = ll_root_used;
682
683	while (dev_ll != NULL) {
684		if ((dev != dev_ll->dev) && ether_addr_cmp(&dev->mac_address, &dev_ll->dev->mac_address)) {
685			RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
686			return -1;
687		}
688		dev_ll = dev_ll->next;
689	}
690
691	/* vlan_tag currently uses the device_id. */
692	dev->vlan_tag = vlan_tags[dev->device_fh];
693	dev->vmdq_rx_q = dev->device_fh * (num_queues/num_devices);
694
695	/* Print out VMDQ registration info. */
696	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
697		dev->device_fh,
698		dev->mac_address.addr_bytes[0], dev->mac_address.addr_bytes[1],
699		dev->mac_address.addr_bytes[2], dev->mac_address.addr_bytes[3],
700		dev->mac_address.addr_bytes[4], dev->mac_address.addr_bytes[5],
701		dev->vlan_tag);
702
703	/* Register the MAC address. */
704	ret = rte_eth_dev_mac_addr_add(ports[0], &dev->mac_address, (uint32_t)dev->device_fh);
705	if (ret) {
706		RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
707										dev->device_fh);
708		return -1;
709	}
710
711	/* Enable stripping of the vlan tag as we handle routing. */
712	rte_eth_dev_set_vlan_strip_on_queue(ports[0], dev->vmdq_rx_q, 1);
713
714	rte_compiler_barrier();
715	/* Set device as ready for RX. */
716	dev->ready = DEVICE_READY;
717
718	return 0;
719}
720
721/*
722 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
723 * queue before disabling RX on the device.
724 */
725static inline void
726unlink_vmdq(struct virtio_net *dev)
727{
728	unsigned i = 0;
729	unsigned rx_count;
730	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
731
732	if (dev->ready == DEVICE_READY) {
733		/*clear MAC and VLAN settings*/
734		rte_eth_dev_mac_addr_remove(ports[0], &dev->mac_address);
735		for (i = 0; i < 6; i++)
736			dev->mac_address.addr_bytes[i] = 0;
737
738		dev->vlan_tag = 0;
739
740		/*Clear out the receive buffers*/
741		rx_count = rte_eth_rx_burst(ports[0],
742					(uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
743
744		while (rx_count) {
745			for (i = 0; i < rx_count; i++)
746				rte_pktmbuf_free(pkts_burst[i]);
747
748			rx_count = rte_eth_rx_burst(ports[0],
749					(uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
750		}
751
752		dev->ready = DEVICE_NOT_READY;
753	}
754}
755
756/*
757 * Check if the packet destination MAC address is for a local device. If so then put
758 * the packet on that devices RX queue. If not then return.
759 */
760static inline unsigned __attribute__((always_inline))
761virtio_tx_local(struct virtio_net *dev, struct rte_mbuf *m)
762{
763	struct virtio_net_data_ll *dev_ll;
764	struct ether_hdr *pkt_hdr;
765	uint64_t ret = 0;
766
767	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
768
769	/*get the used devices list*/
770	dev_ll = ll_root_used;
771
772	while (dev_ll != NULL) {
773		if (likely(dev_ll->dev->ready == DEVICE_READY) && ether_addr_cmp(&(pkt_hdr->d_addr),
774				          &dev_ll->dev->mac_address)) {
775
776			/* Drop the packet if the TX packet is destined for the TX device. */
777			if (dev_ll->dev->device_fh == dev->device_fh) {
778				RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") TX: "
779					"Source and destination MAC addresses are the same. "
780					"Dropping packet.\n",
781					dev_ll->dev->device_fh);
782				return 0;
783			}
784
785
786			RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") TX: "
787				"MAC address is local\n", dev_ll->dev->device_fh);
788
789			if (dev_ll->dev->remove) {
790				/*drop the packet if the device is marked for removal*/
791				RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") "
792					"Device is marked for removal\n",
793					dev_ll->dev->device_fh);
794			} else {
795				/*send the packet to the local virtio device*/
796				ret = virtio_dev_rx(dev_ll->dev, &m, 1);
797				if (enable_stats) {
798					rte_atomic64_add(&dev_statistics[dev_ll->dev->device_fh].rx_total, 1);
799					rte_atomic64_add(&dev_statistics[dev_ll->dev->device_fh].rx, ret);
800					dev_statistics[dev->device_fh].tx_total++;
801					dev_statistics[dev->device_fh].tx += ret;
802				}
803			}
804
805			return 0;
806		}
807		dev_ll = dev_ll->next;
808	}
809
810	return -1;
811}
812
813/*
814 * This function routes the TX packet to the correct interface. This may be a local device
815 * or the physical port.
816 */
817static inline void __attribute__((always_inline))
818virtio_tx_route(struct virtio_net* dev, struct rte_mbuf *m, struct rte_mempool *mbuf_pool, uint16_t vlan_tag)
819{
820	struct mbuf_table *tx_q;
821	struct vlan_ethhdr *vlan_hdr;
822	struct rte_mbuf **m_table;
823	struct rte_mbuf *mbuf;
824	unsigned len, ret;
825	const uint16_t lcore_id = rte_lcore_id();
826
827	/*check if destination is local VM*/
828	if (enable_vm2vm && (virtio_tx_local(dev, m) == 0)) {
829		return;
830	}
831
832	RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") TX: "
833		"MAC address is external\n", dev->device_fh);
834
835	/*Add packet to the port tx queue*/
836	tx_q = &lcore_tx_queue[lcore_id];
837	len = tx_q->len;
838
839	/* Allocate an mbuf and populate the structure. */
840	mbuf = rte_pktmbuf_alloc(mbuf_pool);
841	if(!mbuf)
842		return;
843
844	mbuf->data_len = m->data_len + VLAN_HLEN;
845	mbuf->pkt_len = mbuf->data_len;
846
847	/* Copy ethernet header to mbuf. */
848	rte_memcpy(rte_pktmbuf_mtod(mbuf, void*),
849			rte_pktmbuf_mtod(m, const void*), ETH_HLEN);
850
851
852	/* Setup vlan header. Bytes need to be re-ordered for network with htons()*/
853	vlan_hdr = rte_pktmbuf_mtod(mbuf, struct vlan_ethhdr *);
854	vlan_hdr->h_vlan_encapsulated_proto = vlan_hdr->h_vlan_proto;
855	vlan_hdr->h_vlan_proto = htons(ETH_P_8021Q);
856	vlan_hdr->h_vlan_TCI = htons(vlan_tag);
857
858	/* Copy the remaining packet contents to the mbuf. */
859	rte_memcpy(rte_pktmbuf_mtod_offset(mbuf, void *, VLAN_ETH_HLEN),
860		rte_pktmbuf_mtod_offset(m, const void *, ETH_HLEN),
861		(m->data_len - ETH_HLEN));
862	tx_q->m_table[len] = mbuf;
863	len++;
864	if (enable_stats) {
865		dev_statistics[dev->device_fh].tx_total++;
866		dev_statistics[dev->device_fh].tx++;
867	}
868
869	if (unlikely(len == MAX_PKT_BURST)) {
870		m_table = (struct rte_mbuf **)tx_q->m_table;
871		ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
872		/* Free any buffers not handled by TX and update the port stats. */
873		if (unlikely(ret < len)) {
874			do {
875				rte_pktmbuf_free(m_table[ret]);
876			} while (++ret < len);
877		}
878
879		len = 0;
880	}
881
882	tx_q->len = len;
883	return;
884}
885
886static inline void __attribute__((always_inline))
887virtio_dev_tx(struct virtio_net* dev, struct rte_mempool *mbuf_pool)
888{
889	struct rte_mbuf m;
890	struct vhost_virtqueue *vq;
891	struct vring_desc *desc;
892	uint64_t buff_addr = 0;
893	uint32_t head[MAX_PKT_BURST];
894	uint32_t used_idx;
895	uint32_t i;
896	uint16_t free_entries, packet_success = 0;
897	uint16_t avail_idx;
898
899	vq = dev->virtqueue_tx;
900	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
901
902	/* If there are no available buffers then return. */
903	if (vq->last_used_idx == avail_idx)
904		return;
905
906	RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") virtio_dev_tx()\n",
907		dev->device_fh);
908
909	/* Prefetch available ring to retrieve head indexes. */
910	rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]);
911
912	/*get the number of free entries in the ring*/
913	free_entries = avail_idx - vq->last_used_idx;
914	free_entries = unlikely(free_entries < MAX_PKT_BURST) ? free_entries : MAX_PKT_BURST;
915
916	RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") Buffers available %d\n",
917		dev->device_fh, free_entries);
918	/* Retrieve all of the head indexes first to avoid caching issues. */
919	for (i = 0; i < free_entries; i++)
920		head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)];
921
922	/* Prefetch descriptor index. */
923	rte_prefetch0(&vq->desc[head[packet_success]]);
924
925	while (packet_success < free_entries) {
926		desc = &vq->desc[head[packet_success]];
927		/* Prefetch descriptor address. */
928		rte_prefetch0(desc);
929
930		if (packet_success < (free_entries - 1)) {
931			/* Prefetch descriptor index. */
932			rte_prefetch0(&vq->desc[head[packet_success+1]]);
933		}
934
935		/* Update used index buffer information. */
936		used_idx = vq->last_used_idx & (vq->size - 1);
937		vq->used->ring[used_idx].id = head[packet_success];
938		vq->used->ring[used_idx].len = 0;
939
940		/* Discard first buffer as it is the virtio header */
941		desc = &vq->desc[desc->next];
942
943		/* Buffer address translation. */
944		buff_addr = gpa_to_vva(dev, desc->addr);
945		/* Prefetch buffer address. */
946		rte_prefetch0((void*)(uintptr_t)buff_addr);
947
948		/* Setup dummy mbuf. This is copied to a real mbuf if transmitted out the physical port. */
949		m.data_len = desc->len;
950		m.data_off = 0;
951		m.nb_segs = 1;
952
953		virtio_tx_route(dev, &m, mbuf_pool, 0);
954
955		vq->last_used_idx++;
956		packet_success++;
957	}
958
959	rte_compiler_barrier();
960	vq->used->idx += packet_success;
961	/* Kick guest if required. */
962}
963
964/*
965 * This function is called by each data core. It handles all RX/TX registered with the
966 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
967 * with all devices in the main linked list.
968 */
969static int
970switch_worker(__attribute__((unused)) void *arg)
971{
972	struct rte_mempool *mbuf_pool = arg;
973	struct virtio_net *dev = NULL;
974	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
975	struct virtio_net_data_ll *dev_ll;
976	struct mbuf_table *tx_q;
977	volatile struct lcore_ll_info *lcore_ll;
978	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
979	uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
980	unsigned ret, i;
981	const uint16_t lcore_id = rte_lcore_id();
982	const uint16_t num_cores = (uint16_t)rte_lcore_count();
983	uint16_t rx_count = 0;
984
985	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started \n", lcore_id);
986	lcore_ll = lcore_info[lcore_id].lcore_ll;
987	prev_tsc = 0;
988
989	tx_q = &lcore_tx_queue[lcore_id];
990	for (i = 0; i < num_cores; i ++) {
991		if (lcore_ids[i] == lcore_id) {
992			tx_q->txq_id = i;
993			break;
994		}
995	}
996
997	while(1) {
998		cur_tsc = rte_rdtsc();
999		/*
1000		 * TX burst queue drain
1001		 */
1002		diff_tsc = cur_tsc - prev_tsc;
1003		if (unlikely(diff_tsc > drain_tsc)) {
1004
1005			if (tx_q->len) {
1006				RTE_LOG(DEBUG, VHOST_DATA,
1007					"TX queue drained after timeout with burst size %u\n",
1008					tx_q->len);
1009
1010				/*Tx any packets in the queue*/
1011				ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1012									   (struct rte_mbuf **)tx_q->m_table,
1013									   (uint16_t)tx_q->len);
1014				if (unlikely(ret < tx_q->len)) {
1015					do {
1016						rte_pktmbuf_free(tx_q->m_table[ret]);
1017					} while (++ret < tx_q->len);
1018				}
1019
1020				tx_q->len = 0;
1021			}
1022
1023			prev_tsc = cur_tsc;
1024
1025		}
1026
1027		/*
1028		 * Inform the configuration core that we have exited the linked list and that no devices are
1029		 * in use if requested.
1030		 */
1031		if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1032			lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1033
1034		/*
1035		 * Process devices
1036	 	 */
1037		dev_ll = lcore_ll->ll_root_used;
1038
1039		while (dev_ll != NULL) {
1040			/*get virtio device ID*/
1041			dev = dev_ll->dev;
1042
1043			if (unlikely(dev->remove)) {
1044				dev_ll = dev_ll->next;
1045				unlink_vmdq(dev);
1046				dev->ready = DEVICE_SAFE_REMOVE;
1047				continue;
1048			}
1049			if (likely(dev->ready == DEVICE_READY)) {
1050				/*Handle guest RX*/
1051				rx_count = rte_eth_rx_burst(ports[0],
1052					(uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1053
1054				if (rx_count) {
1055					ret_count = virtio_dev_rx(dev, pkts_burst, rx_count);
1056					if (enable_stats) {
1057						rte_atomic64_add(&dev_statistics[dev_ll->dev->device_fh].rx_total, rx_count);
1058						rte_atomic64_add(&dev_statistics[dev_ll->dev->device_fh].rx, ret_count);
1059					}
1060					while (likely(rx_count)) {
1061						rx_count--;
1062						rte_pktmbuf_free_seg(pkts_burst[rx_count]);
1063					}
1064
1065				}
1066			}
1067
1068			if (likely(!dev->remove))
1069				/*Handle guest TX*/
1070				virtio_dev_tx(dev, mbuf_pool);
1071
1072			/*move to the next device in the list*/
1073			dev_ll = dev_ll->next;
1074		}
1075	}
1076
1077	return 0;
1078}
1079
1080/*
1081 * Add an entry to a used linked list. A free entry must first be found in the free linked list
1082 * using get_data_ll_free_entry();
1083 */
1084static void
1085add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, struct virtio_net_data_ll *ll_dev)
1086{
1087	struct virtio_net_data_ll *ll = *ll_root_addr;
1088
1089	/* Set next as NULL and use a compiler barrier to avoid reordering. */
1090	ll_dev->next = NULL;
1091	rte_compiler_barrier();
1092
1093	/* If ll == NULL then this is the first device. */
1094	if (ll) {
1095		/* Increment to the tail of the linked list. */
1096		while ((ll->next != NULL) )
1097			ll = ll->next;
1098
1099		ll->next = ll_dev;
1100	} else {
1101		*ll_root_addr = ll_dev;
1102	}
1103}
1104
1105/*
1106 * Remove an entry from a used linked list. The entry must then be added to the free linked list
1107 * using put_data_ll_free_entry().
1108 */
1109static void
1110rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, struct virtio_net_data_ll *ll_dev, struct virtio_net_data_ll *ll_dev_last)
1111{
1112	struct virtio_net_data_ll *ll = *ll_root_addr;
1113
1114	if (ll_dev == ll)
1115		*ll_root_addr = ll_dev->next;
1116	else
1117		ll_dev_last->next = ll_dev->next;
1118}
1119
1120/*
1121 * Find and return an entry from the free linked list.
1122 */
1123static struct virtio_net_data_ll *
1124get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
1125{
1126	struct virtio_net_data_ll *ll_free = *ll_root_addr;
1127	struct virtio_net_data_ll *ll_dev;
1128
1129	if (ll_free == NULL)
1130		return NULL;
1131
1132	ll_dev = ll_free;
1133	*ll_root_addr = ll_free->next;
1134
1135	return ll_dev;
1136}
1137
1138/*
1139 * Place an entry back on to the free linked list.
1140 */
1141static void
1142put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr, struct virtio_net_data_ll *ll_dev)
1143{
1144	struct virtio_net_data_ll *ll_free = *ll_root_addr;
1145
1146	ll_dev->next = ll_free;
1147	*ll_root_addr = ll_dev;
1148}
1149
1150/*
1151 * Creates a linked list of a given size.
1152 */
1153static struct virtio_net_data_ll *
1154alloc_data_ll(uint32_t size)
1155{
1156	struct virtio_net_data_ll *ll_new;
1157	uint32_t i;
1158
1159	/* Malloc and then chain the linked list. */
1160	ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
1161	if (ll_new == NULL) {
1162		RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
1163		return NULL;
1164	}
1165
1166	for (i = 0; i < size - 1; i++) {
1167		ll_new[i].dev = NULL;
1168		ll_new[i].next = &ll_new[i+1];
1169	}
1170	ll_new[i].next = NULL;
1171
1172	return ll_new;
1173}
1174
1175/*
1176 * Create the main linked list along with each individual cores linked list. A used and a free list
1177 * are created to manage entries.
1178 */
1179static int
1180init_data_ll (void)
1181{
1182	int lcore;
1183
1184	RTE_LCORE_FOREACH_SLAVE(lcore) {
1185		lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
1186		if (lcore_info[lcore].lcore_ll == NULL) {
1187			RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
1188			return -1;
1189		}
1190
1191		lcore_info[lcore].lcore_ll->device_num = 0;
1192		lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1193		lcore_info[lcore].lcore_ll->ll_root_used = NULL;
1194		if (num_devices % num_switching_cores)
1195			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
1196		else
1197			lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
1198	}
1199
1200	/* Allocate devices up to a maximum of MAX_DEVICES. */
1201	ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
1202
1203	return 0;
1204}
1205/*
1206 * Remove a device from the specific data core linked list and from the main linked list. The
1207 * rx/tx thread must be set the flag to indicate that it is safe to remove the device.
1208 * used.
1209 */
1210static void
1211destroy_device (volatile struct virtio_net *dev)
1212{
1213	struct virtio_net_data_ll *ll_lcore_dev_cur;
1214	struct virtio_net_data_ll *ll_main_dev_cur;
1215	struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
1216	struct virtio_net_data_ll *ll_main_dev_last = NULL;
1217	int lcore;
1218
1219	dev->flags &= ~VIRTIO_DEV_RUNNING;
1220
1221	/*set the remove flag. */
1222	dev->remove = 1;
1223
1224	while(dev->ready != DEVICE_SAFE_REMOVE) {
1225		rte_pause();
1226	}
1227
1228	/* Search for entry to be removed from lcore ll */
1229	ll_lcore_dev_cur = lcore_info[dev->coreid].lcore_ll->ll_root_used;
1230	while (ll_lcore_dev_cur != NULL) {
1231		if (ll_lcore_dev_cur->dev == dev) {
1232			break;
1233		} else {
1234			ll_lcore_dev_last = ll_lcore_dev_cur;
1235			ll_lcore_dev_cur = ll_lcore_dev_cur->next;
1236		}
1237	}
1238
1239	/* Search for entry to be removed from main ll */
1240	ll_main_dev_cur = ll_root_used;
1241	ll_main_dev_last = NULL;
1242	while (ll_main_dev_cur != NULL) {
1243		if (ll_main_dev_cur->dev == dev) {
1244			break;
1245		} else {
1246			ll_main_dev_last = ll_main_dev_cur;
1247			ll_main_dev_cur = ll_main_dev_cur->next;
1248		}
1249	}
1250
1251	if (ll_lcore_dev_cur == NULL || ll_main_dev_cur == NULL) {
1252		RTE_LOG(ERR, XENHOST, "%s: could find device in per_cpu list or main_list\n", __func__);
1253		return;
1254	}
1255
1256	/* Remove entries from the lcore and main ll. */
1257	rm_data_ll_entry(&lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
1258	rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
1259
1260	/* Set the dev_removal_flag on each lcore. */
1261	RTE_LCORE_FOREACH_SLAVE(lcore) {
1262		lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
1263	}
1264
1265	/*
1266	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
1267	 * they can no longer access the device removed from the linked lists and that the devices
1268	 * are no longer in use.
1269	 */
1270	RTE_LCORE_FOREACH_SLAVE(lcore) {
1271		while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
1272			rte_pause();
1273		}
1274	}
1275
1276	/* Add the entries back to the lcore and main free ll.*/
1277	put_data_ll_free_entry(&lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
1278	put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
1279
1280	/* Decrement number of device on the lcore. */
1281	lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->device_num--;
1282
1283	RTE_LOG(INFO, VHOST_DATA, "  #####(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
1284}
1285
1286/*
1287 * A new device is added to a data core. First the device is added to the main linked list
1288 * and the allocated to a specific data core.
1289 */
1290static int
1291new_device (struct virtio_net *dev)
1292{
1293	struct virtio_net_data_ll *ll_dev;
1294	int lcore, core_add = 0;
1295	uint32_t device_num_min = num_devices;
1296
1297	/* Add device to main ll */
1298	ll_dev = get_data_ll_free_entry(&ll_root_free);
1299	if (ll_dev == NULL) {
1300		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
1301			"of %d devices per core has been reached\n",
1302			dev->device_fh, num_devices);
1303		return -1;
1304	}
1305	ll_dev->dev = dev;
1306	add_data_ll_entry(&ll_root_used, ll_dev);
1307
1308	/*reset ready flag*/
1309	dev->ready = DEVICE_NOT_READY;
1310	dev->remove = 0;
1311
1312	/* Find a suitable lcore to add the device. */
1313	RTE_LCORE_FOREACH_SLAVE(lcore) {
1314		if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
1315			device_num_min = lcore_info[lcore].lcore_ll->device_num;
1316			core_add = lcore;
1317		}
1318	}
1319	/* Add device to lcore ll */
1320	ll_dev->dev->coreid = core_add;
1321	ll_dev = get_data_ll_free_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_free);
1322	if (ll_dev == NULL) {
1323		RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
1324		destroy_device(dev);
1325		return -1;
1326	}
1327	ll_dev->dev = dev;
1328	add_data_ll_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_used, ll_dev);
1329
1330	/* Initialize device stats */
1331	memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
1332
1333	lcore_info[ll_dev->dev->coreid].lcore_ll->device_num++;
1334	dev->flags |= VIRTIO_DEV_RUNNING;
1335
1336	RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, dev->coreid);
1337
1338	link_vmdq(dev);
1339
1340	return 0;
1341}
1342
1343/*
1344 * These callback allow devices to be added to the data core when configuration
1345 * has been fully complete.
1346 */
1347static const struct virtio_net_device_ops virtio_net_device_ops =
1348{
1349	.new_device =  new_device,
1350	.destroy_device = destroy_device,
1351};
1352
1353/*
1354 * This is a thread will wake up after a period to print stats if the user has
1355 * enabled them.
1356 */
1357static void
1358print_stats(void)
1359{
1360	struct virtio_net_data_ll *dev_ll;
1361	uint64_t tx_dropped, rx_dropped;
1362	uint64_t tx, tx_total, rx, rx_total;
1363	uint32_t device_fh;
1364	const char clr[] = { 27, '[', '2', 'J', '\0' };
1365	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1366
1367	while(1) {
1368		sleep(enable_stats);
1369
1370		/* Clear screen and move to top left */
1371		printf("%s%s", clr, top_left);
1372
1373		printf("\nDevice statistics ====================================");
1374
1375		dev_ll = ll_root_used;
1376		while (dev_ll != NULL) {
1377			device_fh = (uint32_t)dev_ll->dev->device_fh;
1378			tx_total = dev_statistics[device_fh].tx_total;
1379			tx = dev_statistics[device_fh].tx;
1380			tx_dropped = tx_total - tx;
1381			rx_total = rte_atomic64_read(&dev_statistics[device_fh].rx_total);
1382			rx = rte_atomic64_read(&dev_statistics[device_fh].rx);
1383			rx_dropped = rx_total - rx;
1384
1385			printf("\nStatistics for device %"PRIu32" ------------------------------"
1386					"\nTX total: 		%"PRIu64""
1387					"\nTX dropped: 		%"PRIu64""
1388					"\nTX successful: 		%"PRIu64""
1389					"\nRX total: 		%"PRIu64""
1390					"\nRX dropped: 		%"PRIu64""
1391					"\nRX successful: 		%"PRIu64"",
1392					device_fh,
1393					tx_total,
1394					tx_dropped,
1395					tx,
1396					rx_total,
1397					rx_dropped,
1398					rx);
1399
1400			dev_ll = dev_ll->next;
1401		}
1402		printf("\n======================================================\n");
1403	}
1404}
1405
1406
1407int init_virtio_net(struct virtio_net_device_ops const * const ops);
1408
1409/*
1410 * Main function, does initialisation and calls the per-lcore functions.
1411 */
1412int
1413main(int argc, char *argv[])
1414{
1415	struct rte_mempool *mbuf_pool;
1416	unsigned lcore_id, core_id = 0;
1417	unsigned nb_ports, valid_num_ports;
1418	int ret;
1419	uint8_t portid;
1420	static pthread_t tid;
1421	char thread_name[RTE_MAX_THREAD_NAME_LEN];
1422
1423	/* init EAL */
1424	ret = rte_eal_init(argc, argv);
1425	if (ret < 0)
1426		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1427	argc -= ret;
1428	argv += ret;
1429
1430	/* parse app arguments */
1431	ret = us_vhost_parse_args(argc, argv);
1432	if (ret < 0)
1433		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1434
1435	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
1436		if (rte_lcore_is_enabled(lcore_id))
1437			lcore_ids[core_id ++] = lcore_id;
1438
1439	if (rte_lcore_count() > RTE_MAX_LCORE)
1440		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1441
1442	/*set the number of swithcing cores available*/
1443	num_switching_cores = rte_lcore_count()-1;
1444
1445	/* Get the number of physical ports. */
1446	nb_ports = rte_eth_dev_count();
1447
1448	/*
1449	 * Update the global var NUM_PORTS and global array PORTS
1450	 * and get value of var VALID_NUM_PORTS according to system ports number
1451	 */
1452	valid_num_ports = check_ports_num(nb_ports);
1453
1454	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1455		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1456			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1457		return -1;
1458	}
1459
1460	/* Create the mbuf pool. */
1461	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL",
1462		NUM_MBUFS_PER_PORT * valid_num_ports, MBUF_CACHE_SIZE, 0,
1463		RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id());
1464	if (mbuf_pool == NULL)
1465		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1466
1467	/* initialize all ports */
1468	for (portid = 0; portid < nb_ports; portid++) {
1469		/* skip ports that are not enabled */
1470		if ((enabled_port_mask & (1 << portid)) == 0) {
1471			RTE_LOG(INFO, VHOST_PORT, "Skipping disabled port %d\n", portid);
1472			continue;
1473		}
1474		if (port_init(portid, mbuf_pool) != 0)
1475			rte_exit(EXIT_FAILURE, "Cannot initialize network ports\n");
1476	}
1477
1478	/* Initialise all linked lists. */
1479	if (init_data_ll() == -1)
1480		rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
1481
1482	/* Initialize device stats */
1483	memset(&dev_statistics, 0, sizeof(dev_statistics));
1484
1485	/* Enable stats if the user option is set. */
1486	if (enable_stats) {
1487		ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
1488		if (ret != 0)
1489			rte_exit(EXIT_FAILURE,
1490				"Cannot create print-stats thread\n");
1491
1492		/* Set thread_name for aid in debugging. */
1493		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-xen-stats");
1494		ret = rte_thread_setname(tid, thread_name);
1495		if (ret != 0)
1496			RTE_LOG(DEBUG, VHOST_CONFIG,
1497				"Cannot set print-stats name\n");
1498	}
1499
1500	/* Launch all data cores. */
1501	RTE_LCORE_FOREACH_SLAVE(lcore_id) {
1502		rte_eal_remote_launch(switch_worker, mbuf_pool, lcore_id);
1503	}
1504
1505	init_virtio_xen(&virtio_net_device_ops);
1506
1507	virtio_monitor_loop();
1508	return 0;
1509}
1510