main.c revision 39157ec0
1/*-
2 *   BSD LICENSE
3 *
4 *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
5 *   All rights reserved.
6 *
7 *   Redistribution and use in source and binary forms, with or without
8 *   modification, are permitted provided that the following conditions
9 *   are met:
10 *
11 *     * Redistributions of source code must retain the above copyright
12 *       notice, this list of conditions and the following disclaimer.
13 *     * Redistributions in binary form must reproduce the above copyright
14 *       notice, this list of conditions and the following disclaimer in
15 *       the documentation and/or other materials provided with the
16 *       distribution.
17 *     * Neither the name of Intel Corporation nor the names of its
18 *       contributors may be used to endorse or promote products derived
19 *       from this software without specific prior written permission.
20 *
21 *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34#include <arpa/inet.h>
35#include <getopt.h>
36#include <linux/if_ether.h>
37#include <linux/if_vlan.h>
38#include <linux/virtio_net.h>
39#include <linux/virtio_ring.h>
40#include <signal.h>
41#include <stdint.h>
42#include <sys/eventfd.h>
43#include <sys/param.h>
44#include <unistd.h>
45
46#include <rte_atomic.h>
47#include <rte_cycles.h>
48#include <rte_ethdev.h>
49#include <rte_log.h>
50#include <rte_string_fns.h>
51#include <rte_malloc.h>
52#include <rte_virtio_net.h>
53#include <rte_ip.h>
54#include <rte_tcp.h>
55
56#include "main.h"
57
58#ifndef MAX_QUEUES
59#define MAX_QUEUES 128
60#endif
61
62/* the maximum number of external ports supported */
63#define MAX_SUP_PORTS 1
64
65#define MBUF_CACHE_SIZE	128
66#define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
67
68#define MAX_PKT_BURST 32		/* Max burst size for RX/TX */
69#define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
70
71#define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
72#define BURST_RX_RETRIES 4		/* Number of retries on RX. */
73
74#define JUMBO_FRAME_MAX_SIZE    0x2600
75
76/* State of virtio device. */
77#define DEVICE_MAC_LEARNING 0
78#define DEVICE_RX			1
79#define DEVICE_SAFE_REMOVE	2
80
81/* Configurable number of RX/TX ring descriptors */
82#define RTE_TEST_RX_DESC_DEFAULT 1024
83#define RTE_TEST_TX_DESC_DEFAULT 512
84
85#define INVALID_PORT_ID 0xFF
86
87/* Max number of devices. Limited by vmdq. */
88#define MAX_DEVICES 64
89
90/* Size of buffers used for snprintfs. */
91#define MAX_PRINT_BUFF 6072
92
93/* Maximum long option length for option parsing. */
94#define MAX_LONG_OPT_SZ 64
95
96/* mask of enabled ports */
97static uint32_t enabled_port_mask = 0;
98
99/* Promiscuous mode */
100static uint32_t promiscuous;
101
102/* number of devices/queues to support*/
103static uint32_t num_queues = 0;
104static uint32_t num_devices;
105
106static struct rte_mempool *mbuf_pool;
107static int mergeable;
108
109/* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
110typedef enum {
111	VM2VM_DISABLED = 0,
112	VM2VM_SOFTWARE = 1,
113	VM2VM_HARDWARE = 2,
114	VM2VM_LAST
115} vm2vm_type;
116static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
117
118/* Enable stats. */
119static uint32_t enable_stats = 0;
120/* Enable retries on RX. */
121static uint32_t enable_retry = 1;
122
123/* Disable TX checksum offload */
124static uint32_t enable_tx_csum;
125
126/* Disable TSO offload */
127static uint32_t enable_tso;
128
129static int client_mode;
130static int dequeue_zero_copy;
131
132/* Specify timeout (in useconds) between retries on RX. */
133static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
134/* Specify the number of retries on RX. */
135static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
136
137/* Socket file paths. Can be set by user */
138static char *socket_files;
139static int nb_sockets;
140
141/* empty vmdq configuration structure. Filled in programatically */
142static struct rte_eth_conf vmdq_conf_default = {
143	.rxmode = {
144		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
145		.split_hdr_size = 0,
146		.header_split   = 0, /**< Header Split disabled */
147		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
148		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
149		/*
150		 * It is necessary for 1G NIC such as I350,
151		 * this fixes bug of ipv4 forwarding in guest can't
152		 * forward pakets from one virtio dev to another virtio dev.
153		 */
154		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
155		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
156		.hw_strip_crc   = 1, /**< CRC stripped by hardware */
157	},
158
159	.txmode = {
160		.mq_mode = ETH_MQ_TX_NONE,
161	},
162	.rx_adv_conf = {
163		/*
164		 * should be overridden separately in code with
165		 * appropriate values
166		 */
167		.vmdq_rx_conf = {
168			.nb_queue_pools = ETH_8_POOLS,
169			.enable_default_pool = 0,
170			.default_pool = 0,
171			.nb_pool_maps = 0,
172			.pool_map = {{0, 0},},
173		},
174	},
175};
176
177static unsigned lcore_ids[RTE_MAX_LCORE];
178static uint8_t ports[RTE_MAX_ETHPORTS];
179static unsigned num_ports = 0; /**< The number of ports specified in command line */
180static uint16_t num_pf_queues, num_vmdq_queues;
181static uint16_t vmdq_pool_base, vmdq_queue_base;
182static uint16_t queues_per_pool;
183
184const uint16_t vlan_tags[] = {
185	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
186	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
187	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
188	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
189	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
190	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
191	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
192	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
193};
194
195/* ethernet addresses of ports */
196static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
197
198static struct vhost_dev_tailq_list vhost_dev_list =
199	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
200
201static struct lcore_info lcore_info[RTE_MAX_LCORE];
202
203/* Used for queueing bursts of TX packets. */
204struct mbuf_table {
205	unsigned len;
206	unsigned txq_id;
207	struct rte_mbuf *m_table[MAX_PKT_BURST];
208};
209
210/* TX queue for each data core. */
211struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
212
213#define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
214				 / US_PER_S * BURST_TX_DRAIN_US)
215#define VLAN_HLEN       4
216
217/*
218 * Builds up the correct configuration for VMDQ VLAN pool map
219 * according to the pool & queue limits.
220 */
221static inline int
222get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
223{
224	struct rte_eth_vmdq_rx_conf conf;
225	struct rte_eth_vmdq_rx_conf *def_conf =
226		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
227	unsigned i;
228
229	memset(&conf, 0, sizeof(conf));
230	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
231	conf.nb_pool_maps = num_devices;
232	conf.enable_loop_back = def_conf->enable_loop_back;
233	conf.rx_mode = def_conf->rx_mode;
234
235	for (i = 0; i < conf.nb_pool_maps; i++) {
236		conf.pool_map[i].vlan_id = vlan_tags[ i ];
237		conf.pool_map[i].pools = (1UL << i);
238	}
239
240	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
241	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
242		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
243	return 0;
244}
245
246/*
247 * Validate the device number according to the max pool number gotten form
248 * dev_info. If the device number is invalid, give the error message and
249 * return -1. Each device must have its own pool.
250 */
251static inline int
252validate_num_devices(uint32_t max_nb_devices)
253{
254	if (num_devices > max_nb_devices) {
255		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
256		return -1;
257	}
258	return 0;
259}
260
261/*
262 * Initialises a given port using global settings and with the rx buffers
263 * coming from the mbuf_pool passed as parameter
264 */
265static inline int
266port_init(uint8_t port)
267{
268	struct rte_eth_dev_info dev_info;
269	struct rte_eth_conf port_conf;
270	struct rte_eth_rxconf *rxconf;
271	struct rte_eth_txconf *txconf;
272	int16_t rx_rings, tx_rings;
273	uint16_t rx_ring_size, tx_ring_size;
274	int retval;
275	uint16_t q;
276
277	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
278	rte_eth_dev_info_get (port, &dev_info);
279
280	rxconf = &dev_info.default_rxconf;
281	txconf = &dev_info.default_txconf;
282	rxconf->rx_drop_en = 1;
283
284	/* Enable vlan offload */
285	txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
286
287	/*configure the number of supported virtio devices based on VMDQ limits */
288	num_devices = dev_info.max_vmdq_pools;
289
290	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
291	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
292
293	/*
294	 * When dequeue zero copy is enabled, guest Tx used vring will be
295	 * updated only when corresponding mbuf is freed. Thus, the nb_tx_desc
296	 * (tx_ring_size here) must be small enough so that the driver will
297	 * hit the free threshold easily and free mbufs timely. Otherwise,
298	 * guest Tx vring would be starved.
299	 */
300	if (dequeue_zero_copy)
301		tx_ring_size = 64;
302
303	tx_rings = (uint16_t)rte_lcore_count();
304
305	retval = validate_num_devices(MAX_DEVICES);
306	if (retval < 0)
307		return retval;
308
309	/* Get port configuration. */
310	retval = get_eth_conf(&port_conf, num_devices);
311	if (retval < 0)
312		return retval;
313	/* NIC queues are divided into pf queues and vmdq queues.  */
314	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
315	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
316	num_vmdq_queues = num_devices * queues_per_pool;
317	num_queues = num_pf_queues + num_vmdq_queues;
318	vmdq_queue_base = dev_info.vmdq_queue_base;
319	vmdq_pool_base  = dev_info.vmdq_pool_base;
320	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
321		num_pf_queues, num_devices, queues_per_pool);
322
323	if (port >= rte_eth_dev_count()) return -1;
324
325	if (enable_tx_csum == 0)
326		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_CSUM);
327
328	if (enable_tso == 0) {
329		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4);
330		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO6);
331		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_GUEST_TSO4);
332		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_GUEST_TSO6);
333	}
334
335	rx_rings = (uint16_t)dev_info.max_rx_queues;
336	/* Configure ethernet device. */
337	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
338	if (retval != 0) {
339		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
340			port, strerror(-retval));
341		return retval;
342	}
343
344	/* Setup the queues. */
345	for (q = 0; q < rx_rings; q ++) {
346		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
347						rte_eth_dev_socket_id(port),
348						rxconf,
349						mbuf_pool);
350		if (retval < 0) {
351			RTE_LOG(ERR, VHOST_PORT,
352				"Failed to setup rx queue %u of port %u: %s.\n",
353				q, port, strerror(-retval));
354			return retval;
355		}
356	}
357	for (q = 0; q < tx_rings; q ++) {
358		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
359						rte_eth_dev_socket_id(port),
360						txconf);
361		if (retval < 0) {
362			RTE_LOG(ERR, VHOST_PORT,
363				"Failed to setup tx queue %u of port %u: %s.\n",
364				q, port, strerror(-retval));
365			return retval;
366		}
367	}
368
369	/* Start the device. */
370	retval  = rte_eth_dev_start(port);
371	if (retval < 0) {
372		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
373			port, strerror(-retval));
374		return retval;
375	}
376
377	if (promiscuous)
378		rte_eth_promiscuous_enable(port);
379
380	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
381	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
382	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
383			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
384			(unsigned)port,
385			vmdq_ports_eth_addr[port].addr_bytes[0],
386			vmdq_ports_eth_addr[port].addr_bytes[1],
387			vmdq_ports_eth_addr[port].addr_bytes[2],
388			vmdq_ports_eth_addr[port].addr_bytes[3],
389			vmdq_ports_eth_addr[port].addr_bytes[4],
390			vmdq_ports_eth_addr[port].addr_bytes[5]);
391
392	return 0;
393}
394
395/*
396 * Set socket file path.
397 */
398static int
399us_vhost_parse_socket_path(const char *q_arg)
400{
401	/* parse number string */
402	if (strnlen(q_arg, PATH_MAX) > PATH_MAX)
403		return -1;
404
405	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
406	snprintf(socket_files + nb_sockets * PATH_MAX, PATH_MAX, "%s", q_arg);
407	nb_sockets++;
408
409	return 0;
410}
411
412/*
413 * Parse the portmask provided at run time.
414 */
415static int
416parse_portmask(const char *portmask)
417{
418	char *end = NULL;
419	unsigned long pm;
420
421	errno = 0;
422
423	/* parse hexadecimal string */
424	pm = strtoul(portmask, &end, 16);
425	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
426		return -1;
427
428	if (pm == 0)
429		return -1;
430
431	return pm;
432
433}
434
435/*
436 * Parse num options at run time.
437 */
438static int
439parse_num_opt(const char *q_arg, uint32_t max_valid_value)
440{
441	char *end = NULL;
442	unsigned long num;
443
444	errno = 0;
445
446	/* parse unsigned int string */
447	num = strtoul(q_arg, &end, 10);
448	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
449		return -1;
450
451	if (num > max_valid_value)
452		return -1;
453
454	return num;
455
456}
457
458/*
459 * Display usage
460 */
461static void
462us_vhost_usage(const char *prgname)
463{
464	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
465	"		--vm2vm [0|1|2]\n"
466	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
467	"		--socket-file <path>\n"
468	"		--nb-devices ND\n"
469	"		-p PORTMASK: Set mask for ports to be used by application\n"
470	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
471	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
472	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
473	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
474	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
475	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
476	"		--socket-file: The path of the socket file.\n"
477	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
478	"		--tso [0|1] disable/enable TCP segment offload.\n"
479	"		--client register a vhost-user socket as client mode.\n"
480	"		--dequeue-zero-copy enables dequeue zero copy\n",
481	       prgname);
482}
483
484/*
485 * Parse the arguments given in the command line of the application.
486 */
487static int
488us_vhost_parse_args(int argc, char **argv)
489{
490	int opt, ret;
491	int option_index;
492	unsigned i;
493	const char *prgname = argv[0];
494	static struct option long_option[] = {
495		{"vm2vm", required_argument, NULL, 0},
496		{"rx-retry", required_argument, NULL, 0},
497		{"rx-retry-delay", required_argument, NULL, 0},
498		{"rx-retry-num", required_argument, NULL, 0},
499		{"mergeable", required_argument, NULL, 0},
500		{"stats", required_argument, NULL, 0},
501		{"socket-file", required_argument, NULL, 0},
502		{"tx-csum", required_argument, NULL, 0},
503		{"tso", required_argument, NULL, 0},
504		{"client", no_argument, &client_mode, 1},
505		{"dequeue-zero-copy", no_argument, &dequeue_zero_copy, 1},
506		{NULL, 0, 0, 0},
507	};
508
509	/* Parse command line */
510	while ((opt = getopt_long(argc, argv, "p:P",
511			long_option, &option_index)) != EOF) {
512		switch (opt) {
513		/* Portmask */
514		case 'p':
515			enabled_port_mask = parse_portmask(optarg);
516			if (enabled_port_mask == 0) {
517				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
518				us_vhost_usage(prgname);
519				return -1;
520			}
521			break;
522
523		case 'P':
524			promiscuous = 1;
525			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
526				ETH_VMDQ_ACCEPT_BROADCAST |
527				ETH_VMDQ_ACCEPT_MULTICAST;
528			rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
529
530			break;
531
532		case 0:
533			/* Enable/disable vm2vm comms. */
534			if (!strncmp(long_option[option_index].name, "vm2vm",
535				MAX_LONG_OPT_SZ)) {
536				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
537				if (ret == -1) {
538					RTE_LOG(INFO, VHOST_CONFIG,
539						"Invalid argument for "
540						"vm2vm [0|1|2]\n");
541					us_vhost_usage(prgname);
542					return -1;
543				} else {
544					vm2vm_mode = (vm2vm_type)ret;
545				}
546			}
547
548			/* Enable/disable retries on RX. */
549			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
550				ret = parse_num_opt(optarg, 1);
551				if (ret == -1) {
552					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
553					us_vhost_usage(prgname);
554					return -1;
555				} else {
556					enable_retry = ret;
557				}
558			}
559
560			/* Enable/disable TX checksum offload. */
561			if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
562				ret = parse_num_opt(optarg, 1);
563				if (ret == -1) {
564					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
565					us_vhost_usage(prgname);
566					return -1;
567				} else
568					enable_tx_csum = ret;
569			}
570
571			/* Enable/disable TSO offload. */
572			if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
573				ret = parse_num_opt(optarg, 1);
574				if (ret == -1) {
575					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
576					us_vhost_usage(prgname);
577					return -1;
578				} else
579					enable_tso = ret;
580			}
581
582			/* Specify the retries delay time (in useconds) on RX. */
583			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
584				ret = parse_num_opt(optarg, INT32_MAX);
585				if (ret == -1) {
586					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
587					us_vhost_usage(prgname);
588					return -1;
589				} else {
590					burst_rx_delay_time = ret;
591				}
592			}
593
594			/* Specify the retries number on RX. */
595			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
596				ret = parse_num_opt(optarg, INT32_MAX);
597				if (ret == -1) {
598					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
599					us_vhost_usage(prgname);
600					return -1;
601				} else {
602					burst_rx_retry_num = ret;
603				}
604			}
605
606			/* Enable/disable RX mergeable buffers. */
607			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
608				ret = parse_num_opt(optarg, 1);
609				if (ret == -1) {
610					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
611					us_vhost_usage(prgname);
612					return -1;
613				} else {
614					mergeable = !!ret;
615					if (ret) {
616						vmdq_conf_default.rxmode.jumbo_frame = 1;
617						vmdq_conf_default.rxmode.max_rx_pkt_len
618							= JUMBO_FRAME_MAX_SIZE;
619					}
620				}
621			}
622
623			/* Enable/disable stats. */
624			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
625				ret = parse_num_opt(optarg, INT32_MAX);
626				if (ret == -1) {
627					RTE_LOG(INFO, VHOST_CONFIG,
628						"Invalid argument for stats [0..N]\n");
629					us_vhost_usage(prgname);
630					return -1;
631				} else {
632					enable_stats = ret;
633				}
634			}
635
636			/* Set socket file path. */
637			if (!strncmp(long_option[option_index].name,
638						"socket-file", MAX_LONG_OPT_SZ)) {
639				if (us_vhost_parse_socket_path(optarg) == -1) {
640					RTE_LOG(INFO, VHOST_CONFIG,
641					"Invalid argument for socket name (Max %d characters)\n",
642					PATH_MAX);
643					us_vhost_usage(prgname);
644					return -1;
645				}
646			}
647
648			break;
649
650			/* Invalid option - print options. */
651		default:
652			us_vhost_usage(prgname);
653			return -1;
654		}
655	}
656
657	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
658		if (enabled_port_mask & (1 << i))
659			ports[num_ports++] = (uint8_t)i;
660	}
661
662	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
663		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
664			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
665		return -1;
666	}
667
668	return 0;
669}
670
671/*
672 * Update the global var NUM_PORTS and array PORTS according to system ports number
673 * and return valid ports number
674 */
675static unsigned check_ports_num(unsigned nb_ports)
676{
677	unsigned valid_num_ports = num_ports;
678	unsigned portid;
679
680	if (num_ports > nb_ports) {
681		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
682			num_ports, nb_ports);
683		num_ports = nb_ports;
684	}
685
686	for (portid = 0; portid < num_ports; portid ++) {
687		if (ports[portid] >= nb_ports) {
688			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
689				ports[portid], (nb_ports - 1));
690			ports[portid] = INVALID_PORT_ID;
691			valid_num_ports--;
692		}
693	}
694	return valid_num_ports;
695}
696
697static inline struct vhost_dev *__attribute__((always_inline))
698find_vhost_dev(struct ether_addr *mac)
699{
700	struct vhost_dev *vdev;
701
702	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
703		if (vdev->ready == DEVICE_RX &&
704		    is_same_ether_addr(mac, &vdev->mac_address))
705			return vdev;
706	}
707
708	return NULL;
709}
710
711/*
712 * This function learns the MAC address of the device and registers this along with a
713 * vlan tag to a VMDQ.
714 */
715static int
716link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
717{
718	struct ether_hdr *pkt_hdr;
719	int i, ret;
720
721	/* Learn MAC address of guest device from packet */
722	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
723
724	if (find_vhost_dev(&pkt_hdr->s_addr)) {
725		RTE_LOG(ERR, VHOST_DATA,
726			"(%d) device is using a registered MAC!\n",
727			vdev->vid);
728		return -1;
729	}
730
731	for (i = 0; i < ETHER_ADDR_LEN; i++)
732		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
733
734	/* vlan_tag currently uses the device_id. */
735	vdev->vlan_tag = vlan_tags[vdev->vid];
736
737	/* Print out VMDQ registration info. */
738	RTE_LOG(INFO, VHOST_DATA,
739		"(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
740		vdev->vid,
741		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
742		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
743		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
744		vdev->vlan_tag);
745
746	/* Register the MAC address. */
747	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
748				(uint32_t)vdev->vid + vmdq_pool_base);
749	if (ret)
750		RTE_LOG(ERR, VHOST_DATA,
751			"(%d) failed to add device MAC address to VMDQ\n",
752			vdev->vid);
753
754	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
755
756	/* Set device as ready for RX. */
757	vdev->ready = DEVICE_RX;
758
759	return 0;
760}
761
762/*
763 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
764 * queue before disabling RX on the device.
765 */
766static inline void
767unlink_vmdq(struct vhost_dev *vdev)
768{
769	unsigned i = 0;
770	unsigned rx_count;
771	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
772
773	if (vdev->ready == DEVICE_RX) {
774		/*clear MAC and VLAN settings*/
775		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
776		for (i = 0; i < 6; i++)
777			vdev->mac_address.addr_bytes[i] = 0;
778
779		vdev->vlan_tag = 0;
780
781		/*Clear out the receive buffers*/
782		rx_count = rte_eth_rx_burst(ports[0],
783					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
784
785		while (rx_count) {
786			for (i = 0; i < rx_count; i++)
787				rte_pktmbuf_free(pkts_burst[i]);
788
789			rx_count = rte_eth_rx_burst(ports[0],
790					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
791		}
792
793		vdev->ready = DEVICE_MAC_LEARNING;
794	}
795}
796
797static inline void __attribute__((always_inline))
798virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
799	    struct rte_mbuf *m)
800{
801	uint16_t ret;
802
803	ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
804	if (enable_stats) {
805		rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
806		rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
807		src_vdev->stats.tx_total++;
808		src_vdev->stats.tx += ret;
809	}
810}
811
812/*
813 * Check if the packet destination MAC address is for a local device. If so then put
814 * the packet on that devices RX queue. If not then return.
815 */
816static inline int __attribute__((always_inline))
817virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
818{
819	struct ether_hdr *pkt_hdr;
820	struct vhost_dev *dst_vdev;
821
822	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
823
824	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
825	if (!dst_vdev)
826		return -1;
827
828	if (vdev->vid == dst_vdev->vid) {
829		RTE_LOG(DEBUG, VHOST_DATA,
830			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
831			vdev->vid);
832		return 0;
833	}
834
835	RTE_LOG(DEBUG, VHOST_DATA,
836		"(%d) TX: MAC address is local\n", dst_vdev->vid);
837
838	if (unlikely(dst_vdev->remove)) {
839		RTE_LOG(DEBUG, VHOST_DATA,
840			"(%d) device is marked for removal\n", dst_vdev->vid);
841		return 0;
842	}
843
844	virtio_xmit(dst_vdev, vdev, m);
845	return 0;
846}
847
848/*
849 * Check if the destination MAC of a packet is one local VM,
850 * and get its vlan tag, and offset if it is.
851 */
852static inline int __attribute__((always_inline))
853find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
854	uint32_t *offset, uint16_t *vlan_tag)
855{
856	struct vhost_dev *dst_vdev;
857	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
858
859	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
860	if (!dst_vdev)
861		return 0;
862
863	if (vdev->vid == dst_vdev->vid) {
864		RTE_LOG(DEBUG, VHOST_DATA,
865			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
866			vdev->vid);
867		return -1;
868	}
869
870	/*
871	 * HW vlan strip will reduce the packet length
872	 * by minus length of vlan tag, so need restore
873	 * the packet length by plus it.
874	 */
875	*offset  = VLAN_HLEN;
876	*vlan_tag = vlan_tags[vdev->vid];
877
878	RTE_LOG(DEBUG, VHOST_DATA,
879		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
880		vdev->vid, dst_vdev->vid, *vlan_tag);
881
882	return 0;
883}
884
885static uint16_t
886get_psd_sum(void *l3_hdr, uint64_t ol_flags)
887{
888	if (ol_flags & PKT_TX_IPV4)
889		return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
890	else /* assume ethertype == ETHER_TYPE_IPv6 */
891		return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
892}
893
894static void virtio_tx_offload(struct rte_mbuf *m)
895{
896	void *l3_hdr;
897	struct ipv4_hdr *ipv4_hdr = NULL;
898	struct tcp_hdr *tcp_hdr = NULL;
899	struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
900
901	l3_hdr = (char *)eth_hdr + m->l2_len;
902
903	if (m->ol_flags & PKT_TX_IPV4) {
904		ipv4_hdr = l3_hdr;
905		ipv4_hdr->hdr_checksum = 0;
906		m->ol_flags |= PKT_TX_IP_CKSUM;
907	}
908
909	tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
910	tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
911}
912
913static inline void
914free_pkts(struct rte_mbuf **pkts, uint16_t n)
915{
916	while (n--)
917		rte_pktmbuf_free(pkts[n]);
918}
919
920static inline void __attribute__((always_inline))
921do_drain_mbuf_table(struct mbuf_table *tx_q)
922{
923	uint16_t count;
924
925	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
926				 tx_q->m_table, tx_q->len);
927	if (unlikely(count < tx_q->len))
928		free_pkts(&tx_q->m_table[count], tx_q->len - count);
929
930	tx_q->len = 0;
931}
932
933/*
934 * This function routes the TX packet to the correct interface. This
935 * may be a local device or the physical port.
936 */
937static inline void __attribute__((always_inline))
938virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
939{
940	struct mbuf_table *tx_q;
941	unsigned offset = 0;
942	const uint16_t lcore_id = rte_lcore_id();
943	struct ether_hdr *nh;
944
945
946	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
947	if (unlikely(is_broadcast_ether_addr(&nh->d_addr))) {
948		struct vhost_dev *vdev2;
949
950		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
951			if (vdev2 != vdev)
952				virtio_xmit(vdev2, vdev, m);
953		}
954		goto queue2nic;
955	}
956
957	/*check if destination is local VM*/
958	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
959		rte_pktmbuf_free(m);
960		return;
961	}
962
963	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
964		if (unlikely(find_local_dest(vdev, m, &offset,
965					     &vlan_tag) != 0)) {
966			rte_pktmbuf_free(m);
967			return;
968		}
969	}
970
971	RTE_LOG(DEBUG, VHOST_DATA,
972		"(%d) TX: MAC address is external\n", vdev->vid);
973
974queue2nic:
975
976	/*Add packet to the port tx queue*/
977	tx_q = &lcore_tx_queue[lcore_id];
978
979	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
980	if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
981		/* Guest has inserted the vlan tag. */
982		struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
983		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
984		if ((vm2vm_mode == VM2VM_HARDWARE) &&
985			(vh->vlan_tci != vlan_tag_be))
986			vh->vlan_tci = vlan_tag_be;
987	} else {
988		m->ol_flags |= PKT_TX_VLAN_PKT;
989
990		/*
991		 * Find the right seg to adjust the data len when offset is
992		 * bigger than tail room size.
993		 */
994		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
995			if (likely(offset <= rte_pktmbuf_tailroom(m)))
996				m->data_len += offset;
997			else {
998				struct rte_mbuf *seg = m;
999
1000				while ((seg->next != NULL) &&
1001					(offset > rte_pktmbuf_tailroom(seg)))
1002					seg = seg->next;
1003
1004				seg->data_len += offset;
1005			}
1006			m->pkt_len += offset;
1007		}
1008
1009		m->vlan_tci = vlan_tag;
1010	}
1011
1012	if (m->ol_flags & PKT_TX_TCP_SEG)
1013		virtio_tx_offload(m);
1014
1015	tx_q->m_table[tx_q->len++] = m;
1016	if (enable_stats) {
1017		vdev->stats.tx_total++;
1018		vdev->stats.tx++;
1019	}
1020
1021	if (unlikely(tx_q->len == MAX_PKT_BURST))
1022		do_drain_mbuf_table(tx_q);
1023}
1024
1025
1026static inline void __attribute__((always_inline))
1027drain_mbuf_table(struct mbuf_table *tx_q)
1028{
1029	static uint64_t prev_tsc;
1030	uint64_t cur_tsc;
1031
1032	if (tx_q->len == 0)
1033		return;
1034
1035	cur_tsc = rte_rdtsc();
1036	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1037		prev_tsc = cur_tsc;
1038
1039		RTE_LOG(DEBUG, VHOST_DATA,
1040			"TX queue drained after timeout with burst size %u\n",
1041			tx_q->len);
1042		do_drain_mbuf_table(tx_q);
1043	}
1044}
1045
1046static inline void __attribute__((always_inline))
1047drain_eth_rx(struct vhost_dev *vdev)
1048{
1049	uint16_t rx_count, enqueue_count;
1050	struct rte_mbuf *pkts[MAX_PKT_BURST];
1051
1052	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1053				    pkts, MAX_PKT_BURST);
1054	if (!rx_count)
1055		return;
1056
1057	/*
1058	 * When "enable_retry" is set, here we wait and retry when there
1059	 * is no enough free slots in the queue to hold @rx_count packets,
1060	 * to diminish packet loss.
1061	 */
1062	if (enable_retry &&
1063	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1064			VIRTIO_RXQ))) {
1065		uint32_t retry;
1066
1067		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1068			rte_delay_us(burst_rx_delay_time);
1069			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1070					VIRTIO_RXQ))
1071				break;
1072		}
1073	}
1074
1075	enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1076						pkts, rx_count);
1077	if (enable_stats) {
1078		rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
1079		rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
1080	}
1081
1082	free_pkts(pkts, rx_count);
1083}
1084
1085static inline void __attribute__((always_inline))
1086drain_virtio_tx(struct vhost_dev *vdev)
1087{
1088	struct rte_mbuf *pkts[MAX_PKT_BURST];
1089	uint16_t count;
1090	uint16_t i;
1091
1092	count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ, mbuf_pool,
1093					pkts, MAX_PKT_BURST);
1094
1095	/* setup VMDq for the first packet */
1096	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1097		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1098			free_pkts(pkts, count);
1099	}
1100
1101	for (i = 0; i < count; ++i)
1102		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1103}
1104
1105/*
1106 * Main function of vhost-switch. It basically does:
1107 *
1108 * for each vhost device {
1109 *    - drain_eth_rx()
1110 *
1111 *      Which drains the host eth Rx queue linked to the vhost device,
1112 *      and deliver all of them to guest virito Rx ring associated with
1113 *      this vhost device.
1114 *
1115 *    - drain_virtio_tx()
1116 *
1117 *      Which drains the guest virtio Tx queue and deliver all of them
1118 *      to the target, which could be another vhost device, or the
1119 *      physical eth dev. The route is done in function "virtio_tx_route".
1120 * }
1121 */
1122static int
1123switch_worker(void *arg __rte_unused)
1124{
1125	unsigned i;
1126	unsigned lcore_id = rte_lcore_id();
1127	struct vhost_dev *vdev;
1128	struct mbuf_table *tx_q;
1129
1130	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1131
1132	tx_q = &lcore_tx_queue[lcore_id];
1133	for (i = 0; i < rte_lcore_count(); i++) {
1134		if (lcore_ids[i] == lcore_id) {
1135			tx_q->txq_id = i;
1136			break;
1137		}
1138	}
1139
1140	while(1) {
1141		drain_mbuf_table(tx_q);
1142
1143		/*
1144		 * Inform the configuration core that we have exited the
1145		 * linked list and that no devices are in use if requested.
1146		 */
1147		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1148			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1149
1150		/*
1151		 * Process vhost devices
1152		 */
1153		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1154			      lcore_vdev_entry) {
1155			if (unlikely(vdev->remove)) {
1156				unlink_vmdq(vdev);
1157				vdev->ready = DEVICE_SAFE_REMOVE;
1158				continue;
1159			}
1160
1161			if (likely(vdev->ready == DEVICE_RX))
1162				drain_eth_rx(vdev);
1163
1164			if (likely(!vdev->remove))
1165				drain_virtio_tx(vdev);
1166		}
1167	}
1168
1169	return 0;
1170}
1171
1172/*
1173 * Remove a device from the specific data core linked list and from the
1174 * main linked list. Synchonization  occurs through the use of the
1175 * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1176 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1177 */
1178static void
1179destroy_device(int vid)
1180{
1181	struct vhost_dev *vdev = NULL;
1182	int lcore;
1183
1184	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1185		if (vdev->vid == vid)
1186			break;
1187	}
1188	if (!vdev)
1189		return;
1190	/*set the remove flag. */
1191	vdev->remove = 1;
1192	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1193		rte_pause();
1194	}
1195
1196	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1197		     lcore_vdev_entry);
1198	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1199
1200
1201	/* Set the dev_removal_flag on each lcore. */
1202	RTE_LCORE_FOREACH_SLAVE(lcore)
1203		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1204
1205	/*
1206	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1207	 * we can be sure that they can no longer access the device removed
1208	 * from the linked lists and that the devices are no longer in use.
1209	 */
1210	RTE_LCORE_FOREACH_SLAVE(lcore) {
1211		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1212			rte_pause();
1213	}
1214
1215	lcore_info[vdev->coreid].device_num--;
1216
1217	RTE_LOG(INFO, VHOST_DATA,
1218		"(%d) device has been removed from data core\n",
1219		vdev->vid);
1220
1221	rte_free(vdev);
1222}
1223
1224/*
1225 * A new device is added to a data core. First the device is added to the main linked list
1226 * and the allocated to a specific data core.
1227 */
1228static int
1229new_device(int vid)
1230{
1231	int lcore, core_add = 0;
1232	uint32_t device_num_min = num_devices;
1233	struct vhost_dev *vdev;
1234
1235	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1236	if (vdev == NULL) {
1237		RTE_LOG(INFO, VHOST_DATA,
1238			"(%d) couldn't allocate memory for vhost dev\n",
1239			vid);
1240		return -1;
1241	}
1242	vdev->vid = vid;
1243
1244	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1245	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1246
1247	/*reset ready flag*/
1248	vdev->ready = DEVICE_MAC_LEARNING;
1249	vdev->remove = 0;
1250
1251	/* Find a suitable lcore to add the device. */
1252	RTE_LCORE_FOREACH_SLAVE(lcore) {
1253		if (lcore_info[lcore].device_num < device_num_min) {
1254			device_num_min = lcore_info[lcore].device_num;
1255			core_add = lcore;
1256		}
1257	}
1258	vdev->coreid = core_add;
1259
1260	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1261			  lcore_vdev_entry);
1262	lcore_info[vdev->coreid].device_num++;
1263
1264	/* Disable notifications. */
1265	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1266	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1267
1268	RTE_LOG(INFO, VHOST_DATA,
1269		"(%d) device has been added to data core %d\n",
1270		vid, vdev->coreid);
1271
1272	return 0;
1273}
1274
1275/*
1276 * These callback allow devices to be added to the data core when configuration
1277 * has been fully complete.
1278 */
1279static const struct virtio_net_device_ops virtio_net_device_ops =
1280{
1281	.new_device =  new_device,
1282	.destroy_device = destroy_device,
1283};
1284
1285/*
1286 * This is a thread will wake up after a period to print stats if the user has
1287 * enabled them.
1288 */
1289static void
1290print_stats(void)
1291{
1292	struct vhost_dev *vdev;
1293	uint64_t tx_dropped, rx_dropped;
1294	uint64_t tx, tx_total, rx, rx_total;
1295	const char clr[] = { 27, '[', '2', 'J', '\0' };
1296	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1297
1298	while(1) {
1299		sleep(enable_stats);
1300
1301		/* Clear screen and move to top left */
1302		printf("%s%s\n", clr, top_left);
1303		printf("Device statistics =================================\n");
1304
1305		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1306			tx_total   = vdev->stats.tx_total;
1307			tx         = vdev->stats.tx;
1308			tx_dropped = tx_total - tx;
1309
1310			rx_total   = rte_atomic64_read(&vdev->stats.rx_total_atomic);
1311			rx         = rte_atomic64_read(&vdev->stats.rx_atomic);
1312			rx_dropped = rx_total - rx;
1313
1314			printf("Statistics for device %d\n"
1315				"-----------------------\n"
1316				"TX total:              %" PRIu64 "\n"
1317				"TX dropped:            %" PRIu64 "\n"
1318				"TX successful:         %" PRIu64 "\n"
1319				"RX total:              %" PRIu64 "\n"
1320				"RX dropped:            %" PRIu64 "\n"
1321				"RX successful:         %" PRIu64 "\n",
1322				vdev->vid,
1323				tx_total, tx_dropped, tx,
1324				rx_total, rx_dropped, rx);
1325		}
1326
1327		printf("===================================================\n");
1328	}
1329}
1330
1331static void
1332unregister_drivers(int socket_num)
1333{
1334	int i, ret;
1335
1336	for (i = 0; i < socket_num; i++) {
1337		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1338		if (ret != 0)
1339			RTE_LOG(ERR, VHOST_CONFIG,
1340				"Fail to unregister vhost driver for %s.\n",
1341				socket_files + i * PATH_MAX);
1342	}
1343}
1344
1345/* When we receive a INT signal, unregister vhost driver */
1346static void
1347sigint_handler(__rte_unused int signum)
1348{
1349	/* Unregister vhost driver. */
1350	unregister_drivers(nb_sockets);
1351
1352	exit(0);
1353}
1354
1355/*
1356 * While creating an mbuf pool, one key thing is to figure out how
1357 * many mbuf entries is enough for our use. FYI, here are some
1358 * guidelines:
1359 *
1360 * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1361 *
1362 * - For each switch core (A CPU core does the packet switch), we need
1363 *   also make some reservation for receiving the packets from virtio
1364 *   Tx queue. How many is enough depends on the usage. It's normally
1365 *   a simple calculation like following:
1366 *
1367 *       MAX_PKT_BURST * max packet size / mbuf size
1368 *
1369 *   So, we definitely need allocate more mbufs when TSO is enabled.
1370 *
1371 * - Similarly, for each switching core, we should serve @nr_rx_desc
1372 *   mbufs for receiving the packets from physical NIC device.
1373 *
1374 * - We also need make sure, for each switch core, we have allocated
1375 *   enough mbufs to fill up the mbuf cache.
1376 */
1377static void
1378create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1379	uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1380{
1381	uint32_t nr_mbufs;
1382	uint32_t nr_mbufs_per_core;
1383	uint32_t mtu = 1500;
1384
1385	if (mergeable)
1386		mtu = 9000;
1387	if (enable_tso)
1388		mtu = 64 * 1024;
1389
1390	nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1391			(mbuf_size - RTE_PKTMBUF_HEADROOM);
1392	nr_mbufs_per_core += nr_rx_desc;
1393	nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1394
1395	nr_mbufs  = nr_queues * nr_rx_desc;
1396	nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1397	nr_mbufs *= nr_port;
1398
1399	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1400					    nr_mbuf_cache, 0, mbuf_size,
1401					    rte_socket_id());
1402	if (mbuf_pool == NULL)
1403		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1404}
1405
1406/*
1407 * Main function, does initialisation and calls the per-lcore functions.
1408 */
1409int
1410main(int argc, char *argv[])
1411{
1412	unsigned lcore_id, core_id = 0;
1413	unsigned nb_ports, valid_num_ports;
1414	int ret, i;
1415	uint8_t portid;
1416	static pthread_t tid;
1417	char thread_name[RTE_MAX_THREAD_NAME_LEN];
1418	uint64_t flags = 0;
1419
1420	signal(SIGINT, sigint_handler);
1421
1422	/* init EAL */
1423	ret = rte_eal_init(argc, argv);
1424	if (ret < 0)
1425		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1426	argc -= ret;
1427	argv += ret;
1428
1429	/* parse app arguments */
1430	ret = us_vhost_parse_args(argc, argv);
1431	if (ret < 0)
1432		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1433
1434	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1435		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1436
1437		if (rte_lcore_is_enabled(lcore_id))
1438			lcore_ids[core_id++] = lcore_id;
1439	}
1440
1441	if (rte_lcore_count() > RTE_MAX_LCORE)
1442		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1443
1444	/* Get the number of physical ports. */
1445	nb_ports = rte_eth_dev_count();
1446
1447	/*
1448	 * Update the global var NUM_PORTS and global array PORTS
1449	 * and get value of var VALID_NUM_PORTS according to system ports number
1450	 */
1451	valid_num_ports = check_ports_num(nb_ports);
1452
1453	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1454		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1455			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1456		return -1;
1457	}
1458
1459	/*
1460	 * FIXME: here we are trying to allocate mbufs big enough for
1461	 * @MAX_QUEUES, but the truth is we're never going to use that
1462	 * many queues here. We probably should only do allocation for
1463	 * those queues we are going to use.
1464	 */
1465	create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1466			 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1467
1468	if (vm2vm_mode == VM2VM_HARDWARE) {
1469		/* Enable VT loop back to let L2 switch to do it. */
1470		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1471		RTE_LOG(DEBUG, VHOST_CONFIG,
1472			"Enable loop back for L2 switch in vmdq.\n");
1473	}
1474
1475	/* initialize all ports */
1476	for (portid = 0; portid < nb_ports; portid++) {
1477		/* skip ports that are not enabled */
1478		if ((enabled_port_mask & (1 << portid)) == 0) {
1479			RTE_LOG(INFO, VHOST_PORT,
1480				"Skipping disabled port %d\n", portid);
1481			continue;
1482		}
1483		if (port_init(portid) != 0)
1484			rte_exit(EXIT_FAILURE,
1485				"Cannot initialize network ports\n");
1486	}
1487
1488	/* Enable stats if the user option is set. */
1489	if (enable_stats) {
1490		ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
1491		if (ret != 0)
1492			rte_exit(EXIT_FAILURE,
1493				"Cannot create print-stats thread\n");
1494
1495		/* Set thread_name for aid in debugging.  */
1496		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
1497		ret = rte_thread_setname(tid, thread_name);
1498		if (ret != 0)
1499			RTE_LOG(DEBUG, VHOST_CONFIG,
1500				"Cannot set print-stats name\n");
1501	}
1502
1503	/* Launch all data cores. */
1504	RTE_LCORE_FOREACH_SLAVE(lcore_id)
1505		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1506
1507	if (mergeable == 0)
1508		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
1509
1510	if (client_mode)
1511		flags |= RTE_VHOST_USER_CLIENT;
1512
1513	if (dequeue_zero_copy)
1514		flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1515
1516	/* Register vhost user driver to handle vhost messages. */
1517	for (i = 0; i < nb_sockets; i++) {
1518		ret = rte_vhost_driver_register
1519				(socket_files + i * PATH_MAX, flags);
1520		if (ret != 0) {
1521			unregister_drivers(i);
1522			rte_exit(EXIT_FAILURE,
1523				"vhost driver register failure.\n");
1524		}
1525	}
1526
1527	rte_vhost_driver_callback_register(&virtio_net_device_ops);
1528
1529	rte_vhost_driver_session_start();
1530	return 0;
1531
1532}
1533