1/*-
2 *   BSD LICENSE
3 *
4 *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
5 *   All rights reserved.
6 *
7 *   Redistribution and use in source and binary forms, with or without
8 *   modification, are permitted provided that the following conditions
9 *   are met:
10 *
11 *     * Redistributions of source code must retain the above copyright
12 *       notice, this list of conditions and the following disclaimer.
13 *     * Redistributions in binary form must reproduce the above copyright
14 *       notice, this list of conditions and the following disclaimer in
15 *       the documentation and/or other materials provided with the
16 *       distribution.
17 *     * Neither the name of Intel Corporation nor the names of its
18 *       contributors may be used to endorse or promote products derived
19 *       from this software without specific prior written permission.
20 *
21 *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34#include <getopt.h>
35#include <linux/if_ether.h>
36#include <linux/if_vlan.h>
37#include <linux/virtio_net.h>
38#include <linux/virtio_ring.h>
39#include <sys/param.h>
40#include <unistd.h>
41
42#include <rte_ethdev.h>
43#include <rte_log.h>
44#include <rte_string_fns.h>
45#include <rte_mbuf.h>
46#include <rte_malloc.h>
47#include <rte_ip.h>
48#include <rte_udp.h>
49#include <rte_tcp.h>
50
51#include "main.h"
52#include "rte_virtio_net.h"
53#include "vxlan.h"
54#include "vxlan_setup.h"
55
56#define IPV4_HEADER_LEN 20
57#define UDP_HEADER_LEN  8
58#define VXLAN_HEADER_LEN 8
59
60#define IP_VERSION 0x40
61#define IP_HDRLEN  0x05 /* default IP header length == five 32-bits words. */
62#define IP_DEFTTL  64   /* from RFC 1340. */
63#define IP_VHL_DEF (IP_VERSION | IP_HDRLEN)
64
65#define IP_DN_FRAGMENT_FLAG 0x0040
66
67/* Used to compare MAC addresses. */
68#define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
69
70/* Configurable number of RX/TX ring descriptors */
71#define RTE_TEST_RX_DESC_DEFAULT 1024
72#define RTE_TEST_TX_DESC_DEFAULT 512
73
74/* Default inner VLAN ID */
75#define INNER_VLAN_ID 100
76
77/* VXLAN device */
78struct vxlan_conf vxdev;
79
80struct ipv4_hdr app_ip_hdr[VXLAN_N_PORTS];
81struct ether_hdr app_l2_hdr[VXLAN_N_PORTS];
82
83/* local VTEP IP address */
84uint8_t vxlan_multicast_ips[2][4] = { {239, 1, 1, 1 }, {239, 1, 2, 1 } };
85
86/* Remote VTEP IP address */
87uint8_t vxlan_overlay_ips[2][4] = { {192, 168, 10, 1}, {192, 168, 30, 1} };
88
89/* Remote VTEP MAC address */
90uint8_t peer_mac[6] = {0x00, 0x11, 0x01, 0x00, 0x00, 0x01};
91
92/* VXLAN RX filter type */
93uint8_t tep_filter_type[] = {RTE_TUNNEL_FILTER_IMAC_TENID,
94			RTE_TUNNEL_FILTER_IMAC_IVLAN_TENID,
95			RTE_TUNNEL_FILTER_OMAC_TENID_IMAC,};
96
97/* Options for configuring ethernet port */
98static const struct rte_eth_conf port_conf = {
99	.rxmode = {
100		.split_hdr_size = 0,
101		.header_split   = 0, /**< Header Split disabled */
102		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
103		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
104		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
105		.hw_strip_crc   = 1, /**< CRC stripped by hardware */
106	},
107	.txmode = {
108		.mq_mode = ETH_MQ_TX_NONE,
109	},
110};
111
112/**
113 * The one or two device(s) that belongs to the same tenant ID can
114 * be assigned in a VM.
115 */
116const uint16_t tenant_id_conf[] = {
117	1000, 1000, 1001, 1001, 1002, 1002, 1003, 1003,
118	1004, 1004, 1005, 1005, 1006, 1006, 1007, 1007,
119	1008, 1008, 1009, 1009, 1010, 1010, 1011, 1011,
120	1012, 1012, 1013, 1013, 1014, 1014, 1015, 1015,
121	1016, 1016, 1017, 1017, 1018, 1018, 1019, 1019,
122	1020, 1020, 1021, 1021, 1022, 1022, 1023, 1023,
123	1024, 1024, 1025, 1025, 1026, 1026, 1027, 1027,
124	1028, 1028, 1029, 1029, 1030, 1030, 1031, 1031,
125};
126
127/**
128 * Initialises a given port using global settings and with the rx buffers
129 * coming from the mbuf_pool passed as parameter
130 */
131int
132vxlan_port_init(uint8_t port, struct rte_mempool *mbuf_pool)
133{
134	int retval;
135	uint16_t q;
136	struct rte_eth_dev_info dev_info;
137	uint16_t rx_rings, tx_rings = (uint16_t)rte_lcore_count();
138	const uint16_t rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
139	const uint16_t tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
140	struct rte_eth_udp_tunnel tunnel_udp;
141	struct rte_eth_rxconf *rxconf;
142	struct rte_eth_txconf *txconf;
143	struct vxlan_conf *pconf = &vxdev;
144
145	pconf->dst_port = udp_port;
146
147	rte_eth_dev_info_get(port, &dev_info);
148
149	if (dev_info.max_rx_queues > MAX_QUEUES) {
150		rte_exit(EXIT_FAILURE,
151			"please define MAX_QUEUES no less than %u in %s\n",
152			dev_info.max_rx_queues, __FILE__);
153	}
154
155	rxconf = &dev_info.default_rxconf;
156	txconf = &dev_info.default_txconf;
157	txconf->txq_flags = 0;
158
159	if (port >= rte_eth_dev_count())
160		return -1;
161
162	rx_rings = nb_devices;
163
164	/* Configure ethernet device. */
165	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
166	if (retval != 0)
167		return retval;
168
169	/* Setup the queues. */
170	for (q = 0; q < rx_rings; q++) {
171		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
172						rte_eth_dev_socket_id(port),
173						rxconf,
174						mbuf_pool);
175		if (retval < 0)
176			return retval;
177	}
178	for (q = 0; q < tx_rings; q++) {
179		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
180						rte_eth_dev_socket_id(port),
181						txconf);
182		if (retval < 0)
183			return retval;
184	}
185
186	/* Start the device. */
187	retval  = rte_eth_dev_start(port);
188	if (retval < 0)
189		return retval;
190
191	/* Configure UDP port for UDP tunneling */
192	tunnel_udp.udp_port = udp_port;
193	tunnel_udp.prot_type = RTE_TUNNEL_TYPE_VXLAN;
194	retval = rte_eth_dev_udp_tunnel_port_add(port, &tunnel_udp);
195	if (retval < 0)
196		return retval;
197	rte_eth_macaddr_get(port, &ports_eth_addr[port]);
198	RTE_LOG(INFO, PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
199			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
200			(unsigned)port,
201			ports_eth_addr[port].addr_bytes[0],
202			ports_eth_addr[port].addr_bytes[1],
203			ports_eth_addr[port].addr_bytes[2],
204			ports_eth_addr[port].addr_bytes[3],
205			ports_eth_addr[port].addr_bytes[4],
206			ports_eth_addr[port].addr_bytes[5]);
207
208	if (tso_segsz != 0) {
209		struct rte_eth_dev_info dev_info;
210		rte_eth_dev_info_get(port, &dev_info);
211		if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) == 0)
212			RTE_LOG(WARNING, PORT,
213				"hardware TSO offload is not supported\n");
214	}
215	return 0;
216}
217
218static int
219vxlan_rx_process(struct rte_mbuf *pkt)
220{
221	int ret = 0;
222
223	if (rx_decap)
224		ret = decapsulation(pkt);
225
226	return ret;
227}
228
229static void
230vxlan_tx_process(uint8_t queue_id, struct rte_mbuf *pkt)
231{
232	if (tx_encap)
233		encapsulation(pkt, queue_id);
234
235	return;
236}
237
238/*
239 * This function learns the MAC address of the device and set init
240 * L2 header and L3 header info.
241 */
242int
243vxlan_link(struct vhost_dev *vdev, struct rte_mbuf *m)
244{
245	int i, ret;
246	struct ether_hdr *pkt_hdr;
247	uint64_t portid = vdev->vid;
248	struct ipv4_hdr *ip;
249
250	struct rte_eth_tunnel_filter_conf tunnel_filter_conf;
251
252	if (unlikely(portid >= VXLAN_N_PORTS)) {
253		RTE_LOG(INFO, VHOST_DATA,
254			"(%d) WARNING: Not configuring device,"
255			"as already have %d ports for VXLAN.",
256			vdev->vid, VXLAN_N_PORTS);
257		return -1;
258	}
259
260	/* Learn MAC address of guest device from packet */
261	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
262	if (is_same_ether_addr(&(pkt_hdr->s_addr), &vdev->mac_address)) {
263		RTE_LOG(INFO, VHOST_DATA,
264			"(%d) WARNING: This device is using an existing"
265			" MAC address and has not been registered.\n",
266			vdev->vid);
267		return -1;
268	}
269
270	for (i = 0; i < ETHER_ADDR_LEN; i++) {
271		vdev->mac_address.addr_bytes[i] =
272			vxdev.port[portid].vport_mac.addr_bytes[i] =
273			pkt_hdr->s_addr.addr_bytes[i];
274		vxdev.port[portid].peer_mac.addr_bytes[i] = peer_mac[i];
275	}
276
277	memset(&tunnel_filter_conf, 0,
278		sizeof(struct rte_eth_tunnel_filter_conf));
279
280	ether_addr_copy(&ports_eth_addr[0], &tunnel_filter_conf.outer_mac);
281	tunnel_filter_conf.filter_type = tep_filter_type[filter_idx];
282
283	/* inner MAC */
284	ether_addr_copy(&vdev->mac_address, &tunnel_filter_conf.inner_mac);
285
286	tunnel_filter_conf.queue_id = vdev->rx_q;
287	tunnel_filter_conf.tenant_id = tenant_id_conf[vdev->rx_q];
288
289	if (tep_filter_type[filter_idx] == RTE_TUNNEL_FILTER_IMAC_IVLAN_TENID)
290		tunnel_filter_conf.inner_vlan = INNER_VLAN_ID;
291
292	tunnel_filter_conf.tunnel_type = RTE_TUNNEL_TYPE_VXLAN;
293
294	ret = rte_eth_dev_filter_ctrl(ports[0],
295		RTE_ETH_FILTER_TUNNEL,
296		RTE_ETH_FILTER_ADD,
297		&tunnel_filter_conf);
298	if (ret) {
299		RTE_LOG(ERR, VHOST_DATA,
300			"%d Failed to add device MAC address to cloud filter\n",
301		vdev->rx_q);
302		return -1;
303	}
304
305	/* Print out inner MAC and VNI info. */
306	RTE_LOG(INFO, VHOST_DATA,
307		"(%d) MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VNI %d registered\n",
308		vdev->rx_q,
309		vdev->mac_address.addr_bytes[0],
310		vdev->mac_address.addr_bytes[1],
311		vdev->mac_address.addr_bytes[2],
312		vdev->mac_address.addr_bytes[3],
313		vdev->mac_address.addr_bytes[4],
314		vdev->mac_address.addr_bytes[5],
315		tenant_id_conf[vdev->rx_q]);
316
317	vxdev.port[portid].vport_id = portid;
318
319	for (i = 0; i < 4; i++) {
320		/* Local VTEP IP */
321		vxdev.port_ip |= vxlan_multicast_ips[portid][i] << (8 * i);
322		/* Remote VTEP IP */
323		vxdev.port[portid].peer_ip |=
324			vxlan_overlay_ips[portid][i] << (8 * i);
325	}
326
327	vxdev.out_key = tenant_id_conf[vdev->rx_q];
328	ether_addr_copy(&vxdev.port[portid].peer_mac,
329			&app_l2_hdr[portid].d_addr);
330	ether_addr_copy(&ports_eth_addr[0],
331			&app_l2_hdr[portid].s_addr);
332	app_l2_hdr[portid].ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv4);
333
334	ip = &app_ip_hdr[portid];
335	ip->version_ihl = IP_VHL_DEF;
336	ip->type_of_service = 0;
337	ip->total_length = 0;
338	ip->packet_id = 0;
339	ip->fragment_offset = IP_DN_FRAGMENT_FLAG;
340	ip->time_to_live = IP_DEFTTL;
341	ip->next_proto_id = IPPROTO_UDP;
342	ip->hdr_checksum = 0;
343	ip->src_addr = vxdev.port_ip;
344	ip->dst_addr = vxdev.port[portid].peer_ip;
345
346	/* Set device as ready for RX. */
347	vdev->ready = DEVICE_RX;
348
349	return 0;
350}
351
352/**
353 * Removes cloud filter. Ensures that nothing is adding buffers to the RX
354 * queue before disabling RX on the device.
355 */
356void
357vxlan_unlink(struct vhost_dev *vdev)
358{
359	unsigned i = 0, rx_count;
360	int ret;
361	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
362	struct rte_eth_tunnel_filter_conf tunnel_filter_conf;
363
364	if (vdev->ready == DEVICE_RX) {
365		memset(&tunnel_filter_conf, 0,
366			sizeof(struct rte_eth_tunnel_filter_conf));
367
368		ether_addr_copy(&ports_eth_addr[0], &tunnel_filter_conf.outer_mac);
369		ether_addr_copy(&vdev->mac_address, &tunnel_filter_conf.inner_mac);
370		tunnel_filter_conf.tenant_id = tenant_id_conf[vdev->rx_q];
371		tunnel_filter_conf.filter_type = tep_filter_type[filter_idx];
372
373		if (tep_filter_type[filter_idx] ==
374			RTE_TUNNEL_FILTER_IMAC_IVLAN_TENID)
375			tunnel_filter_conf.inner_vlan = INNER_VLAN_ID;
376
377		tunnel_filter_conf.queue_id = vdev->rx_q;
378		tunnel_filter_conf.tunnel_type = RTE_TUNNEL_TYPE_VXLAN;
379
380		ret = rte_eth_dev_filter_ctrl(ports[0],
381				RTE_ETH_FILTER_TUNNEL,
382				RTE_ETH_FILTER_DELETE,
383				&tunnel_filter_conf);
384		if (ret) {
385			RTE_LOG(ERR, VHOST_DATA,
386				"%d Failed to add device MAC address to cloud filter\n",
387				vdev->rx_q);
388			return;
389		}
390		for (i = 0; i < ETHER_ADDR_LEN; i++)
391			vdev->mac_address.addr_bytes[i] = 0;
392
393		/* Clear out the receive buffers */
394		rx_count = rte_eth_rx_burst(ports[0],
395				(uint16_t)vdev->rx_q,
396				pkts_burst, MAX_PKT_BURST);
397
398		while (rx_count) {
399			for (i = 0; i < rx_count; i++)
400				rte_pktmbuf_free(pkts_burst[i]);
401
402			rx_count = rte_eth_rx_burst(ports[0],
403					(uint16_t)vdev->rx_q,
404					pkts_burst, MAX_PKT_BURST);
405		}
406		vdev->ready = DEVICE_MAC_LEARNING;
407	}
408}
409
410/* Transmit packets after encapsulating */
411int
412vxlan_tx_pkts(uint8_t port_id, uint16_t queue_id,
413		struct rte_mbuf **tx_pkts, uint16_t nb_pkts) {
414	int ret = 0;
415	uint16_t i;
416
417	for (i = 0; i < nb_pkts; i++)
418		vxlan_tx_process(queue_id, tx_pkts[i]);
419
420	ret = rte_eth_tx_burst(port_id, queue_id, tx_pkts, nb_pkts);
421
422	return ret;
423}
424
425/* Check for decapsulation and pass packets directly to VIRTIO device */
426int
427vxlan_rx_pkts(int vid, struct rte_mbuf **pkts_burst, uint32_t rx_count)
428{
429	uint32_t i = 0;
430	uint32_t count = 0;
431	int ret;
432	struct rte_mbuf *pkts_valid[rx_count];
433
434	for (i = 0; i < rx_count; i++) {
435		if (enable_stats) {
436			rte_atomic64_add(
437				&dev_statistics[vid].rx_bad_ip_csum,
438				(pkts_burst[i]->ol_flags & PKT_RX_IP_CKSUM_BAD)
439				!= 0);
440			rte_atomic64_add(
441				&dev_statistics[vid].rx_bad_ip_csum,
442				(pkts_burst[i]->ol_flags & PKT_RX_L4_CKSUM_BAD)
443				!= 0);
444		}
445		ret = vxlan_rx_process(pkts_burst[i]);
446		if (unlikely(ret < 0))
447			continue;
448
449		pkts_valid[count] = pkts_burst[i];
450			count++;
451	}
452
453	ret = rte_vhost_enqueue_burst(vid, VIRTIO_RXQ, pkts_valid, count);
454	return ret;
455}
456