rte_eth_bond_pmd.c revision aab0c291
1/*-
2 *   BSD LICENSE
3 *
4 *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
5 *   All rights reserved.
6 *
7 *   Redistribution and use in source and binary forms, with or without
8 *   modification, are permitted provided that the following conditions
9 *   are met:
10 *
11 *     * Redistributions of source code must retain the above copyright
12 *       notice, this list of conditions and the following disclaimer.
13 *     * Redistributions in binary form must reproduce the above copyright
14 *       notice, this list of conditions and the following disclaimer in
15 *       the documentation and/or other materials provided with the
16 *       distribution.
17 *     * Neither the name of Intel Corporation nor the names of its
18 *       contributors may be used to endorse or promote products derived
19 *       from this software without specific prior written permission.
20 *
21 *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33#include <stdlib.h>
34#include <netinet/in.h>
35
36#include <rte_mbuf.h>
37#include <rte_malloc.h>
38#include <rte_ethdev.h>
39#include <rte_tcp.h>
40#include <rte_udp.h>
41#include <rte_ip.h>
42#include <rte_ip_frag.h>
43#include <rte_devargs.h>
44#include <rte_kvargs.h>
45#include <rte_vdev.h>
46#include <rte_alarm.h>
47#include <rte_cycles.h>
48
49#include "rte_eth_bond.h"
50#include "rte_eth_bond_private.h"
51#include "rte_eth_bond_8023ad_private.h"
52
53#define REORDER_PERIOD_MS 10
54
55#define HASH_L4_PORTS(h) ((h)->src_port ^ (h)->dst_port)
56
57/* Table for statistics in mode 5 TLB */
58static uint64_t tlb_last_obytets[RTE_MAX_ETHPORTS];
59
60static inline size_t
61get_vlan_offset(struct ether_hdr *eth_hdr, uint16_t *proto)
62{
63	size_t vlan_offset = 0;
64
65	if (rte_cpu_to_be_16(ETHER_TYPE_VLAN) == *proto) {
66		struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
67
68		vlan_offset = sizeof(struct vlan_hdr);
69		*proto = vlan_hdr->eth_proto;
70
71		if (rte_cpu_to_be_16(ETHER_TYPE_VLAN) == *proto) {
72			vlan_hdr = vlan_hdr + 1;
73			*proto = vlan_hdr->eth_proto;
74			vlan_offset += sizeof(struct vlan_hdr);
75		}
76	}
77	return vlan_offset;
78}
79
80static uint16_t
81bond_ethdev_rx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
82{
83	struct bond_dev_private *internals;
84
85	uint16_t num_rx_slave = 0;
86	uint16_t num_rx_total = 0;
87
88	int i;
89
90	/* Cast to structure, containing bonded device's port id and queue id */
91	struct bond_rx_queue *bd_rx_q = (struct bond_rx_queue *)queue;
92
93	internals = bd_rx_q->dev_private;
94
95
96	for (i = 0; i < internals->active_slave_count && nb_pkts; i++) {
97		/* Offset of pointer to *bufs increases as packets are received
98		 * from other slaves */
99		num_rx_slave = rte_eth_rx_burst(internals->active_slaves[i],
100				bd_rx_q->queue_id, bufs + num_rx_total, nb_pkts);
101		if (num_rx_slave) {
102			num_rx_total += num_rx_slave;
103			nb_pkts -= num_rx_slave;
104		}
105	}
106
107	return num_rx_total;
108}
109
110static uint16_t
111bond_ethdev_rx_burst_active_backup(void *queue, struct rte_mbuf **bufs,
112		uint16_t nb_pkts)
113{
114	struct bond_dev_private *internals;
115
116	/* Cast to structure, containing bonded device's port id and queue id */
117	struct bond_rx_queue *bd_rx_q = (struct bond_rx_queue *)queue;
118
119	internals = bd_rx_q->dev_private;
120
121	return rte_eth_rx_burst(internals->current_primary_port,
122			bd_rx_q->queue_id, bufs, nb_pkts);
123}
124
125static inline uint8_t
126is_lacp_packets(uint16_t ethertype, uint8_t subtype, uint16_t vlan_tci)
127{
128	const uint16_t ether_type_slow_be = rte_be_to_cpu_16(ETHER_TYPE_SLOW);
129
130	return !vlan_tci && (ethertype == ether_type_slow_be &&
131		(subtype == SLOW_SUBTYPE_MARKER || subtype == SLOW_SUBTYPE_LACP));
132}
133
134static uint16_t
135bond_ethdev_rx_burst_8023ad(void *queue, struct rte_mbuf **bufs,
136		uint16_t nb_pkts)
137{
138	/* Cast to structure, containing bonded device's port id and queue id */
139	struct bond_rx_queue *bd_rx_q = (struct bond_rx_queue *)queue;
140	struct bond_dev_private *internals = bd_rx_q->dev_private;
141	struct ether_addr bond_mac;
142
143	struct ether_hdr *hdr;
144
145	const uint16_t ether_type_slow_be = rte_be_to_cpu_16(ETHER_TYPE_SLOW);
146	uint16_t num_rx_total = 0;	/* Total number of received packets */
147	uint8_t slaves[RTE_MAX_ETHPORTS];
148	uint8_t slave_count;
149
150	uint8_t collecting;  /* current slave collecting status */
151	const uint8_t promisc = internals->promiscuous_en;
152	uint8_t i, j, k;
153	uint8_t subtype;
154
155	rte_eth_macaddr_get(internals->port_id, &bond_mac);
156	/* Copy slave list to protect against slave up/down changes during tx
157	 * bursting */
158	slave_count = internals->active_slave_count;
159	memcpy(slaves, internals->active_slaves,
160			sizeof(internals->active_slaves[0]) * slave_count);
161
162	for (i = 0; i < slave_count && num_rx_total < nb_pkts; i++) {
163		j = num_rx_total;
164		collecting = ACTOR_STATE(&mode_8023ad_ports[slaves[i]], COLLECTING);
165
166		/* Read packets from this slave */
167		num_rx_total += rte_eth_rx_burst(slaves[i], bd_rx_q->queue_id,
168				&bufs[num_rx_total], nb_pkts - num_rx_total);
169
170		for (k = j; k < 2 && k < num_rx_total; k++)
171			rte_prefetch0(rte_pktmbuf_mtod(bufs[k], void *));
172
173		/* Handle slow protocol packets. */
174		while (j < num_rx_total) {
175			if (j + 3 < num_rx_total)
176				rte_prefetch0(rte_pktmbuf_mtod(bufs[j + 3], void *));
177
178			hdr = rte_pktmbuf_mtod(bufs[j], struct ether_hdr *);
179			subtype = ((struct slow_protocol_frame *)hdr)->slow_protocol.subtype;
180
181			/* Remove packet from array if it is slow packet or slave is not
182			 * in collecting state or bondign interface is not in promiscus
183			 * mode and packet address does not match. */
184			if (unlikely(is_lacp_packets(hdr->ether_type, subtype, bufs[j]->vlan_tci) ||
185				!collecting || (!promisc &&
186					!is_multicast_ether_addr(&hdr->d_addr) &&
187					!is_same_ether_addr(&bond_mac, &hdr->d_addr)))) {
188
189				if (hdr->ether_type == ether_type_slow_be) {
190					bond_mode_8023ad_handle_slow_pkt(internals, slaves[i],
191						bufs[j]);
192				} else
193					rte_pktmbuf_free(bufs[j]);
194
195				/* Packet is managed by mode 4 or dropped, shift the array */
196				num_rx_total--;
197				if (j < num_rx_total) {
198					memmove(&bufs[j], &bufs[j + 1], sizeof(bufs[0]) *
199						(num_rx_total - j));
200				}
201			} else
202				j++;
203		}
204	}
205
206	return num_rx_total;
207}
208
209#if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
210uint32_t burstnumberRX;
211uint32_t burstnumberTX;
212
213#ifdef RTE_LIBRTE_BOND_DEBUG_ALB
214
215static void
216arp_op_name(uint16_t arp_op, char *buf)
217{
218	switch (arp_op) {
219	case ARP_OP_REQUEST:
220		snprintf(buf, sizeof("ARP Request"), "%s", "ARP Request");
221		return;
222	case ARP_OP_REPLY:
223		snprintf(buf, sizeof("ARP Reply"), "%s", "ARP Reply");
224		return;
225	case ARP_OP_REVREQUEST:
226		snprintf(buf, sizeof("Reverse ARP Request"), "%s",
227				"Reverse ARP Request");
228		return;
229	case ARP_OP_REVREPLY:
230		snprintf(buf, sizeof("Reverse ARP Reply"), "%s",
231				"Reverse ARP Reply");
232		return;
233	case ARP_OP_INVREQUEST:
234		snprintf(buf, sizeof("Peer Identify Request"), "%s",
235				"Peer Identify Request");
236		return;
237	case ARP_OP_INVREPLY:
238		snprintf(buf, sizeof("Peer Identify Reply"), "%s",
239				"Peer Identify Reply");
240		return;
241	default:
242		break;
243	}
244	snprintf(buf, sizeof("Unknown"), "%s", "Unknown");
245	return;
246}
247#endif
248#define MaxIPv4String	16
249static void
250ipv4_addr_to_dot(uint32_t be_ipv4_addr, char *buf, uint8_t buf_size)
251{
252	uint32_t ipv4_addr;
253
254	ipv4_addr = rte_be_to_cpu_32(be_ipv4_addr);
255	snprintf(buf, buf_size, "%d.%d.%d.%d", (ipv4_addr >> 24) & 0xFF,
256		(ipv4_addr >> 16) & 0xFF, (ipv4_addr >> 8) & 0xFF,
257		ipv4_addr & 0xFF);
258}
259
260#define MAX_CLIENTS_NUMBER	128
261uint8_t active_clients;
262struct client_stats_t {
263	uint8_t port;
264	uint32_t ipv4_addr;
265	uint32_t ipv4_rx_packets;
266	uint32_t ipv4_tx_packets;
267};
268struct client_stats_t client_stats[MAX_CLIENTS_NUMBER];
269
270static void
271update_client_stats(uint32_t addr, uint8_t port, uint32_t *TXorRXindicator)
272{
273	int i = 0;
274
275	for (; i < MAX_CLIENTS_NUMBER; i++)	{
276		if ((client_stats[i].ipv4_addr == addr) && (client_stats[i].port == port))	{
277			/* Just update RX packets number for this client */
278			if (TXorRXindicator == &burstnumberRX)
279				client_stats[i].ipv4_rx_packets++;
280			else
281				client_stats[i].ipv4_tx_packets++;
282			return;
283		}
284	}
285	/* We have a new client. Insert him to the table, and increment stats */
286	if (TXorRXindicator == &burstnumberRX)
287		client_stats[active_clients].ipv4_rx_packets++;
288	else
289		client_stats[active_clients].ipv4_tx_packets++;
290	client_stats[active_clients].ipv4_addr = addr;
291	client_stats[active_clients].port = port;
292	active_clients++;
293
294}
295
296#ifdef RTE_LIBRTE_BOND_DEBUG_ALB
297#define MODE6_DEBUG(info, src_ip, dst_ip, eth_h, arp_op, port, burstnumber)	\
298		RTE_LOG(DEBUG, PMD, \
299		"%s " \
300		"port:%d " \
301		"SrcMAC:%02X:%02X:%02X:%02X:%02X:%02X " \
302		"SrcIP:%s " \
303		"DstMAC:%02X:%02X:%02X:%02X:%02X:%02X " \
304		"DstIP:%s " \
305		"%s " \
306		"%d\n", \
307		info, \
308		port, \
309		eth_h->s_addr.addr_bytes[0], \
310		eth_h->s_addr.addr_bytes[1], \
311		eth_h->s_addr.addr_bytes[2], \
312		eth_h->s_addr.addr_bytes[3], \
313		eth_h->s_addr.addr_bytes[4], \
314		eth_h->s_addr.addr_bytes[5], \
315		src_ip, \
316		eth_h->d_addr.addr_bytes[0], \
317		eth_h->d_addr.addr_bytes[1], \
318		eth_h->d_addr.addr_bytes[2], \
319		eth_h->d_addr.addr_bytes[3], \
320		eth_h->d_addr.addr_bytes[4], \
321		eth_h->d_addr.addr_bytes[5], \
322		dst_ip, \
323		arp_op, \
324		++burstnumber)
325#endif
326
327static void
328mode6_debug(const char __attribute__((unused)) *info, struct ether_hdr *eth_h,
329		uint8_t port, uint32_t __attribute__((unused)) *burstnumber)
330{
331	struct ipv4_hdr *ipv4_h;
332#ifdef RTE_LIBRTE_BOND_DEBUG_ALB
333	struct arp_hdr *arp_h;
334	char dst_ip[16];
335	char ArpOp[24];
336	char buf[16];
337#endif
338	char src_ip[16];
339
340	uint16_t ether_type = eth_h->ether_type;
341	uint16_t offset = get_vlan_offset(eth_h, &ether_type);
342
343#ifdef RTE_LIBRTE_BOND_DEBUG_ALB
344	snprintf(buf, 16, "%s", info);
345#endif
346
347	if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_IPv4)) {
348		ipv4_h = (struct ipv4_hdr *)((char *)(eth_h + 1) + offset);
349		ipv4_addr_to_dot(ipv4_h->src_addr, src_ip, MaxIPv4String);
350#ifdef RTE_LIBRTE_BOND_DEBUG_ALB
351		ipv4_addr_to_dot(ipv4_h->dst_addr, dst_ip, MaxIPv4String);
352		MODE6_DEBUG(buf, src_ip, dst_ip, eth_h, "", port, *burstnumber);
353#endif
354		update_client_stats(ipv4_h->src_addr, port, burstnumber);
355	}
356#ifdef RTE_LIBRTE_BOND_DEBUG_ALB
357	else if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_ARP)) {
358		arp_h = (struct arp_hdr *)((char *)(eth_h + 1) + offset);
359		ipv4_addr_to_dot(arp_h->arp_data.arp_sip, src_ip, MaxIPv4String);
360		ipv4_addr_to_dot(arp_h->arp_data.arp_tip, dst_ip, MaxIPv4String);
361		arp_op_name(rte_be_to_cpu_16(arp_h->arp_op), ArpOp);
362		MODE6_DEBUG(buf, src_ip, dst_ip, eth_h, ArpOp, port, *burstnumber);
363	}
364#endif
365}
366#endif
367
368static uint16_t
369bond_ethdev_rx_burst_alb(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
370{
371	struct bond_tx_queue *bd_tx_q = (struct bond_tx_queue *)queue;
372	struct bond_dev_private *internals = bd_tx_q->dev_private;
373	struct ether_hdr *eth_h;
374	uint16_t ether_type, offset;
375	uint16_t nb_recv_pkts;
376	int i;
377
378	nb_recv_pkts = bond_ethdev_rx_burst(queue, bufs, nb_pkts);
379
380	for (i = 0; i < nb_recv_pkts; i++) {
381		eth_h = rte_pktmbuf_mtod(bufs[i], struct ether_hdr *);
382		ether_type = eth_h->ether_type;
383		offset = get_vlan_offset(eth_h, &ether_type);
384
385		if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_ARP)) {
386#if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
387			mode6_debug("RX ARP:", eth_h, bufs[i]->port, &burstnumberRX);
388#endif
389			bond_mode_alb_arp_recv(eth_h, offset, internals);
390		}
391#if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
392		else if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_IPv4))
393			mode6_debug("RX IPv4:", eth_h, bufs[i]->port, &burstnumberRX);
394#endif
395	}
396
397	return nb_recv_pkts;
398}
399
400static uint16_t
401bond_ethdev_tx_burst_round_robin(void *queue, struct rte_mbuf **bufs,
402		uint16_t nb_pkts)
403{
404	struct bond_dev_private *internals;
405	struct bond_tx_queue *bd_tx_q;
406
407	struct rte_mbuf *slave_bufs[RTE_MAX_ETHPORTS][nb_pkts];
408	uint16_t slave_nb_pkts[RTE_MAX_ETHPORTS] = { 0 };
409
410	uint8_t num_of_slaves;
411	uint8_t slaves[RTE_MAX_ETHPORTS];
412
413	uint16_t num_tx_total = 0, num_tx_slave;
414
415	static int slave_idx = 0;
416	int i, cslave_idx = 0, tx_fail_total = 0;
417
418	bd_tx_q = (struct bond_tx_queue *)queue;
419	internals = bd_tx_q->dev_private;
420
421	/* Copy slave list to protect against slave up/down changes during tx
422	 * bursting */
423	num_of_slaves = internals->active_slave_count;
424	memcpy(slaves, internals->active_slaves,
425			sizeof(internals->active_slaves[0]) * num_of_slaves);
426
427	if (num_of_slaves < 1)
428		return num_tx_total;
429
430	/* Populate slaves mbuf with which packets are to be sent on it  */
431	for (i = 0; i < nb_pkts; i++) {
432		cslave_idx = (slave_idx + i) % num_of_slaves;
433		slave_bufs[cslave_idx][(slave_nb_pkts[cslave_idx])++] = bufs[i];
434	}
435
436	/* increment current slave index so the next call to tx burst starts on the
437	 * next slave */
438	slave_idx = ++cslave_idx;
439
440	/* Send packet burst on each slave device */
441	for (i = 0; i < num_of_slaves; i++) {
442		if (slave_nb_pkts[i] > 0) {
443			num_tx_slave = rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id,
444					slave_bufs[i], slave_nb_pkts[i]);
445
446			/* if tx burst fails move packets to end of bufs */
447			if (unlikely(num_tx_slave < slave_nb_pkts[i])) {
448				int tx_fail_slave = slave_nb_pkts[i] - num_tx_slave;
449
450				tx_fail_total += tx_fail_slave;
451
452				memcpy(&bufs[nb_pkts - tx_fail_total],
453						&slave_bufs[i][num_tx_slave],
454						tx_fail_slave * sizeof(bufs[0]));
455			}
456			num_tx_total += num_tx_slave;
457		}
458	}
459
460	return num_tx_total;
461}
462
463static uint16_t
464bond_ethdev_tx_burst_active_backup(void *queue,
465		struct rte_mbuf **bufs, uint16_t nb_pkts)
466{
467	struct bond_dev_private *internals;
468	struct bond_tx_queue *bd_tx_q;
469
470	bd_tx_q = (struct bond_tx_queue *)queue;
471	internals = bd_tx_q->dev_private;
472
473	if (internals->active_slave_count < 1)
474		return 0;
475
476	return rte_eth_tx_burst(internals->current_primary_port, bd_tx_q->queue_id,
477			bufs, nb_pkts);
478}
479
480static inline uint16_t
481ether_hash(struct ether_hdr *eth_hdr)
482{
483	unaligned_uint16_t *word_src_addr =
484		(unaligned_uint16_t *)eth_hdr->s_addr.addr_bytes;
485	unaligned_uint16_t *word_dst_addr =
486		(unaligned_uint16_t *)eth_hdr->d_addr.addr_bytes;
487
488	return (word_src_addr[0] ^ word_dst_addr[0]) ^
489			(word_src_addr[1] ^ word_dst_addr[1]) ^
490			(word_src_addr[2] ^ word_dst_addr[2]);
491}
492
493static inline uint32_t
494ipv4_hash(struct ipv4_hdr *ipv4_hdr)
495{
496	return ipv4_hdr->src_addr ^ ipv4_hdr->dst_addr;
497}
498
499static inline uint32_t
500ipv6_hash(struct ipv6_hdr *ipv6_hdr)
501{
502	unaligned_uint32_t *word_src_addr =
503		(unaligned_uint32_t *)&(ipv6_hdr->src_addr[0]);
504	unaligned_uint32_t *word_dst_addr =
505		(unaligned_uint32_t *)&(ipv6_hdr->dst_addr[0]);
506
507	return (word_src_addr[0] ^ word_dst_addr[0]) ^
508			(word_src_addr[1] ^ word_dst_addr[1]) ^
509			(word_src_addr[2] ^ word_dst_addr[2]) ^
510			(word_src_addr[3] ^ word_dst_addr[3]);
511}
512
513uint16_t
514xmit_l2_hash(const struct rte_mbuf *buf, uint8_t slave_count)
515{
516	struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(buf, struct ether_hdr *);
517
518	uint32_t hash = ether_hash(eth_hdr);
519
520	return (hash ^= hash >> 8) % slave_count;
521}
522
523uint16_t
524xmit_l23_hash(const struct rte_mbuf *buf, uint8_t slave_count)
525{
526	struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(buf, struct ether_hdr *);
527	uint16_t proto = eth_hdr->ether_type;
528	size_t vlan_offset = get_vlan_offset(eth_hdr, &proto);
529	uint32_t hash, l3hash = 0;
530
531	hash = ether_hash(eth_hdr);
532
533	if (rte_cpu_to_be_16(ETHER_TYPE_IPv4) == proto) {
534		struct ipv4_hdr *ipv4_hdr = (struct ipv4_hdr *)
535				((char *)(eth_hdr + 1) + vlan_offset);
536		l3hash = ipv4_hash(ipv4_hdr);
537
538	} else if (rte_cpu_to_be_16(ETHER_TYPE_IPv6) == proto) {
539		struct ipv6_hdr *ipv6_hdr = (struct ipv6_hdr *)
540				((char *)(eth_hdr + 1) + vlan_offset);
541		l3hash = ipv6_hash(ipv6_hdr);
542	}
543
544	hash = hash ^ l3hash;
545	hash ^= hash >> 16;
546	hash ^= hash >> 8;
547
548	return hash % slave_count;
549}
550
551uint16_t
552xmit_l34_hash(const struct rte_mbuf *buf, uint8_t slave_count)
553{
554	struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(buf, struct ether_hdr *);
555	uint16_t proto = eth_hdr->ether_type;
556	size_t vlan_offset = get_vlan_offset(eth_hdr, &proto);
557
558	struct udp_hdr *udp_hdr = NULL;
559	struct tcp_hdr *tcp_hdr = NULL;
560	uint32_t hash, l3hash = 0, l4hash = 0;
561
562	if (rte_cpu_to_be_16(ETHER_TYPE_IPv4) == proto) {
563		struct ipv4_hdr *ipv4_hdr = (struct ipv4_hdr *)
564				((char *)(eth_hdr + 1) + vlan_offset);
565		size_t ip_hdr_offset;
566
567		l3hash = ipv4_hash(ipv4_hdr);
568
569		/* there is no L4 header in fragmented packet */
570		if (likely(rte_ipv4_frag_pkt_is_fragmented(ipv4_hdr) == 0)) {
571			ip_hdr_offset = (ipv4_hdr->version_ihl & IPV4_HDR_IHL_MASK) *
572					IPV4_IHL_MULTIPLIER;
573
574			if (ipv4_hdr->next_proto_id == IPPROTO_TCP) {
575				tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr +
576						ip_hdr_offset);
577				l4hash = HASH_L4_PORTS(tcp_hdr);
578			} else if (ipv4_hdr->next_proto_id == IPPROTO_UDP) {
579				udp_hdr = (struct udp_hdr *)((char *)ipv4_hdr +
580						ip_hdr_offset);
581				l4hash = HASH_L4_PORTS(udp_hdr);
582			}
583		}
584	} else if  (rte_cpu_to_be_16(ETHER_TYPE_IPv6) == proto) {
585		struct ipv6_hdr *ipv6_hdr = (struct ipv6_hdr *)
586				((char *)(eth_hdr + 1) + vlan_offset);
587		l3hash = ipv6_hash(ipv6_hdr);
588
589		if (ipv6_hdr->proto == IPPROTO_TCP) {
590			tcp_hdr = (struct tcp_hdr *)(ipv6_hdr + 1);
591			l4hash = HASH_L4_PORTS(tcp_hdr);
592		} else if (ipv6_hdr->proto == IPPROTO_UDP) {
593			udp_hdr = (struct udp_hdr *)(ipv6_hdr + 1);
594			l4hash = HASH_L4_PORTS(udp_hdr);
595		}
596	}
597
598	hash = l3hash ^ l4hash;
599	hash ^= hash >> 16;
600	hash ^= hash >> 8;
601
602	return hash % slave_count;
603}
604
605struct bwg_slave {
606	uint64_t bwg_left_int;
607	uint64_t bwg_left_remainder;
608	uint8_t slave;
609};
610
611void
612bond_tlb_activate_slave(struct bond_dev_private *internals) {
613	int i;
614
615	for (i = 0; i < internals->active_slave_count; i++) {
616		tlb_last_obytets[internals->active_slaves[i]] = 0;
617	}
618}
619
620static int
621bandwidth_cmp(const void *a, const void *b)
622{
623	const struct bwg_slave *bwg_a = a;
624	const struct bwg_slave *bwg_b = b;
625	int64_t diff = (int64_t)bwg_b->bwg_left_int - (int64_t)bwg_a->bwg_left_int;
626	int64_t diff2 = (int64_t)bwg_b->bwg_left_remainder -
627			(int64_t)bwg_a->bwg_left_remainder;
628	if (diff > 0)
629		return 1;
630	else if (diff < 0)
631		return -1;
632	else if (diff2 > 0)
633		return 1;
634	else if (diff2 < 0)
635		return -1;
636	else
637		return 0;
638}
639
640static void
641bandwidth_left(uint8_t port_id, uint64_t load, uint8_t update_idx,
642		struct bwg_slave *bwg_slave)
643{
644	struct rte_eth_link link_status;
645
646	rte_eth_link_get(port_id, &link_status);
647	uint64_t link_bwg = link_status.link_speed * 1000000ULL / 8;
648	if (link_bwg == 0)
649		return;
650	link_bwg = link_bwg * (update_idx+1) * REORDER_PERIOD_MS;
651	bwg_slave->bwg_left_int = (link_bwg - 1000*load) / link_bwg;
652	bwg_slave->bwg_left_remainder = (link_bwg - 1000*load) % link_bwg;
653}
654
655static void
656bond_ethdev_update_tlb_slave_cb(void *arg)
657{
658	struct bond_dev_private *internals = arg;
659	struct rte_eth_stats slave_stats;
660	struct bwg_slave bwg_array[RTE_MAX_ETHPORTS];
661	uint8_t slave_count;
662	uint64_t tx_bytes;
663
664	uint8_t update_stats = 0;
665	uint8_t i, slave_id;
666
667	internals->slave_update_idx++;
668
669
670	if (internals->slave_update_idx >= REORDER_PERIOD_MS)
671		update_stats = 1;
672
673	for (i = 0; i < internals->active_slave_count; i++) {
674		slave_id = internals->active_slaves[i];
675		rte_eth_stats_get(slave_id, &slave_stats);
676		tx_bytes = slave_stats.obytes - tlb_last_obytets[slave_id];
677		bandwidth_left(slave_id, tx_bytes,
678				internals->slave_update_idx, &bwg_array[i]);
679		bwg_array[i].slave = slave_id;
680
681		if (update_stats) {
682			tlb_last_obytets[slave_id] = slave_stats.obytes;
683		}
684	}
685
686	if (update_stats == 1)
687		internals->slave_update_idx = 0;
688
689	slave_count = i;
690	qsort(bwg_array, slave_count, sizeof(bwg_array[0]), bandwidth_cmp);
691	for (i = 0; i < slave_count; i++)
692		internals->tlb_slaves_order[i] = bwg_array[i].slave;
693
694	rte_eal_alarm_set(REORDER_PERIOD_MS * 1000, bond_ethdev_update_tlb_slave_cb,
695			(struct bond_dev_private *)internals);
696}
697
698static uint16_t
699bond_ethdev_tx_burst_tlb(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
700{
701	struct bond_tx_queue *bd_tx_q = (struct bond_tx_queue *)queue;
702	struct bond_dev_private *internals = bd_tx_q->dev_private;
703
704	struct rte_eth_dev *primary_port =
705			&rte_eth_devices[internals->primary_port];
706	uint16_t num_tx_total = 0;
707	uint8_t i, j;
708
709	uint8_t num_of_slaves = internals->active_slave_count;
710	uint8_t slaves[RTE_MAX_ETHPORTS];
711
712	struct ether_hdr *ether_hdr;
713	struct ether_addr primary_slave_addr;
714	struct ether_addr active_slave_addr;
715
716	if (num_of_slaves < 1)
717		return num_tx_total;
718
719	memcpy(slaves, internals->tlb_slaves_order,
720				sizeof(internals->tlb_slaves_order[0]) * num_of_slaves);
721
722
723	ether_addr_copy(primary_port->data->mac_addrs, &primary_slave_addr);
724
725	if (nb_pkts > 3) {
726		for (i = 0; i < 3; i++)
727			rte_prefetch0(rte_pktmbuf_mtod(bufs[i], void*));
728	}
729
730	for (i = 0; i < num_of_slaves; i++) {
731		rte_eth_macaddr_get(slaves[i], &active_slave_addr);
732		for (j = num_tx_total; j < nb_pkts; j++) {
733			if (j + 3 < nb_pkts)
734				rte_prefetch0(rte_pktmbuf_mtod(bufs[j+3], void*));
735
736			ether_hdr = rte_pktmbuf_mtod(bufs[j], struct ether_hdr *);
737			if (is_same_ether_addr(&ether_hdr->s_addr, &primary_slave_addr))
738				ether_addr_copy(&active_slave_addr, &ether_hdr->s_addr);
739#if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
740					mode6_debug("TX IPv4:", ether_hdr, slaves[i], &burstnumberTX);
741#endif
742		}
743
744		num_tx_total += rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id,
745				bufs + num_tx_total, nb_pkts - num_tx_total);
746
747		if (num_tx_total == nb_pkts)
748			break;
749	}
750
751	return num_tx_total;
752}
753
754void
755bond_tlb_disable(struct bond_dev_private *internals)
756{
757	rte_eal_alarm_cancel(bond_ethdev_update_tlb_slave_cb, internals);
758}
759
760void
761bond_tlb_enable(struct bond_dev_private *internals)
762{
763	bond_ethdev_update_tlb_slave_cb(internals);
764}
765
766static uint16_t
767bond_ethdev_tx_burst_alb(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
768{
769	struct bond_tx_queue *bd_tx_q = (struct bond_tx_queue *)queue;
770	struct bond_dev_private *internals = bd_tx_q->dev_private;
771
772	struct ether_hdr *eth_h;
773	uint16_t ether_type, offset;
774
775	struct client_data *client_info;
776
777	/*
778	 * We create transmit buffers for every slave and one additional to send
779	 * through tlb. In worst case every packet will be send on one port.
780	 */
781	struct rte_mbuf *slave_bufs[RTE_MAX_ETHPORTS + 1][nb_pkts];
782	uint16_t slave_bufs_pkts[RTE_MAX_ETHPORTS + 1] = { 0 };
783
784	/*
785	 * We create separate transmit buffers for update packets as they wont be
786	 * counted in num_tx_total.
787	 */
788	struct rte_mbuf *update_bufs[RTE_MAX_ETHPORTS][ALB_HASH_TABLE_SIZE];
789	uint16_t update_bufs_pkts[RTE_MAX_ETHPORTS] = { 0 };
790
791	struct rte_mbuf *upd_pkt;
792	size_t pkt_size;
793
794	uint16_t num_send, num_not_send = 0;
795	uint16_t num_tx_total = 0;
796	uint8_t slave_idx;
797
798	int i, j;
799
800	/* Search tx buffer for ARP packets and forward them to alb */
801	for (i = 0; i < nb_pkts; i++) {
802		eth_h = rte_pktmbuf_mtod(bufs[i], struct ether_hdr *);
803		ether_type = eth_h->ether_type;
804		offset = get_vlan_offset(eth_h, &ether_type);
805
806		if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_ARP)) {
807			slave_idx = bond_mode_alb_arp_xmit(eth_h, offset, internals);
808
809			/* Change src mac in eth header */
810			rte_eth_macaddr_get(slave_idx, &eth_h->s_addr);
811
812			/* Add packet to slave tx buffer */
813			slave_bufs[slave_idx][slave_bufs_pkts[slave_idx]] = bufs[i];
814			slave_bufs_pkts[slave_idx]++;
815		} else {
816			/* If packet is not ARP, send it with TLB policy */
817			slave_bufs[RTE_MAX_ETHPORTS][slave_bufs_pkts[RTE_MAX_ETHPORTS]] =
818					bufs[i];
819			slave_bufs_pkts[RTE_MAX_ETHPORTS]++;
820		}
821	}
822
823	/* Update connected client ARP tables */
824	if (internals->mode6.ntt) {
825		for (i = 0; i < ALB_HASH_TABLE_SIZE; i++) {
826			client_info = &internals->mode6.client_table[i];
827
828			if (client_info->in_use) {
829				/* Allocate new packet to send ARP update on current slave */
830				upd_pkt = rte_pktmbuf_alloc(internals->mode6.mempool);
831				if (upd_pkt == NULL) {
832					RTE_LOG(ERR, PMD, "Failed to allocate ARP packet from pool\n");
833					continue;
834				}
835				pkt_size = sizeof(struct ether_hdr) + sizeof(struct arp_hdr)
836						+ client_info->vlan_count * sizeof(struct vlan_hdr);
837				upd_pkt->data_len = pkt_size;
838				upd_pkt->pkt_len = pkt_size;
839
840				slave_idx = bond_mode_alb_arp_upd(client_info, upd_pkt,
841						internals);
842
843				/* Add packet to update tx buffer */
844				update_bufs[slave_idx][update_bufs_pkts[slave_idx]] = upd_pkt;
845				update_bufs_pkts[slave_idx]++;
846			}
847		}
848		internals->mode6.ntt = 0;
849	}
850
851	/* Send ARP packets on proper slaves */
852	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
853		if (slave_bufs_pkts[i] > 0) {
854			num_send = rte_eth_tx_burst(i, bd_tx_q->queue_id,
855					slave_bufs[i], slave_bufs_pkts[i]);
856			for (j = 0; j < slave_bufs_pkts[i] - num_send; j++) {
857				bufs[nb_pkts - 1 - num_not_send - j] =
858						slave_bufs[i][nb_pkts - 1 - j];
859			}
860
861			num_tx_total += num_send;
862			num_not_send += slave_bufs_pkts[i] - num_send;
863
864#if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
865	/* Print TX stats including update packets */
866			for (j = 0; j < slave_bufs_pkts[i]; j++) {
867				eth_h = rte_pktmbuf_mtod(slave_bufs[i][j], struct ether_hdr *);
868				mode6_debug("TX ARP:", eth_h, i, &burstnumberTX);
869			}
870#endif
871		}
872	}
873
874	/* Send update packets on proper slaves */
875	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
876		if (update_bufs_pkts[i] > 0) {
877			num_send = rte_eth_tx_burst(i, bd_tx_q->queue_id, update_bufs[i],
878					update_bufs_pkts[i]);
879			for (j = num_send; j < update_bufs_pkts[i]; j++) {
880				rte_pktmbuf_free(update_bufs[i][j]);
881			}
882#if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
883			for (j = 0; j < update_bufs_pkts[i]; j++) {
884				eth_h = rte_pktmbuf_mtod(update_bufs[i][j], struct ether_hdr *);
885				mode6_debug("TX ARPupd:", eth_h, i, &burstnumberTX);
886			}
887#endif
888		}
889	}
890
891	/* Send non-ARP packets using tlb policy */
892	if (slave_bufs_pkts[RTE_MAX_ETHPORTS] > 0) {
893		num_send = bond_ethdev_tx_burst_tlb(queue,
894				slave_bufs[RTE_MAX_ETHPORTS],
895				slave_bufs_pkts[RTE_MAX_ETHPORTS]);
896
897		for (j = 0; j < slave_bufs_pkts[RTE_MAX_ETHPORTS]; j++) {
898			bufs[nb_pkts - 1 - num_not_send - j] =
899					slave_bufs[RTE_MAX_ETHPORTS][nb_pkts - 1 - j];
900		}
901
902		num_tx_total += num_send;
903		num_not_send += slave_bufs_pkts[RTE_MAX_ETHPORTS] - num_send;
904	}
905
906	return num_tx_total;
907}
908
909static uint16_t
910bond_ethdev_tx_burst_balance(void *queue, struct rte_mbuf **bufs,
911		uint16_t nb_pkts)
912{
913	struct bond_dev_private *internals;
914	struct bond_tx_queue *bd_tx_q;
915
916	uint8_t num_of_slaves;
917	uint8_t slaves[RTE_MAX_ETHPORTS];
918
919	uint16_t num_tx_total = 0, num_tx_slave = 0, tx_fail_total = 0;
920
921	int i, op_slave_id;
922
923	struct rte_mbuf *slave_bufs[RTE_MAX_ETHPORTS][nb_pkts];
924	uint16_t slave_nb_pkts[RTE_MAX_ETHPORTS] = { 0 };
925
926	bd_tx_q = (struct bond_tx_queue *)queue;
927	internals = bd_tx_q->dev_private;
928
929	/* Copy slave list to protect against slave up/down changes during tx
930	 * bursting */
931	num_of_slaves = internals->active_slave_count;
932	memcpy(slaves, internals->active_slaves,
933			sizeof(internals->active_slaves[0]) * num_of_slaves);
934
935	if (num_of_slaves < 1)
936		return num_tx_total;
937
938	/* Populate slaves mbuf with the packets which are to be sent on it  */
939	for (i = 0; i < nb_pkts; i++) {
940		/* Select output slave using hash based on xmit policy */
941		op_slave_id = internals->xmit_hash(bufs[i], num_of_slaves);
942
943		/* Populate slave mbuf arrays with mbufs for that slave */
944		slave_bufs[op_slave_id][slave_nb_pkts[op_slave_id]++] = bufs[i];
945	}
946
947	/* Send packet burst on each slave device */
948	for (i = 0; i < num_of_slaves; i++) {
949		if (slave_nb_pkts[i] > 0) {
950			num_tx_slave = rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id,
951					slave_bufs[i], slave_nb_pkts[i]);
952
953			/* if tx burst fails move packets to end of bufs */
954			if (unlikely(num_tx_slave < slave_nb_pkts[i])) {
955				int slave_tx_fail_count = slave_nb_pkts[i] - num_tx_slave;
956
957				tx_fail_total += slave_tx_fail_count;
958				memcpy(&bufs[nb_pkts - tx_fail_total],
959						&slave_bufs[i][num_tx_slave],
960						slave_tx_fail_count * sizeof(bufs[0]));
961			}
962
963			num_tx_total += num_tx_slave;
964		}
965	}
966
967	return num_tx_total;
968}
969
970static uint16_t
971bond_ethdev_tx_burst_8023ad(void *queue, struct rte_mbuf **bufs,
972		uint16_t nb_pkts)
973{
974	struct bond_dev_private *internals;
975	struct bond_tx_queue *bd_tx_q;
976
977	uint8_t num_of_slaves;
978	uint8_t slaves[RTE_MAX_ETHPORTS];
979	 /* positions in slaves, not ID */
980	uint8_t distributing_offsets[RTE_MAX_ETHPORTS];
981	uint8_t distributing_count;
982
983	uint16_t num_tx_slave, num_tx_total = 0, num_tx_fail_total = 0;
984	uint16_t i, j, op_slave_idx;
985	const uint16_t buffs_size = nb_pkts + BOND_MODE_8023AX_SLAVE_TX_PKTS + 1;
986
987	/* Allocate additional packets in case 8023AD mode. */
988	struct rte_mbuf *slave_bufs[RTE_MAX_ETHPORTS][buffs_size];
989	void *slow_pkts[BOND_MODE_8023AX_SLAVE_TX_PKTS] = { NULL };
990
991	/* Total amount of packets in slave_bufs */
992	uint16_t slave_nb_pkts[RTE_MAX_ETHPORTS] = { 0 };
993	/* Slow packets placed in each slave */
994	uint8_t slave_slow_nb_pkts[RTE_MAX_ETHPORTS] = { 0 };
995
996	bd_tx_q = (struct bond_tx_queue *)queue;
997	internals = bd_tx_q->dev_private;
998
999	/* Copy slave list to protect against slave up/down changes during tx
1000	 * bursting */
1001	num_of_slaves = internals->active_slave_count;
1002	if (num_of_slaves < 1)
1003		return num_tx_total;
1004
1005	memcpy(slaves, internals->active_slaves, sizeof(slaves[0]) * num_of_slaves);
1006
1007	distributing_count = 0;
1008	for (i = 0; i < num_of_slaves; i++) {
1009		struct port *port = &mode_8023ad_ports[slaves[i]];
1010
1011		slave_slow_nb_pkts[i] = rte_ring_dequeue_burst(port->tx_ring,
1012				slow_pkts, BOND_MODE_8023AX_SLAVE_TX_PKTS);
1013		slave_nb_pkts[i] = slave_slow_nb_pkts[i];
1014
1015		for (j = 0; j < slave_slow_nb_pkts[i]; j++)
1016			slave_bufs[i][j] = slow_pkts[j];
1017
1018		if (ACTOR_STATE(port, DISTRIBUTING))
1019			distributing_offsets[distributing_count++] = i;
1020	}
1021
1022	if (likely(distributing_count > 0)) {
1023		/* Populate slaves mbuf with the packets which are to be sent on it */
1024		for (i = 0; i < nb_pkts; i++) {
1025			/* Select output slave using hash based on xmit policy */
1026			op_slave_idx = internals->xmit_hash(bufs[i], distributing_count);
1027
1028			/* Populate slave mbuf arrays with mbufs for that slave. Use only
1029			 * slaves that are currently distributing. */
1030			uint8_t slave_offset = distributing_offsets[op_slave_idx];
1031			slave_bufs[slave_offset][slave_nb_pkts[slave_offset]] = bufs[i];
1032			slave_nb_pkts[slave_offset]++;
1033		}
1034	}
1035
1036	/* Send packet burst on each slave device */
1037	for (i = 0; i < num_of_slaves; i++) {
1038		if (slave_nb_pkts[i] == 0)
1039			continue;
1040
1041		num_tx_slave = rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id,
1042				slave_bufs[i], slave_nb_pkts[i]);
1043
1044		/* If tx burst fails drop slow packets */
1045		for ( ; num_tx_slave < slave_slow_nb_pkts[i]; num_tx_slave++)
1046			rte_pktmbuf_free(slave_bufs[i][num_tx_slave]);
1047
1048		num_tx_total += num_tx_slave - slave_slow_nb_pkts[i];
1049		num_tx_fail_total += slave_nb_pkts[i] - num_tx_slave;
1050
1051		/* If tx burst fails move packets to end of bufs */
1052		if (unlikely(num_tx_slave < slave_nb_pkts[i])) {
1053			uint16_t j = nb_pkts - num_tx_fail_total;
1054			for ( ; num_tx_slave < slave_nb_pkts[i]; j++, num_tx_slave++)
1055				bufs[j] = slave_bufs[i][num_tx_slave];
1056		}
1057	}
1058
1059	return num_tx_total;
1060}
1061
1062static uint16_t
1063bond_ethdev_tx_burst_broadcast(void *queue, struct rte_mbuf **bufs,
1064		uint16_t nb_pkts)
1065{
1066	struct bond_dev_private *internals;
1067	struct bond_tx_queue *bd_tx_q;
1068
1069	uint8_t tx_failed_flag = 0, num_of_slaves;
1070	uint8_t slaves[RTE_MAX_ETHPORTS];
1071
1072	uint16_t max_nb_of_tx_pkts = 0;
1073
1074	int slave_tx_total[RTE_MAX_ETHPORTS];
1075	int i, most_successful_tx_slave = -1;
1076
1077	bd_tx_q = (struct bond_tx_queue *)queue;
1078	internals = bd_tx_q->dev_private;
1079
1080	/* Copy slave list to protect against slave up/down changes during tx
1081	 * bursting */
1082	num_of_slaves = internals->active_slave_count;
1083	memcpy(slaves, internals->active_slaves,
1084			sizeof(internals->active_slaves[0]) * num_of_slaves);
1085
1086	if (num_of_slaves < 1)
1087		return 0;
1088
1089	/* Increment reference count on mbufs */
1090	for (i = 0; i < nb_pkts; i++)
1091		rte_mbuf_refcnt_update(bufs[i], num_of_slaves - 1);
1092
1093	/* Transmit burst on each active slave */
1094	for (i = 0; i < num_of_slaves; i++) {
1095		slave_tx_total[i] = rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id,
1096					bufs, nb_pkts);
1097
1098		if (unlikely(slave_tx_total[i] < nb_pkts))
1099			tx_failed_flag = 1;
1100
1101		/* record the value and slave index for the slave which transmits the
1102		 * maximum number of packets */
1103		if (slave_tx_total[i] > max_nb_of_tx_pkts) {
1104			max_nb_of_tx_pkts = slave_tx_total[i];
1105			most_successful_tx_slave = i;
1106		}
1107	}
1108
1109	/* if slaves fail to transmit packets from burst, the calling application
1110	 * is not expected to know about multiple references to packets so we must
1111	 * handle failures of all packets except those of the most successful slave
1112	 */
1113	if (unlikely(tx_failed_flag))
1114		for (i = 0; i < num_of_slaves; i++)
1115			if (i != most_successful_tx_slave)
1116				while (slave_tx_total[i] < nb_pkts)
1117					rte_pktmbuf_free(bufs[slave_tx_total[i]++]);
1118
1119	return max_nb_of_tx_pkts;
1120}
1121
1122void
1123link_properties_set(struct rte_eth_dev *bonded_eth_dev,
1124		struct rte_eth_link *slave_dev_link)
1125{
1126	struct rte_eth_link *bonded_dev_link = &bonded_eth_dev->data->dev_link;
1127	struct bond_dev_private *internals = bonded_eth_dev->data->dev_private;
1128
1129	if (slave_dev_link->link_status &&
1130		bonded_eth_dev->data->dev_started) {
1131		bonded_dev_link->link_duplex = slave_dev_link->link_duplex;
1132		bonded_dev_link->link_speed = slave_dev_link->link_speed;
1133
1134		internals->link_props_set = 1;
1135	}
1136}
1137
1138void
1139link_properties_reset(struct rte_eth_dev *bonded_eth_dev)
1140{
1141	struct bond_dev_private *internals = bonded_eth_dev->data->dev_private;
1142
1143	memset(&(bonded_eth_dev->data->dev_link), 0,
1144			sizeof(bonded_eth_dev->data->dev_link));
1145
1146	internals->link_props_set = 0;
1147}
1148
1149int
1150link_properties_valid(struct rte_eth_link *bonded_dev_link,
1151		struct rte_eth_link *slave_dev_link)
1152{
1153	if (bonded_dev_link->link_duplex != slave_dev_link->link_duplex ||
1154		bonded_dev_link->link_speed !=  slave_dev_link->link_speed)
1155		return -1;
1156
1157	return 0;
1158}
1159
1160int
1161mac_address_get(struct rte_eth_dev *eth_dev, struct ether_addr *dst_mac_addr)
1162{
1163	struct ether_addr *mac_addr;
1164
1165	if (eth_dev == NULL) {
1166		RTE_LOG(ERR, PMD, "%s: NULL pointer eth_dev specified\n", __func__);
1167		return -1;
1168	}
1169
1170	if (dst_mac_addr == NULL) {
1171		RTE_LOG(ERR, PMD, "%s: NULL pointer MAC specified\n", __func__);
1172		return -1;
1173	}
1174
1175	mac_addr = eth_dev->data->mac_addrs;
1176
1177	ether_addr_copy(mac_addr, dst_mac_addr);
1178	return 0;
1179}
1180
1181int
1182mac_address_set(struct rte_eth_dev *eth_dev, struct ether_addr *new_mac_addr)
1183{
1184	struct ether_addr *mac_addr;
1185
1186	if (eth_dev == NULL) {
1187		RTE_BOND_LOG(ERR, "NULL pointer eth_dev specified");
1188		return -1;
1189	}
1190
1191	if (new_mac_addr == NULL) {
1192		RTE_BOND_LOG(ERR, "NULL pointer MAC specified");
1193		return -1;
1194	}
1195
1196	mac_addr = eth_dev->data->mac_addrs;
1197
1198	/* If new MAC is different to current MAC then update */
1199	if (memcmp(mac_addr, new_mac_addr, sizeof(*mac_addr)) != 0)
1200		memcpy(mac_addr, new_mac_addr, sizeof(*mac_addr));
1201
1202	return 0;
1203}
1204
1205int
1206mac_address_slaves_update(struct rte_eth_dev *bonded_eth_dev)
1207{
1208	struct bond_dev_private *internals = bonded_eth_dev->data->dev_private;
1209	int i;
1210
1211	/* Update slave devices MAC addresses */
1212	if (internals->slave_count < 1)
1213		return -1;
1214
1215	switch (internals->mode) {
1216	case BONDING_MODE_ROUND_ROBIN:
1217	case BONDING_MODE_BALANCE:
1218	case BONDING_MODE_BROADCAST:
1219		for (i = 0; i < internals->slave_count; i++) {
1220			if (mac_address_set(&rte_eth_devices[internals->slaves[i].port_id],
1221					bonded_eth_dev->data->mac_addrs)) {
1222				RTE_BOND_LOG(ERR, "Failed to update port Id %d MAC address",
1223						internals->slaves[i].port_id);
1224				return -1;
1225			}
1226		}
1227		break;
1228	case BONDING_MODE_8023AD:
1229		bond_mode_8023ad_mac_address_update(bonded_eth_dev);
1230		break;
1231	case BONDING_MODE_ACTIVE_BACKUP:
1232	case BONDING_MODE_TLB:
1233	case BONDING_MODE_ALB:
1234	default:
1235		for (i = 0; i < internals->slave_count; i++) {
1236			if (internals->slaves[i].port_id ==
1237					internals->current_primary_port) {
1238				if (mac_address_set(&rte_eth_devices[internals->primary_port],
1239						bonded_eth_dev->data->mac_addrs)) {
1240					RTE_BOND_LOG(ERR, "Failed to update port Id %d MAC address",
1241							internals->current_primary_port);
1242					return -1;
1243				}
1244			} else {
1245				if (mac_address_set(
1246						&rte_eth_devices[internals->slaves[i].port_id],
1247						&internals->slaves[i].persisted_mac_addr)) {
1248					RTE_BOND_LOG(ERR, "Failed to update port Id %d MAC address",
1249							internals->slaves[i].port_id);
1250					return -1;
1251				}
1252			}
1253		}
1254	}
1255
1256	return 0;
1257}
1258
1259int
1260bond_ethdev_mode_set(struct rte_eth_dev *eth_dev, int mode)
1261{
1262	struct bond_dev_private *internals;
1263
1264	internals = eth_dev->data->dev_private;
1265
1266	switch (mode) {
1267	case BONDING_MODE_ROUND_ROBIN:
1268		eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_round_robin;
1269		eth_dev->rx_pkt_burst = bond_ethdev_rx_burst;
1270		break;
1271	case BONDING_MODE_ACTIVE_BACKUP:
1272		eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_active_backup;
1273		eth_dev->rx_pkt_burst = bond_ethdev_rx_burst_active_backup;
1274		break;
1275	case BONDING_MODE_BALANCE:
1276		eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_balance;
1277		eth_dev->rx_pkt_burst = bond_ethdev_rx_burst;
1278		break;
1279	case BONDING_MODE_BROADCAST:
1280		eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_broadcast;
1281		eth_dev->rx_pkt_burst = bond_ethdev_rx_burst;
1282		break;
1283	case BONDING_MODE_8023AD:
1284		if (bond_mode_8023ad_enable(eth_dev) != 0)
1285			return -1;
1286
1287		eth_dev->rx_pkt_burst = bond_ethdev_rx_burst_8023ad;
1288		eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_8023ad;
1289		RTE_LOG(WARNING, PMD,
1290				"Using mode 4, it is necessary to do TX burst and RX burst "
1291				"at least every 100ms.\n");
1292		break;
1293	case BONDING_MODE_TLB:
1294		eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_tlb;
1295		eth_dev->rx_pkt_burst = bond_ethdev_rx_burst_active_backup;
1296		break;
1297	case BONDING_MODE_ALB:
1298		if (bond_mode_alb_enable(eth_dev) != 0)
1299			return -1;
1300
1301		eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_alb;
1302		eth_dev->rx_pkt_burst = bond_ethdev_rx_burst_alb;
1303		break;
1304	default:
1305		return -1;
1306	}
1307
1308	internals->mode = mode;
1309
1310	return 0;
1311}
1312
1313int
1314slave_configure(struct rte_eth_dev *bonded_eth_dev,
1315		struct rte_eth_dev *slave_eth_dev)
1316{
1317	struct bond_rx_queue *bd_rx_q;
1318	struct bond_tx_queue *bd_tx_q;
1319
1320	uint16_t old_nb_tx_queues = slave_eth_dev->data->nb_tx_queues;
1321	uint16_t old_nb_rx_queues = slave_eth_dev->data->nb_rx_queues;
1322	int errval;
1323	uint16_t q_id;
1324
1325	/* Stop slave */
1326	rte_eth_dev_stop(slave_eth_dev->data->port_id);
1327
1328	/* Enable interrupts on slave device if supported */
1329	if (slave_eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC)
1330		slave_eth_dev->data->dev_conf.intr_conf.lsc = 1;
1331
1332	/* If RSS is enabled for bonding, try to enable it for slaves  */
1333	if (bonded_eth_dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS_FLAG) {
1334		if (bonded_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len
1335				!= 0) {
1336			slave_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len =
1337					bonded_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len;
1338			slave_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key =
1339					bonded_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key;
1340		} else {
1341			slave_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key = NULL;
1342		}
1343
1344		slave_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf =
1345				bonded_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
1346		slave_eth_dev->data->dev_conf.rxmode.mq_mode =
1347				bonded_eth_dev->data->dev_conf.rxmode.mq_mode;
1348	}
1349
1350	slave_eth_dev->data->dev_conf.rxmode.hw_vlan_filter =
1351			bonded_eth_dev->data->dev_conf.rxmode.hw_vlan_filter;
1352
1353	/* Configure device */
1354	errval = rte_eth_dev_configure(slave_eth_dev->data->port_id,
1355			bonded_eth_dev->data->nb_rx_queues,
1356			bonded_eth_dev->data->nb_tx_queues,
1357			&(slave_eth_dev->data->dev_conf));
1358	if (errval != 0) {
1359		RTE_BOND_LOG(ERR, "Cannot configure slave device: port %u , err (%d)",
1360				slave_eth_dev->data->port_id, errval);
1361		return errval;
1362	}
1363
1364	/* Setup Rx Queues */
1365	/* Use existing queues, if any */
1366	for (q_id = old_nb_rx_queues;
1367	     q_id < bonded_eth_dev->data->nb_rx_queues; q_id++) {
1368		bd_rx_q = (struct bond_rx_queue *)bonded_eth_dev->data->rx_queues[q_id];
1369
1370		errval = rte_eth_rx_queue_setup(slave_eth_dev->data->port_id, q_id,
1371				bd_rx_q->nb_rx_desc,
1372				rte_eth_dev_socket_id(slave_eth_dev->data->port_id),
1373				&(bd_rx_q->rx_conf), bd_rx_q->mb_pool);
1374		if (errval != 0) {
1375			RTE_BOND_LOG(ERR,
1376					"rte_eth_rx_queue_setup: port=%d queue_id %d, err (%d)",
1377					slave_eth_dev->data->port_id, q_id, errval);
1378			return errval;
1379		}
1380	}
1381
1382	/* Setup Tx Queues */
1383	/* Use existing queues, if any */
1384	for (q_id = old_nb_tx_queues;
1385	     q_id < bonded_eth_dev->data->nb_tx_queues; q_id++) {
1386		bd_tx_q = (struct bond_tx_queue *)bonded_eth_dev->data->tx_queues[q_id];
1387
1388		errval = rte_eth_tx_queue_setup(slave_eth_dev->data->port_id, q_id,
1389				bd_tx_q->nb_tx_desc,
1390				rte_eth_dev_socket_id(slave_eth_dev->data->port_id),
1391				&bd_tx_q->tx_conf);
1392		if (errval != 0) {
1393			RTE_BOND_LOG(ERR,
1394					"rte_eth_tx_queue_setup: port=%d queue_id %d, err (%d)",
1395					slave_eth_dev->data->port_id, q_id, errval);
1396			return errval;
1397		}
1398	}
1399
1400	/* Start device */
1401	errval = rte_eth_dev_start(slave_eth_dev->data->port_id);
1402	if (errval != 0) {
1403		RTE_BOND_LOG(ERR, "rte_eth_dev_start: port=%u, err (%d)",
1404				slave_eth_dev->data->port_id, errval);
1405		return -1;
1406	}
1407
1408	/* If RSS is enabled for bonding, synchronize RETA */
1409	if (bonded_eth_dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS) {
1410		int i;
1411		struct bond_dev_private *internals;
1412
1413		internals = bonded_eth_dev->data->dev_private;
1414
1415		for (i = 0; i < internals->slave_count; i++) {
1416			if (internals->slaves[i].port_id == slave_eth_dev->data->port_id) {
1417				errval = rte_eth_dev_rss_reta_update(
1418						slave_eth_dev->data->port_id,
1419						&internals->reta_conf[0],
1420						internals->slaves[i].reta_size);
1421				if (errval != 0) {
1422					RTE_LOG(WARNING, PMD,
1423							"rte_eth_dev_rss_reta_update on slave port %d fails (err %d)."
1424							" RSS Configuration for bonding may be inconsistent.\n",
1425							slave_eth_dev->data->port_id, errval);
1426				}
1427				break;
1428			}
1429		}
1430	}
1431
1432	/* If lsc interrupt is set, check initial slave's link status */
1433	if (slave_eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC) {
1434		slave_eth_dev->dev_ops->link_update(slave_eth_dev, 0);
1435		bond_ethdev_lsc_event_callback(slave_eth_dev->data->port_id,
1436			RTE_ETH_EVENT_INTR_LSC, &bonded_eth_dev->data->port_id);
1437	}
1438
1439	return 0;
1440}
1441
1442void
1443slave_remove(struct bond_dev_private *internals,
1444		struct rte_eth_dev *slave_eth_dev)
1445{
1446	uint8_t i;
1447
1448	for (i = 0; i < internals->slave_count; i++)
1449		if (internals->slaves[i].port_id ==
1450				slave_eth_dev->data->port_id)
1451			break;
1452
1453	if (i < (internals->slave_count - 1))
1454		memmove(&internals->slaves[i], &internals->slaves[i + 1],
1455				sizeof(internals->slaves[0]) *
1456				(internals->slave_count - i - 1));
1457
1458	internals->slave_count--;
1459}
1460
1461static void
1462bond_ethdev_slave_link_status_change_monitor(void *cb_arg);
1463
1464void
1465slave_add(struct bond_dev_private *internals,
1466		struct rte_eth_dev *slave_eth_dev)
1467{
1468	struct bond_slave_details *slave_details =
1469			&internals->slaves[internals->slave_count];
1470
1471	slave_details->port_id = slave_eth_dev->data->port_id;
1472	slave_details->last_link_status = 0;
1473
1474	/* Mark slave devices that don't support interrupts so we can
1475	 * compensate when we start the bond
1476	 */
1477	if (!(slave_eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC)) {
1478		slave_details->link_status_poll_enabled = 1;
1479	}
1480
1481	slave_details->link_status_wait_to_complete = 0;
1482	/* clean tlb_last_obytes when adding port for bonding device */
1483	memcpy(&(slave_details->persisted_mac_addr), slave_eth_dev->data->mac_addrs,
1484			sizeof(struct ether_addr));
1485}
1486
1487void
1488bond_ethdev_primary_set(struct bond_dev_private *internals,
1489		uint8_t slave_port_id)
1490{
1491	int i;
1492
1493	if (internals->active_slave_count < 1)
1494		internals->current_primary_port = slave_port_id;
1495	else
1496		/* Search bonded device slave ports for new proposed primary port */
1497		for (i = 0; i < internals->active_slave_count; i++) {
1498			if (internals->active_slaves[i] == slave_port_id)
1499				internals->current_primary_port = slave_port_id;
1500		}
1501}
1502
1503static void
1504bond_ethdev_promiscuous_enable(struct rte_eth_dev *eth_dev);
1505
1506static int
1507bond_ethdev_start(struct rte_eth_dev *eth_dev)
1508{
1509	struct bond_dev_private *internals;
1510	int i;
1511
1512	/* slave eth dev will be started by bonded device */
1513	if (check_for_bonded_ethdev(eth_dev)) {
1514		RTE_BOND_LOG(ERR, "User tried to explicitly start a slave eth_dev (%d)",
1515				eth_dev->data->port_id);
1516		return -1;
1517	}
1518
1519	eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
1520	eth_dev->data->dev_started = 1;
1521
1522	internals = eth_dev->data->dev_private;
1523
1524	if (internals->slave_count == 0) {
1525		RTE_BOND_LOG(ERR, "Cannot start port since there are no slave devices");
1526		return -1;
1527	}
1528
1529	if (internals->user_defined_mac == 0) {
1530		struct ether_addr *new_mac_addr = NULL;
1531
1532		for (i = 0; i < internals->slave_count; i++)
1533			if (internals->slaves[i].port_id == internals->primary_port)
1534				new_mac_addr = &internals->slaves[i].persisted_mac_addr;
1535
1536		if (new_mac_addr == NULL)
1537			return -1;
1538
1539		if (mac_address_set(eth_dev, new_mac_addr) != 0) {
1540			RTE_BOND_LOG(ERR, "bonded port (%d) failed to update MAC address",
1541					eth_dev->data->port_id);
1542			return -1;
1543		}
1544	}
1545
1546	/* Update all slave devices MACs*/
1547	if (mac_address_slaves_update(eth_dev) != 0)
1548		return -1;
1549
1550	/* If bonded device is configure in promiscuous mode then re-apply config */
1551	if (internals->promiscuous_en)
1552		bond_ethdev_promiscuous_enable(eth_dev);
1553
1554	/* Reconfigure each slave device if starting bonded device */
1555	for (i = 0; i < internals->slave_count; i++) {
1556		if (slave_configure(eth_dev,
1557				&(rte_eth_devices[internals->slaves[i].port_id])) != 0) {
1558			RTE_BOND_LOG(ERR,
1559					"bonded port (%d) failed to reconfigure slave device (%d)",
1560					eth_dev->data->port_id, internals->slaves[i].port_id);
1561			return -1;
1562		}
1563		/* We will need to poll for link status if any slave doesn't
1564		 * support interrupts
1565		 */
1566		if (internals->slaves[i].link_status_poll_enabled)
1567			internals->link_status_polling_enabled = 1;
1568	}
1569	/* start polling if needed */
1570	if (internals->link_status_polling_enabled) {
1571		rte_eal_alarm_set(
1572			internals->link_status_polling_interval_ms * 1000,
1573			bond_ethdev_slave_link_status_change_monitor,
1574			(void *)&rte_eth_devices[internals->port_id]);
1575	}
1576
1577	if (internals->user_defined_primary_port)
1578		bond_ethdev_primary_set(internals, internals->primary_port);
1579
1580	if (internals->mode == BONDING_MODE_8023AD)
1581		bond_mode_8023ad_start(eth_dev);
1582
1583	if (internals->mode == BONDING_MODE_TLB ||
1584			internals->mode == BONDING_MODE_ALB)
1585		bond_tlb_enable(internals);
1586
1587	return 0;
1588}
1589
1590static void
1591bond_ethdev_free_queues(struct rte_eth_dev *dev)
1592{
1593	uint8_t i;
1594
1595	if (dev->data->rx_queues != NULL) {
1596		for (i = 0; i < dev->data->nb_rx_queues; i++) {
1597			rte_free(dev->data->rx_queues[i]);
1598			dev->data->rx_queues[i] = NULL;
1599		}
1600		dev->data->nb_rx_queues = 0;
1601	}
1602
1603	if (dev->data->tx_queues != NULL) {
1604		for (i = 0; i < dev->data->nb_tx_queues; i++) {
1605			rte_free(dev->data->tx_queues[i]);
1606			dev->data->tx_queues[i] = NULL;
1607		}
1608		dev->data->nb_tx_queues = 0;
1609	}
1610}
1611
1612void
1613bond_ethdev_stop(struct rte_eth_dev *eth_dev)
1614{
1615	struct bond_dev_private *internals = eth_dev->data->dev_private;
1616	uint8_t i;
1617
1618	if (internals->mode == BONDING_MODE_8023AD) {
1619		struct port *port;
1620		void *pkt = NULL;
1621
1622		bond_mode_8023ad_stop(eth_dev);
1623
1624		/* Discard all messages to/from mode 4 state machines */
1625		for (i = 0; i < internals->active_slave_count; i++) {
1626			port = &mode_8023ad_ports[internals->active_slaves[i]];
1627
1628			RTE_ASSERT(port->rx_ring != NULL);
1629			while (rte_ring_dequeue(port->rx_ring, &pkt) != -ENOENT)
1630				rte_pktmbuf_free(pkt);
1631
1632			RTE_ASSERT(port->tx_ring != NULL);
1633			while (rte_ring_dequeue(port->tx_ring, &pkt) != -ENOENT)
1634				rte_pktmbuf_free(pkt);
1635		}
1636	}
1637
1638	if (internals->mode == BONDING_MODE_TLB ||
1639			internals->mode == BONDING_MODE_ALB) {
1640		bond_tlb_disable(internals);
1641		for (i = 0; i < internals->active_slave_count; i++)
1642			tlb_last_obytets[internals->active_slaves[i]] = 0;
1643	}
1644
1645	internals->active_slave_count = 0;
1646	internals->link_status_polling_enabled = 0;
1647	for (i = 0; i < internals->slave_count; i++)
1648		internals->slaves[i].last_link_status = 0;
1649
1650	eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
1651	eth_dev->data->dev_started = 0;
1652}
1653
1654void
1655bond_ethdev_close(struct rte_eth_dev *dev)
1656{
1657	struct bond_dev_private *internals = dev->data->dev_private;
1658
1659	bond_ethdev_free_queues(dev);
1660	rte_bitmap_reset(internals->vlan_filter_bmp);
1661}
1662
1663/* forward declaration */
1664static int bond_ethdev_configure(struct rte_eth_dev *dev);
1665
1666static void
1667bond_ethdev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
1668{
1669	struct bond_dev_private *internals = dev->data->dev_private;
1670
1671	dev_info->max_mac_addrs = 1;
1672
1673	dev_info->max_rx_pktlen = internals->candidate_max_rx_pktlen
1674				  ? internals->candidate_max_rx_pktlen
1675				  : ETHER_MAX_JUMBO_FRAME_LEN;
1676
1677	dev_info->max_rx_queues = (uint16_t)128;
1678	dev_info->max_tx_queues = (uint16_t)512;
1679
1680	dev_info->min_rx_bufsize = 0;
1681	dev_info->pci_dev = NULL;
1682
1683	dev_info->rx_offload_capa = internals->rx_offload_capa;
1684	dev_info->tx_offload_capa = internals->tx_offload_capa;
1685	dev_info->flow_type_rss_offloads = internals->flow_type_rss_offloads;
1686
1687	dev_info->reta_size = internals->reta_size;
1688}
1689
1690static int
1691bond_ethdev_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on)
1692{
1693	int res;
1694	uint8_t i;
1695	struct bond_dev_private *internals = dev->data->dev_private;
1696
1697	/* don't do this while a slave is being added */
1698	rte_spinlock_lock(&internals->lock);
1699
1700	if (on)
1701		rte_bitmap_set(internals->vlan_filter_bmp, vlan_id);
1702	else
1703		rte_bitmap_clear(internals->vlan_filter_bmp, vlan_id);
1704
1705	for (i = 0; i < internals->slave_count; i++) {
1706		uint8_t port_id = internals->slaves[i].port_id;
1707
1708		res = rte_eth_dev_vlan_filter(port_id, vlan_id, on);
1709		if (res == ENOTSUP)
1710			RTE_LOG(WARNING, PMD,
1711				"Setting VLAN filter on slave port %u not supported.\n",
1712				port_id);
1713	}
1714
1715	rte_spinlock_unlock(&internals->lock);
1716	return 0;
1717}
1718
1719static int
1720bond_ethdev_rx_queue_setup(struct rte_eth_dev *dev, uint16_t rx_queue_id,
1721		uint16_t nb_rx_desc, unsigned int socket_id __rte_unused,
1722		const struct rte_eth_rxconf *rx_conf, struct rte_mempool *mb_pool)
1723{
1724	struct bond_rx_queue *bd_rx_q = (struct bond_rx_queue *)
1725			rte_zmalloc_socket(NULL, sizeof(struct bond_rx_queue),
1726					0, dev->data->numa_node);
1727	if (bd_rx_q == NULL)
1728		return -1;
1729
1730	bd_rx_q->queue_id = rx_queue_id;
1731	bd_rx_q->dev_private = dev->data->dev_private;
1732
1733	bd_rx_q->nb_rx_desc = nb_rx_desc;
1734
1735	memcpy(&(bd_rx_q->rx_conf), rx_conf, sizeof(struct rte_eth_rxconf));
1736	bd_rx_q->mb_pool = mb_pool;
1737
1738	dev->data->rx_queues[rx_queue_id] = bd_rx_q;
1739
1740	return 0;
1741}
1742
1743static int
1744bond_ethdev_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id,
1745		uint16_t nb_tx_desc, unsigned int socket_id __rte_unused,
1746		const struct rte_eth_txconf *tx_conf)
1747{
1748	struct bond_tx_queue *bd_tx_q  = (struct bond_tx_queue *)
1749			rte_zmalloc_socket(NULL, sizeof(struct bond_tx_queue),
1750					0, dev->data->numa_node);
1751
1752	if (bd_tx_q == NULL)
1753		return -1;
1754
1755	bd_tx_q->queue_id = tx_queue_id;
1756	bd_tx_q->dev_private = dev->data->dev_private;
1757
1758	bd_tx_q->nb_tx_desc = nb_tx_desc;
1759	memcpy(&(bd_tx_q->tx_conf), tx_conf, sizeof(bd_tx_q->tx_conf));
1760
1761	dev->data->tx_queues[tx_queue_id] = bd_tx_q;
1762
1763	return 0;
1764}
1765
1766static void
1767bond_ethdev_rx_queue_release(void *queue)
1768{
1769	if (queue == NULL)
1770		return;
1771
1772	rte_free(queue);
1773}
1774
1775static void
1776bond_ethdev_tx_queue_release(void *queue)
1777{
1778	if (queue == NULL)
1779		return;
1780
1781	rte_free(queue);
1782}
1783
1784static void
1785bond_ethdev_slave_link_status_change_monitor(void *cb_arg)
1786{
1787	struct rte_eth_dev *bonded_ethdev, *slave_ethdev;
1788	struct bond_dev_private *internals;
1789
1790	/* Default value for polling slave found is true as we don't want to
1791	 * disable the polling thread if we cannot get the lock */
1792	int i, polling_slave_found = 1;
1793
1794	if (cb_arg == NULL)
1795		return;
1796
1797	bonded_ethdev = (struct rte_eth_dev *)cb_arg;
1798	internals = (struct bond_dev_private *)bonded_ethdev->data->dev_private;
1799
1800	if (!bonded_ethdev->data->dev_started ||
1801		!internals->link_status_polling_enabled)
1802		return;
1803
1804	/* If device is currently being configured then don't check slaves link
1805	 * status, wait until next period */
1806	if (rte_spinlock_trylock(&internals->lock)) {
1807		if (internals->slave_count > 0)
1808			polling_slave_found = 0;
1809
1810		for (i = 0; i < internals->slave_count; i++) {
1811			if (!internals->slaves[i].link_status_poll_enabled)
1812				continue;
1813
1814			slave_ethdev = &rte_eth_devices[internals->slaves[i].port_id];
1815			polling_slave_found = 1;
1816
1817			/* Update slave link status */
1818			(*slave_ethdev->dev_ops->link_update)(slave_ethdev,
1819					internals->slaves[i].link_status_wait_to_complete);
1820
1821			/* if link status has changed since last checked then call lsc
1822			 * event callback */
1823			if (slave_ethdev->data->dev_link.link_status !=
1824					internals->slaves[i].last_link_status) {
1825				internals->slaves[i].last_link_status =
1826						slave_ethdev->data->dev_link.link_status;
1827
1828				bond_ethdev_lsc_event_callback(internals->slaves[i].port_id,
1829						RTE_ETH_EVENT_INTR_LSC,
1830						&bonded_ethdev->data->port_id);
1831			}
1832		}
1833		rte_spinlock_unlock(&internals->lock);
1834	}
1835
1836	if (polling_slave_found)
1837		/* Set alarm to continue monitoring link status of slave ethdev's */
1838		rte_eal_alarm_set(internals->link_status_polling_interval_ms * 1000,
1839				bond_ethdev_slave_link_status_change_monitor, cb_arg);
1840}
1841
1842static int
1843bond_ethdev_link_update(struct rte_eth_dev *bonded_eth_dev,
1844		int wait_to_complete)
1845{
1846	struct bond_dev_private *internals = bonded_eth_dev->data->dev_private;
1847
1848	if (!bonded_eth_dev->data->dev_started ||
1849		internals->active_slave_count == 0) {
1850		bonded_eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
1851		return 0;
1852	} else {
1853		struct rte_eth_dev *slave_eth_dev;
1854		int i, link_up = 0;
1855
1856		for (i = 0; i < internals->active_slave_count; i++) {
1857			slave_eth_dev = &rte_eth_devices[internals->active_slaves[i]];
1858
1859			(*slave_eth_dev->dev_ops->link_update)(slave_eth_dev,
1860					wait_to_complete);
1861			if (slave_eth_dev->data->dev_link.link_status == ETH_LINK_UP) {
1862				link_up = 1;
1863				break;
1864			}
1865		}
1866
1867		bonded_eth_dev->data->dev_link.link_status = link_up;
1868	}
1869
1870	return 0;
1871}
1872
1873static void
1874bond_ethdev_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
1875{
1876	struct bond_dev_private *internals = dev->data->dev_private;
1877	struct rte_eth_stats slave_stats;
1878	int i, j;
1879
1880	for (i = 0; i < internals->slave_count; i++) {
1881		rte_eth_stats_get(internals->slaves[i].port_id, &slave_stats);
1882
1883		stats->ipackets += slave_stats.ipackets;
1884		stats->opackets += slave_stats.opackets;
1885		stats->ibytes += slave_stats.ibytes;
1886		stats->obytes += slave_stats.obytes;
1887		stats->imissed += slave_stats.imissed;
1888		stats->ierrors += slave_stats.ierrors;
1889		stats->oerrors += slave_stats.oerrors;
1890		stats->rx_nombuf += slave_stats.rx_nombuf;
1891
1892		for (j = 0; j < RTE_ETHDEV_QUEUE_STAT_CNTRS; j++) {
1893			stats->q_ipackets[j] += slave_stats.q_ipackets[j];
1894			stats->q_opackets[j] += slave_stats.q_opackets[j];
1895			stats->q_ibytes[j] += slave_stats.q_ibytes[j];
1896			stats->q_obytes[j] += slave_stats.q_obytes[j];
1897			stats->q_errors[j] += slave_stats.q_errors[j];
1898		}
1899
1900	}
1901}
1902
1903static void
1904bond_ethdev_stats_reset(struct rte_eth_dev *dev)
1905{
1906	struct bond_dev_private *internals = dev->data->dev_private;
1907	int i;
1908
1909	for (i = 0; i < internals->slave_count; i++)
1910		rte_eth_stats_reset(internals->slaves[i].port_id);
1911}
1912
1913static void
1914bond_ethdev_promiscuous_enable(struct rte_eth_dev *eth_dev)
1915{
1916	struct bond_dev_private *internals = eth_dev->data->dev_private;
1917	int i;
1918
1919	internals->promiscuous_en = 1;
1920
1921	switch (internals->mode) {
1922	/* Promiscuous mode is propagated to all slaves */
1923	case BONDING_MODE_ROUND_ROBIN:
1924	case BONDING_MODE_BALANCE:
1925	case BONDING_MODE_BROADCAST:
1926		for (i = 0; i < internals->slave_count; i++)
1927			rte_eth_promiscuous_enable(internals->slaves[i].port_id);
1928		break;
1929	/* In mode4 promiscus mode is managed when slave is added/removed */
1930	case BONDING_MODE_8023AD:
1931		break;
1932	/* Promiscuous mode is propagated only to primary slave */
1933	case BONDING_MODE_ACTIVE_BACKUP:
1934	case BONDING_MODE_TLB:
1935	case BONDING_MODE_ALB:
1936	default:
1937		rte_eth_promiscuous_enable(internals->current_primary_port);
1938	}
1939}
1940
1941static void
1942bond_ethdev_promiscuous_disable(struct rte_eth_dev *dev)
1943{
1944	struct bond_dev_private *internals = dev->data->dev_private;
1945	int i;
1946
1947	internals->promiscuous_en = 0;
1948
1949	switch (internals->mode) {
1950	/* Promiscuous mode is propagated to all slaves */
1951	case BONDING_MODE_ROUND_ROBIN:
1952	case BONDING_MODE_BALANCE:
1953	case BONDING_MODE_BROADCAST:
1954		for (i = 0; i < internals->slave_count; i++)
1955			rte_eth_promiscuous_disable(internals->slaves[i].port_id);
1956		break;
1957	/* In mode4 promiscus mode is set managed when slave is added/removed */
1958	case BONDING_MODE_8023AD:
1959		break;
1960	/* Promiscuous mode is propagated only to primary slave */
1961	case BONDING_MODE_ACTIVE_BACKUP:
1962	case BONDING_MODE_TLB:
1963	case BONDING_MODE_ALB:
1964	default:
1965		rte_eth_promiscuous_disable(internals->current_primary_port);
1966	}
1967}
1968
1969static void
1970bond_ethdev_delayed_lsc_propagation(void *arg)
1971{
1972	if (arg == NULL)
1973		return;
1974
1975	_rte_eth_dev_callback_process((struct rte_eth_dev *)arg,
1976			RTE_ETH_EVENT_INTR_LSC, NULL);
1977}
1978
1979void
1980bond_ethdev_lsc_event_callback(uint8_t port_id, enum rte_eth_event_type type,
1981		void *param)
1982{
1983	struct rte_eth_dev *bonded_eth_dev, *slave_eth_dev;
1984	struct bond_dev_private *internals;
1985	struct rte_eth_link link;
1986
1987	int i, valid_slave = 0;
1988	uint8_t active_pos;
1989	uint8_t lsc_flag = 0;
1990
1991	if (type != RTE_ETH_EVENT_INTR_LSC || param == NULL)
1992		return;
1993
1994	bonded_eth_dev = &rte_eth_devices[*(uint8_t *)param];
1995	slave_eth_dev = &rte_eth_devices[port_id];
1996
1997	if (check_for_bonded_ethdev(bonded_eth_dev))
1998		return;
1999
2000	internals = bonded_eth_dev->data->dev_private;
2001
2002	/* If the device isn't started don't handle interrupts */
2003	if (!bonded_eth_dev->data->dev_started)
2004		return;
2005
2006	/* verify that port_id is a valid slave of bonded port */
2007	for (i = 0; i < internals->slave_count; i++) {
2008		if (internals->slaves[i].port_id == port_id) {
2009			valid_slave = 1;
2010			break;
2011		}
2012	}
2013
2014	if (!valid_slave)
2015		return;
2016
2017	/* Search for port in active port list */
2018	active_pos = find_slave_by_id(internals->active_slaves,
2019			internals->active_slave_count, port_id);
2020
2021	rte_eth_link_get_nowait(port_id, &link);
2022	if (link.link_status) {
2023		if (active_pos < internals->active_slave_count)
2024			return;
2025
2026		/* if no active slave ports then set this port to be primary port */
2027		if (internals->active_slave_count < 1) {
2028			/* If first active slave, then change link status */
2029			bonded_eth_dev->data->dev_link.link_status = ETH_LINK_UP;
2030			internals->current_primary_port = port_id;
2031			lsc_flag = 1;
2032
2033			mac_address_slaves_update(bonded_eth_dev);
2034
2035			/* Inherit eth dev link properties from first active slave */
2036			link_properties_set(bonded_eth_dev,
2037					&(slave_eth_dev->data->dev_link));
2038		} else {
2039			if (link_properties_valid(
2040				&bonded_eth_dev->data->dev_link, &link) != 0) {
2041				slave_eth_dev->data->dev_flags &=
2042					(~RTE_ETH_DEV_BONDED_SLAVE);
2043				RTE_LOG(ERR, PMD,
2044					"port %u invalid speed/duplex\n",
2045					port_id);
2046				return;
2047			}
2048		}
2049
2050		activate_slave(bonded_eth_dev, port_id);
2051
2052		/* If user has defined the primary port then default to using it */
2053		if (internals->user_defined_primary_port &&
2054				internals->primary_port == port_id)
2055			bond_ethdev_primary_set(internals, port_id);
2056	} else {
2057		if (active_pos == internals->active_slave_count)
2058			return;
2059
2060		/* Remove from active slave list */
2061		deactivate_slave(bonded_eth_dev, port_id);
2062
2063		/* No active slaves, change link status to down and reset other
2064		 * link properties */
2065		if (internals->active_slave_count < 1) {
2066			lsc_flag = 1;
2067			bonded_eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
2068
2069			link_properties_reset(bonded_eth_dev);
2070		}
2071
2072		/* Update primary id, take first active slave from list or if none
2073		 * available set to -1 */
2074		if (port_id == internals->current_primary_port) {
2075			if (internals->active_slave_count > 0)
2076				bond_ethdev_primary_set(internals,
2077						internals->active_slaves[0]);
2078			else
2079				internals->current_primary_port = internals->primary_port;
2080		}
2081	}
2082
2083	if (lsc_flag) {
2084		/* Cancel any possible outstanding interrupts if delays are enabled */
2085		if (internals->link_up_delay_ms > 0 ||
2086			internals->link_down_delay_ms > 0)
2087			rte_eal_alarm_cancel(bond_ethdev_delayed_lsc_propagation,
2088					bonded_eth_dev);
2089
2090		if (bonded_eth_dev->data->dev_link.link_status) {
2091			if (internals->link_up_delay_ms > 0)
2092				rte_eal_alarm_set(internals->link_up_delay_ms * 1000,
2093						bond_ethdev_delayed_lsc_propagation,
2094						(void *)bonded_eth_dev);
2095			else
2096				_rte_eth_dev_callback_process(bonded_eth_dev,
2097						RTE_ETH_EVENT_INTR_LSC, NULL);
2098
2099		} else {
2100			if (internals->link_down_delay_ms > 0)
2101				rte_eal_alarm_set(internals->link_down_delay_ms * 1000,
2102						bond_ethdev_delayed_lsc_propagation,
2103						(void *)bonded_eth_dev);
2104			else
2105				_rte_eth_dev_callback_process(bonded_eth_dev,
2106						RTE_ETH_EVENT_INTR_LSC, NULL);
2107		}
2108	}
2109}
2110
2111static int
2112bond_ethdev_rss_reta_update(struct rte_eth_dev *dev,
2113		struct rte_eth_rss_reta_entry64 *reta_conf, uint16_t reta_size)
2114{
2115	unsigned i, j;
2116	int result = 0;
2117	int slave_reta_size;
2118	unsigned reta_count;
2119	struct bond_dev_private *internals = dev->data->dev_private;
2120
2121	if (reta_size != internals->reta_size)
2122		return -EINVAL;
2123
2124	 /* Copy RETA table */
2125	reta_count = reta_size / RTE_RETA_GROUP_SIZE;
2126
2127	for (i = 0; i < reta_count; i++) {
2128		internals->reta_conf[i].mask = reta_conf[i].mask;
2129		for (j = 0; j < RTE_RETA_GROUP_SIZE; j++)
2130			if ((reta_conf[i].mask >> j) & 0x01)
2131				internals->reta_conf[i].reta[j] = reta_conf[i].reta[j];
2132	}
2133
2134	/* Fill rest of array */
2135	for (; i < RTE_DIM(internals->reta_conf); i += reta_count)
2136		memcpy(&internals->reta_conf[i], &internals->reta_conf[0],
2137				sizeof(internals->reta_conf[0]) * reta_count);
2138
2139	/* Propagate RETA over slaves */
2140	for (i = 0; i < internals->slave_count; i++) {
2141		slave_reta_size = internals->slaves[i].reta_size;
2142		result = rte_eth_dev_rss_reta_update(internals->slaves[i].port_id,
2143				&internals->reta_conf[0], slave_reta_size);
2144		if (result < 0)
2145			return result;
2146	}
2147
2148	return 0;
2149}
2150
2151static int
2152bond_ethdev_rss_reta_query(struct rte_eth_dev *dev,
2153		struct rte_eth_rss_reta_entry64 *reta_conf, uint16_t reta_size)
2154{
2155	int i, j;
2156	struct bond_dev_private *internals = dev->data->dev_private;
2157
2158	if (reta_size != internals->reta_size)
2159		return -EINVAL;
2160
2161	 /* Copy RETA table */
2162	for (i = 0; i < reta_size / RTE_RETA_GROUP_SIZE; i++)
2163		for (j = 0; j < RTE_RETA_GROUP_SIZE; j++)
2164			if ((reta_conf[i].mask >> j) & 0x01)
2165				reta_conf[i].reta[j] = internals->reta_conf[i].reta[j];
2166
2167	return 0;
2168}
2169
2170static int
2171bond_ethdev_rss_hash_update(struct rte_eth_dev *dev,
2172		struct rte_eth_rss_conf *rss_conf)
2173{
2174	int i, result = 0;
2175	struct bond_dev_private *internals = dev->data->dev_private;
2176	struct rte_eth_rss_conf bond_rss_conf;
2177
2178	memcpy(&bond_rss_conf, rss_conf, sizeof(struct rte_eth_rss_conf));
2179
2180	bond_rss_conf.rss_hf &= internals->flow_type_rss_offloads;
2181
2182	if (bond_rss_conf.rss_hf != 0)
2183		dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf = bond_rss_conf.rss_hf;
2184
2185	if (bond_rss_conf.rss_key && bond_rss_conf.rss_key_len <
2186			sizeof(internals->rss_key)) {
2187		if (bond_rss_conf.rss_key_len == 0)
2188			bond_rss_conf.rss_key_len = 40;
2189		internals->rss_key_len = bond_rss_conf.rss_key_len;
2190		memcpy(internals->rss_key, bond_rss_conf.rss_key,
2191				internals->rss_key_len);
2192	}
2193
2194	for (i = 0; i < internals->slave_count; i++) {
2195		result = rte_eth_dev_rss_hash_update(internals->slaves[i].port_id,
2196				&bond_rss_conf);
2197		if (result < 0)
2198			return result;
2199	}
2200
2201	return 0;
2202}
2203
2204static int
2205bond_ethdev_rss_hash_conf_get(struct rte_eth_dev *dev,
2206		struct rte_eth_rss_conf *rss_conf)
2207{
2208	struct bond_dev_private *internals = dev->data->dev_private;
2209
2210	rss_conf->rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
2211	rss_conf->rss_key_len = internals->rss_key_len;
2212	if (rss_conf->rss_key)
2213		memcpy(rss_conf->rss_key, internals->rss_key, internals->rss_key_len);
2214
2215	return 0;
2216}
2217
2218const struct eth_dev_ops default_dev_ops = {
2219	.dev_start            = bond_ethdev_start,
2220	.dev_stop             = bond_ethdev_stop,
2221	.dev_close            = bond_ethdev_close,
2222	.dev_configure        = bond_ethdev_configure,
2223	.dev_infos_get        = bond_ethdev_info,
2224	.vlan_filter_set      = bond_ethdev_vlan_filter_set,
2225	.rx_queue_setup       = bond_ethdev_rx_queue_setup,
2226	.tx_queue_setup       = bond_ethdev_tx_queue_setup,
2227	.rx_queue_release     = bond_ethdev_rx_queue_release,
2228	.tx_queue_release     = bond_ethdev_tx_queue_release,
2229	.link_update          = bond_ethdev_link_update,
2230	.stats_get            = bond_ethdev_stats_get,
2231	.stats_reset          = bond_ethdev_stats_reset,
2232	.promiscuous_enable   = bond_ethdev_promiscuous_enable,
2233	.promiscuous_disable  = bond_ethdev_promiscuous_disable,
2234	.reta_update          = bond_ethdev_rss_reta_update,
2235	.reta_query           = bond_ethdev_rss_reta_query,
2236	.rss_hash_update      = bond_ethdev_rss_hash_update,
2237	.rss_hash_conf_get    = bond_ethdev_rss_hash_conf_get
2238};
2239
2240static int
2241bond_probe(const char *name, const char *params)
2242{
2243	struct bond_dev_private *internals;
2244	struct rte_kvargs *kvlist;
2245	uint8_t bonding_mode, socket_id;
2246	int  arg_count, port_id;
2247
2248	RTE_LOG(INFO, EAL, "Initializing pmd_bond for %s\n", name);
2249
2250	kvlist = rte_kvargs_parse(params, pmd_bond_init_valid_arguments);
2251	if (kvlist == NULL)
2252		return -1;
2253
2254	/* Parse link bonding mode */
2255	if (rte_kvargs_count(kvlist, PMD_BOND_MODE_KVARG) == 1) {
2256		if (rte_kvargs_process(kvlist, PMD_BOND_MODE_KVARG,
2257				&bond_ethdev_parse_slave_mode_kvarg,
2258				&bonding_mode) != 0) {
2259			RTE_LOG(ERR, EAL, "Invalid mode for bonded device %s\n",
2260					name);
2261			goto parse_error;
2262		}
2263	} else {
2264		RTE_LOG(ERR, EAL, "Mode must be specified only once for bonded "
2265				"device %s\n", name);
2266		goto parse_error;
2267	}
2268
2269	/* Parse socket id to create bonding device on */
2270	arg_count = rte_kvargs_count(kvlist, PMD_BOND_SOCKET_ID_KVARG);
2271	if (arg_count == 1) {
2272		if (rte_kvargs_process(kvlist, PMD_BOND_SOCKET_ID_KVARG,
2273				&bond_ethdev_parse_socket_id_kvarg, &socket_id)
2274				!= 0) {
2275			RTE_LOG(ERR, EAL, "Invalid socket Id specified for "
2276					"bonded device %s\n", name);
2277			goto parse_error;
2278		}
2279	} else if (arg_count > 1) {
2280		RTE_LOG(ERR, EAL, "Socket Id can be specified only once for "
2281				"bonded device %s\n", name);
2282		goto parse_error;
2283	} else {
2284		socket_id = rte_socket_id();
2285	}
2286
2287	/* Create link bonding eth device */
2288	port_id = rte_eth_bond_create(name, bonding_mode, socket_id);
2289	if (port_id < 0) {
2290		RTE_LOG(ERR, EAL, "Failed to create socket %s in mode %u on "
2291				"socket %u.\n",	name, bonding_mode, socket_id);
2292		goto parse_error;
2293	}
2294	internals = rte_eth_devices[port_id].data->dev_private;
2295	internals->kvlist = kvlist;
2296
2297	RTE_LOG(INFO, EAL, "Create bonded device %s on port %d in mode %u on "
2298			"socket %u.\n",	name, port_id, bonding_mode, socket_id);
2299	return 0;
2300
2301parse_error:
2302	rte_kvargs_free(kvlist);
2303
2304	return -1;
2305}
2306
2307static int
2308bond_remove(const char *name)
2309{
2310	int  ret;
2311
2312	if (name == NULL)
2313		return -EINVAL;
2314
2315	RTE_LOG(INFO, EAL, "Uninitializing pmd_bond for %s\n", name);
2316
2317	/* free link bonding eth device */
2318	ret = rte_eth_bond_free(name);
2319	if (ret < 0)
2320		RTE_LOG(ERR, EAL, "Failed to free %s\n", name);
2321
2322	return ret;
2323}
2324
2325/* this part will resolve the slave portids after all the other pdev and vdev
2326 * have been allocated */
2327static int
2328bond_ethdev_configure(struct rte_eth_dev *dev)
2329{
2330	char *name = dev->data->name;
2331	struct bond_dev_private *internals = dev->data->dev_private;
2332	struct rte_kvargs *kvlist = internals->kvlist;
2333	int arg_count;
2334	uint8_t port_id = dev - rte_eth_devices;
2335
2336	static const uint8_t default_rss_key[40] = {
2337		0x6D, 0x5A, 0x56, 0xDA, 0x25, 0x5B, 0x0E, 0xC2, 0x41, 0x67, 0x25, 0x3D,
2338		0x43, 0xA3, 0x8F, 0xB0, 0xD0, 0xCA, 0x2B, 0xCB, 0xAE, 0x7B, 0x30, 0xB4,
2339		0x77, 0xCB, 0x2D, 0xA3, 0x80, 0x30, 0xF2, 0x0C, 0x6A, 0x42, 0xB7, 0x3B,
2340		0xBE, 0xAC, 0x01, 0xFA
2341	};
2342
2343	unsigned i, j;
2344
2345	/* If RSS is enabled, fill table and key with default values */
2346	if (dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS) {
2347		dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key = internals->rss_key;
2348		dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len = 0;
2349		memcpy(internals->rss_key, default_rss_key, 40);
2350
2351		for (i = 0; i < RTE_DIM(internals->reta_conf); i++) {
2352			internals->reta_conf[i].mask = ~0LL;
2353			for (j = 0; j < RTE_RETA_GROUP_SIZE; j++)
2354				internals->reta_conf[i].reta[j] = j % dev->data->nb_rx_queues;
2355		}
2356	}
2357
2358	/* set the max_rx_pktlen */
2359	internals->max_rx_pktlen = internals->candidate_max_rx_pktlen;
2360
2361	/*
2362	 * if no kvlist, it means that this bonded device has been created
2363	 * through the bonding api.
2364	 */
2365	if (!kvlist)
2366		return 0;
2367
2368	/* Parse MAC address for bonded device */
2369	arg_count = rte_kvargs_count(kvlist, PMD_BOND_MAC_ADDR_KVARG);
2370	if (arg_count == 1) {
2371		struct ether_addr bond_mac;
2372
2373		if (rte_kvargs_process(kvlist, PMD_BOND_MAC_ADDR_KVARG,
2374				&bond_ethdev_parse_bond_mac_addr_kvarg, &bond_mac) < 0) {
2375			RTE_LOG(INFO, EAL, "Invalid mac address for bonded device %s\n",
2376					name);
2377			return -1;
2378		}
2379
2380		/* Set MAC address */
2381		if (rte_eth_bond_mac_address_set(port_id, &bond_mac) != 0) {
2382			RTE_LOG(ERR, EAL,
2383					"Failed to set mac address on bonded device %s\n",
2384					name);
2385			return -1;
2386		}
2387	} else if (arg_count > 1) {
2388		RTE_LOG(ERR, EAL,
2389				"MAC address can be specified only once for bonded device %s\n",
2390				name);
2391		return -1;
2392	}
2393
2394	/* Parse/set balance mode transmit policy */
2395	arg_count = rte_kvargs_count(kvlist, PMD_BOND_XMIT_POLICY_KVARG);
2396	if (arg_count == 1) {
2397		uint8_t xmit_policy;
2398
2399		if (rte_kvargs_process(kvlist, PMD_BOND_XMIT_POLICY_KVARG,
2400				&bond_ethdev_parse_balance_xmit_policy_kvarg, &xmit_policy) !=
2401						0) {
2402			RTE_LOG(INFO, EAL,
2403					"Invalid xmit policy specified for bonded device %s\n",
2404					name);
2405			return -1;
2406		}
2407
2408		/* Set balance mode transmit policy*/
2409		if (rte_eth_bond_xmit_policy_set(port_id, xmit_policy) != 0) {
2410			RTE_LOG(ERR, EAL,
2411					"Failed to set balance xmit policy on bonded device %s\n",
2412					name);
2413			return -1;
2414		}
2415	} else if (arg_count > 1) {
2416		RTE_LOG(ERR, EAL,
2417				"Transmit policy can be specified only once for bonded device"
2418				" %s\n", name);
2419		return -1;
2420	}
2421
2422	/* Parse/add slave ports to bonded device */
2423	if (rte_kvargs_count(kvlist, PMD_BOND_SLAVE_PORT_KVARG) > 0) {
2424		struct bond_ethdev_slave_ports slave_ports;
2425		unsigned i;
2426
2427		memset(&slave_ports, 0, sizeof(slave_ports));
2428
2429		if (rte_kvargs_process(kvlist, PMD_BOND_SLAVE_PORT_KVARG,
2430				&bond_ethdev_parse_slave_port_kvarg, &slave_ports) != 0) {
2431			RTE_LOG(ERR, EAL,
2432					"Failed to parse slave ports for bonded device %s\n",
2433					name);
2434			return -1;
2435		}
2436
2437		for (i = 0; i < slave_ports.slave_count; i++) {
2438			if (rte_eth_bond_slave_add(port_id, slave_ports.slaves[i]) != 0) {
2439				RTE_LOG(ERR, EAL,
2440						"Failed to add port %d as slave to bonded device %s\n",
2441						slave_ports.slaves[i], name);
2442			}
2443		}
2444
2445	} else {
2446		RTE_LOG(INFO, EAL, "No slaves specified for bonded device %s\n", name);
2447		return -1;
2448	}
2449
2450	/* Parse/set primary slave port id*/
2451	arg_count = rte_kvargs_count(kvlist, PMD_BOND_PRIMARY_SLAVE_KVARG);
2452	if (arg_count == 1) {
2453		uint8_t primary_slave_port_id;
2454
2455		if (rte_kvargs_process(kvlist,
2456				PMD_BOND_PRIMARY_SLAVE_KVARG,
2457				&bond_ethdev_parse_primary_slave_port_id_kvarg,
2458				&primary_slave_port_id) < 0) {
2459			RTE_LOG(INFO, EAL,
2460					"Invalid primary slave port id specified for bonded device"
2461					" %s\n", name);
2462			return -1;
2463		}
2464
2465		/* Set balance mode transmit policy*/
2466		if (rte_eth_bond_primary_set(port_id, (uint8_t)primary_slave_port_id)
2467				!= 0) {
2468			RTE_LOG(ERR, EAL,
2469					"Failed to set primary slave port %d on bonded device %s\n",
2470					primary_slave_port_id, name);
2471			return -1;
2472		}
2473	} else if (arg_count > 1) {
2474		RTE_LOG(INFO, EAL,
2475				"Primary slave can be specified only once for bonded device"
2476				" %s\n", name);
2477		return -1;
2478	}
2479
2480	/* Parse link status monitor polling interval */
2481	arg_count = rte_kvargs_count(kvlist, PMD_BOND_LSC_POLL_PERIOD_KVARG);
2482	if (arg_count == 1) {
2483		uint32_t lsc_poll_interval_ms;
2484
2485		if (rte_kvargs_process(kvlist,
2486				PMD_BOND_LSC_POLL_PERIOD_KVARG,
2487				&bond_ethdev_parse_time_ms_kvarg,
2488				&lsc_poll_interval_ms) < 0) {
2489			RTE_LOG(INFO, EAL,
2490					"Invalid lsc polling interval value specified for bonded"
2491					" device %s\n", name);
2492			return -1;
2493		}
2494
2495		if (rte_eth_bond_link_monitoring_set(port_id, lsc_poll_interval_ms)
2496				!= 0) {
2497			RTE_LOG(ERR, EAL,
2498					"Failed to set lsc monitor polling interval (%u ms) on"
2499					" bonded device %s\n", lsc_poll_interval_ms, name);
2500			return -1;
2501		}
2502	} else if (arg_count > 1) {
2503		RTE_LOG(INFO, EAL,
2504				"LSC polling interval can be specified only once for bonded"
2505				" device %s\n", name);
2506		return -1;
2507	}
2508
2509	/* Parse link up interrupt propagation delay */
2510	arg_count = rte_kvargs_count(kvlist, PMD_BOND_LINK_UP_PROP_DELAY_KVARG);
2511	if (arg_count == 1) {
2512		uint32_t link_up_delay_ms;
2513
2514		if (rte_kvargs_process(kvlist,
2515				PMD_BOND_LINK_UP_PROP_DELAY_KVARG,
2516				&bond_ethdev_parse_time_ms_kvarg,
2517				&link_up_delay_ms) < 0) {
2518			RTE_LOG(INFO, EAL,
2519					"Invalid link up propagation delay value specified for"
2520					" bonded device %s\n", name);
2521			return -1;
2522		}
2523
2524		/* Set balance mode transmit policy*/
2525		if (rte_eth_bond_link_up_prop_delay_set(port_id, link_up_delay_ms)
2526				!= 0) {
2527			RTE_LOG(ERR, EAL,
2528					"Failed to set link up propagation delay (%u ms) on bonded"
2529					" device %s\n", link_up_delay_ms, name);
2530			return -1;
2531		}
2532	} else if (arg_count > 1) {
2533		RTE_LOG(INFO, EAL,
2534				"Link up propagation delay can be specified only once for"
2535				" bonded device %s\n", name);
2536		return -1;
2537	}
2538
2539	/* Parse link down interrupt propagation delay */
2540	arg_count = rte_kvargs_count(kvlist, PMD_BOND_LINK_DOWN_PROP_DELAY_KVARG);
2541	if (arg_count == 1) {
2542		uint32_t link_down_delay_ms;
2543
2544		if (rte_kvargs_process(kvlist,
2545				PMD_BOND_LINK_DOWN_PROP_DELAY_KVARG,
2546				&bond_ethdev_parse_time_ms_kvarg,
2547				&link_down_delay_ms) < 0) {
2548			RTE_LOG(INFO, EAL,
2549					"Invalid link down propagation delay value specified for"
2550					" bonded device %s\n", name);
2551			return -1;
2552		}
2553
2554		/* Set balance mode transmit policy*/
2555		if (rte_eth_bond_link_down_prop_delay_set(port_id, link_down_delay_ms)
2556				!= 0) {
2557			RTE_LOG(ERR, EAL,
2558					"Failed to set link down propagation delay (%u ms) on"
2559					" bonded device %s\n", link_down_delay_ms, name);
2560			return -1;
2561		}
2562	} else if (arg_count > 1) {
2563		RTE_LOG(INFO, EAL,
2564				"Link down propagation delay can be specified only once for"
2565				" bonded device %s\n", name);
2566		return -1;
2567	}
2568
2569	return 0;
2570}
2571
2572static struct rte_vdev_driver bond_drv = {
2573	.probe = bond_probe,
2574	.remove = bond_remove,
2575};
2576
2577RTE_PMD_REGISTER_VDEV(net_bonding, bond_drv);
2578RTE_PMD_REGISTER_ALIAS(net_bonding, eth_bond);
2579
2580RTE_PMD_REGISTER_PARAM_STRING(net_bonding,
2581	"slave=<ifc> "
2582	"primary=<ifc> "
2583	"mode=[0-6] "
2584	"xmit_policy=[l2 | l23 | l34] "
2585	"socket_id=<int> "
2586	"mac=<mac addr> "
2587	"lsc_poll_period_ms=<int> "
2588	"up_delay=<int> "
2589	"down_delay=<int>");
2590