rte_eth_bond_pmd.c revision 3d9b7210
1/*-
2 *   BSD LICENSE
3 *
4 *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
5 *   All rights reserved.
6 *
7 *   Redistribution and use in source and binary forms, with or without
8 *   modification, are permitted provided that the following conditions
9 *   are met:
10 *
11 *     * Redistributions of source code must retain the above copyright
12 *       notice, this list of conditions and the following disclaimer.
13 *     * Redistributions in binary form must reproduce the above copyright
14 *       notice, this list of conditions and the following disclaimer in
15 *       the documentation and/or other materials provided with the
16 *       distribution.
17 *     * Neither the name of Intel Corporation nor the names of its
18 *       contributors may be used to endorse or promote products derived
19 *       from this software without specific prior written permission.
20 *
21 *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33#include <stdlib.h>
34#include <netinet/in.h>
35
36#include <rte_mbuf.h>
37#include <rte_malloc.h>
38#include <rte_ethdev.h>
39#include <rte_tcp.h>
40#include <rte_udp.h>
41#include <rte_ip.h>
42#include <rte_ip_frag.h>
43#include <rte_devargs.h>
44#include <rte_kvargs.h>
45#include <rte_vdev.h>
46#include <rte_alarm.h>
47#include <rte_cycles.h>
48
49#include "rte_eth_bond.h"
50#include "rte_eth_bond_private.h"
51#include "rte_eth_bond_8023ad_private.h"
52
53#define REORDER_PERIOD_MS 10
54
55#define HASH_L4_PORTS(h) ((h)->src_port ^ (h)->dst_port)
56
57/* Table for statistics in mode 5 TLB */
58static uint64_t tlb_last_obytets[RTE_MAX_ETHPORTS];
59
60static inline size_t
61get_vlan_offset(struct ether_hdr *eth_hdr, uint16_t *proto)
62{
63	size_t vlan_offset = 0;
64
65	if (rte_cpu_to_be_16(ETHER_TYPE_VLAN) == *proto) {
66		struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
67
68		vlan_offset = sizeof(struct vlan_hdr);
69		*proto = vlan_hdr->eth_proto;
70
71		if (rte_cpu_to_be_16(ETHER_TYPE_VLAN) == *proto) {
72			vlan_hdr = vlan_hdr + 1;
73			*proto = vlan_hdr->eth_proto;
74			vlan_offset += sizeof(struct vlan_hdr);
75		}
76	}
77	return vlan_offset;
78}
79
80static uint16_t
81bond_ethdev_rx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
82{
83	struct bond_dev_private *internals;
84
85	uint16_t num_rx_slave = 0;
86	uint16_t num_rx_total = 0;
87
88	int i;
89
90	/* Cast to structure, containing bonded device's port id and queue id */
91	struct bond_rx_queue *bd_rx_q = (struct bond_rx_queue *)queue;
92
93	internals = bd_rx_q->dev_private;
94
95
96	for (i = 0; i < internals->active_slave_count && nb_pkts; i++) {
97		/* Offset of pointer to *bufs increases as packets are received
98		 * from other slaves */
99		num_rx_slave = rte_eth_rx_burst(internals->active_slaves[i],
100				bd_rx_q->queue_id, bufs + num_rx_total, nb_pkts);
101		if (num_rx_slave) {
102			num_rx_total += num_rx_slave;
103			nb_pkts -= num_rx_slave;
104		}
105	}
106
107	return num_rx_total;
108}
109
110static uint16_t
111bond_ethdev_rx_burst_active_backup(void *queue, struct rte_mbuf **bufs,
112		uint16_t nb_pkts)
113{
114	struct bond_dev_private *internals;
115
116	/* Cast to structure, containing bonded device's port id and queue id */
117	struct bond_rx_queue *bd_rx_q = (struct bond_rx_queue *)queue;
118
119	internals = bd_rx_q->dev_private;
120
121	return rte_eth_rx_burst(internals->current_primary_port,
122			bd_rx_q->queue_id, bufs, nb_pkts);
123}
124
125static inline uint8_t
126is_lacp_packets(uint16_t ethertype, uint8_t subtype, uint16_t vlan_tci)
127{
128	const uint16_t ether_type_slow_be = rte_be_to_cpu_16(ETHER_TYPE_SLOW);
129
130	return !vlan_tci && (ethertype == ether_type_slow_be &&
131		(subtype == SLOW_SUBTYPE_MARKER || subtype == SLOW_SUBTYPE_LACP));
132}
133
134static uint16_t
135bond_ethdev_rx_burst_8023ad(void *queue, struct rte_mbuf **bufs,
136		uint16_t nb_pkts)
137{
138	/* Cast to structure, containing bonded device's port id and queue id */
139	struct bond_rx_queue *bd_rx_q = (struct bond_rx_queue *)queue;
140	struct bond_dev_private *internals = bd_rx_q->dev_private;
141	struct ether_addr bond_mac;
142
143	struct ether_hdr *hdr;
144
145	const uint16_t ether_type_slow_be = rte_be_to_cpu_16(ETHER_TYPE_SLOW);
146	uint16_t num_rx_total = 0;	/* Total number of received packets */
147	uint8_t slaves[RTE_MAX_ETHPORTS];
148	uint8_t slave_count;
149
150	uint8_t collecting;  /* current slave collecting status */
151	const uint8_t promisc = internals->promiscuous_en;
152	uint8_t i, j, k;
153	uint8_t subtype;
154
155	rte_eth_macaddr_get(internals->port_id, &bond_mac);
156	/* Copy slave list to protect against slave up/down changes during tx
157	 * bursting */
158	slave_count = internals->active_slave_count;
159	memcpy(slaves, internals->active_slaves,
160			sizeof(internals->active_slaves[0]) * slave_count);
161
162	for (i = 0; i < slave_count && num_rx_total < nb_pkts; i++) {
163		j = num_rx_total;
164		collecting = ACTOR_STATE(&mode_8023ad_ports[slaves[i]], COLLECTING);
165
166		/* Read packets from this slave */
167		num_rx_total += rte_eth_rx_burst(slaves[i], bd_rx_q->queue_id,
168				&bufs[num_rx_total], nb_pkts - num_rx_total);
169
170		for (k = j; k < 2 && k < num_rx_total; k++)
171			rte_prefetch0(rte_pktmbuf_mtod(bufs[k], void *));
172
173		/* Handle slow protocol packets. */
174		while (j < num_rx_total) {
175			if (j + 3 < num_rx_total)
176				rte_prefetch0(rte_pktmbuf_mtod(bufs[j + 3], void *));
177
178			hdr = rte_pktmbuf_mtod(bufs[j], struct ether_hdr *);
179			subtype = ((struct slow_protocol_frame *)hdr)->slow_protocol.subtype;
180
181			/* Remove packet from array if it is slow packet or slave is not
182			 * in collecting state or bondign interface is not in promiscus
183			 * mode and packet address does not match. */
184			if (unlikely(is_lacp_packets(hdr->ether_type, subtype, bufs[j]->vlan_tci) ||
185				!collecting || (!promisc &&
186					!is_multicast_ether_addr(&hdr->d_addr) &&
187					!is_same_ether_addr(&bond_mac, &hdr->d_addr)))) {
188
189				if (hdr->ether_type == ether_type_slow_be) {
190					bond_mode_8023ad_handle_slow_pkt(internals, slaves[i],
191						bufs[j]);
192				} else
193					rte_pktmbuf_free(bufs[j]);
194
195				/* Packet is managed by mode 4 or dropped, shift the array */
196				num_rx_total--;
197				if (j < num_rx_total) {
198					memmove(&bufs[j], &bufs[j + 1], sizeof(bufs[0]) *
199						(num_rx_total - j));
200				}
201			} else
202				j++;
203		}
204	}
205
206	return num_rx_total;
207}
208
209#if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
210uint32_t burstnumberRX;
211uint32_t burstnumberTX;
212
213#ifdef RTE_LIBRTE_BOND_DEBUG_ALB
214
215static void
216arp_op_name(uint16_t arp_op, char *buf)
217{
218	switch (arp_op) {
219	case ARP_OP_REQUEST:
220		snprintf(buf, sizeof("ARP Request"), "%s", "ARP Request");
221		return;
222	case ARP_OP_REPLY:
223		snprintf(buf, sizeof("ARP Reply"), "%s", "ARP Reply");
224		return;
225	case ARP_OP_REVREQUEST:
226		snprintf(buf, sizeof("Reverse ARP Request"), "%s",
227				"Reverse ARP Request");
228		return;
229	case ARP_OP_REVREPLY:
230		snprintf(buf, sizeof("Reverse ARP Reply"), "%s",
231				"Reverse ARP Reply");
232		return;
233	case ARP_OP_INVREQUEST:
234		snprintf(buf, sizeof("Peer Identify Request"), "%s",
235				"Peer Identify Request");
236		return;
237	case ARP_OP_INVREPLY:
238		snprintf(buf, sizeof("Peer Identify Reply"), "%s",
239				"Peer Identify Reply");
240		return;
241	default:
242		break;
243	}
244	snprintf(buf, sizeof("Unknown"), "%s", "Unknown");
245	return;
246}
247#endif
248#define MaxIPv4String	16
249static void
250ipv4_addr_to_dot(uint32_t be_ipv4_addr, char *buf, uint8_t buf_size)
251{
252	uint32_t ipv4_addr;
253
254	ipv4_addr = rte_be_to_cpu_32(be_ipv4_addr);
255	snprintf(buf, buf_size, "%d.%d.%d.%d", (ipv4_addr >> 24) & 0xFF,
256		(ipv4_addr >> 16) & 0xFF, (ipv4_addr >> 8) & 0xFF,
257		ipv4_addr & 0xFF);
258}
259
260#define MAX_CLIENTS_NUMBER	128
261uint8_t active_clients;
262struct client_stats_t {
263	uint8_t port;
264	uint32_t ipv4_addr;
265	uint32_t ipv4_rx_packets;
266	uint32_t ipv4_tx_packets;
267};
268struct client_stats_t client_stats[MAX_CLIENTS_NUMBER];
269
270static void
271update_client_stats(uint32_t addr, uint8_t port, uint32_t *TXorRXindicator)
272{
273	int i = 0;
274
275	for (; i < MAX_CLIENTS_NUMBER; i++)	{
276		if ((client_stats[i].ipv4_addr == addr) && (client_stats[i].port == port))	{
277			/* Just update RX packets number for this client */
278			if (TXorRXindicator == &burstnumberRX)
279				client_stats[i].ipv4_rx_packets++;
280			else
281				client_stats[i].ipv4_tx_packets++;
282			return;
283		}
284	}
285	/* We have a new client. Insert him to the table, and increment stats */
286	if (TXorRXindicator == &burstnumberRX)
287		client_stats[active_clients].ipv4_rx_packets++;
288	else
289		client_stats[active_clients].ipv4_tx_packets++;
290	client_stats[active_clients].ipv4_addr = addr;
291	client_stats[active_clients].port = port;
292	active_clients++;
293
294}
295
296#ifdef RTE_LIBRTE_BOND_DEBUG_ALB
297#define MODE6_DEBUG(info, src_ip, dst_ip, eth_h, arp_op, port, burstnumber)	\
298		RTE_LOG(DEBUG, PMD, \
299		"%s " \
300		"port:%d " \
301		"SrcMAC:%02X:%02X:%02X:%02X:%02X:%02X " \
302		"SrcIP:%s " \
303		"DstMAC:%02X:%02X:%02X:%02X:%02X:%02X " \
304		"DstIP:%s " \
305		"%s " \
306		"%d\n", \
307		info, \
308		port, \
309		eth_h->s_addr.addr_bytes[0], \
310		eth_h->s_addr.addr_bytes[1], \
311		eth_h->s_addr.addr_bytes[2], \
312		eth_h->s_addr.addr_bytes[3], \
313		eth_h->s_addr.addr_bytes[4], \
314		eth_h->s_addr.addr_bytes[5], \
315		src_ip, \
316		eth_h->d_addr.addr_bytes[0], \
317		eth_h->d_addr.addr_bytes[1], \
318		eth_h->d_addr.addr_bytes[2], \
319		eth_h->d_addr.addr_bytes[3], \
320		eth_h->d_addr.addr_bytes[4], \
321		eth_h->d_addr.addr_bytes[5], \
322		dst_ip, \
323		arp_op, \
324		++burstnumber)
325#endif
326
327static void
328mode6_debug(const char __attribute__((unused)) *info, struct ether_hdr *eth_h,
329		uint8_t port, uint32_t __attribute__((unused)) *burstnumber)
330{
331	struct ipv4_hdr *ipv4_h;
332#ifdef RTE_LIBRTE_BOND_DEBUG_ALB
333	struct arp_hdr *arp_h;
334	char dst_ip[16];
335	char ArpOp[24];
336	char buf[16];
337#endif
338	char src_ip[16];
339
340	uint16_t ether_type = eth_h->ether_type;
341	uint16_t offset = get_vlan_offset(eth_h, &ether_type);
342
343#ifdef RTE_LIBRTE_BOND_DEBUG_ALB
344	snprintf(buf, 16, "%s", info);
345#endif
346
347	if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_IPv4)) {
348		ipv4_h = (struct ipv4_hdr *)((char *)(eth_h + 1) + offset);
349		ipv4_addr_to_dot(ipv4_h->src_addr, src_ip, MaxIPv4String);
350#ifdef RTE_LIBRTE_BOND_DEBUG_ALB
351		ipv4_addr_to_dot(ipv4_h->dst_addr, dst_ip, MaxIPv4String);
352		MODE6_DEBUG(buf, src_ip, dst_ip, eth_h, "", port, *burstnumber);
353#endif
354		update_client_stats(ipv4_h->src_addr, port, burstnumber);
355	}
356#ifdef RTE_LIBRTE_BOND_DEBUG_ALB
357	else if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_ARP)) {
358		arp_h = (struct arp_hdr *)((char *)(eth_h + 1) + offset);
359		ipv4_addr_to_dot(arp_h->arp_data.arp_sip, src_ip, MaxIPv4String);
360		ipv4_addr_to_dot(arp_h->arp_data.arp_tip, dst_ip, MaxIPv4String);
361		arp_op_name(rte_be_to_cpu_16(arp_h->arp_op), ArpOp);
362		MODE6_DEBUG(buf, src_ip, dst_ip, eth_h, ArpOp, port, *burstnumber);
363	}
364#endif
365}
366#endif
367
368static uint16_t
369bond_ethdev_rx_burst_alb(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
370{
371	struct bond_tx_queue *bd_tx_q = (struct bond_tx_queue *)queue;
372	struct bond_dev_private *internals = bd_tx_q->dev_private;
373	struct ether_hdr *eth_h;
374	uint16_t ether_type, offset;
375	uint16_t nb_recv_pkts;
376	int i;
377
378	nb_recv_pkts = bond_ethdev_rx_burst(queue, bufs, nb_pkts);
379
380	for (i = 0; i < nb_recv_pkts; i++) {
381		eth_h = rte_pktmbuf_mtod(bufs[i], struct ether_hdr *);
382		ether_type = eth_h->ether_type;
383		offset = get_vlan_offset(eth_h, &ether_type);
384
385		if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_ARP)) {
386#if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
387			mode6_debug("RX ARP:", eth_h, bufs[i]->port, &burstnumberRX);
388#endif
389			bond_mode_alb_arp_recv(eth_h, offset, internals);
390		}
391#if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
392		else if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_IPv4))
393			mode6_debug("RX IPv4:", eth_h, bufs[i]->port, &burstnumberRX);
394#endif
395	}
396
397	return nb_recv_pkts;
398}
399
400static uint16_t
401bond_ethdev_tx_burst_round_robin(void *queue, struct rte_mbuf **bufs,
402		uint16_t nb_pkts)
403{
404	struct bond_dev_private *internals;
405	struct bond_tx_queue *bd_tx_q;
406
407	struct rte_mbuf *slave_bufs[RTE_MAX_ETHPORTS][nb_pkts];
408	uint16_t slave_nb_pkts[RTE_MAX_ETHPORTS] = { 0 };
409
410	uint8_t num_of_slaves;
411	uint8_t slaves[RTE_MAX_ETHPORTS];
412
413	uint16_t num_tx_total = 0, num_tx_slave;
414
415	static int slave_idx = 0;
416	int i, cslave_idx = 0, tx_fail_total = 0;
417
418	bd_tx_q = (struct bond_tx_queue *)queue;
419	internals = bd_tx_q->dev_private;
420
421	/* Copy slave list to protect against slave up/down changes during tx
422	 * bursting */
423	num_of_slaves = internals->active_slave_count;
424	memcpy(slaves, internals->active_slaves,
425			sizeof(internals->active_slaves[0]) * num_of_slaves);
426
427	if (num_of_slaves < 1)
428		return num_tx_total;
429
430	/* Populate slaves mbuf with which packets are to be sent on it  */
431	for (i = 0; i < nb_pkts; i++) {
432		cslave_idx = (slave_idx + i) % num_of_slaves;
433		slave_bufs[cslave_idx][(slave_nb_pkts[cslave_idx])++] = bufs[i];
434	}
435
436	/* increment current slave index so the next call to tx burst starts on the
437	 * next slave */
438	slave_idx = ++cslave_idx;
439
440	/* Send packet burst on each slave device */
441	for (i = 0; i < num_of_slaves; i++) {
442		if (slave_nb_pkts[i] > 0) {
443			num_tx_slave = rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id,
444					slave_bufs[i], slave_nb_pkts[i]);
445
446			/* if tx burst fails move packets to end of bufs */
447			if (unlikely(num_tx_slave < slave_nb_pkts[i])) {
448				int tx_fail_slave = slave_nb_pkts[i] - num_tx_slave;
449
450				tx_fail_total += tx_fail_slave;
451
452				memcpy(&bufs[nb_pkts - tx_fail_total],
453						&slave_bufs[i][num_tx_slave],
454						tx_fail_slave * sizeof(bufs[0]));
455			}
456			num_tx_total += num_tx_slave;
457		}
458	}
459
460	return num_tx_total;
461}
462
463static uint16_t
464bond_ethdev_tx_burst_active_backup(void *queue,
465		struct rte_mbuf **bufs, uint16_t nb_pkts)
466{
467	struct bond_dev_private *internals;
468	struct bond_tx_queue *bd_tx_q;
469
470	bd_tx_q = (struct bond_tx_queue *)queue;
471	internals = bd_tx_q->dev_private;
472
473	if (internals->active_slave_count < 1)
474		return 0;
475
476	return rte_eth_tx_burst(internals->current_primary_port, bd_tx_q->queue_id,
477			bufs, nb_pkts);
478}
479
480static inline uint16_t
481ether_hash(struct ether_hdr *eth_hdr)
482{
483	unaligned_uint16_t *word_src_addr =
484		(unaligned_uint16_t *)eth_hdr->s_addr.addr_bytes;
485	unaligned_uint16_t *word_dst_addr =
486		(unaligned_uint16_t *)eth_hdr->d_addr.addr_bytes;
487
488	return (word_src_addr[0] ^ word_dst_addr[0]) ^
489			(word_src_addr[1] ^ word_dst_addr[1]) ^
490			(word_src_addr[2] ^ word_dst_addr[2]);
491}
492
493static inline uint32_t
494ipv4_hash(struct ipv4_hdr *ipv4_hdr)
495{
496	return ipv4_hdr->src_addr ^ ipv4_hdr->dst_addr;
497}
498
499static inline uint32_t
500ipv6_hash(struct ipv6_hdr *ipv6_hdr)
501{
502	unaligned_uint32_t *word_src_addr =
503		(unaligned_uint32_t *)&(ipv6_hdr->src_addr[0]);
504	unaligned_uint32_t *word_dst_addr =
505		(unaligned_uint32_t *)&(ipv6_hdr->dst_addr[0]);
506
507	return (word_src_addr[0] ^ word_dst_addr[0]) ^
508			(word_src_addr[1] ^ word_dst_addr[1]) ^
509			(word_src_addr[2] ^ word_dst_addr[2]) ^
510			(word_src_addr[3] ^ word_dst_addr[3]);
511}
512
513uint16_t
514xmit_l2_hash(const struct rte_mbuf *buf, uint8_t slave_count)
515{
516	struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(buf, struct ether_hdr *);
517
518	uint32_t hash = ether_hash(eth_hdr);
519
520	return (hash ^= hash >> 8) % slave_count;
521}
522
523uint16_t
524xmit_l23_hash(const struct rte_mbuf *buf, uint8_t slave_count)
525{
526	struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(buf, struct ether_hdr *);
527	uint16_t proto = eth_hdr->ether_type;
528	size_t vlan_offset = get_vlan_offset(eth_hdr, &proto);
529	uint32_t hash, l3hash = 0;
530
531	hash = ether_hash(eth_hdr);
532
533	if (rte_cpu_to_be_16(ETHER_TYPE_IPv4) == proto) {
534		struct ipv4_hdr *ipv4_hdr = (struct ipv4_hdr *)
535				((char *)(eth_hdr + 1) + vlan_offset);
536		l3hash = ipv4_hash(ipv4_hdr);
537
538	} else if (rte_cpu_to_be_16(ETHER_TYPE_IPv6) == proto) {
539		struct ipv6_hdr *ipv6_hdr = (struct ipv6_hdr *)
540				((char *)(eth_hdr + 1) + vlan_offset);
541		l3hash = ipv6_hash(ipv6_hdr);
542	}
543
544	hash = hash ^ l3hash;
545	hash ^= hash >> 16;
546	hash ^= hash >> 8;
547
548	return hash % slave_count;
549}
550
551uint16_t
552xmit_l34_hash(const struct rte_mbuf *buf, uint8_t slave_count)
553{
554	struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(buf, struct ether_hdr *);
555	uint16_t proto = eth_hdr->ether_type;
556	size_t vlan_offset = get_vlan_offset(eth_hdr, &proto);
557
558	struct udp_hdr *udp_hdr = NULL;
559	struct tcp_hdr *tcp_hdr = NULL;
560	uint32_t hash, l3hash = 0, l4hash = 0;
561
562	if (rte_cpu_to_be_16(ETHER_TYPE_IPv4) == proto) {
563		struct ipv4_hdr *ipv4_hdr = (struct ipv4_hdr *)
564				((char *)(eth_hdr + 1) + vlan_offset);
565		size_t ip_hdr_offset;
566
567		l3hash = ipv4_hash(ipv4_hdr);
568
569		/* there is no L4 header in fragmented packet */
570		if (likely(rte_ipv4_frag_pkt_is_fragmented(ipv4_hdr) == 0)) {
571			ip_hdr_offset = (ipv4_hdr->version_ihl & IPV4_HDR_IHL_MASK) *
572					IPV4_IHL_MULTIPLIER;
573
574			if (ipv4_hdr->next_proto_id == IPPROTO_TCP) {
575				tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr +
576						ip_hdr_offset);
577				l4hash = HASH_L4_PORTS(tcp_hdr);
578			} else if (ipv4_hdr->next_proto_id == IPPROTO_UDP) {
579				udp_hdr = (struct udp_hdr *)((char *)ipv4_hdr +
580						ip_hdr_offset);
581				l4hash = HASH_L4_PORTS(udp_hdr);
582			}
583		}
584	} else if  (rte_cpu_to_be_16(ETHER_TYPE_IPv6) == proto) {
585		struct ipv6_hdr *ipv6_hdr = (struct ipv6_hdr *)
586				((char *)(eth_hdr + 1) + vlan_offset);
587		l3hash = ipv6_hash(ipv6_hdr);
588
589		if (ipv6_hdr->proto == IPPROTO_TCP) {
590			tcp_hdr = (struct tcp_hdr *)(ipv6_hdr + 1);
591			l4hash = HASH_L4_PORTS(tcp_hdr);
592		} else if (ipv6_hdr->proto == IPPROTO_UDP) {
593			udp_hdr = (struct udp_hdr *)(ipv6_hdr + 1);
594			l4hash = HASH_L4_PORTS(udp_hdr);
595		}
596	}
597
598	hash = l3hash ^ l4hash;
599	hash ^= hash >> 16;
600	hash ^= hash >> 8;
601
602	return hash % slave_count;
603}
604
605struct bwg_slave {
606	uint64_t bwg_left_int;
607	uint64_t bwg_left_remainder;
608	uint8_t slave;
609};
610
611void
612bond_tlb_activate_slave(struct bond_dev_private *internals) {
613	int i;
614
615	for (i = 0; i < internals->active_slave_count; i++) {
616		tlb_last_obytets[internals->active_slaves[i]] = 0;
617	}
618}
619
620static int
621bandwidth_cmp(const void *a, const void *b)
622{
623	const struct bwg_slave *bwg_a = a;
624	const struct bwg_slave *bwg_b = b;
625	int64_t diff = (int64_t)bwg_b->bwg_left_int - (int64_t)bwg_a->bwg_left_int;
626	int64_t diff2 = (int64_t)bwg_b->bwg_left_remainder -
627			(int64_t)bwg_a->bwg_left_remainder;
628	if (diff > 0)
629		return 1;
630	else if (diff < 0)
631		return -1;
632	else if (diff2 > 0)
633		return 1;
634	else if (diff2 < 0)
635		return -1;
636	else
637		return 0;
638}
639
640static void
641bandwidth_left(uint8_t port_id, uint64_t load, uint8_t update_idx,
642		struct bwg_slave *bwg_slave)
643{
644	struct rte_eth_link link_status;
645
646	rte_eth_link_get(port_id, &link_status);
647	uint64_t link_bwg = link_status.link_speed * 1000000ULL / 8;
648	if (link_bwg == 0)
649		return;
650	link_bwg = link_bwg * (update_idx+1) * REORDER_PERIOD_MS;
651	bwg_slave->bwg_left_int = (link_bwg - 1000*load) / link_bwg;
652	bwg_slave->bwg_left_remainder = (link_bwg - 1000*load) % link_bwg;
653}
654
655static void
656bond_ethdev_update_tlb_slave_cb(void *arg)
657{
658	struct bond_dev_private *internals = arg;
659	struct rte_eth_stats slave_stats;
660	struct bwg_slave bwg_array[RTE_MAX_ETHPORTS];
661	uint8_t slave_count;
662	uint64_t tx_bytes;
663
664	uint8_t update_stats = 0;
665	uint8_t i, slave_id;
666
667	internals->slave_update_idx++;
668
669
670	if (internals->slave_update_idx >= REORDER_PERIOD_MS)
671		update_stats = 1;
672
673	for (i = 0; i < internals->active_slave_count; i++) {
674		slave_id = internals->active_slaves[i];
675		rte_eth_stats_get(slave_id, &slave_stats);
676		tx_bytes = slave_stats.obytes - tlb_last_obytets[slave_id];
677		bandwidth_left(slave_id, tx_bytes,
678				internals->slave_update_idx, &bwg_array[i]);
679		bwg_array[i].slave = slave_id;
680
681		if (update_stats) {
682			tlb_last_obytets[slave_id] = slave_stats.obytes;
683		}
684	}
685
686	if (update_stats == 1)
687		internals->slave_update_idx = 0;
688
689	slave_count = i;
690	qsort(bwg_array, slave_count, sizeof(bwg_array[0]), bandwidth_cmp);
691	for (i = 0; i < slave_count; i++)
692		internals->tlb_slaves_order[i] = bwg_array[i].slave;
693
694	rte_eal_alarm_set(REORDER_PERIOD_MS * 1000, bond_ethdev_update_tlb_slave_cb,
695			(struct bond_dev_private *)internals);
696}
697
698static uint16_t
699bond_ethdev_tx_burst_tlb(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
700{
701	struct bond_tx_queue *bd_tx_q = (struct bond_tx_queue *)queue;
702	struct bond_dev_private *internals = bd_tx_q->dev_private;
703
704	struct rte_eth_dev *primary_port =
705			&rte_eth_devices[internals->primary_port];
706	uint16_t num_tx_total = 0;
707	uint8_t i, j;
708
709	uint8_t num_of_slaves = internals->active_slave_count;
710	uint8_t slaves[RTE_MAX_ETHPORTS];
711
712	struct ether_hdr *ether_hdr;
713	struct ether_addr primary_slave_addr;
714	struct ether_addr active_slave_addr;
715
716	if (num_of_slaves < 1)
717		return num_tx_total;
718
719	memcpy(slaves, internals->tlb_slaves_order,
720				sizeof(internals->tlb_slaves_order[0]) * num_of_slaves);
721
722
723	ether_addr_copy(primary_port->data->mac_addrs, &primary_slave_addr);
724
725	if (nb_pkts > 3) {
726		for (i = 0; i < 3; i++)
727			rte_prefetch0(rte_pktmbuf_mtod(bufs[i], void*));
728	}
729
730	for (i = 0; i < num_of_slaves; i++) {
731		rte_eth_macaddr_get(slaves[i], &active_slave_addr);
732		for (j = num_tx_total; j < nb_pkts; j++) {
733			if (j + 3 < nb_pkts)
734				rte_prefetch0(rte_pktmbuf_mtod(bufs[j+3], void*));
735
736			ether_hdr = rte_pktmbuf_mtod(bufs[j], struct ether_hdr *);
737			if (is_same_ether_addr(&ether_hdr->s_addr, &primary_slave_addr))
738				ether_addr_copy(&active_slave_addr, &ether_hdr->s_addr);
739#if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
740					mode6_debug("TX IPv4:", ether_hdr, slaves[i], &burstnumberTX);
741#endif
742		}
743
744		num_tx_total += rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id,
745				bufs + num_tx_total, nb_pkts - num_tx_total);
746
747		if (num_tx_total == nb_pkts)
748			break;
749	}
750
751	return num_tx_total;
752}
753
754void
755bond_tlb_disable(struct bond_dev_private *internals)
756{
757	rte_eal_alarm_cancel(bond_ethdev_update_tlb_slave_cb, internals);
758}
759
760void
761bond_tlb_enable(struct bond_dev_private *internals)
762{
763	bond_ethdev_update_tlb_slave_cb(internals);
764}
765
766static uint16_t
767bond_ethdev_tx_burst_alb(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
768{
769	struct bond_tx_queue *bd_tx_q = (struct bond_tx_queue *)queue;
770	struct bond_dev_private *internals = bd_tx_q->dev_private;
771
772	struct ether_hdr *eth_h;
773	uint16_t ether_type, offset;
774
775	struct client_data *client_info;
776
777	/*
778	 * We create transmit buffers for every slave and one additional to send
779	 * through tlb. In worst case every packet will be send on one port.
780	 */
781	struct rte_mbuf *slave_bufs[RTE_MAX_ETHPORTS + 1][nb_pkts];
782	uint16_t slave_bufs_pkts[RTE_MAX_ETHPORTS + 1] = { 0 };
783
784	/*
785	 * We create separate transmit buffers for update packets as they wont be
786	 * counted in num_tx_total.
787	 */
788	struct rte_mbuf *update_bufs[RTE_MAX_ETHPORTS][ALB_HASH_TABLE_SIZE];
789	uint16_t update_bufs_pkts[RTE_MAX_ETHPORTS] = { 0 };
790
791	struct rte_mbuf *upd_pkt;
792	size_t pkt_size;
793
794	uint16_t num_send, num_not_send = 0;
795	uint16_t num_tx_total = 0;
796	uint8_t slave_idx;
797
798	int i, j;
799
800	/* Search tx buffer for ARP packets and forward them to alb */
801	for (i = 0; i < nb_pkts; i++) {
802		eth_h = rte_pktmbuf_mtod(bufs[i], struct ether_hdr *);
803		ether_type = eth_h->ether_type;
804		offset = get_vlan_offset(eth_h, &ether_type);
805
806		if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_ARP)) {
807			slave_idx = bond_mode_alb_arp_xmit(eth_h, offset, internals);
808
809			/* Change src mac in eth header */
810			rte_eth_macaddr_get(slave_idx, &eth_h->s_addr);
811
812			/* Add packet to slave tx buffer */
813			slave_bufs[slave_idx][slave_bufs_pkts[slave_idx]] = bufs[i];
814			slave_bufs_pkts[slave_idx]++;
815		} else {
816			/* If packet is not ARP, send it with TLB policy */
817			slave_bufs[RTE_MAX_ETHPORTS][slave_bufs_pkts[RTE_MAX_ETHPORTS]] =
818					bufs[i];
819			slave_bufs_pkts[RTE_MAX_ETHPORTS]++;
820		}
821	}
822
823	/* Update connected client ARP tables */
824	if (internals->mode6.ntt) {
825		for (i = 0; i < ALB_HASH_TABLE_SIZE; i++) {
826			client_info = &internals->mode6.client_table[i];
827
828			if (client_info->in_use) {
829				/* Allocate new packet to send ARP update on current slave */
830				upd_pkt = rte_pktmbuf_alloc(internals->mode6.mempool);
831				if (upd_pkt == NULL) {
832					RTE_LOG(ERR, PMD, "Failed to allocate ARP packet from pool\n");
833					continue;
834				}
835				pkt_size = sizeof(struct ether_hdr) + sizeof(struct arp_hdr)
836						+ client_info->vlan_count * sizeof(struct vlan_hdr);
837				upd_pkt->data_len = pkt_size;
838				upd_pkt->pkt_len = pkt_size;
839
840				slave_idx = bond_mode_alb_arp_upd(client_info, upd_pkt,
841						internals);
842
843				/* Add packet to update tx buffer */
844				update_bufs[slave_idx][update_bufs_pkts[slave_idx]] = upd_pkt;
845				update_bufs_pkts[slave_idx]++;
846			}
847		}
848		internals->mode6.ntt = 0;
849	}
850
851	/* Send ARP packets on proper slaves */
852	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
853		if (slave_bufs_pkts[i] > 0) {
854			num_send = rte_eth_tx_burst(i, bd_tx_q->queue_id,
855					slave_bufs[i], slave_bufs_pkts[i]);
856			for (j = 0; j < slave_bufs_pkts[i] - num_send; j++) {
857				bufs[nb_pkts - 1 - num_not_send - j] =
858						slave_bufs[i][nb_pkts - 1 - j];
859			}
860
861			num_tx_total += num_send;
862			num_not_send += slave_bufs_pkts[i] - num_send;
863
864#if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
865	/* Print TX stats including update packets */
866			for (j = 0; j < slave_bufs_pkts[i]; j++) {
867				eth_h = rte_pktmbuf_mtod(slave_bufs[i][j], struct ether_hdr *);
868				mode6_debug("TX ARP:", eth_h, i, &burstnumberTX);
869			}
870#endif
871		}
872	}
873
874	/* Send update packets on proper slaves */
875	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
876		if (update_bufs_pkts[i] > 0) {
877			num_send = rte_eth_tx_burst(i, bd_tx_q->queue_id, update_bufs[i],
878					update_bufs_pkts[i]);
879			for (j = num_send; j < update_bufs_pkts[i]; j++) {
880				rte_pktmbuf_free(update_bufs[i][j]);
881			}
882#if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
883			for (j = 0; j < update_bufs_pkts[i]; j++) {
884				eth_h = rte_pktmbuf_mtod(update_bufs[i][j], struct ether_hdr *);
885				mode6_debug("TX ARPupd:", eth_h, i, &burstnumberTX);
886			}
887#endif
888		}
889	}
890
891	/* Send non-ARP packets using tlb policy */
892	if (slave_bufs_pkts[RTE_MAX_ETHPORTS] > 0) {
893		num_send = bond_ethdev_tx_burst_tlb(queue,
894				slave_bufs[RTE_MAX_ETHPORTS],
895				slave_bufs_pkts[RTE_MAX_ETHPORTS]);
896
897		for (j = 0; j < slave_bufs_pkts[RTE_MAX_ETHPORTS]; j++) {
898			bufs[nb_pkts - 1 - num_not_send - j] =
899					slave_bufs[RTE_MAX_ETHPORTS][nb_pkts - 1 - j];
900		}
901
902		num_tx_total += num_send;
903		num_not_send += slave_bufs_pkts[RTE_MAX_ETHPORTS] - num_send;
904	}
905
906	return num_tx_total;
907}
908
909static uint16_t
910bond_ethdev_tx_burst_balance(void *queue, struct rte_mbuf **bufs,
911		uint16_t nb_pkts)
912{
913	struct bond_dev_private *internals;
914	struct bond_tx_queue *bd_tx_q;
915
916	uint8_t num_of_slaves;
917	uint8_t slaves[RTE_MAX_ETHPORTS];
918
919	uint16_t num_tx_total = 0, num_tx_slave = 0, tx_fail_total = 0;
920
921	int i, op_slave_id;
922
923	struct rte_mbuf *slave_bufs[RTE_MAX_ETHPORTS][nb_pkts];
924	uint16_t slave_nb_pkts[RTE_MAX_ETHPORTS] = { 0 };
925
926	bd_tx_q = (struct bond_tx_queue *)queue;
927	internals = bd_tx_q->dev_private;
928
929	/* Copy slave list to protect against slave up/down changes during tx
930	 * bursting */
931	num_of_slaves = internals->active_slave_count;
932	memcpy(slaves, internals->active_slaves,
933			sizeof(internals->active_slaves[0]) * num_of_slaves);
934
935	if (num_of_slaves < 1)
936		return num_tx_total;
937
938	/* Populate slaves mbuf with the packets which are to be sent on it  */
939	for (i = 0; i < nb_pkts; i++) {
940		/* Select output slave using hash based on xmit policy */
941		op_slave_id = internals->xmit_hash(bufs[i], num_of_slaves);
942
943		/* Populate slave mbuf arrays with mbufs for that slave */
944		slave_bufs[op_slave_id][slave_nb_pkts[op_slave_id]++] = bufs[i];
945	}
946
947	/* Send packet burst on each slave device */
948	for (i = 0; i < num_of_slaves; i++) {
949		if (slave_nb_pkts[i] > 0) {
950			num_tx_slave = rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id,
951					slave_bufs[i], slave_nb_pkts[i]);
952
953			/* if tx burst fails move packets to end of bufs */
954			if (unlikely(num_tx_slave < slave_nb_pkts[i])) {
955				int slave_tx_fail_count = slave_nb_pkts[i] - num_tx_slave;
956
957				tx_fail_total += slave_tx_fail_count;
958				memcpy(&bufs[nb_pkts - tx_fail_total],
959						&slave_bufs[i][num_tx_slave],
960						slave_tx_fail_count * sizeof(bufs[0]));
961			}
962
963			num_tx_total += num_tx_slave;
964		}
965	}
966
967	return num_tx_total;
968}
969
970static uint16_t
971bond_ethdev_tx_burst_8023ad(void *queue, struct rte_mbuf **bufs,
972		uint16_t nb_pkts)
973{
974	struct bond_dev_private *internals;
975	struct bond_tx_queue *bd_tx_q;
976
977	uint8_t num_of_slaves;
978	uint8_t slaves[RTE_MAX_ETHPORTS];
979	 /* positions in slaves, not ID */
980	uint8_t distributing_offsets[RTE_MAX_ETHPORTS];
981	uint8_t distributing_count;
982
983	uint16_t num_tx_slave, num_tx_total = 0, num_tx_fail_total = 0;
984	uint16_t i, j, op_slave_idx;
985	const uint16_t buffs_size = nb_pkts + BOND_MODE_8023AX_SLAVE_TX_PKTS + 1;
986
987	/* Allocate additional packets in case 8023AD mode. */
988	struct rte_mbuf *slave_bufs[RTE_MAX_ETHPORTS][buffs_size];
989	void *slow_pkts[BOND_MODE_8023AX_SLAVE_TX_PKTS] = { NULL };
990
991	/* Total amount of packets in slave_bufs */
992	uint16_t slave_nb_pkts[RTE_MAX_ETHPORTS] = { 0 };
993	/* Slow packets placed in each slave */
994	uint8_t slave_slow_nb_pkts[RTE_MAX_ETHPORTS] = { 0 };
995
996	bd_tx_q = (struct bond_tx_queue *)queue;
997	internals = bd_tx_q->dev_private;
998
999	/* Copy slave list to protect against slave up/down changes during tx
1000	 * bursting */
1001	num_of_slaves = internals->active_slave_count;
1002	if (num_of_slaves < 1)
1003		return num_tx_total;
1004
1005	memcpy(slaves, internals->active_slaves, sizeof(slaves[0]) * num_of_slaves);
1006
1007	distributing_count = 0;
1008	for (i = 0; i < num_of_slaves; i++) {
1009		struct port *port = &mode_8023ad_ports[slaves[i]];
1010
1011		slave_slow_nb_pkts[i] = rte_ring_dequeue_burst(port->tx_ring,
1012				slow_pkts, BOND_MODE_8023AX_SLAVE_TX_PKTS);
1013		slave_nb_pkts[i] = slave_slow_nb_pkts[i];
1014
1015		for (j = 0; j < slave_slow_nb_pkts[i]; j++)
1016			slave_bufs[i][j] = slow_pkts[j];
1017
1018		if (ACTOR_STATE(port, DISTRIBUTING))
1019			distributing_offsets[distributing_count++] = i;
1020	}
1021
1022	if (likely(distributing_count > 0)) {
1023		/* Populate slaves mbuf with the packets which are to be sent on it */
1024		for (i = 0; i < nb_pkts; i++) {
1025			/* Select output slave using hash based on xmit policy */
1026			op_slave_idx = internals->xmit_hash(bufs[i], distributing_count);
1027
1028			/* Populate slave mbuf arrays with mbufs for that slave. Use only
1029			 * slaves that are currently distributing. */
1030			uint8_t slave_offset = distributing_offsets[op_slave_idx];
1031			slave_bufs[slave_offset][slave_nb_pkts[slave_offset]] = bufs[i];
1032			slave_nb_pkts[slave_offset]++;
1033		}
1034	}
1035
1036	/* Send packet burst on each slave device */
1037	for (i = 0; i < num_of_slaves; i++) {
1038		if (slave_nb_pkts[i] == 0)
1039			continue;
1040
1041		num_tx_slave = rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id,
1042				slave_bufs[i], slave_nb_pkts[i]);
1043
1044		/* If tx burst fails drop slow packets */
1045		for ( ; num_tx_slave < slave_slow_nb_pkts[i]; num_tx_slave++)
1046			rte_pktmbuf_free(slave_bufs[i][num_tx_slave]);
1047
1048		num_tx_total += num_tx_slave - slave_slow_nb_pkts[i];
1049		num_tx_fail_total += slave_nb_pkts[i] - num_tx_slave;
1050
1051		/* If tx burst fails move packets to end of bufs */
1052		if (unlikely(num_tx_slave < slave_nb_pkts[i])) {
1053			uint16_t j = nb_pkts - num_tx_fail_total;
1054			for ( ; num_tx_slave < slave_nb_pkts[i]; j++, num_tx_slave++)
1055				bufs[j] = slave_bufs[i][num_tx_slave];
1056		}
1057	}
1058
1059	return num_tx_total;
1060}
1061
1062static uint16_t
1063bond_ethdev_tx_burst_broadcast(void *queue, struct rte_mbuf **bufs,
1064		uint16_t nb_pkts)
1065{
1066	struct bond_dev_private *internals;
1067	struct bond_tx_queue *bd_tx_q;
1068
1069	uint8_t tx_failed_flag = 0, num_of_slaves;
1070	uint8_t slaves[RTE_MAX_ETHPORTS];
1071
1072	uint16_t max_nb_of_tx_pkts = 0;
1073
1074	int slave_tx_total[RTE_MAX_ETHPORTS];
1075	int i, most_successful_tx_slave = -1;
1076
1077	bd_tx_q = (struct bond_tx_queue *)queue;
1078	internals = bd_tx_q->dev_private;
1079
1080	/* Copy slave list to protect against slave up/down changes during tx
1081	 * bursting */
1082	num_of_slaves = internals->active_slave_count;
1083	memcpy(slaves, internals->active_slaves,
1084			sizeof(internals->active_slaves[0]) * num_of_slaves);
1085
1086	if (num_of_slaves < 1)
1087		return 0;
1088
1089	/* Increment reference count on mbufs */
1090	for (i = 0; i < nb_pkts; i++)
1091		rte_mbuf_refcnt_update(bufs[i], num_of_slaves - 1);
1092
1093	/* Transmit burst on each active slave */
1094	for (i = 0; i < num_of_slaves; i++) {
1095		slave_tx_total[i] = rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id,
1096					bufs, nb_pkts);
1097
1098		if (unlikely(slave_tx_total[i] < nb_pkts))
1099			tx_failed_flag = 1;
1100
1101		/* record the value and slave index for the slave which transmits the
1102		 * maximum number of packets */
1103		if (slave_tx_total[i] > max_nb_of_tx_pkts) {
1104			max_nb_of_tx_pkts = slave_tx_total[i];
1105			most_successful_tx_slave = i;
1106		}
1107	}
1108
1109	/* if slaves fail to transmit packets from burst, the calling application
1110	 * is not expected to know about multiple references to packets so we must
1111	 * handle failures of all packets except those of the most successful slave
1112	 */
1113	if (unlikely(tx_failed_flag))
1114		for (i = 0; i < num_of_slaves; i++)
1115			if (i != most_successful_tx_slave)
1116				while (slave_tx_total[i] < nb_pkts)
1117					rte_pktmbuf_free(bufs[slave_tx_total[i]++]);
1118
1119	return max_nb_of_tx_pkts;
1120}
1121
1122void
1123link_properties_set(struct rte_eth_dev *bonded_eth_dev,
1124		struct rte_eth_link *slave_dev_link)
1125{
1126	struct rte_eth_link *bonded_dev_link = &bonded_eth_dev->data->dev_link;
1127	struct bond_dev_private *internals = bonded_eth_dev->data->dev_private;
1128
1129	if (slave_dev_link->link_status &&
1130		bonded_eth_dev->data->dev_started) {
1131		bonded_dev_link->link_duplex = slave_dev_link->link_duplex;
1132		bonded_dev_link->link_speed = slave_dev_link->link_speed;
1133
1134		internals->link_props_set = 1;
1135	}
1136}
1137
1138void
1139link_properties_reset(struct rte_eth_dev *bonded_eth_dev)
1140{
1141	struct bond_dev_private *internals = bonded_eth_dev->data->dev_private;
1142
1143	memset(&(bonded_eth_dev->data->dev_link), 0,
1144			sizeof(bonded_eth_dev->data->dev_link));
1145
1146	internals->link_props_set = 0;
1147}
1148
1149int
1150link_properties_valid(struct rte_eth_link *bonded_dev_link,
1151		struct rte_eth_link *slave_dev_link)
1152{
1153	if (bonded_dev_link->link_duplex != slave_dev_link->link_duplex ||
1154		bonded_dev_link->link_speed !=  slave_dev_link->link_speed)
1155		return -1;
1156
1157	return 0;
1158}
1159
1160int
1161mac_address_get(struct rte_eth_dev *eth_dev, struct ether_addr *dst_mac_addr)
1162{
1163	struct ether_addr *mac_addr;
1164
1165	if (eth_dev == NULL) {
1166		RTE_LOG(ERR, PMD, "%s: NULL pointer eth_dev specified\n", __func__);
1167		return -1;
1168	}
1169
1170	if (dst_mac_addr == NULL) {
1171		RTE_LOG(ERR, PMD, "%s: NULL pointer MAC specified\n", __func__);
1172		return -1;
1173	}
1174
1175	mac_addr = eth_dev->data->mac_addrs;
1176
1177	ether_addr_copy(mac_addr, dst_mac_addr);
1178	return 0;
1179}
1180
1181int
1182mac_address_set(struct rte_eth_dev *eth_dev, struct ether_addr *new_mac_addr)
1183{
1184	struct ether_addr *mac_addr;
1185
1186	if (eth_dev == NULL) {
1187		RTE_BOND_LOG(ERR, "NULL pointer eth_dev specified");
1188		return -1;
1189	}
1190
1191	if (new_mac_addr == NULL) {
1192		RTE_BOND_LOG(ERR, "NULL pointer MAC specified");
1193		return -1;
1194	}
1195
1196	mac_addr = eth_dev->data->mac_addrs;
1197
1198	/* If new MAC is different to current MAC then update */
1199	if (memcmp(mac_addr, new_mac_addr, sizeof(*mac_addr)) != 0)
1200		memcpy(mac_addr, new_mac_addr, sizeof(*mac_addr));
1201
1202	return 0;
1203}
1204
1205int
1206mac_address_slaves_update(struct rte_eth_dev *bonded_eth_dev)
1207{
1208	struct bond_dev_private *internals = bonded_eth_dev->data->dev_private;
1209	int i;
1210
1211	/* Update slave devices MAC addresses */
1212	if (internals->slave_count < 1)
1213		return -1;
1214
1215	switch (internals->mode) {
1216	case BONDING_MODE_ROUND_ROBIN:
1217	case BONDING_MODE_BALANCE:
1218	case BONDING_MODE_BROADCAST:
1219		for (i = 0; i < internals->slave_count; i++) {
1220			if (mac_address_set(&rte_eth_devices[internals->slaves[i].port_id],
1221					bonded_eth_dev->data->mac_addrs)) {
1222				RTE_BOND_LOG(ERR, "Failed to update port Id %d MAC address",
1223						internals->slaves[i].port_id);
1224				return -1;
1225			}
1226		}
1227		break;
1228	case BONDING_MODE_8023AD:
1229		bond_mode_8023ad_mac_address_update(bonded_eth_dev);
1230		break;
1231	case BONDING_MODE_ACTIVE_BACKUP:
1232	case BONDING_MODE_TLB:
1233	case BONDING_MODE_ALB:
1234	default:
1235		for (i = 0; i < internals->slave_count; i++) {
1236			if (internals->slaves[i].port_id ==
1237					internals->current_primary_port) {
1238				if (mac_address_set(&rte_eth_devices[internals->primary_port],
1239						bonded_eth_dev->data->mac_addrs)) {
1240					RTE_BOND_LOG(ERR, "Failed to update port Id %d MAC address",
1241							internals->current_primary_port);
1242					return -1;
1243				}
1244			} else {
1245				if (mac_address_set(
1246						&rte_eth_devices[internals->slaves[i].port_id],
1247						&internals->slaves[i].persisted_mac_addr)) {
1248					RTE_BOND_LOG(ERR, "Failed to update port Id %d MAC address",
1249							internals->slaves[i].port_id);
1250					return -1;
1251				}
1252			}
1253		}
1254	}
1255
1256	return 0;
1257}
1258
1259int
1260bond_ethdev_mode_set(struct rte_eth_dev *eth_dev, int mode)
1261{
1262	struct bond_dev_private *internals;
1263
1264	internals = eth_dev->data->dev_private;
1265
1266	switch (mode) {
1267	case BONDING_MODE_ROUND_ROBIN:
1268		eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_round_robin;
1269		eth_dev->rx_pkt_burst = bond_ethdev_rx_burst;
1270		break;
1271	case BONDING_MODE_ACTIVE_BACKUP:
1272		eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_active_backup;
1273		eth_dev->rx_pkt_burst = bond_ethdev_rx_burst_active_backup;
1274		break;
1275	case BONDING_MODE_BALANCE:
1276		eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_balance;
1277		eth_dev->rx_pkt_burst = bond_ethdev_rx_burst;
1278		break;
1279	case BONDING_MODE_BROADCAST:
1280		eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_broadcast;
1281		eth_dev->rx_pkt_burst = bond_ethdev_rx_burst;
1282		break;
1283	case BONDING_MODE_8023AD:
1284		if (bond_mode_8023ad_enable(eth_dev) != 0)
1285			return -1;
1286
1287		eth_dev->rx_pkt_burst = bond_ethdev_rx_burst_8023ad;
1288		eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_8023ad;
1289		RTE_LOG(WARNING, PMD,
1290				"Using mode 4, it is necessary to do TX burst and RX burst "
1291				"at least every 100ms.\n");
1292		break;
1293	case BONDING_MODE_TLB:
1294		eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_tlb;
1295		eth_dev->rx_pkt_burst = bond_ethdev_rx_burst_active_backup;
1296		break;
1297	case BONDING_MODE_ALB:
1298		if (bond_mode_alb_enable(eth_dev) != 0)
1299			return -1;
1300
1301		eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_alb;
1302		eth_dev->rx_pkt_burst = bond_ethdev_rx_burst_alb;
1303		break;
1304	default:
1305		return -1;
1306	}
1307
1308	internals->mode = mode;
1309
1310	return 0;
1311}
1312
1313int
1314slave_configure(struct rte_eth_dev *bonded_eth_dev,
1315		struct rte_eth_dev *slave_eth_dev)
1316{
1317	struct bond_rx_queue *bd_rx_q;
1318	struct bond_tx_queue *bd_tx_q;
1319
1320	uint16_t old_nb_tx_queues = slave_eth_dev->data->nb_tx_queues;
1321	uint16_t old_nb_rx_queues = slave_eth_dev->data->nb_rx_queues;
1322	int errval;
1323	uint16_t q_id;
1324
1325	/* Stop slave */
1326	rte_eth_dev_stop(slave_eth_dev->data->port_id);
1327
1328	/* Enable interrupts on slave device if supported */
1329	if (slave_eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC)
1330		slave_eth_dev->data->dev_conf.intr_conf.lsc = 1;
1331
1332	/* If RSS is enabled for bonding, try to enable it for slaves  */
1333	if (bonded_eth_dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS_FLAG) {
1334		if (bonded_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len
1335				!= 0) {
1336			slave_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len =
1337					bonded_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len;
1338			slave_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key =
1339					bonded_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key;
1340		} else {
1341			slave_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key = NULL;
1342		}
1343
1344		slave_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf =
1345				bonded_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
1346		slave_eth_dev->data->dev_conf.rxmode.mq_mode =
1347				bonded_eth_dev->data->dev_conf.rxmode.mq_mode;
1348	}
1349
1350	slave_eth_dev->data->dev_conf.rxmode.hw_vlan_filter =
1351			bonded_eth_dev->data->dev_conf.rxmode.hw_vlan_filter;
1352
1353	/* Configure device */
1354	errval = rte_eth_dev_configure(slave_eth_dev->data->port_id,
1355			bonded_eth_dev->data->nb_rx_queues,
1356			bonded_eth_dev->data->nb_tx_queues,
1357			&(slave_eth_dev->data->dev_conf));
1358	if (errval != 0) {
1359		RTE_BOND_LOG(ERR, "Cannot configure slave device: port %u , err (%d)",
1360				slave_eth_dev->data->port_id, errval);
1361		return errval;
1362	}
1363
1364	/* Setup Rx Queues */
1365	/* Use existing queues, if any */
1366	for (q_id = old_nb_rx_queues;
1367	     q_id < bonded_eth_dev->data->nb_rx_queues; q_id++) {
1368		bd_rx_q = (struct bond_rx_queue *)bonded_eth_dev->data->rx_queues[q_id];
1369
1370		errval = rte_eth_rx_queue_setup(slave_eth_dev->data->port_id, q_id,
1371				bd_rx_q->nb_rx_desc,
1372				rte_eth_dev_socket_id(slave_eth_dev->data->port_id),
1373				&(bd_rx_q->rx_conf), bd_rx_q->mb_pool);
1374		if (errval != 0) {
1375			RTE_BOND_LOG(ERR,
1376					"rte_eth_rx_queue_setup: port=%d queue_id %d, err (%d)",
1377					slave_eth_dev->data->port_id, q_id, errval);
1378			return errval;
1379		}
1380	}
1381
1382	/* Setup Tx Queues */
1383	/* Use existing queues, if any */
1384	for (q_id = old_nb_tx_queues;
1385	     q_id < bonded_eth_dev->data->nb_tx_queues; q_id++) {
1386		bd_tx_q = (struct bond_tx_queue *)bonded_eth_dev->data->tx_queues[q_id];
1387
1388		errval = rte_eth_tx_queue_setup(slave_eth_dev->data->port_id, q_id,
1389				bd_tx_q->nb_tx_desc,
1390				rte_eth_dev_socket_id(slave_eth_dev->data->port_id),
1391				&bd_tx_q->tx_conf);
1392		if (errval != 0) {
1393			RTE_BOND_LOG(ERR,
1394					"rte_eth_tx_queue_setup: port=%d queue_id %d, err (%d)",
1395					slave_eth_dev->data->port_id, q_id, errval);
1396			return errval;
1397		}
1398	}
1399
1400	/* Start device */
1401	errval = rte_eth_dev_start(slave_eth_dev->data->port_id);
1402	if (errval != 0) {
1403		RTE_BOND_LOG(ERR, "rte_eth_dev_start: port=%u, err (%d)",
1404				slave_eth_dev->data->port_id, errval);
1405		return -1;
1406	}
1407
1408	/* If RSS is enabled for bonding, synchronize RETA */
1409	if (bonded_eth_dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS) {
1410		int i;
1411		struct bond_dev_private *internals;
1412
1413		internals = bonded_eth_dev->data->dev_private;
1414
1415		for (i = 0; i < internals->slave_count; i++) {
1416			if (internals->slaves[i].port_id == slave_eth_dev->data->port_id) {
1417				errval = rte_eth_dev_rss_reta_update(
1418						slave_eth_dev->data->port_id,
1419						&internals->reta_conf[0],
1420						internals->slaves[i].reta_size);
1421				if (errval != 0) {
1422					RTE_LOG(WARNING, PMD,
1423							"rte_eth_dev_rss_reta_update on slave port %d fails (err %d)."
1424							" RSS Configuration for bonding may be inconsistent.\n",
1425							slave_eth_dev->data->port_id, errval);
1426				}
1427				break;
1428			}
1429		}
1430	}
1431
1432	/* If lsc interrupt is set, check initial slave's link status */
1433	if (slave_eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC)
1434		bond_ethdev_lsc_event_callback(slave_eth_dev->data->port_id,
1435			RTE_ETH_EVENT_INTR_LSC, &bonded_eth_dev->data->port_id);
1436
1437	return 0;
1438}
1439
1440void
1441slave_remove(struct bond_dev_private *internals,
1442		struct rte_eth_dev *slave_eth_dev)
1443{
1444	uint8_t i;
1445
1446	for (i = 0; i < internals->slave_count; i++)
1447		if (internals->slaves[i].port_id ==
1448				slave_eth_dev->data->port_id)
1449			break;
1450
1451	if (i < (internals->slave_count - 1))
1452		memmove(&internals->slaves[i], &internals->slaves[i + 1],
1453				sizeof(internals->slaves[0]) *
1454				(internals->slave_count - i - 1));
1455
1456	internals->slave_count--;
1457}
1458
1459static void
1460bond_ethdev_slave_link_status_change_monitor(void *cb_arg);
1461
1462void
1463slave_add(struct bond_dev_private *internals,
1464		struct rte_eth_dev *slave_eth_dev)
1465{
1466	struct bond_slave_details *slave_details =
1467			&internals->slaves[internals->slave_count];
1468
1469	slave_details->port_id = slave_eth_dev->data->port_id;
1470	slave_details->last_link_status = 0;
1471
1472	/* Mark slave devices that don't support interrupts so we can
1473	 * compensate when we start the bond
1474	 */
1475	if (!(slave_eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC)) {
1476		slave_details->link_status_poll_enabled = 1;
1477	}
1478
1479	slave_details->link_status_wait_to_complete = 0;
1480	/* clean tlb_last_obytes when adding port for bonding device */
1481	memcpy(&(slave_details->persisted_mac_addr), slave_eth_dev->data->mac_addrs,
1482			sizeof(struct ether_addr));
1483}
1484
1485void
1486bond_ethdev_primary_set(struct bond_dev_private *internals,
1487		uint8_t slave_port_id)
1488{
1489	int i;
1490
1491	if (internals->active_slave_count < 1)
1492		internals->current_primary_port = slave_port_id;
1493	else
1494		/* Search bonded device slave ports for new proposed primary port */
1495		for (i = 0; i < internals->active_slave_count; i++) {
1496			if (internals->active_slaves[i] == slave_port_id)
1497				internals->current_primary_port = slave_port_id;
1498		}
1499}
1500
1501static void
1502bond_ethdev_promiscuous_enable(struct rte_eth_dev *eth_dev);
1503
1504static int
1505bond_ethdev_start(struct rte_eth_dev *eth_dev)
1506{
1507	struct bond_dev_private *internals;
1508	int i;
1509
1510	/* slave eth dev will be started by bonded device */
1511	if (check_for_bonded_ethdev(eth_dev)) {
1512		RTE_BOND_LOG(ERR, "User tried to explicitly start a slave eth_dev (%d)",
1513				eth_dev->data->port_id);
1514		return -1;
1515	}
1516
1517	eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
1518	eth_dev->data->dev_started = 1;
1519
1520	internals = eth_dev->data->dev_private;
1521
1522	if (internals->slave_count == 0) {
1523		RTE_BOND_LOG(ERR, "Cannot start port since there are no slave devices");
1524		return -1;
1525	}
1526
1527	if (internals->user_defined_mac == 0) {
1528		struct ether_addr *new_mac_addr = NULL;
1529
1530		for (i = 0; i < internals->slave_count; i++)
1531			if (internals->slaves[i].port_id == internals->primary_port)
1532				new_mac_addr = &internals->slaves[i].persisted_mac_addr;
1533
1534		if (new_mac_addr == NULL)
1535			return -1;
1536
1537		if (mac_address_set(eth_dev, new_mac_addr) != 0) {
1538			RTE_BOND_LOG(ERR, "bonded port (%d) failed to update MAC address",
1539					eth_dev->data->port_id);
1540			return -1;
1541		}
1542	}
1543
1544	/* Update all slave devices MACs*/
1545	if (mac_address_slaves_update(eth_dev) != 0)
1546		return -1;
1547
1548	/* If bonded device is configure in promiscuous mode then re-apply config */
1549	if (internals->promiscuous_en)
1550		bond_ethdev_promiscuous_enable(eth_dev);
1551
1552	/* Reconfigure each slave device if starting bonded device */
1553	for (i = 0; i < internals->slave_count; i++) {
1554		if (slave_configure(eth_dev,
1555				&(rte_eth_devices[internals->slaves[i].port_id])) != 0) {
1556			RTE_BOND_LOG(ERR,
1557					"bonded port (%d) failed to reconfigure slave device (%d)",
1558					eth_dev->data->port_id, internals->slaves[i].port_id);
1559			return -1;
1560		}
1561		/* We will need to poll for link status if any slave doesn't
1562		 * support interrupts
1563		 */
1564		if (internals->slaves[i].link_status_poll_enabled)
1565			internals->link_status_polling_enabled = 1;
1566	}
1567	/* start polling if needed */
1568	if (internals->link_status_polling_enabled) {
1569		rte_eal_alarm_set(
1570			internals->link_status_polling_interval_ms * 1000,
1571			bond_ethdev_slave_link_status_change_monitor,
1572			(void *)&rte_eth_devices[internals->port_id]);
1573	}
1574
1575	if (internals->user_defined_primary_port)
1576		bond_ethdev_primary_set(internals, internals->primary_port);
1577
1578	if (internals->mode == BONDING_MODE_8023AD)
1579		bond_mode_8023ad_start(eth_dev);
1580
1581	if (internals->mode == BONDING_MODE_TLB ||
1582			internals->mode == BONDING_MODE_ALB)
1583		bond_tlb_enable(internals);
1584
1585	return 0;
1586}
1587
1588static void
1589bond_ethdev_free_queues(struct rte_eth_dev *dev)
1590{
1591	uint8_t i;
1592
1593	if (dev->data->rx_queues != NULL) {
1594		for (i = 0; i < dev->data->nb_rx_queues; i++) {
1595			rte_free(dev->data->rx_queues[i]);
1596			dev->data->rx_queues[i] = NULL;
1597		}
1598		dev->data->nb_rx_queues = 0;
1599	}
1600
1601	if (dev->data->tx_queues != NULL) {
1602		for (i = 0; i < dev->data->nb_tx_queues; i++) {
1603			rte_free(dev->data->tx_queues[i]);
1604			dev->data->tx_queues[i] = NULL;
1605		}
1606		dev->data->nb_tx_queues = 0;
1607	}
1608}
1609
1610void
1611bond_ethdev_stop(struct rte_eth_dev *eth_dev)
1612{
1613	struct bond_dev_private *internals = eth_dev->data->dev_private;
1614	uint8_t i;
1615
1616	if (internals->mode == BONDING_MODE_8023AD) {
1617		struct port *port;
1618		void *pkt = NULL;
1619
1620		bond_mode_8023ad_stop(eth_dev);
1621
1622		/* Discard all messages to/from mode 4 state machines */
1623		for (i = 0; i < internals->active_slave_count; i++) {
1624			port = &mode_8023ad_ports[internals->active_slaves[i]];
1625
1626			RTE_ASSERT(port->rx_ring != NULL);
1627			while (rte_ring_dequeue(port->rx_ring, &pkt) != -ENOENT)
1628				rte_pktmbuf_free(pkt);
1629
1630			RTE_ASSERT(port->tx_ring != NULL);
1631			while (rte_ring_dequeue(port->tx_ring, &pkt) != -ENOENT)
1632				rte_pktmbuf_free(pkt);
1633		}
1634	}
1635
1636	if (internals->mode == BONDING_MODE_TLB ||
1637			internals->mode == BONDING_MODE_ALB) {
1638		bond_tlb_disable(internals);
1639		for (i = 0; i < internals->active_slave_count; i++)
1640			tlb_last_obytets[internals->active_slaves[i]] = 0;
1641	}
1642
1643	internals->active_slave_count = 0;
1644	internals->link_status_polling_enabled = 0;
1645	for (i = 0; i < internals->slave_count; i++)
1646		internals->slaves[i].last_link_status = 0;
1647
1648	eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
1649	eth_dev->data->dev_started = 0;
1650}
1651
1652void
1653bond_ethdev_close(struct rte_eth_dev *dev)
1654{
1655	struct bond_dev_private *internals = dev->data->dev_private;
1656
1657	bond_ethdev_free_queues(dev);
1658	rte_bitmap_reset(internals->vlan_filter_bmp);
1659}
1660
1661/* forward declaration */
1662static int bond_ethdev_configure(struct rte_eth_dev *dev);
1663
1664static void
1665bond_ethdev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
1666{
1667	struct bond_dev_private *internals = dev->data->dev_private;
1668
1669	dev_info->max_mac_addrs = 1;
1670
1671	dev_info->max_rx_pktlen = internals->candidate_max_rx_pktlen ?
1672				  internals->candidate_max_rx_pktlen : 2048;
1673
1674	dev_info->max_rx_queues = (uint16_t)128;
1675	dev_info->max_tx_queues = (uint16_t)512;
1676
1677	dev_info->min_rx_bufsize = 0;
1678	dev_info->pci_dev = NULL;
1679
1680	dev_info->rx_offload_capa = internals->rx_offload_capa;
1681	dev_info->tx_offload_capa = internals->tx_offload_capa;
1682	dev_info->flow_type_rss_offloads = internals->flow_type_rss_offloads;
1683
1684	dev_info->reta_size = internals->reta_size;
1685}
1686
1687static int
1688bond_ethdev_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on)
1689{
1690	int res;
1691	uint8_t i;
1692	struct bond_dev_private *internals = dev->data->dev_private;
1693
1694	/* don't do this while a slave is being added */
1695	rte_spinlock_lock(&internals->lock);
1696
1697	if (on)
1698		rte_bitmap_set(internals->vlan_filter_bmp, vlan_id);
1699	else
1700		rte_bitmap_clear(internals->vlan_filter_bmp, vlan_id);
1701
1702	for (i = 0; i < internals->slave_count; i++) {
1703		uint8_t port_id = internals->slaves[i].port_id;
1704
1705		res = rte_eth_dev_vlan_filter(port_id, vlan_id, on);
1706		if (res == ENOTSUP)
1707			RTE_LOG(WARNING, PMD,
1708				"Setting VLAN filter on slave port %u not supported.\n",
1709				port_id);
1710	}
1711
1712	rte_spinlock_unlock(&internals->lock);
1713	return 0;
1714}
1715
1716static int
1717bond_ethdev_rx_queue_setup(struct rte_eth_dev *dev, uint16_t rx_queue_id,
1718		uint16_t nb_rx_desc, unsigned int socket_id __rte_unused,
1719		const struct rte_eth_rxconf *rx_conf, struct rte_mempool *mb_pool)
1720{
1721	struct bond_rx_queue *bd_rx_q = (struct bond_rx_queue *)
1722			rte_zmalloc_socket(NULL, sizeof(struct bond_rx_queue),
1723					0, dev->data->numa_node);
1724	if (bd_rx_q == NULL)
1725		return -1;
1726
1727	bd_rx_q->queue_id = rx_queue_id;
1728	bd_rx_q->dev_private = dev->data->dev_private;
1729
1730	bd_rx_q->nb_rx_desc = nb_rx_desc;
1731
1732	memcpy(&(bd_rx_q->rx_conf), rx_conf, sizeof(struct rte_eth_rxconf));
1733	bd_rx_q->mb_pool = mb_pool;
1734
1735	dev->data->rx_queues[rx_queue_id] = bd_rx_q;
1736
1737	return 0;
1738}
1739
1740static int
1741bond_ethdev_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id,
1742		uint16_t nb_tx_desc, unsigned int socket_id __rte_unused,
1743		const struct rte_eth_txconf *tx_conf)
1744{
1745	struct bond_tx_queue *bd_tx_q  = (struct bond_tx_queue *)
1746			rte_zmalloc_socket(NULL, sizeof(struct bond_tx_queue),
1747					0, dev->data->numa_node);
1748
1749	if (bd_tx_q == NULL)
1750		return -1;
1751
1752	bd_tx_q->queue_id = tx_queue_id;
1753	bd_tx_q->dev_private = dev->data->dev_private;
1754
1755	bd_tx_q->nb_tx_desc = nb_tx_desc;
1756	memcpy(&(bd_tx_q->tx_conf), tx_conf, sizeof(bd_tx_q->tx_conf));
1757
1758	dev->data->tx_queues[tx_queue_id] = bd_tx_q;
1759
1760	return 0;
1761}
1762
1763static void
1764bond_ethdev_rx_queue_release(void *queue)
1765{
1766	if (queue == NULL)
1767		return;
1768
1769	rte_free(queue);
1770}
1771
1772static void
1773bond_ethdev_tx_queue_release(void *queue)
1774{
1775	if (queue == NULL)
1776		return;
1777
1778	rte_free(queue);
1779}
1780
1781static void
1782bond_ethdev_slave_link_status_change_monitor(void *cb_arg)
1783{
1784	struct rte_eth_dev *bonded_ethdev, *slave_ethdev;
1785	struct bond_dev_private *internals;
1786
1787	/* Default value for polling slave found is true as we don't want to
1788	 * disable the polling thread if we cannot get the lock */
1789	int i, polling_slave_found = 1;
1790
1791	if (cb_arg == NULL)
1792		return;
1793
1794	bonded_ethdev = (struct rte_eth_dev *)cb_arg;
1795	internals = (struct bond_dev_private *)bonded_ethdev->data->dev_private;
1796
1797	if (!bonded_ethdev->data->dev_started ||
1798		!internals->link_status_polling_enabled)
1799		return;
1800
1801	/* If device is currently being configured then don't check slaves link
1802	 * status, wait until next period */
1803	if (rte_spinlock_trylock(&internals->lock)) {
1804		if (internals->slave_count > 0)
1805			polling_slave_found = 0;
1806
1807		for (i = 0; i < internals->slave_count; i++) {
1808			if (!internals->slaves[i].link_status_poll_enabled)
1809				continue;
1810
1811			slave_ethdev = &rte_eth_devices[internals->slaves[i].port_id];
1812			polling_slave_found = 1;
1813
1814			/* Update slave link status */
1815			(*slave_ethdev->dev_ops->link_update)(slave_ethdev,
1816					internals->slaves[i].link_status_wait_to_complete);
1817
1818			/* if link status has changed since last checked then call lsc
1819			 * event callback */
1820			if (slave_ethdev->data->dev_link.link_status !=
1821					internals->slaves[i].last_link_status) {
1822				internals->slaves[i].last_link_status =
1823						slave_ethdev->data->dev_link.link_status;
1824
1825				bond_ethdev_lsc_event_callback(internals->slaves[i].port_id,
1826						RTE_ETH_EVENT_INTR_LSC,
1827						&bonded_ethdev->data->port_id);
1828			}
1829		}
1830		rte_spinlock_unlock(&internals->lock);
1831	}
1832
1833	if (polling_slave_found)
1834		/* Set alarm to continue monitoring link status of slave ethdev's */
1835		rte_eal_alarm_set(internals->link_status_polling_interval_ms * 1000,
1836				bond_ethdev_slave_link_status_change_monitor, cb_arg);
1837}
1838
1839static int
1840bond_ethdev_link_update(struct rte_eth_dev *bonded_eth_dev,
1841		int wait_to_complete)
1842{
1843	struct bond_dev_private *internals = bonded_eth_dev->data->dev_private;
1844
1845	if (!bonded_eth_dev->data->dev_started ||
1846		internals->active_slave_count == 0) {
1847		bonded_eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
1848		return 0;
1849	} else {
1850		struct rte_eth_dev *slave_eth_dev;
1851		int i, link_up = 0;
1852
1853		for (i = 0; i < internals->active_slave_count; i++) {
1854			slave_eth_dev = &rte_eth_devices[internals->active_slaves[i]];
1855
1856			(*slave_eth_dev->dev_ops->link_update)(slave_eth_dev,
1857					wait_to_complete);
1858			if (slave_eth_dev->data->dev_link.link_status == ETH_LINK_UP) {
1859				link_up = 1;
1860				break;
1861			}
1862		}
1863
1864		bonded_eth_dev->data->dev_link.link_status = link_up;
1865	}
1866
1867	return 0;
1868}
1869
1870static void
1871bond_ethdev_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
1872{
1873	struct bond_dev_private *internals = dev->data->dev_private;
1874	struct rte_eth_stats slave_stats;
1875	int i, j;
1876
1877	for (i = 0; i < internals->slave_count; i++) {
1878		rte_eth_stats_get(internals->slaves[i].port_id, &slave_stats);
1879
1880		stats->ipackets += slave_stats.ipackets;
1881		stats->opackets += slave_stats.opackets;
1882		stats->ibytes += slave_stats.ibytes;
1883		stats->obytes += slave_stats.obytes;
1884		stats->imissed += slave_stats.imissed;
1885		stats->ierrors += slave_stats.ierrors;
1886		stats->oerrors += slave_stats.oerrors;
1887		stats->rx_nombuf += slave_stats.rx_nombuf;
1888
1889		for (j = 0; j < RTE_ETHDEV_QUEUE_STAT_CNTRS; j++) {
1890			stats->q_ipackets[j] += slave_stats.q_ipackets[j];
1891			stats->q_opackets[j] += slave_stats.q_opackets[j];
1892			stats->q_ibytes[j] += slave_stats.q_ibytes[j];
1893			stats->q_obytes[j] += slave_stats.q_obytes[j];
1894			stats->q_errors[j] += slave_stats.q_errors[j];
1895		}
1896
1897	}
1898}
1899
1900static void
1901bond_ethdev_stats_reset(struct rte_eth_dev *dev)
1902{
1903	struct bond_dev_private *internals = dev->data->dev_private;
1904	int i;
1905
1906	for (i = 0; i < internals->slave_count; i++)
1907		rte_eth_stats_reset(internals->slaves[i].port_id);
1908}
1909
1910static void
1911bond_ethdev_promiscuous_enable(struct rte_eth_dev *eth_dev)
1912{
1913	struct bond_dev_private *internals = eth_dev->data->dev_private;
1914	int i;
1915
1916	internals->promiscuous_en = 1;
1917
1918	switch (internals->mode) {
1919	/* Promiscuous mode is propagated to all slaves */
1920	case BONDING_MODE_ROUND_ROBIN:
1921	case BONDING_MODE_BALANCE:
1922	case BONDING_MODE_BROADCAST:
1923		for (i = 0; i < internals->slave_count; i++)
1924			rte_eth_promiscuous_enable(internals->slaves[i].port_id);
1925		break;
1926	/* In mode4 promiscus mode is managed when slave is added/removed */
1927	case BONDING_MODE_8023AD:
1928		break;
1929	/* Promiscuous mode is propagated only to primary slave */
1930	case BONDING_MODE_ACTIVE_BACKUP:
1931	case BONDING_MODE_TLB:
1932	case BONDING_MODE_ALB:
1933	default:
1934		rte_eth_promiscuous_enable(internals->current_primary_port);
1935	}
1936}
1937
1938static void
1939bond_ethdev_promiscuous_disable(struct rte_eth_dev *dev)
1940{
1941	struct bond_dev_private *internals = dev->data->dev_private;
1942	int i;
1943
1944	internals->promiscuous_en = 0;
1945
1946	switch (internals->mode) {
1947	/* Promiscuous mode is propagated to all slaves */
1948	case BONDING_MODE_ROUND_ROBIN:
1949	case BONDING_MODE_BALANCE:
1950	case BONDING_MODE_BROADCAST:
1951		for (i = 0; i < internals->slave_count; i++)
1952			rte_eth_promiscuous_disable(internals->slaves[i].port_id);
1953		break;
1954	/* In mode4 promiscus mode is set managed when slave is added/removed */
1955	case BONDING_MODE_8023AD:
1956		break;
1957	/* Promiscuous mode is propagated only to primary slave */
1958	case BONDING_MODE_ACTIVE_BACKUP:
1959	case BONDING_MODE_TLB:
1960	case BONDING_MODE_ALB:
1961	default:
1962		rte_eth_promiscuous_disable(internals->current_primary_port);
1963	}
1964}
1965
1966static void
1967bond_ethdev_delayed_lsc_propagation(void *arg)
1968{
1969	if (arg == NULL)
1970		return;
1971
1972	_rte_eth_dev_callback_process((struct rte_eth_dev *)arg,
1973			RTE_ETH_EVENT_INTR_LSC, NULL);
1974}
1975
1976void
1977bond_ethdev_lsc_event_callback(uint8_t port_id, enum rte_eth_event_type type,
1978		void *param)
1979{
1980	struct rte_eth_dev *bonded_eth_dev, *slave_eth_dev;
1981	struct bond_dev_private *internals;
1982	struct rte_eth_link link;
1983
1984	int i, valid_slave = 0;
1985	uint8_t active_pos;
1986	uint8_t lsc_flag = 0;
1987
1988	if (type != RTE_ETH_EVENT_INTR_LSC || param == NULL)
1989		return;
1990
1991	bonded_eth_dev = &rte_eth_devices[*(uint8_t *)param];
1992	slave_eth_dev = &rte_eth_devices[port_id];
1993
1994	if (check_for_bonded_ethdev(bonded_eth_dev))
1995		return;
1996
1997	internals = bonded_eth_dev->data->dev_private;
1998
1999	/* If the device isn't started don't handle interrupts */
2000	if (!bonded_eth_dev->data->dev_started)
2001		return;
2002
2003	/* verify that port_id is a valid slave of bonded port */
2004	for (i = 0; i < internals->slave_count; i++) {
2005		if (internals->slaves[i].port_id == port_id) {
2006			valid_slave = 1;
2007			break;
2008		}
2009	}
2010
2011	if (!valid_slave)
2012		return;
2013
2014	/* Search for port in active port list */
2015	active_pos = find_slave_by_id(internals->active_slaves,
2016			internals->active_slave_count, port_id);
2017
2018	rte_eth_link_get_nowait(port_id, &link);
2019	if (link.link_status) {
2020		if (active_pos < internals->active_slave_count)
2021			return;
2022
2023		/* if no active slave ports then set this port to be primary port */
2024		if (internals->active_slave_count < 1) {
2025			/* If first active slave, then change link status */
2026			bonded_eth_dev->data->dev_link.link_status = ETH_LINK_UP;
2027			internals->current_primary_port = port_id;
2028			lsc_flag = 1;
2029
2030			mac_address_slaves_update(bonded_eth_dev);
2031
2032			/* Inherit eth dev link properties from first active slave */
2033			link_properties_set(bonded_eth_dev,
2034					&(slave_eth_dev->data->dev_link));
2035		} else {
2036			if (link_properties_valid(
2037				&bonded_eth_dev->data->dev_link, &link) != 0) {
2038				slave_eth_dev->data->dev_flags &=
2039					(~RTE_ETH_DEV_BONDED_SLAVE);
2040				RTE_LOG(ERR, PMD,
2041					"port %u invalid speed/duplex\n",
2042					port_id);
2043				return;
2044			}
2045		}
2046
2047		activate_slave(bonded_eth_dev, port_id);
2048
2049		/* If user has defined the primary port then default to using it */
2050		if (internals->user_defined_primary_port &&
2051				internals->primary_port == port_id)
2052			bond_ethdev_primary_set(internals, port_id);
2053	} else {
2054		if (active_pos == internals->active_slave_count)
2055			return;
2056
2057		/* Remove from active slave list */
2058		deactivate_slave(bonded_eth_dev, port_id);
2059
2060		/* No active slaves, change link status to down and reset other
2061		 * link properties */
2062		if (internals->active_slave_count < 1) {
2063			lsc_flag = 1;
2064			bonded_eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
2065
2066			link_properties_reset(bonded_eth_dev);
2067		}
2068
2069		/* Update primary id, take first active slave from list or if none
2070		 * available set to -1 */
2071		if (port_id == internals->current_primary_port) {
2072			if (internals->active_slave_count > 0)
2073				bond_ethdev_primary_set(internals,
2074						internals->active_slaves[0]);
2075			else
2076				internals->current_primary_port = internals->primary_port;
2077		}
2078	}
2079
2080	if (lsc_flag) {
2081		/* Cancel any possible outstanding interrupts if delays are enabled */
2082		if (internals->link_up_delay_ms > 0 ||
2083			internals->link_down_delay_ms > 0)
2084			rte_eal_alarm_cancel(bond_ethdev_delayed_lsc_propagation,
2085					bonded_eth_dev);
2086
2087		if (bonded_eth_dev->data->dev_link.link_status) {
2088			if (internals->link_up_delay_ms > 0)
2089				rte_eal_alarm_set(internals->link_up_delay_ms * 1000,
2090						bond_ethdev_delayed_lsc_propagation,
2091						(void *)bonded_eth_dev);
2092			else
2093				_rte_eth_dev_callback_process(bonded_eth_dev,
2094						RTE_ETH_EVENT_INTR_LSC, NULL);
2095
2096		} else {
2097			if (internals->link_down_delay_ms > 0)
2098				rte_eal_alarm_set(internals->link_down_delay_ms * 1000,
2099						bond_ethdev_delayed_lsc_propagation,
2100						(void *)bonded_eth_dev);
2101			else
2102				_rte_eth_dev_callback_process(bonded_eth_dev,
2103						RTE_ETH_EVENT_INTR_LSC, NULL);
2104		}
2105	}
2106}
2107
2108static int
2109bond_ethdev_rss_reta_update(struct rte_eth_dev *dev,
2110		struct rte_eth_rss_reta_entry64 *reta_conf, uint16_t reta_size)
2111{
2112	unsigned i, j;
2113	int result = 0;
2114	int slave_reta_size;
2115	unsigned reta_count;
2116	struct bond_dev_private *internals = dev->data->dev_private;
2117
2118	if (reta_size != internals->reta_size)
2119		return -EINVAL;
2120
2121	 /* Copy RETA table */
2122	reta_count = reta_size / RTE_RETA_GROUP_SIZE;
2123
2124	for (i = 0; i < reta_count; i++) {
2125		internals->reta_conf[i].mask = reta_conf[i].mask;
2126		for (j = 0; j < RTE_RETA_GROUP_SIZE; j++)
2127			if ((reta_conf[i].mask >> j) & 0x01)
2128				internals->reta_conf[i].reta[j] = reta_conf[i].reta[j];
2129	}
2130
2131	/* Fill rest of array */
2132	for (; i < RTE_DIM(internals->reta_conf); i += reta_count)
2133		memcpy(&internals->reta_conf[i], &internals->reta_conf[0],
2134				sizeof(internals->reta_conf[0]) * reta_count);
2135
2136	/* Propagate RETA over slaves */
2137	for (i = 0; i < internals->slave_count; i++) {
2138		slave_reta_size = internals->slaves[i].reta_size;
2139		result = rte_eth_dev_rss_reta_update(internals->slaves[i].port_id,
2140				&internals->reta_conf[0], slave_reta_size);
2141		if (result < 0)
2142			return result;
2143	}
2144
2145	return 0;
2146}
2147
2148static int
2149bond_ethdev_rss_reta_query(struct rte_eth_dev *dev,
2150		struct rte_eth_rss_reta_entry64 *reta_conf, uint16_t reta_size)
2151{
2152	int i, j;
2153	struct bond_dev_private *internals = dev->data->dev_private;
2154
2155	if (reta_size != internals->reta_size)
2156		return -EINVAL;
2157
2158	 /* Copy RETA table */
2159	for (i = 0; i < reta_size / RTE_RETA_GROUP_SIZE; i++)
2160		for (j = 0; j < RTE_RETA_GROUP_SIZE; j++)
2161			if ((reta_conf[i].mask >> j) & 0x01)
2162				reta_conf[i].reta[j] = internals->reta_conf[i].reta[j];
2163
2164	return 0;
2165}
2166
2167static int
2168bond_ethdev_rss_hash_update(struct rte_eth_dev *dev,
2169		struct rte_eth_rss_conf *rss_conf)
2170{
2171	int i, result = 0;
2172	struct bond_dev_private *internals = dev->data->dev_private;
2173	struct rte_eth_rss_conf bond_rss_conf;
2174
2175	memcpy(&bond_rss_conf, rss_conf, sizeof(struct rte_eth_rss_conf));
2176
2177	bond_rss_conf.rss_hf &= internals->flow_type_rss_offloads;
2178
2179	if (bond_rss_conf.rss_hf != 0)
2180		dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf = bond_rss_conf.rss_hf;
2181
2182	if (bond_rss_conf.rss_key && bond_rss_conf.rss_key_len <
2183			sizeof(internals->rss_key)) {
2184		if (bond_rss_conf.rss_key_len == 0)
2185			bond_rss_conf.rss_key_len = 40;
2186		internals->rss_key_len = bond_rss_conf.rss_key_len;
2187		memcpy(internals->rss_key, bond_rss_conf.rss_key,
2188				internals->rss_key_len);
2189	}
2190
2191	for (i = 0; i < internals->slave_count; i++) {
2192		result = rte_eth_dev_rss_hash_update(internals->slaves[i].port_id,
2193				&bond_rss_conf);
2194		if (result < 0)
2195			return result;
2196	}
2197
2198	return 0;
2199}
2200
2201static int
2202bond_ethdev_rss_hash_conf_get(struct rte_eth_dev *dev,
2203		struct rte_eth_rss_conf *rss_conf)
2204{
2205	struct bond_dev_private *internals = dev->data->dev_private;
2206
2207	rss_conf->rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
2208	rss_conf->rss_key_len = internals->rss_key_len;
2209	if (rss_conf->rss_key)
2210		memcpy(rss_conf->rss_key, internals->rss_key, internals->rss_key_len);
2211
2212	return 0;
2213}
2214
2215const struct eth_dev_ops default_dev_ops = {
2216	.dev_start            = bond_ethdev_start,
2217	.dev_stop             = bond_ethdev_stop,
2218	.dev_close            = bond_ethdev_close,
2219	.dev_configure        = bond_ethdev_configure,
2220	.dev_infos_get        = bond_ethdev_info,
2221	.vlan_filter_set      = bond_ethdev_vlan_filter_set,
2222	.rx_queue_setup       = bond_ethdev_rx_queue_setup,
2223	.tx_queue_setup       = bond_ethdev_tx_queue_setup,
2224	.rx_queue_release     = bond_ethdev_rx_queue_release,
2225	.tx_queue_release     = bond_ethdev_tx_queue_release,
2226	.link_update          = bond_ethdev_link_update,
2227	.stats_get            = bond_ethdev_stats_get,
2228	.stats_reset          = bond_ethdev_stats_reset,
2229	.promiscuous_enable   = bond_ethdev_promiscuous_enable,
2230	.promiscuous_disable  = bond_ethdev_promiscuous_disable,
2231	.reta_update          = bond_ethdev_rss_reta_update,
2232	.reta_query           = bond_ethdev_rss_reta_query,
2233	.rss_hash_update      = bond_ethdev_rss_hash_update,
2234	.rss_hash_conf_get    = bond_ethdev_rss_hash_conf_get
2235};
2236
2237static int
2238bond_probe(const char *name, const char *params)
2239{
2240	struct bond_dev_private *internals;
2241	struct rte_kvargs *kvlist;
2242	uint8_t bonding_mode, socket_id;
2243	int  arg_count, port_id;
2244
2245	RTE_LOG(INFO, EAL, "Initializing pmd_bond for %s\n", name);
2246
2247	kvlist = rte_kvargs_parse(params, pmd_bond_init_valid_arguments);
2248	if (kvlist == NULL)
2249		return -1;
2250
2251	/* Parse link bonding mode */
2252	if (rte_kvargs_count(kvlist, PMD_BOND_MODE_KVARG) == 1) {
2253		if (rte_kvargs_process(kvlist, PMD_BOND_MODE_KVARG,
2254				&bond_ethdev_parse_slave_mode_kvarg,
2255				&bonding_mode) != 0) {
2256			RTE_LOG(ERR, EAL, "Invalid mode for bonded device %s\n",
2257					name);
2258			goto parse_error;
2259		}
2260	} else {
2261		RTE_LOG(ERR, EAL, "Mode must be specified only once for bonded "
2262				"device %s\n", name);
2263		goto parse_error;
2264	}
2265
2266	/* Parse socket id to create bonding device on */
2267	arg_count = rte_kvargs_count(kvlist, PMD_BOND_SOCKET_ID_KVARG);
2268	if (arg_count == 1) {
2269		if (rte_kvargs_process(kvlist, PMD_BOND_SOCKET_ID_KVARG,
2270				&bond_ethdev_parse_socket_id_kvarg, &socket_id)
2271				!= 0) {
2272			RTE_LOG(ERR, EAL, "Invalid socket Id specified for "
2273					"bonded device %s\n", name);
2274			goto parse_error;
2275		}
2276	} else if (arg_count > 1) {
2277		RTE_LOG(ERR, EAL, "Socket Id can be specified only once for "
2278				"bonded device %s\n", name);
2279		goto parse_error;
2280	} else {
2281		socket_id = rte_socket_id();
2282	}
2283
2284	/* Create link bonding eth device */
2285	port_id = rte_eth_bond_create(name, bonding_mode, socket_id);
2286	if (port_id < 0) {
2287		RTE_LOG(ERR, EAL, "Failed to create socket %s in mode %u on "
2288				"socket %u.\n",	name, bonding_mode, socket_id);
2289		goto parse_error;
2290	}
2291	internals = rte_eth_devices[port_id].data->dev_private;
2292	internals->kvlist = kvlist;
2293
2294	RTE_LOG(INFO, EAL, "Create bonded device %s on port %d in mode %u on "
2295			"socket %u.\n",	name, port_id, bonding_mode, socket_id);
2296	return 0;
2297
2298parse_error:
2299	rte_kvargs_free(kvlist);
2300
2301	return -1;
2302}
2303
2304static int
2305bond_remove(const char *name)
2306{
2307	int  ret;
2308
2309	if (name == NULL)
2310		return -EINVAL;
2311
2312	RTE_LOG(INFO, EAL, "Uninitializing pmd_bond for %s\n", name);
2313
2314	/* free link bonding eth device */
2315	ret = rte_eth_bond_free(name);
2316	if (ret < 0)
2317		RTE_LOG(ERR, EAL, "Failed to free %s\n", name);
2318
2319	return ret;
2320}
2321
2322/* this part will resolve the slave portids after all the other pdev and vdev
2323 * have been allocated */
2324static int
2325bond_ethdev_configure(struct rte_eth_dev *dev)
2326{
2327	char *name = dev->data->name;
2328	struct bond_dev_private *internals = dev->data->dev_private;
2329	struct rte_kvargs *kvlist = internals->kvlist;
2330	int arg_count;
2331	uint8_t port_id = dev - rte_eth_devices;
2332
2333	static const uint8_t default_rss_key[40] = {
2334		0x6D, 0x5A, 0x56, 0xDA, 0x25, 0x5B, 0x0E, 0xC2, 0x41, 0x67, 0x25, 0x3D,
2335		0x43, 0xA3, 0x8F, 0xB0, 0xD0, 0xCA, 0x2B, 0xCB, 0xAE, 0x7B, 0x30, 0xB4,
2336		0x77, 0xCB, 0x2D, 0xA3, 0x80, 0x30, 0xF2, 0x0C, 0x6A, 0x42, 0xB7, 0x3B,
2337		0xBE, 0xAC, 0x01, 0xFA
2338	};
2339
2340	unsigned i, j;
2341
2342	/* If RSS is enabled, fill table and key with default values */
2343	if (dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS) {
2344		dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key = internals->rss_key;
2345		dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len = 0;
2346		memcpy(internals->rss_key, default_rss_key, 40);
2347
2348		for (i = 0; i < RTE_DIM(internals->reta_conf); i++) {
2349			internals->reta_conf[i].mask = ~0LL;
2350			for (j = 0; j < RTE_RETA_GROUP_SIZE; j++)
2351				internals->reta_conf[i].reta[j] = j % dev->data->nb_rx_queues;
2352		}
2353	}
2354
2355	/* set the max_rx_pktlen */
2356	internals->max_rx_pktlen = internals->candidate_max_rx_pktlen;
2357
2358	/*
2359	 * if no kvlist, it means that this bonded device has been created
2360	 * through the bonding api.
2361	 */
2362	if (!kvlist)
2363		return 0;
2364
2365	/* Parse MAC address for bonded device */
2366	arg_count = rte_kvargs_count(kvlist, PMD_BOND_MAC_ADDR_KVARG);
2367	if (arg_count == 1) {
2368		struct ether_addr bond_mac;
2369
2370		if (rte_kvargs_process(kvlist, PMD_BOND_MAC_ADDR_KVARG,
2371				&bond_ethdev_parse_bond_mac_addr_kvarg, &bond_mac) < 0) {
2372			RTE_LOG(INFO, EAL, "Invalid mac address for bonded device %s\n",
2373					name);
2374			return -1;
2375		}
2376
2377		/* Set MAC address */
2378		if (rte_eth_bond_mac_address_set(port_id, &bond_mac) != 0) {
2379			RTE_LOG(ERR, EAL,
2380					"Failed to set mac address on bonded device %s\n",
2381					name);
2382			return -1;
2383		}
2384	} else if (arg_count > 1) {
2385		RTE_LOG(ERR, EAL,
2386				"MAC address can be specified only once for bonded device %s\n",
2387				name);
2388		return -1;
2389	}
2390
2391	/* Parse/set balance mode transmit policy */
2392	arg_count = rte_kvargs_count(kvlist, PMD_BOND_XMIT_POLICY_KVARG);
2393	if (arg_count == 1) {
2394		uint8_t xmit_policy;
2395
2396		if (rte_kvargs_process(kvlist, PMD_BOND_XMIT_POLICY_KVARG,
2397				&bond_ethdev_parse_balance_xmit_policy_kvarg, &xmit_policy) !=
2398						0) {
2399			RTE_LOG(INFO, EAL,
2400					"Invalid xmit policy specified for bonded device %s\n",
2401					name);
2402			return -1;
2403		}
2404
2405		/* Set balance mode transmit policy*/
2406		if (rte_eth_bond_xmit_policy_set(port_id, xmit_policy) != 0) {
2407			RTE_LOG(ERR, EAL,
2408					"Failed to set balance xmit policy on bonded device %s\n",
2409					name);
2410			return -1;
2411		}
2412	} else if (arg_count > 1) {
2413		RTE_LOG(ERR, EAL,
2414				"Transmit policy can be specified only once for bonded device"
2415				" %s\n", name);
2416		return -1;
2417	}
2418
2419	/* Parse/add slave ports to bonded device */
2420	if (rte_kvargs_count(kvlist, PMD_BOND_SLAVE_PORT_KVARG) > 0) {
2421		struct bond_ethdev_slave_ports slave_ports;
2422		unsigned i;
2423
2424		memset(&slave_ports, 0, sizeof(slave_ports));
2425
2426		if (rte_kvargs_process(kvlist, PMD_BOND_SLAVE_PORT_KVARG,
2427				&bond_ethdev_parse_slave_port_kvarg, &slave_ports) != 0) {
2428			RTE_LOG(ERR, EAL,
2429					"Failed to parse slave ports for bonded device %s\n",
2430					name);
2431			return -1;
2432		}
2433
2434		for (i = 0; i < slave_ports.slave_count; i++) {
2435			if (rte_eth_bond_slave_add(port_id, slave_ports.slaves[i]) != 0) {
2436				RTE_LOG(ERR, EAL,
2437						"Failed to add port %d as slave to bonded device %s\n",
2438						slave_ports.slaves[i], name);
2439			}
2440		}
2441
2442	} else {
2443		RTE_LOG(INFO, EAL, "No slaves specified for bonded device %s\n", name);
2444		return -1;
2445	}
2446
2447	/* Parse/set primary slave port id*/
2448	arg_count = rte_kvargs_count(kvlist, PMD_BOND_PRIMARY_SLAVE_KVARG);
2449	if (arg_count == 1) {
2450		uint8_t primary_slave_port_id;
2451
2452		if (rte_kvargs_process(kvlist,
2453				PMD_BOND_PRIMARY_SLAVE_KVARG,
2454				&bond_ethdev_parse_primary_slave_port_id_kvarg,
2455				&primary_slave_port_id) < 0) {
2456			RTE_LOG(INFO, EAL,
2457					"Invalid primary slave port id specified for bonded device"
2458					" %s\n", name);
2459			return -1;
2460		}
2461
2462		/* Set balance mode transmit policy*/
2463		if (rte_eth_bond_primary_set(port_id, (uint8_t)primary_slave_port_id)
2464				!= 0) {
2465			RTE_LOG(ERR, EAL,
2466					"Failed to set primary slave port %d on bonded device %s\n",
2467					primary_slave_port_id, name);
2468			return -1;
2469		}
2470	} else if (arg_count > 1) {
2471		RTE_LOG(INFO, EAL,
2472				"Primary slave can be specified only once for bonded device"
2473				" %s\n", name);
2474		return -1;
2475	}
2476
2477	/* Parse link status monitor polling interval */
2478	arg_count = rte_kvargs_count(kvlist, PMD_BOND_LSC_POLL_PERIOD_KVARG);
2479	if (arg_count == 1) {
2480		uint32_t lsc_poll_interval_ms;
2481
2482		if (rte_kvargs_process(kvlist,
2483				PMD_BOND_LSC_POLL_PERIOD_KVARG,
2484				&bond_ethdev_parse_time_ms_kvarg,
2485				&lsc_poll_interval_ms) < 0) {
2486			RTE_LOG(INFO, EAL,
2487					"Invalid lsc polling interval value specified for bonded"
2488					" device %s\n", name);
2489			return -1;
2490		}
2491
2492		if (rte_eth_bond_link_monitoring_set(port_id, lsc_poll_interval_ms)
2493				!= 0) {
2494			RTE_LOG(ERR, EAL,
2495					"Failed to set lsc monitor polling interval (%u ms) on"
2496					" bonded device %s\n", lsc_poll_interval_ms, name);
2497			return -1;
2498		}
2499	} else if (arg_count > 1) {
2500		RTE_LOG(INFO, EAL,
2501				"LSC polling interval can be specified only once for bonded"
2502				" device %s\n", name);
2503		return -1;
2504	}
2505
2506	/* Parse link up interrupt propagation delay */
2507	arg_count = rte_kvargs_count(kvlist, PMD_BOND_LINK_UP_PROP_DELAY_KVARG);
2508	if (arg_count == 1) {
2509		uint32_t link_up_delay_ms;
2510
2511		if (rte_kvargs_process(kvlist,
2512				PMD_BOND_LINK_UP_PROP_DELAY_KVARG,
2513				&bond_ethdev_parse_time_ms_kvarg,
2514				&link_up_delay_ms) < 0) {
2515			RTE_LOG(INFO, EAL,
2516					"Invalid link up propagation delay value specified for"
2517					" bonded device %s\n", name);
2518			return -1;
2519		}
2520
2521		/* Set balance mode transmit policy*/
2522		if (rte_eth_bond_link_up_prop_delay_set(port_id, link_up_delay_ms)
2523				!= 0) {
2524			RTE_LOG(ERR, EAL,
2525					"Failed to set link up propagation delay (%u ms) on bonded"
2526					" device %s\n", link_up_delay_ms, name);
2527			return -1;
2528		}
2529	} else if (arg_count > 1) {
2530		RTE_LOG(INFO, EAL,
2531				"Link up propagation delay can be specified only once for"
2532				" bonded device %s\n", name);
2533		return -1;
2534	}
2535
2536	/* Parse link down interrupt propagation delay */
2537	arg_count = rte_kvargs_count(kvlist, PMD_BOND_LINK_DOWN_PROP_DELAY_KVARG);
2538	if (arg_count == 1) {
2539		uint32_t link_down_delay_ms;
2540
2541		if (rte_kvargs_process(kvlist,
2542				PMD_BOND_LINK_DOWN_PROP_DELAY_KVARG,
2543				&bond_ethdev_parse_time_ms_kvarg,
2544				&link_down_delay_ms) < 0) {
2545			RTE_LOG(INFO, EAL,
2546					"Invalid link down propagation delay value specified for"
2547					" bonded device %s\n", name);
2548			return -1;
2549		}
2550
2551		/* Set balance mode transmit policy*/
2552		if (rte_eth_bond_link_down_prop_delay_set(port_id, link_down_delay_ms)
2553				!= 0) {
2554			RTE_LOG(ERR, EAL,
2555					"Failed to set link down propagation delay (%u ms) on"
2556					" bonded device %s\n", link_down_delay_ms, name);
2557			return -1;
2558		}
2559	} else if (arg_count > 1) {
2560		RTE_LOG(INFO, EAL,
2561				"Link down propagation delay can be specified only once for"
2562				" bonded device %s\n", name);
2563		return -1;
2564	}
2565
2566	return 0;
2567}
2568
2569static struct rte_vdev_driver bond_drv = {
2570	.probe = bond_probe,
2571	.remove = bond_remove,
2572};
2573
2574RTE_PMD_REGISTER_VDEV(net_bonding, bond_drv);
2575RTE_PMD_REGISTER_ALIAS(net_bonding, eth_bond);
2576
2577RTE_PMD_REGISTER_PARAM_STRING(net_bonding,
2578	"slave=<ifc> "
2579	"primary=<ifc> "
2580	"mode=[0-6] "
2581	"xmit_policy=[l2 | l23 | l34] "
2582	"socket_id=<int> "
2583	"mac=<mac addr> "
2584	"lsc_poll_period_ms=<int> "
2585	"up_delay=<int> "
2586	"down_delay=<int>");
2587