mlx5_rxq.c revision 3d9b7210
1/*-
2 *   BSD LICENSE
3 *
4 *   Copyright 2015 6WIND S.A.
5 *   Copyright 2015 Mellanox.
6 *
7 *   Redistribution and use in source and binary forms, with or without
8 *   modification, are permitted provided that the following conditions
9 *   are met:
10 *
11 *     * Redistributions of source code must retain the above copyright
12 *       notice, this list of conditions and the following disclaimer.
13 *     * Redistributions in binary form must reproduce the above copyright
14 *       notice, this list of conditions and the following disclaimer in
15 *       the documentation and/or other materials provided with the
16 *       distribution.
17 *     * Neither the name of 6WIND S.A. nor the names of its
18 *       contributors may be used to endorse or promote products derived
19 *       from this software without specific prior written permission.
20 *
21 *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34#include <stddef.h>
35#include <assert.h>
36#include <errno.h>
37#include <string.h>
38#include <stdint.h>
39
40/* Verbs header. */
41/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
42#ifdef PEDANTIC
43#pragma GCC diagnostic ignored "-Wpedantic"
44#endif
45#include <infiniband/verbs.h>
46#include <infiniband/arch.h>
47#include <infiniband/mlx5_hw.h>
48#ifdef PEDANTIC
49#pragma GCC diagnostic error "-Wpedantic"
50#endif
51
52/* DPDK headers don't like -pedantic. */
53#ifdef PEDANTIC
54#pragma GCC diagnostic ignored "-Wpedantic"
55#endif
56#include <rte_mbuf.h>
57#include <rte_malloc.h>
58#include <rte_ethdev.h>
59#include <rte_common.h>
60#ifdef PEDANTIC
61#pragma GCC diagnostic error "-Wpedantic"
62#endif
63
64#include "mlx5.h"
65#include "mlx5_rxtx.h"
66#include "mlx5_utils.h"
67#include "mlx5_autoconf.h"
68#include "mlx5_defs.h"
69
70/* Initialization data for hash RX queues. */
71const struct hash_rxq_init hash_rxq_init[] = {
72	[HASH_RXQ_TCPV4] = {
73		.hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
74				IBV_EXP_RX_HASH_DST_IPV4 |
75				IBV_EXP_RX_HASH_SRC_PORT_TCP |
76				IBV_EXP_RX_HASH_DST_PORT_TCP),
77		.dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_TCP,
78		.flow_priority = 0,
79		.flow_spec.tcp_udp = {
80			.type = IBV_EXP_FLOW_SPEC_TCP,
81			.size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
82		},
83		.underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
84	},
85	[HASH_RXQ_UDPV4] = {
86		.hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
87				IBV_EXP_RX_HASH_DST_IPV4 |
88				IBV_EXP_RX_HASH_SRC_PORT_UDP |
89				IBV_EXP_RX_HASH_DST_PORT_UDP),
90		.dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_UDP,
91		.flow_priority = 0,
92		.flow_spec.tcp_udp = {
93			.type = IBV_EXP_FLOW_SPEC_UDP,
94			.size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
95		},
96		.underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
97	},
98	[HASH_RXQ_IPV4] = {
99		.hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
100				IBV_EXP_RX_HASH_DST_IPV4),
101		.dpdk_rss_hf = (ETH_RSS_IPV4 |
102				ETH_RSS_FRAG_IPV4),
103		.flow_priority = 1,
104		.flow_spec.ipv4 = {
105			.type = IBV_EXP_FLOW_SPEC_IPV4,
106			.size = sizeof(hash_rxq_init[0].flow_spec.ipv4),
107		},
108		.underlayer = &hash_rxq_init[HASH_RXQ_ETH],
109	},
110	[HASH_RXQ_TCPV6] = {
111		.hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
112				IBV_EXP_RX_HASH_DST_IPV6 |
113				IBV_EXP_RX_HASH_SRC_PORT_TCP |
114				IBV_EXP_RX_HASH_DST_PORT_TCP),
115		.dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_TCP,
116		.flow_priority = 0,
117		.flow_spec.tcp_udp = {
118			.type = IBV_EXP_FLOW_SPEC_TCP,
119			.size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
120		},
121		.underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
122	},
123	[HASH_RXQ_UDPV6] = {
124		.hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
125				IBV_EXP_RX_HASH_DST_IPV6 |
126				IBV_EXP_RX_HASH_SRC_PORT_UDP |
127				IBV_EXP_RX_HASH_DST_PORT_UDP),
128		.dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_UDP,
129		.flow_priority = 0,
130		.flow_spec.tcp_udp = {
131			.type = IBV_EXP_FLOW_SPEC_UDP,
132			.size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
133		},
134		.underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
135	},
136	[HASH_RXQ_IPV6] = {
137		.hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
138				IBV_EXP_RX_HASH_DST_IPV6),
139		.dpdk_rss_hf = (ETH_RSS_IPV6 |
140				ETH_RSS_FRAG_IPV6),
141		.flow_priority = 1,
142		.flow_spec.ipv6 = {
143			.type = IBV_EXP_FLOW_SPEC_IPV6,
144			.size = sizeof(hash_rxq_init[0].flow_spec.ipv6),
145		},
146		.underlayer = &hash_rxq_init[HASH_RXQ_ETH],
147	},
148	[HASH_RXQ_ETH] = {
149		.hash_fields = 0,
150		.dpdk_rss_hf = 0,
151		.flow_priority = 2,
152		.flow_spec.eth = {
153			.type = IBV_EXP_FLOW_SPEC_ETH,
154			.size = sizeof(hash_rxq_init[0].flow_spec.eth),
155		},
156		.underlayer = NULL,
157	},
158};
159
160/* Number of entries in hash_rxq_init[]. */
161const unsigned int hash_rxq_init_n = RTE_DIM(hash_rxq_init);
162
163/* Initialization data for hash RX queue indirection tables. */
164static const struct ind_table_init ind_table_init[] = {
165	{
166		.max_size = -1u, /* Superseded by HW limitations. */
167		.hash_types =
168			1 << HASH_RXQ_TCPV4 |
169			1 << HASH_RXQ_UDPV4 |
170			1 << HASH_RXQ_IPV4 |
171			1 << HASH_RXQ_TCPV6 |
172			1 << HASH_RXQ_UDPV6 |
173			1 << HASH_RXQ_IPV6 |
174			0,
175		.hash_types_n = 6,
176	},
177	{
178		.max_size = 1,
179		.hash_types = 1 << HASH_RXQ_ETH,
180		.hash_types_n = 1,
181	},
182};
183
184#define IND_TABLE_INIT_N RTE_DIM(ind_table_init)
185
186/* Default RSS hash key also used for ConnectX-3. */
187uint8_t rss_hash_default_key[] = {
188	0x2c, 0xc6, 0x81, 0xd1,
189	0x5b, 0xdb, 0xf4, 0xf7,
190	0xfc, 0xa2, 0x83, 0x19,
191	0xdb, 0x1a, 0x3e, 0x94,
192	0x6b, 0x9e, 0x38, 0xd9,
193	0x2c, 0x9c, 0x03, 0xd1,
194	0xad, 0x99, 0x44, 0xa7,
195	0xd9, 0x56, 0x3d, 0x59,
196	0x06, 0x3c, 0x25, 0xf3,
197	0xfc, 0x1f, 0xdc, 0x2a,
198};
199
200/* Length of the default RSS hash key. */
201const size_t rss_hash_default_key_len = sizeof(rss_hash_default_key);
202
203/**
204 * Populate flow steering rule for a given hash RX queue type using
205 * information from hash_rxq_init[]. Nothing is written to flow_attr when
206 * flow_attr_size is not large enough, but the required size is still returned.
207 *
208 * @param priv
209 *   Pointer to private structure.
210 * @param[out] flow_attr
211 *   Pointer to flow attribute structure to fill. Note that the allocated
212 *   area must be larger and large enough to hold all flow specifications.
213 * @param flow_attr_size
214 *   Entire size of flow_attr and trailing room for flow specifications.
215 * @param type
216 *   Hash RX queue type to use for flow steering rule.
217 *
218 * @return
219 *   Total size of the flow attribute buffer. No errors are defined.
220 */
221size_t
222priv_flow_attr(struct priv *priv, struct ibv_exp_flow_attr *flow_attr,
223	       size_t flow_attr_size, enum hash_rxq_type type)
224{
225	size_t offset = sizeof(*flow_attr);
226	const struct hash_rxq_init *init = &hash_rxq_init[type];
227
228	assert(priv != NULL);
229	assert((size_t)type < RTE_DIM(hash_rxq_init));
230	do {
231		offset += init->flow_spec.hdr.size;
232		init = init->underlayer;
233	} while (init != NULL);
234	if (offset > flow_attr_size)
235		return offset;
236	flow_attr_size = offset;
237	init = &hash_rxq_init[type];
238	*flow_attr = (struct ibv_exp_flow_attr){
239		.type = IBV_EXP_FLOW_ATTR_NORMAL,
240		/* Priorities < 3 are reserved for flow director. */
241		.priority = init->flow_priority + 3,
242		.num_of_specs = 0,
243		.port = priv->port,
244		.flags = 0,
245	};
246	do {
247		offset -= init->flow_spec.hdr.size;
248		memcpy((void *)((uintptr_t)flow_attr + offset),
249		       &init->flow_spec,
250		       init->flow_spec.hdr.size);
251		++flow_attr->num_of_specs;
252		init = init->underlayer;
253	} while (init != NULL);
254	return flow_attr_size;
255}
256
257/**
258 * Convert hash type position in indirection table initializer to
259 * hash RX queue type.
260 *
261 * @param table
262 *   Indirection table initializer.
263 * @param pos
264 *   Hash type position.
265 *
266 * @return
267 *   Hash RX queue type.
268 */
269static enum hash_rxq_type
270hash_rxq_type_from_pos(const struct ind_table_init *table, unsigned int pos)
271{
272	enum hash_rxq_type type = HASH_RXQ_TCPV4;
273
274	assert(pos < table->hash_types_n);
275	do {
276		if ((table->hash_types & (1 << type)) && (pos-- == 0))
277			break;
278		++type;
279	} while (1);
280	return type;
281}
282
283/**
284 * Filter out disabled hash RX queue types from ind_table_init[].
285 *
286 * @param priv
287 *   Pointer to private structure.
288 * @param[out] table
289 *   Output table.
290 *
291 * @return
292 *   Number of table entries.
293 */
294static unsigned int
295priv_make_ind_table_init(struct priv *priv,
296			 struct ind_table_init (*table)[IND_TABLE_INIT_N])
297{
298	uint64_t rss_hf;
299	unsigned int i;
300	unsigned int j;
301	unsigned int table_n = 0;
302	/* Mandatory to receive frames not handled by normal hash RX queues. */
303	unsigned int hash_types_sup = 1 << HASH_RXQ_ETH;
304
305	rss_hf = priv->rss_hf;
306	/* Process other protocols only if more than one queue. */
307	if (priv->rxqs_n > 1)
308		for (i = 0; (i != hash_rxq_init_n); ++i)
309			if (rss_hf & hash_rxq_init[i].dpdk_rss_hf)
310				hash_types_sup |= (1 << i);
311
312	/* Filter out entries whose protocols are not in the set. */
313	for (i = 0, j = 0; (i != IND_TABLE_INIT_N); ++i) {
314		unsigned int nb;
315		unsigned int h;
316
317		/* j is increased only if the table has valid protocols. */
318		assert(j <= i);
319		(*table)[j] = ind_table_init[i];
320		(*table)[j].hash_types &= hash_types_sup;
321		for (h = 0, nb = 0; (h != hash_rxq_init_n); ++h)
322			if (((*table)[j].hash_types >> h) & 0x1)
323				++nb;
324		(*table)[i].hash_types_n = nb;
325		if (nb) {
326			++table_n;
327			++j;
328		}
329	}
330	return table_n;
331}
332
333/**
334 * Initialize hash RX queues and indirection table.
335 *
336 * @param priv
337 *   Pointer to private structure.
338 *
339 * @return
340 *   0 on success, errno value on failure.
341 */
342int
343priv_create_hash_rxqs(struct priv *priv)
344{
345	struct ibv_exp_wq *wqs[priv->reta_idx_n];
346	struct ind_table_init ind_table_init[IND_TABLE_INIT_N];
347	unsigned int ind_tables_n =
348		priv_make_ind_table_init(priv, &ind_table_init);
349	unsigned int hash_rxqs_n = 0;
350	struct hash_rxq (*hash_rxqs)[] = NULL;
351	struct ibv_exp_rwq_ind_table *(*ind_tables)[] = NULL;
352	unsigned int i;
353	unsigned int j;
354	unsigned int k;
355	int err = 0;
356
357	assert(priv->ind_tables == NULL);
358	assert(priv->ind_tables_n == 0);
359	assert(priv->hash_rxqs == NULL);
360	assert(priv->hash_rxqs_n == 0);
361	assert(priv->pd != NULL);
362	assert(priv->ctx != NULL);
363	if (priv->rxqs_n == 0)
364		return EINVAL;
365	assert(priv->rxqs != NULL);
366	if (ind_tables_n == 0) {
367		ERROR("all hash RX queue types have been filtered out,"
368		      " indirection table cannot be created");
369		return EINVAL;
370	}
371	if (priv->rxqs_n & (priv->rxqs_n - 1)) {
372		INFO("%u RX queues are configured, consider rounding this"
373		     " number to the next power of two for better balancing",
374		     priv->rxqs_n);
375		DEBUG("indirection table extended to assume %u WQs",
376		      priv->reta_idx_n);
377	}
378	for (i = 0; (i != priv->reta_idx_n); ++i) {
379		struct rxq_ctrl *rxq_ctrl;
380
381		rxq_ctrl = container_of((*priv->rxqs)[(*priv->reta_idx)[i]],
382					struct rxq_ctrl, rxq);
383		wqs[i] = rxq_ctrl->wq;
384	}
385	/* Get number of hash RX queues to configure. */
386	for (i = 0, hash_rxqs_n = 0; (i != ind_tables_n); ++i)
387		hash_rxqs_n += ind_table_init[i].hash_types_n;
388	DEBUG("allocating %u hash RX queues for %u WQs, %u indirection tables",
389	      hash_rxqs_n, priv->rxqs_n, ind_tables_n);
390	/* Create indirection tables. */
391	ind_tables = rte_calloc(__func__, ind_tables_n,
392				sizeof((*ind_tables)[0]), 0);
393	if (ind_tables == NULL) {
394		err = ENOMEM;
395		ERROR("cannot allocate indirection tables container: %s",
396		      strerror(err));
397		goto error;
398	}
399	for (i = 0; (i != ind_tables_n); ++i) {
400		struct ibv_exp_rwq_ind_table_init_attr ind_init_attr = {
401			.pd = priv->pd,
402			.log_ind_tbl_size = 0, /* Set below. */
403			.ind_tbl = wqs,
404			.comp_mask = 0,
405		};
406		unsigned int ind_tbl_size = ind_table_init[i].max_size;
407		struct ibv_exp_rwq_ind_table *ind_table;
408
409		if (priv->reta_idx_n < ind_tbl_size)
410			ind_tbl_size = priv->reta_idx_n;
411		ind_init_attr.log_ind_tbl_size = log2above(ind_tbl_size);
412		errno = 0;
413		ind_table = ibv_exp_create_rwq_ind_table(priv->ctx,
414							 &ind_init_attr);
415		if (ind_table != NULL) {
416			(*ind_tables)[i] = ind_table;
417			continue;
418		}
419		/* Not clear whether errno is set. */
420		err = (errno ? errno : EINVAL);
421		ERROR("RX indirection table creation failed with error %d: %s",
422		      err, strerror(err));
423		goto error;
424	}
425	/* Allocate array that holds hash RX queues and related data. */
426	hash_rxqs = rte_calloc(__func__, hash_rxqs_n,
427			       sizeof((*hash_rxqs)[0]), 0);
428	if (hash_rxqs == NULL) {
429		err = ENOMEM;
430		ERROR("cannot allocate hash RX queues container: %s",
431		      strerror(err));
432		goto error;
433	}
434	for (i = 0, j = 0, k = 0;
435	     ((i != hash_rxqs_n) && (j != ind_tables_n));
436	     ++i) {
437		struct hash_rxq *hash_rxq = &(*hash_rxqs)[i];
438		enum hash_rxq_type type =
439			hash_rxq_type_from_pos(&ind_table_init[j], k);
440		struct rte_eth_rss_conf *priv_rss_conf =
441			(*priv->rss_conf)[type];
442		struct ibv_exp_rx_hash_conf hash_conf = {
443			.rx_hash_function = IBV_EXP_RX_HASH_FUNC_TOEPLITZ,
444			.rx_hash_key_len = (priv_rss_conf ?
445					    priv_rss_conf->rss_key_len :
446					    rss_hash_default_key_len),
447			.rx_hash_key = (priv_rss_conf ?
448					priv_rss_conf->rss_key :
449					rss_hash_default_key),
450			.rx_hash_fields_mask = hash_rxq_init[type].hash_fields,
451			.rwq_ind_tbl = (*ind_tables)[j],
452		};
453		struct ibv_exp_qp_init_attr qp_init_attr = {
454			.max_inl_recv = 0, /* Currently not supported. */
455			.qp_type = IBV_QPT_RAW_PACKET,
456			.comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
457				      IBV_EXP_QP_INIT_ATTR_RX_HASH),
458			.pd = priv->pd,
459			.rx_hash_conf = &hash_conf,
460			.port_num = priv->port,
461		};
462
463		DEBUG("using indirection table %u for hash RX queue %u type %d",
464		      j, i, type);
465		*hash_rxq = (struct hash_rxq){
466			.priv = priv,
467			.qp = ibv_exp_create_qp(priv->ctx, &qp_init_attr),
468			.type = type,
469		};
470		if (hash_rxq->qp == NULL) {
471			err = (errno ? errno : EINVAL);
472			ERROR("Hash RX QP creation failure: %s",
473			      strerror(err));
474			goto error;
475		}
476		if (++k < ind_table_init[j].hash_types_n)
477			continue;
478		/* Switch to the next indirection table and reset hash RX
479		 * queue type array index. */
480		++j;
481		k = 0;
482	}
483	priv->ind_tables = ind_tables;
484	priv->ind_tables_n = ind_tables_n;
485	priv->hash_rxqs = hash_rxqs;
486	priv->hash_rxqs_n = hash_rxqs_n;
487	assert(err == 0);
488	return 0;
489error:
490	if (hash_rxqs != NULL) {
491		for (i = 0; (i != hash_rxqs_n); ++i) {
492			struct ibv_qp *qp = (*hash_rxqs)[i].qp;
493
494			if (qp == NULL)
495				continue;
496			claim_zero(ibv_destroy_qp(qp));
497		}
498		rte_free(hash_rxqs);
499	}
500	if (ind_tables != NULL) {
501		for (j = 0; (j != ind_tables_n); ++j) {
502			struct ibv_exp_rwq_ind_table *ind_table =
503				(*ind_tables)[j];
504
505			if (ind_table == NULL)
506				continue;
507			claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
508		}
509		rte_free(ind_tables);
510	}
511	return err;
512}
513
514/**
515 * Clean up hash RX queues and indirection table.
516 *
517 * @param priv
518 *   Pointer to private structure.
519 */
520void
521priv_destroy_hash_rxqs(struct priv *priv)
522{
523	unsigned int i;
524
525	DEBUG("destroying %u hash RX queues", priv->hash_rxqs_n);
526	if (priv->hash_rxqs_n == 0) {
527		assert(priv->hash_rxqs == NULL);
528		assert(priv->ind_tables == NULL);
529		return;
530	}
531	for (i = 0; (i != priv->hash_rxqs_n); ++i) {
532		struct hash_rxq *hash_rxq = &(*priv->hash_rxqs)[i];
533		unsigned int j, k;
534
535		assert(hash_rxq->priv == priv);
536		assert(hash_rxq->qp != NULL);
537		/* Also check that there are no remaining flows. */
538		for (j = 0; (j != RTE_DIM(hash_rxq->special_flow)); ++j)
539			for (k = 0;
540			     (k != RTE_DIM(hash_rxq->special_flow[j]));
541			     ++k)
542				assert(hash_rxq->special_flow[j][k] == NULL);
543		for (j = 0; (j != RTE_DIM(hash_rxq->mac_flow)); ++j)
544			for (k = 0; (k != RTE_DIM(hash_rxq->mac_flow[j])); ++k)
545				assert(hash_rxq->mac_flow[j][k] == NULL);
546		claim_zero(ibv_destroy_qp(hash_rxq->qp));
547	}
548	priv->hash_rxqs_n = 0;
549	rte_free(priv->hash_rxqs);
550	priv->hash_rxqs = NULL;
551	for (i = 0; (i != priv->ind_tables_n); ++i) {
552		struct ibv_exp_rwq_ind_table *ind_table =
553			(*priv->ind_tables)[i];
554
555		assert(ind_table != NULL);
556		claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
557	}
558	priv->ind_tables_n = 0;
559	rte_free(priv->ind_tables);
560	priv->ind_tables = NULL;
561}
562
563/**
564 * Check whether a given flow type is allowed.
565 *
566 * @param priv
567 *   Pointer to private structure.
568 * @param type
569 *   Flow type to check.
570 *
571 * @return
572 *   Nonzero if the given flow type is allowed.
573 */
574int
575priv_allow_flow_type(struct priv *priv, enum hash_rxq_flow_type type)
576{
577	/* Only FLOW_TYPE_PROMISC is allowed when promiscuous mode
578	 * has been requested. */
579	if (priv->promisc_req)
580		return type == HASH_RXQ_FLOW_TYPE_PROMISC;
581	switch (type) {
582	case HASH_RXQ_FLOW_TYPE_PROMISC:
583		return !!priv->promisc_req;
584	case HASH_RXQ_FLOW_TYPE_ALLMULTI:
585		return !!priv->allmulti_req;
586	case HASH_RXQ_FLOW_TYPE_BROADCAST:
587	case HASH_RXQ_FLOW_TYPE_IPV6MULTI:
588		/* If allmulti is enabled, broadcast and ipv6multi
589		 * are unnecessary. */
590		return !priv->allmulti_req;
591	case HASH_RXQ_FLOW_TYPE_MAC:
592		return 1;
593	default:
594		/* Unsupported flow type is not allowed. */
595		return 0;
596	}
597	return 0;
598}
599
600/**
601 * Automatically enable/disable flows according to configuration.
602 *
603 * @param priv
604 *   Private structure.
605 *
606 * @return
607 *   0 on success, errno value on failure.
608 */
609int
610priv_rehash_flows(struct priv *priv)
611{
612	enum hash_rxq_flow_type i;
613
614	for (i = HASH_RXQ_FLOW_TYPE_PROMISC;
615			i != RTE_DIM((*priv->hash_rxqs)[0].special_flow);
616			++i)
617		if (!priv_allow_flow_type(priv, i)) {
618			priv_special_flow_disable(priv, i);
619		} else {
620			int ret = priv_special_flow_enable(priv, i);
621
622			if (ret)
623				return ret;
624		}
625	if (priv_allow_flow_type(priv, HASH_RXQ_FLOW_TYPE_MAC))
626		return priv_mac_addrs_enable(priv);
627	priv_mac_addrs_disable(priv);
628	return 0;
629}
630
631/**
632 * Allocate RX queue elements.
633 *
634 * @param rxq_ctrl
635 *   Pointer to RX queue structure.
636 * @param elts_n
637 *   Number of elements to allocate.
638 * @param[in] pool
639 *   If not NULL, fetch buffers from this array instead of allocating them
640 *   with rte_pktmbuf_alloc().
641 *
642 * @return
643 *   0 on success, errno value on failure.
644 */
645static int
646rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
647	       struct rte_mbuf *(*pool)[])
648{
649	const unsigned int sges_n = 1 << rxq_ctrl->rxq.sges_n;
650	unsigned int i;
651	int ret = 0;
652
653	/* Iterate on segments. */
654	for (i = 0; (i != elts_n); ++i) {
655		struct rte_mbuf *buf;
656		volatile struct mlx5_wqe_data_seg *scat =
657			&(*rxq_ctrl->rxq.wqes)[i];
658
659		if (pool != NULL) {
660			buf = (*pool)[i];
661			assert(buf != NULL);
662			rte_pktmbuf_reset(buf);
663			rte_pktmbuf_refcnt_update(buf, 1);
664		} else
665			buf = rte_pktmbuf_alloc(rxq_ctrl->rxq.mp);
666		if (buf == NULL) {
667			assert(pool == NULL);
668			ERROR("%p: empty mbuf pool", (void *)rxq_ctrl);
669			ret = ENOMEM;
670			goto error;
671		}
672		/* Headroom is reserved by rte_pktmbuf_alloc(). */
673		assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
674		/* Buffer is supposed to be empty. */
675		assert(rte_pktmbuf_data_len(buf) == 0);
676		assert(rte_pktmbuf_pkt_len(buf) == 0);
677		assert(!buf->next);
678		/* Only the first segment keeps headroom. */
679		if (i % sges_n)
680			SET_DATA_OFF(buf, 0);
681		PORT(buf) = rxq_ctrl->rxq.port_id;
682		DATA_LEN(buf) = rte_pktmbuf_tailroom(buf);
683		PKT_LEN(buf) = DATA_LEN(buf);
684		NB_SEGS(buf) = 1;
685		/* scat->addr must be able to store a pointer. */
686		assert(sizeof(scat->addr) >= sizeof(uintptr_t));
687		*scat = (struct mlx5_wqe_data_seg){
688			.addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)),
689			.byte_count = htonl(DATA_LEN(buf)),
690			.lkey = htonl(rxq_ctrl->mr->lkey),
691		};
692		(*rxq_ctrl->rxq.elts)[i] = buf;
693	}
694	DEBUG("%p: allocated and configured %u segments (max %u packets)",
695	      (void *)rxq_ctrl, elts_n, elts_n / (1 << rxq_ctrl->rxq.sges_n));
696	assert(ret == 0);
697	return 0;
698error:
699	assert(pool == NULL);
700	elts_n = i;
701	for (i = 0; (i != elts_n); ++i) {
702		if ((*rxq_ctrl->rxq.elts)[i] != NULL)
703			rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
704		(*rxq_ctrl->rxq.elts)[i] = NULL;
705	}
706	DEBUG("%p: failed, freed everything", (void *)rxq_ctrl);
707	assert(ret > 0);
708	return ret;
709}
710
711/**
712 * Free RX queue elements.
713 *
714 * @param rxq_ctrl
715 *   Pointer to RX queue structure.
716 */
717static void
718rxq_free_elts(struct rxq_ctrl *rxq_ctrl)
719{
720	unsigned int i;
721
722	DEBUG("%p: freeing WRs", (void *)rxq_ctrl);
723	if (rxq_ctrl->rxq.elts == NULL)
724		return;
725
726	for (i = 0; (i != (1u << rxq_ctrl->rxq.elts_n)); ++i) {
727		if ((*rxq_ctrl->rxq.elts)[i] != NULL)
728			rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
729		(*rxq_ctrl->rxq.elts)[i] = NULL;
730	}
731}
732
733/**
734 * Clean up a RX queue.
735 *
736 * Destroy objects, free allocated memory and reset the structure for reuse.
737 *
738 * @param rxq_ctrl
739 *   Pointer to RX queue structure.
740 */
741void
742rxq_cleanup(struct rxq_ctrl *rxq_ctrl)
743{
744	struct ibv_exp_release_intf_params params;
745
746	DEBUG("cleaning up %p", (void *)rxq_ctrl);
747	rxq_free_elts(rxq_ctrl);
748	if (rxq_ctrl->fdir_queue != NULL)
749		priv_fdir_queue_destroy(rxq_ctrl->priv, rxq_ctrl->fdir_queue);
750	if (rxq_ctrl->if_wq != NULL) {
751		assert(rxq_ctrl->priv != NULL);
752		assert(rxq_ctrl->priv->ctx != NULL);
753		assert(rxq_ctrl->wq != NULL);
754		params = (struct ibv_exp_release_intf_params){
755			.comp_mask = 0,
756		};
757		claim_zero(ibv_exp_release_intf(rxq_ctrl->priv->ctx,
758						rxq_ctrl->if_wq,
759						&params));
760	}
761	if (rxq_ctrl->if_cq != NULL) {
762		assert(rxq_ctrl->priv != NULL);
763		assert(rxq_ctrl->priv->ctx != NULL);
764		assert(rxq_ctrl->cq != NULL);
765		params = (struct ibv_exp_release_intf_params){
766			.comp_mask = 0,
767		};
768		claim_zero(ibv_exp_release_intf(rxq_ctrl->priv->ctx,
769						rxq_ctrl->if_cq,
770						&params));
771	}
772	if (rxq_ctrl->wq != NULL)
773		claim_zero(ibv_exp_destroy_wq(rxq_ctrl->wq));
774	if (rxq_ctrl->cq != NULL)
775		claim_zero(ibv_destroy_cq(rxq_ctrl->cq));
776	if (rxq_ctrl->rd != NULL) {
777		struct ibv_exp_destroy_res_domain_attr attr = {
778			.comp_mask = 0,
779		};
780
781		assert(rxq_ctrl->priv != NULL);
782		assert(rxq_ctrl->priv->ctx != NULL);
783		claim_zero(ibv_exp_destroy_res_domain(rxq_ctrl->priv->ctx,
784						      rxq_ctrl->rd,
785						      &attr));
786	}
787	if (rxq_ctrl->mr != NULL)
788		claim_zero(ibv_dereg_mr(rxq_ctrl->mr));
789	memset(rxq_ctrl, 0, sizeof(*rxq_ctrl));
790}
791
792/**
793 * Reconfigure RX queue buffers.
794 *
795 * rxq_rehash() does not allocate mbufs, which, if not done from the right
796 * thread (such as a control thread), may corrupt the pool.
797 * In case of failure, the queue is left untouched.
798 *
799 * @param dev
800 *   Pointer to Ethernet device structure.
801 * @param rxq_ctrl
802 *   RX queue pointer.
803 *
804 * @return
805 *   0 on success, errno value on failure.
806 */
807int
808rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
809{
810	unsigned int elts_n = 1 << rxq_ctrl->rxq.elts_n;
811	unsigned int i;
812	struct ibv_exp_wq_attr mod;
813	int err;
814
815	DEBUG("%p: rehashing queue %p with %u SGE(s) per packet",
816	      (void *)dev, (void *)rxq_ctrl, 1 << rxq_ctrl->rxq.sges_n);
817	assert(!(elts_n % (1 << rxq_ctrl->rxq.sges_n)));
818	/* From now on, any failure will render the queue unusable.
819	 * Reinitialize WQ. */
820	mod = (struct ibv_exp_wq_attr){
821		.attr_mask = IBV_EXP_WQ_ATTR_STATE,
822		.wq_state = IBV_EXP_WQS_RESET,
823	};
824	err = ibv_exp_modify_wq(rxq_ctrl->wq, &mod);
825	if (err) {
826		ERROR("%p: cannot reset WQ: %s", (void *)dev, strerror(err));
827		assert(err > 0);
828		return err;
829	}
830	/* Snatch mbufs from original queue. */
831	claim_zero(rxq_alloc_elts(rxq_ctrl, elts_n, rxq_ctrl->rxq.elts));
832	for (i = 0; i != elts_n; ++i) {
833		struct rte_mbuf *buf = (*rxq_ctrl->rxq.elts)[i];
834
835		assert(rte_mbuf_refcnt_read(buf) == 2);
836		rte_pktmbuf_free_seg(buf);
837	}
838	/* Change queue state to ready. */
839	mod = (struct ibv_exp_wq_attr){
840		.attr_mask = IBV_EXP_WQ_ATTR_STATE,
841		.wq_state = IBV_EXP_WQS_RDY,
842	};
843	err = ibv_exp_modify_wq(rxq_ctrl->wq, &mod);
844	if (err) {
845		ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
846		      (void *)dev, strerror(err));
847		goto error;
848	}
849	/* Update doorbell counter. */
850	rxq_ctrl->rxq.rq_ci = elts_n >> rxq_ctrl->rxq.sges_n;
851	rte_wmb();
852	*rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
853error:
854	assert(err >= 0);
855	return err;
856}
857
858/**
859 * Initialize RX queue.
860 *
861 * @param tmpl
862 *   Pointer to RX queue control template.
863 *
864 * @return
865 *   0 on success, errno value on failure.
866 */
867static inline int
868rxq_setup(struct rxq_ctrl *tmpl)
869{
870	struct ibv_cq *ibcq = tmpl->cq;
871	struct mlx5_cq *cq = to_mxxx(cq, cq);
872	struct mlx5_rwq *rwq = container_of(tmpl->wq, struct mlx5_rwq, wq);
873	struct rte_mbuf *(*elts)[1 << tmpl->rxq.elts_n] =
874		rte_calloc_socket("RXQ", 1, sizeof(*elts), 0, tmpl->socket);
875
876	if (cq->cqe_sz != RTE_CACHE_LINE_SIZE) {
877		ERROR("Wrong MLX5_CQE_SIZE environment variable value: "
878		      "it should be set to %u", RTE_CACHE_LINE_SIZE);
879		return EINVAL;
880	}
881	if (elts == NULL)
882		return ENOMEM;
883	tmpl->rxq.rq_db = rwq->rq.db;
884	tmpl->rxq.cqe_n = log2above(ibcq->cqe);
885	tmpl->rxq.cq_ci = 0;
886	tmpl->rxq.rq_ci = 0;
887	tmpl->rxq.cq_db = cq->dbrec;
888	tmpl->rxq.wqes =
889		(volatile struct mlx5_wqe_data_seg (*)[])
890		(uintptr_t)rwq->rq.buff;
891	tmpl->rxq.cqes =
892		(volatile struct mlx5_cqe (*)[])
893		(uintptr_t)cq->active_buf->buf;
894	tmpl->rxq.elts = elts;
895	return 0;
896}
897
898/**
899 * Configure a RX queue.
900 *
901 * @param dev
902 *   Pointer to Ethernet device structure.
903 * @param rxq_ctrl
904 *   Pointer to RX queue structure.
905 * @param desc
906 *   Number of descriptors to configure in queue.
907 * @param socket
908 *   NUMA socket on which memory must be allocated.
909 * @param[in] conf
910 *   Thresholds parameters.
911 * @param mp
912 *   Memory pool for buffer allocations.
913 *
914 * @return
915 *   0 on success, errno value on failure.
916 */
917int
918rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
919	       uint16_t desc, unsigned int socket,
920	       const struct rte_eth_rxconf *conf, struct rte_mempool *mp)
921{
922	struct priv *priv = dev->data->dev_private;
923	struct rxq_ctrl tmpl = {
924		.priv = priv,
925		.socket = socket,
926		.rxq = {
927			.elts_n = log2above(desc),
928			.mp = mp,
929			.rss_hash = priv->rxqs_n > 1,
930		},
931	};
932	struct ibv_exp_wq_attr mod;
933	union {
934		struct ibv_exp_query_intf_params params;
935		struct ibv_exp_cq_init_attr cq;
936		struct ibv_exp_res_domain_init_attr rd;
937		struct ibv_exp_wq_init_attr wq;
938		struct ibv_exp_cq_attr cq_attr;
939	} attr;
940	enum ibv_exp_query_intf_status status;
941	unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
942	unsigned int cqe_n = desc - 1;
943	struct rte_mbuf *(*elts)[desc] = NULL;
944	int ret = 0;
945
946	(void)conf; /* Thresholds configuration (ignored). */
947	/* Enable scattered packets support for this queue if necessary. */
948	assert(mb_len >= RTE_PKTMBUF_HEADROOM);
949	/* If smaller than MRU, multi-segment support must be enabled. */
950	if (mb_len < (priv->mtu > dev->data->dev_conf.rxmode.max_rx_pkt_len ?
951		     dev->data->dev_conf.rxmode.max_rx_pkt_len :
952		     priv->mtu))
953		dev->data->dev_conf.rxmode.jumbo_frame = 1;
954	if ((dev->data->dev_conf.rxmode.jumbo_frame) &&
955	    (dev->data->dev_conf.rxmode.max_rx_pkt_len >
956	     (mb_len - RTE_PKTMBUF_HEADROOM))) {
957		unsigned int size =
958			RTE_PKTMBUF_HEADROOM +
959			dev->data->dev_conf.rxmode.max_rx_pkt_len;
960		unsigned int sges_n;
961
962		/*
963		 * Determine the number of SGEs needed for a full packet
964		 * and round it to the next power of two.
965		 */
966		sges_n = log2above((size / mb_len) + !!(size % mb_len));
967		tmpl.rxq.sges_n = sges_n;
968		/* Make sure rxq.sges_n did not overflow. */
969		size = mb_len * (1 << tmpl.rxq.sges_n);
970		size -= RTE_PKTMBUF_HEADROOM;
971		if (size < dev->data->dev_conf.rxmode.max_rx_pkt_len) {
972			ERROR("%p: too many SGEs (%u) needed to handle"
973			      " requested maximum packet size %u",
974			      (void *)dev,
975			      1 << sges_n,
976			      dev->data->dev_conf.rxmode.max_rx_pkt_len);
977			return EOVERFLOW;
978		}
979	}
980	DEBUG("%p: maximum number of segments per packet: %u",
981	      (void *)dev, 1 << tmpl.rxq.sges_n);
982	if (desc % (1 << tmpl.rxq.sges_n)) {
983		ERROR("%p: number of RX queue descriptors (%u) is not a"
984		      " multiple of SGEs per packet (%u)",
985		      (void *)dev,
986		      desc,
987		      1 << tmpl.rxq.sges_n);
988		return EINVAL;
989	}
990	/* Toggle RX checksum offload if hardware supports it. */
991	if (priv->hw_csum)
992		tmpl.rxq.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
993	if (priv->hw_csum_l2tun)
994		tmpl.rxq.csum_l2tun =
995			!!dev->data->dev_conf.rxmode.hw_ip_checksum;
996	/* Use the entire RX mempool as the memory region. */
997	tmpl.mr = mlx5_mp2mr(priv->pd, mp);
998	if (tmpl.mr == NULL) {
999		ret = EINVAL;
1000		ERROR("%p: MR creation failure: %s",
1001		      (void *)dev, strerror(ret));
1002		goto error;
1003	}
1004	attr.rd = (struct ibv_exp_res_domain_init_attr){
1005		.comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
1006			      IBV_EXP_RES_DOMAIN_MSG_MODEL),
1007		.thread_model = IBV_EXP_THREAD_SINGLE,
1008		.msg_model = IBV_EXP_MSG_HIGH_BW,
1009	};
1010	tmpl.rd = ibv_exp_create_res_domain(priv->ctx, &attr.rd);
1011	if (tmpl.rd == NULL) {
1012		ret = ENOMEM;
1013		ERROR("%p: RD creation failure: %s",
1014		      (void *)dev, strerror(ret));
1015		goto error;
1016	}
1017	attr.cq = (struct ibv_exp_cq_init_attr){
1018		.comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
1019		.res_domain = tmpl.rd,
1020	};
1021	if (priv->cqe_comp) {
1022		attr.cq.comp_mask |= IBV_EXP_CQ_INIT_ATTR_FLAGS;
1023		attr.cq.flags |= IBV_EXP_CQ_COMPRESSED_CQE;
1024		cqe_n = (desc * 2) - 1; /* Double the number of CQEs. */
1025	}
1026	tmpl.cq = ibv_exp_create_cq(priv->ctx, cqe_n, NULL, NULL, 0,
1027				    &attr.cq);
1028	if (tmpl.cq == NULL) {
1029		ret = ENOMEM;
1030		ERROR("%p: CQ creation failure: %s",
1031		      (void *)dev, strerror(ret));
1032		goto error;
1033	}
1034	DEBUG("priv->device_attr.max_qp_wr is %d",
1035	      priv->device_attr.max_qp_wr);
1036	DEBUG("priv->device_attr.max_sge is %d",
1037	      priv->device_attr.max_sge);
1038	/* Configure VLAN stripping. */
1039	tmpl.rxq.vlan_strip = (priv->hw_vlan_strip &&
1040			       !!dev->data->dev_conf.rxmode.hw_vlan_strip);
1041	attr.wq = (struct ibv_exp_wq_init_attr){
1042		.wq_context = NULL, /* Could be useful in the future. */
1043		.wq_type = IBV_EXP_WQT_RQ,
1044		/* Max number of outstanding WRs. */
1045		.max_recv_wr = desc >> tmpl.rxq.sges_n,
1046		/* Max number of scatter/gather elements in a WR. */
1047		.max_recv_sge = 1 << tmpl.rxq.sges_n,
1048		.pd = priv->pd,
1049		.cq = tmpl.cq,
1050		.comp_mask =
1051			IBV_EXP_CREATE_WQ_RES_DOMAIN |
1052			IBV_EXP_CREATE_WQ_VLAN_OFFLOADS |
1053			0,
1054		.res_domain = tmpl.rd,
1055		.vlan_offloads = (tmpl.rxq.vlan_strip ?
1056				  IBV_EXP_RECEIVE_WQ_CVLAN_STRIP :
1057				  0),
1058	};
1059	/* By default, FCS (CRC) is stripped by hardware. */
1060	if (dev->data->dev_conf.rxmode.hw_strip_crc) {
1061		tmpl.rxq.crc_present = 0;
1062	} else if (priv->hw_fcs_strip) {
1063		/* Ask HW/Verbs to leave CRC in place when supported. */
1064		attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_SCATTER_FCS;
1065		attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS;
1066		tmpl.rxq.crc_present = 1;
1067	} else {
1068		WARN("%p: CRC stripping has been disabled but will still"
1069		     " be performed by hardware, make sure MLNX_OFED and"
1070		     " firmware are up to date",
1071		     (void *)dev);
1072		tmpl.rxq.crc_present = 0;
1073	}
1074	DEBUG("%p: CRC stripping is %s, %u bytes will be subtracted from"
1075	      " incoming frames to hide it",
1076	      (void *)dev,
1077	      tmpl.rxq.crc_present ? "disabled" : "enabled",
1078	      tmpl.rxq.crc_present << 2);
1079	if (!mlx5_getenv_int("MLX5_PMD_ENABLE_PADDING"))
1080		; /* Nothing else to do. */
1081	else if (priv->hw_padding) {
1082		INFO("%p: enabling packet padding on queue %p",
1083		     (void *)dev, (void *)rxq_ctrl);
1084		attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_RX_END_PADDING;
1085		attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS;
1086	} else
1087		WARN("%p: packet padding has been requested but is not"
1088		     " supported, make sure MLNX_OFED and firmware are"
1089		     " up to date",
1090		     (void *)dev);
1091
1092	tmpl.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
1093	if (tmpl.wq == NULL) {
1094		ret = (errno ? errno : EINVAL);
1095		ERROR("%p: WQ creation failure: %s",
1096		      (void *)dev, strerror(ret));
1097		goto error;
1098	}
1099	/*
1100	 * Make sure number of WRs*SGEs match expectations since a queue
1101	 * cannot allocate more than "desc" buffers.
1102	 */
1103	if (((int)attr.wq.max_recv_wr != (desc >> tmpl.rxq.sges_n)) ||
1104	    ((int)attr.wq.max_recv_sge != (1 << tmpl.rxq.sges_n))) {
1105		ERROR("%p: requested %u*%u but got %u*%u WRs*SGEs",
1106		      (void *)dev,
1107		      (desc >> tmpl.rxq.sges_n), (1 << tmpl.rxq.sges_n),
1108		      attr.wq.max_recv_wr, attr.wq.max_recv_sge);
1109		ret = EINVAL;
1110		goto error;
1111	}
1112	/* Save port ID. */
1113	tmpl.rxq.port_id = dev->data->port_id;
1114	DEBUG("%p: RTE port ID: %u", (void *)rxq_ctrl, tmpl.rxq.port_id);
1115	attr.params = (struct ibv_exp_query_intf_params){
1116		.intf_scope = IBV_EXP_INTF_GLOBAL,
1117		.intf_version = 1,
1118		.intf = IBV_EXP_INTF_CQ,
1119		.obj = tmpl.cq,
1120	};
1121	tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
1122	if (tmpl.if_cq == NULL) {
1123		ERROR("%p: CQ interface family query failed with status %d",
1124		      (void *)dev, status);
1125		goto error;
1126	}
1127	attr.params = (struct ibv_exp_query_intf_params){
1128		.intf_scope = IBV_EXP_INTF_GLOBAL,
1129		.intf = IBV_EXP_INTF_WQ,
1130		.obj = tmpl.wq,
1131	};
1132	tmpl.if_wq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
1133	if (tmpl.if_wq == NULL) {
1134		ERROR("%p: WQ interface family query failed with status %d",
1135		      (void *)dev, status);
1136		goto error;
1137	}
1138	/* Change queue state to ready. */
1139	mod = (struct ibv_exp_wq_attr){
1140		.attr_mask = IBV_EXP_WQ_ATTR_STATE,
1141		.wq_state = IBV_EXP_WQS_RDY,
1142	};
1143	ret = ibv_exp_modify_wq(tmpl.wq, &mod);
1144	if (ret) {
1145		ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
1146		      (void *)dev, strerror(ret));
1147		goto error;
1148	}
1149	ret = rxq_setup(&tmpl);
1150	if (ret) {
1151		ERROR("%p: cannot initialize RX queue structure: %s",
1152		      (void *)dev, strerror(ret));
1153		goto error;
1154	}
1155	/* Reuse buffers from original queue if possible. */
1156	if (rxq_ctrl->rxq.elts_n) {
1157		assert(1 << rxq_ctrl->rxq.elts_n == desc);
1158		assert(rxq_ctrl->rxq.elts != tmpl.rxq.elts);
1159		ret = rxq_alloc_elts(&tmpl, desc, rxq_ctrl->rxq.elts);
1160	} else
1161		ret = rxq_alloc_elts(&tmpl, desc, NULL);
1162	if (ret) {
1163		ERROR("%p: RXQ allocation failed: %s",
1164		      (void *)dev, strerror(ret));
1165		goto error;
1166	}
1167	/* Clean up rxq in case we're reinitializing it. */
1168	DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq_ctrl);
1169	rxq_cleanup(rxq_ctrl);
1170	/* Move mbuf pointers to dedicated storage area in RX queue. */
1171	elts = (void *)(rxq_ctrl + 1);
1172	rte_memcpy(elts, tmpl.rxq.elts, sizeof(*elts));
1173#ifndef NDEBUG
1174	memset(tmpl.rxq.elts, 0x55, sizeof(*elts));
1175#endif
1176	rte_free(tmpl.rxq.elts);
1177	tmpl.rxq.elts = elts;
1178	*rxq_ctrl = tmpl;
1179	/* Update doorbell counter. */
1180	rxq_ctrl->rxq.rq_ci = desc >> rxq_ctrl->rxq.sges_n;
1181	rte_wmb();
1182	*rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
1183	DEBUG("%p: rxq updated with %p", (void *)rxq_ctrl, (void *)&tmpl);
1184	assert(ret == 0);
1185	return 0;
1186error:
1187	elts = tmpl.rxq.elts;
1188	rxq_cleanup(&tmpl);
1189	rte_free(elts);
1190	assert(ret > 0);
1191	return ret;
1192}
1193
1194/**
1195 * DPDK callback to configure a RX queue.
1196 *
1197 * @param dev
1198 *   Pointer to Ethernet device structure.
1199 * @param idx
1200 *   RX queue index.
1201 * @param desc
1202 *   Number of descriptors to configure in queue.
1203 * @param socket
1204 *   NUMA socket on which memory must be allocated.
1205 * @param[in] conf
1206 *   Thresholds parameters.
1207 * @param mp
1208 *   Memory pool for buffer allocations.
1209 *
1210 * @return
1211 *   0 on success, negative errno value on failure.
1212 */
1213int
1214mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
1215		    unsigned int socket, const struct rte_eth_rxconf *conf,
1216		    struct rte_mempool *mp)
1217{
1218	struct priv *priv = dev->data->dev_private;
1219	struct rxq *rxq = (*priv->rxqs)[idx];
1220	struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
1221	int ret;
1222
1223	if (mlx5_is_secondary())
1224		return -E_RTE_SECONDARY;
1225
1226	priv_lock(priv);
1227	if (!rte_is_power_of_2(desc)) {
1228		desc = 1 << log2above(desc);
1229		WARN("%p: increased number of descriptors in RX queue %u"
1230		     " to the next power of two (%d)",
1231		     (void *)dev, idx, desc);
1232	}
1233	DEBUG("%p: configuring queue %u for %u descriptors",
1234	      (void *)dev, idx, desc);
1235	if (idx >= priv->rxqs_n) {
1236		ERROR("%p: queue index out of range (%u >= %u)",
1237		      (void *)dev, idx, priv->rxqs_n);
1238		priv_unlock(priv);
1239		return -EOVERFLOW;
1240	}
1241	if (rxq != NULL) {
1242		DEBUG("%p: reusing already allocated queue index %u (%p)",
1243		      (void *)dev, idx, (void *)rxq);
1244		if (priv->started) {
1245			priv_unlock(priv);
1246			return -EEXIST;
1247		}
1248		(*priv->rxqs)[idx] = NULL;
1249		rxq_cleanup(rxq_ctrl);
1250	} else {
1251		rxq_ctrl = rte_calloc_socket("RXQ", 1, sizeof(*rxq_ctrl) +
1252					     desc * sizeof(struct rte_mbuf *),
1253					     0, socket);
1254		if (rxq_ctrl == NULL) {
1255			ERROR("%p: unable to allocate queue index %u",
1256			      (void *)dev, idx);
1257			priv_unlock(priv);
1258			return -ENOMEM;
1259		}
1260	}
1261	ret = rxq_ctrl_setup(dev, rxq_ctrl, desc, socket, conf, mp);
1262	if (ret)
1263		rte_free(rxq_ctrl);
1264	else {
1265		rxq_ctrl->rxq.stats.idx = idx;
1266		DEBUG("%p: adding RX queue %p to list",
1267		      (void *)dev, (void *)rxq_ctrl);
1268		(*priv->rxqs)[idx] = &rxq_ctrl->rxq;
1269		/* Update receive callback. */
1270		priv_select_rx_function(priv);
1271	}
1272	priv_unlock(priv);
1273	return -ret;
1274}
1275
1276/**
1277 * DPDK callback to release a RX queue.
1278 *
1279 * @param dpdk_rxq
1280 *   Generic RX queue pointer.
1281 */
1282void
1283mlx5_rx_queue_release(void *dpdk_rxq)
1284{
1285	struct rxq *rxq = (struct rxq *)dpdk_rxq;
1286	struct rxq_ctrl *rxq_ctrl;
1287	struct priv *priv;
1288	unsigned int i;
1289
1290	if (mlx5_is_secondary())
1291		return;
1292
1293	if (rxq == NULL)
1294		return;
1295	rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
1296	priv = rxq_ctrl->priv;
1297	priv_lock(priv);
1298	for (i = 0; (i != priv->rxqs_n); ++i)
1299		if ((*priv->rxqs)[i] == rxq) {
1300			DEBUG("%p: removing RX queue %p from list",
1301			      (void *)priv->dev, (void *)rxq_ctrl);
1302			(*priv->rxqs)[i] = NULL;
1303			break;
1304		}
1305	rxq_cleanup(rxq_ctrl);
1306	rte_free(rxq_ctrl);
1307	priv_unlock(priv);
1308}
1309
1310/**
1311 * DPDK callback for RX in secondary processes.
1312 *
1313 * This function configures all queues from primary process information
1314 * if necessary before reverting to the normal RX burst callback.
1315 *
1316 * @param dpdk_rxq
1317 *   Generic pointer to RX queue structure.
1318 * @param[out] pkts
1319 *   Array to store received packets.
1320 * @param pkts_n
1321 *   Maximum number of packets in array.
1322 *
1323 * @return
1324 *   Number of packets successfully received (<= pkts_n).
1325 */
1326uint16_t
1327mlx5_rx_burst_secondary_setup(void *dpdk_rxq, struct rte_mbuf **pkts,
1328			      uint16_t pkts_n)
1329{
1330	struct rxq *rxq = dpdk_rxq;
1331	struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
1332	struct priv *priv = mlx5_secondary_data_setup(rxq_ctrl->priv);
1333	struct priv *primary_priv;
1334	unsigned int index;
1335
1336	if (priv == NULL)
1337		return 0;
1338	primary_priv =
1339		mlx5_secondary_data[priv->dev->data->port_id].primary_priv;
1340	/* Look for queue index in both private structures. */
1341	for (index = 0; index != priv->rxqs_n; ++index)
1342		if (((*primary_priv->rxqs)[index] == rxq) ||
1343		    ((*priv->rxqs)[index] == rxq))
1344			break;
1345	if (index == priv->rxqs_n)
1346		return 0;
1347	rxq = (*priv->rxqs)[index];
1348	return priv->dev->rx_pkt_burst(rxq, pkts, pkts_n);
1349}
1350