1/*-
2 *   BSD LICENSE
3 *
4 *   Copyright 2015 6WIND S.A.
5 *   Copyright 2015 Mellanox.
6 *
7 *   Redistribution and use in source and binary forms, with or without
8 *   modification, are permitted provided that the following conditions
9 *   are met:
10 *
11 *     * Redistributions of source code must retain the above copyright
12 *       notice, this list of conditions and the following disclaimer.
13 *     * Redistributions in binary form must reproduce the above copyright
14 *       notice, this list of conditions and the following disclaimer in
15 *       the documentation and/or other materials provided with the
16 *       distribution.
17 *     * Neither the name of 6WIND S.A. nor the names of its
18 *       contributors may be used to endorse or promote products derived
19 *       from this software without specific prior written permission.
20 *
21 *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34#include <stddef.h>
35#include <assert.h>
36#include <errno.h>
37#include <string.h>
38#include <stdint.h>
39
40/* Verbs header. */
41/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
42#ifdef PEDANTIC
43#pragma GCC diagnostic ignored "-Wpedantic"
44#endif
45#include <infiniband/verbs.h>
46#include <infiniband/arch.h>
47#include <infiniband/mlx5_hw.h>
48#ifdef PEDANTIC
49#pragma GCC diagnostic error "-Wpedantic"
50#endif
51
52/* DPDK headers don't like -pedantic. */
53#ifdef PEDANTIC
54#pragma GCC diagnostic ignored "-Wpedantic"
55#endif
56#include <rte_mbuf.h>
57#include <rte_malloc.h>
58#include <rte_ethdev.h>
59#include <rte_common.h>
60#ifdef PEDANTIC
61#pragma GCC diagnostic error "-Wpedantic"
62#endif
63
64#include "mlx5.h"
65#include "mlx5_rxtx.h"
66#include "mlx5_utils.h"
67#include "mlx5_autoconf.h"
68#include "mlx5_defs.h"
69
70/* Initialization data for hash RX queues. */
71const struct hash_rxq_init hash_rxq_init[] = {
72	[HASH_RXQ_TCPV4] = {
73		.hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
74				IBV_EXP_RX_HASH_DST_IPV4 |
75				IBV_EXP_RX_HASH_SRC_PORT_TCP |
76				IBV_EXP_RX_HASH_DST_PORT_TCP),
77		.dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_TCP,
78		.flow_priority = 0,
79		.flow_spec.tcp_udp = {
80			.type = IBV_EXP_FLOW_SPEC_TCP,
81			.size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
82		},
83		.underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
84	},
85	[HASH_RXQ_UDPV4] = {
86		.hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
87				IBV_EXP_RX_HASH_DST_IPV4 |
88				IBV_EXP_RX_HASH_SRC_PORT_UDP |
89				IBV_EXP_RX_HASH_DST_PORT_UDP),
90		.dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_UDP,
91		.flow_priority = 0,
92		.flow_spec.tcp_udp = {
93			.type = IBV_EXP_FLOW_SPEC_UDP,
94			.size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
95		},
96		.underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
97	},
98	[HASH_RXQ_IPV4] = {
99		.hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
100				IBV_EXP_RX_HASH_DST_IPV4),
101		.dpdk_rss_hf = (ETH_RSS_IPV4 |
102				ETH_RSS_FRAG_IPV4),
103		.flow_priority = 1,
104		.flow_spec.ipv4 = {
105			.type = IBV_EXP_FLOW_SPEC_IPV4,
106			.size = sizeof(hash_rxq_init[0].flow_spec.ipv4),
107		},
108		.underlayer = &hash_rxq_init[HASH_RXQ_ETH],
109	},
110	[HASH_RXQ_TCPV6] = {
111		.hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
112				IBV_EXP_RX_HASH_DST_IPV6 |
113				IBV_EXP_RX_HASH_SRC_PORT_TCP |
114				IBV_EXP_RX_HASH_DST_PORT_TCP),
115		.dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_TCP,
116		.flow_priority = 0,
117		.flow_spec.tcp_udp = {
118			.type = IBV_EXP_FLOW_SPEC_TCP,
119			.size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
120		},
121		.underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
122	},
123	[HASH_RXQ_UDPV6] = {
124		.hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
125				IBV_EXP_RX_HASH_DST_IPV6 |
126				IBV_EXP_RX_HASH_SRC_PORT_UDP |
127				IBV_EXP_RX_HASH_DST_PORT_UDP),
128		.dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_UDP,
129		.flow_priority = 0,
130		.flow_spec.tcp_udp = {
131			.type = IBV_EXP_FLOW_SPEC_UDP,
132			.size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
133		},
134		.underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
135	},
136	[HASH_RXQ_IPV6] = {
137		.hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
138				IBV_EXP_RX_HASH_DST_IPV6),
139		.dpdk_rss_hf = (ETH_RSS_IPV6 |
140				ETH_RSS_FRAG_IPV6),
141		.flow_priority = 1,
142		.flow_spec.ipv6 = {
143			.type = IBV_EXP_FLOW_SPEC_IPV6,
144			.size = sizeof(hash_rxq_init[0].flow_spec.ipv6),
145		},
146		.underlayer = &hash_rxq_init[HASH_RXQ_ETH],
147	},
148	[HASH_RXQ_ETH] = {
149		.hash_fields = 0,
150		.dpdk_rss_hf = 0,
151		.flow_priority = 2,
152		.flow_spec.eth = {
153			.type = IBV_EXP_FLOW_SPEC_ETH,
154			.size = sizeof(hash_rxq_init[0].flow_spec.eth),
155		},
156		.underlayer = NULL,
157	},
158};
159
160/* Number of entries in hash_rxq_init[]. */
161const unsigned int hash_rxq_init_n = RTE_DIM(hash_rxq_init);
162
163/* Initialization data for hash RX queue indirection tables. */
164static const struct ind_table_init ind_table_init[] = {
165	{
166		.max_size = -1u, /* Superseded by HW limitations. */
167		.hash_types =
168			1 << HASH_RXQ_TCPV4 |
169			1 << HASH_RXQ_UDPV4 |
170			1 << HASH_RXQ_IPV4 |
171			1 << HASH_RXQ_TCPV6 |
172			1 << HASH_RXQ_UDPV6 |
173			1 << HASH_RXQ_IPV6 |
174			0,
175		.hash_types_n = 6,
176	},
177	{
178		.max_size = 1,
179		.hash_types = 1 << HASH_RXQ_ETH,
180		.hash_types_n = 1,
181	},
182};
183
184#define IND_TABLE_INIT_N RTE_DIM(ind_table_init)
185
186/* Default RSS hash key also used for ConnectX-3. */
187uint8_t rss_hash_default_key[] = {
188	0x2c, 0xc6, 0x81, 0xd1,
189	0x5b, 0xdb, 0xf4, 0xf7,
190	0xfc, 0xa2, 0x83, 0x19,
191	0xdb, 0x1a, 0x3e, 0x94,
192	0x6b, 0x9e, 0x38, 0xd9,
193	0x2c, 0x9c, 0x03, 0xd1,
194	0xad, 0x99, 0x44, 0xa7,
195	0xd9, 0x56, 0x3d, 0x59,
196	0x06, 0x3c, 0x25, 0xf3,
197	0xfc, 0x1f, 0xdc, 0x2a,
198};
199
200/* Length of the default RSS hash key. */
201const size_t rss_hash_default_key_len = sizeof(rss_hash_default_key);
202
203/**
204 * Populate flow steering rule for a given hash RX queue type using
205 * information from hash_rxq_init[]. Nothing is written to flow_attr when
206 * flow_attr_size is not large enough, but the required size is still returned.
207 *
208 * @param priv
209 *   Pointer to private structure.
210 * @param[out] flow_attr
211 *   Pointer to flow attribute structure to fill. Note that the allocated
212 *   area must be larger and large enough to hold all flow specifications.
213 * @param flow_attr_size
214 *   Entire size of flow_attr and trailing room for flow specifications.
215 * @param type
216 *   Hash RX queue type to use for flow steering rule.
217 *
218 * @return
219 *   Total size of the flow attribute buffer. No errors are defined.
220 */
221size_t
222priv_flow_attr(struct priv *priv, struct ibv_exp_flow_attr *flow_attr,
223	       size_t flow_attr_size, enum hash_rxq_type type)
224{
225	size_t offset = sizeof(*flow_attr);
226	const struct hash_rxq_init *init = &hash_rxq_init[type];
227
228	assert(priv != NULL);
229	assert((size_t)type < RTE_DIM(hash_rxq_init));
230	do {
231		offset += init->flow_spec.hdr.size;
232		init = init->underlayer;
233	} while (init != NULL);
234	if (offset > flow_attr_size)
235		return offset;
236	flow_attr_size = offset;
237	init = &hash_rxq_init[type];
238	*flow_attr = (struct ibv_exp_flow_attr){
239		.type = IBV_EXP_FLOW_ATTR_NORMAL,
240		/* Priorities < 3 are reserved for flow director. */
241		.priority = init->flow_priority + 3,
242		.num_of_specs = 0,
243		.port = priv->port,
244		.flags = 0,
245	};
246	do {
247		offset -= init->flow_spec.hdr.size;
248		memcpy((void *)((uintptr_t)flow_attr + offset),
249		       &init->flow_spec,
250		       init->flow_spec.hdr.size);
251		++flow_attr->num_of_specs;
252		init = init->underlayer;
253	} while (init != NULL);
254	return flow_attr_size;
255}
256
257/**
258 * Convert hash type position in indirection table initializer to
259 * hash RX queue type.
260 *
261 * @param table
262 *   Indirection table initializer.
263 * @param pos
264 *   Hash type position.
265 *
266 * @return
267 *   Hash RX queue type.
268 */
269static enum hash_rxq_type
270hash_rxq_type_from_pos(const struct ind_table_init *table, unsigned int pos)
271{
272	enum hash_rxq_type type = HASH_RXQ_TCPV4;
273
274	assert(pos < table->hash_types_n);
275	do {
276		if ((table->hash_types & (1 << type)) && (pos-- == 0))
277			break;
278		++type;
279	} while (1);
280	return type;
281}
282
283/**
284 * Filter out disabled hash RX queue types from ind_table_init[].
285 *
286 * @param priv
287 *   Pointer to private structure.
288 * @param[out] table
289 *   Output table.
290 *
291 * @return
292 *   Number of table entries.
293 */
294static unsigned int
295priv_make_ind_table_init(struct priv *priv,
296			 struct ind_table_init (*table)[IND_TABLE_INIT_N])
297{
298	uint64_t rss_hf;
299	unsigned int i;
300	unsigned int j;
301	unsigned int table_n = 0;
302	/* Mandatory to receive frames not handled by normal hash RX queues. */
303	unsigned int hash_types_sup = 1 << HASH_RXQ_ETH;
304
305	rss_hf = priv->rss_hf;
306	/* Process other protocols only if more than one queue. */
307	if (priv->rxqs_n > 1)
308		for (i = 0; (i != hash_rxq_init_n); ++i)
309			if (rss_hf & hash_rxq_init[i].dpdk_rss_hf)
310				hash_types_sup |= (1 << i);
311
312	/* Filter out entries whose protocols are not in the set. */
313	for (i = 0, j = 0; (i != IND_TABLE_INIT_N); ++i) {
314		unsigned int nb;
315		unsigned int h;
316
317		/* j is increased only if the table has valid protocols. */
318		assert(j <= i);
319		(*table)[j] = ind_table_init[i];
320		(*table)[j].hash_types &= hash_types_sup;
321		for (h = 0, nb = 0; (h != hash_rxq_init_n); ++h)
322			if (((*table)[j].hash_types >> h) & 0x1)
323				++nb;
324		(*table)[i].hash_types_n = nb;
325		if (nb) {
326			++table_n;
327			++j;
328		}
329	}
330	return table_n;
331}
332
333/**
334 * Initialize hash RX queues and indirection table.
335 *
336 * @param priv
337 *   Pointer to private structure.
338 *
339 * @return
340 *   0 on success, errno value on failure.
341 */
342int
343priv_create_hash_rxqs(struct priv *priv)
344{
345	struct ibv_exp_wq *wqs[priv->reta_idx_n];
346	struct ind_table_init ind_table_init[IND_TABLE_INIT_N];
347	unsigned int ind_tables_n =
348		priv_make_ind_table_init(priv, &ind_table_init);
349	unsigned int hash_rxqs_n = 0;
350	struct hash_rxq (*hash_rxqs)[] = NULL;
351	struct ibv_exp_rwq_ind_table *(*ind_tables)[] = NULL;
352	unsigned int i;
353	unsigned int j;
354	unsigned int k;
355	int err = 0;
356
357	assert(priv->ind_tables == NULL);
358	assert(priv->ind_tables_n == 0);
359	assert(priv->hash_rxqs == NULL);
360	assert(priv->hash_rxqs_n == 0);
361	assert(priv->pd != NULL);
362	assert(priv->ctx != NULL);
363	if (priv->rxqs_n == 0)
364		return EINVAL;
365	assert(priv->rxqs != NULL);
366	if (ind_tables_n == 0) {
367		ERROR("all hash RX queue types have been filtered out,"
368		      " indirection table cannot be created");
369		return EINVAL;
370	}
371	if (priv->rxqs_n & (priv->rxqs_n - 1)) {
372		INFO("%u RX queues are configured, consider rounding this"
373		     " number to the next power of two for better balancing",
374		     priv->rxqs_n);
375		DEBUG("indirection table extended to assume %u WQs",
376		      priv->reta_idx_n);
377	}
378	for (i = 0; (i != priv->reta_idx_n); ++i) {
379		struct rxq_ctrl *rxq_ctrl;
380
381		rxq_ctrl = container_of((*priv->rxqs)[(*priv->reta_idx)[i]],
382					struct rxq_ctrl, rxq);
383		wqs[i] = rxq_ctrl->wq;
384	}
385	/* Get number of hash RX queues to configure. */
386	for (i = 0, hash_rxqs_n = 0; (i != ind_tables_n); ++i)
387		hash_rxqs_n += ind_table_init[i].hash_types_n;
388	DEBUG("allocating %u hash RX queues for %u WQs, %u indirection tables",
389	      hash_rxqs_n, priv->rxqs_n, ind_tables_n);
390	/* Create indirection tables. */
391	ind_tables = rte_calloc(__func__, ind_tables_n,
392				sizeof((*ind_tables)[0]), 0);
393	if (ind_tables == NULL) {
394		err = ENOMEM;
395		ERROR("cannot allocate indirection tables container: %s",
396		      strerror(err));
397		goto error;
398	}
399	for (i = 0; (i != ind_tables_n); ++i) {
400		struct ibv_exp_rwq_ind_table_init_attr ind_init_attr = {
401			.pd = priv->pd,
402			.log_ind_tbl_size = 0, /* Set below. */
403			.ind_tbl = wqs,
404			.comp_mask = 0,
405		};
406		unsigned int ind_tbl_size = ind_table_init[i].max_size;
407		struct ibv_exp_rwq_ind_table *ind_table;
408
409		if (priv->reta_idx_n < ind_tbl_size)
410			ind_tbl_size = priv->reta_idx_n;
411		ind_init_attr.log_ind_tbl_size = log2above(ind_tbl_size);
412		errno = 0;
413		ind_table = ibv_exp_create_rwq_ind_table(priv->ctx,
414							 &ind_init_attr);
415		if (ind_table != NULL) {
416			(*ind_tables)[i] = ind_table;
417			continue;
418		}
419		/* Not clear whether errno is set. */
420		err = (errno ? errno : EINVAL);
421		ERROR("RX indirection table creation failed with error %d: %s",
422		      err, strerror(err));
423		goto error;
424	}
425	/* Allocate array that holds hash RX queues and related data. */
426	hash_rxqs = rte_calloc(__func__, hash_rxqs_n,
427			       sizeof((*hash_rxqs)[0]), 0);
428	if (hash_rxqs == NULL) {
429		err = ENOMEM;
430		ERROR("cannot allocate hash RX queues container: %s",
431		      strerror(err));
432		goto error;
433	}
434	for (i = 0, j = 0, k = 0;
435	     ((i != hash_rxqs_n) && (j != ind_tables_n));
436	     ++i) {
437		struct hash_rxq *hash_rxq = &(*hash_rxqs)[i];
438		enum hash_rxq_type type =
439			hash_rxq_type_from_pos(&ind_table_init[j], k);
440		struct rte_eth_rss_conf *priv_rss_conf =
441			(*priv->rss_conf)[type];
442		struct ibv_exp_rx_hash_conf hash_conf = {
443			.rx_hash_function = IBV_EXP_RX_HASH_FUNC_TOEPLITZ,
444			.rx_hash_key_len = (priv_rss_conf ?
445					    priv_rss_conf->rss_key_len :
446					    rss_hash_default_key_len),
447			.rx_hash_key = (priv_rss_conf ?
448					priv_rss_conf->rss_key :
449					rss_hash_default_key),
450			.rx_hash_fields_mask = hash_rxq_init[type].hash_fields,
451			.rwq_ind_tbl = (*ind_tables)[j],
452		};
453		struct ibv_exp_qp_init_attr qp_init_attr = {
454			.max_inl_recv = 0, /* Currently not supported. */
455			.qp_type = IBV_QPT_RAW_PACKET,
456			.comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
457				      IBV_EXP_QP_INIT_ATTR_RX_HASH),
458			.pd = priv->pd,
459			.rx_hash_conf = &hash_conf,
460			.port_num = priv->port,
461		};
462
463		DEBUG("using indirection table %u for hash RX queue %u type %d",
464		      j, i, type);
465		*hash_rxq = (struct hash_rxq){
466			.priv = priv,
467			.qp = ibv_exp_create_qp(priv->ctx, &qp_init_attr),
468			.type = type,
469		};
470		if (hash_rxq->qp == NULL) {
471			err = (errno ? errno : EINVAL);
472			ERROR("Hash RX QP creation failure: %s",
473			      strerror(err));
474			goto error;
475		}
476		if (++k < ind_table_init[j].hash_types_n)
477			continue;
478		/* Switch to the next indirection table and reset hash RX
479		 * queue type array index. */
480		++j;
481		k = 0;
482	}
483	priv->ind_tables = ind_tables;
484	priv->ind_tables_n = ind_tables_n;
485	priv->hash_rxqs = hash_rxqs;
486	priv->hash_rxqs_n = hash_rxqs_n;
487	assert(err == 0);
488	return 0;
489error:
490	if (hash_rxqs != NULL) {
491		for (i = 0; (i != hash_rxqs_n); ++i) {
492			struct ibv_qp *qp = (*hash_rxqs)[i].qp;
493
494			if (qp == NULL)
495				continue;
496			claim_zero(ibv_destroy_qp(qp));
497		}
498		rte_free(hash_rxqs);
499	}
500	if (ind_tables != NULL) {
501		for (j = 0; (j != ind_tables_n); ++j) {
502			struct ibv_exp_rwq_ind_table *ind_table =
503				(*ind_tables)[j];
504
505			if (ind_table == NULL)
506				continue;
507			claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
508		}
509		rte_free(ind_tables);
510	}
511	return err;
512}
513
514/**
515 * Clean up hash RX queues and indirection table.
516 *
517 * @param priv
518 *   Pointer to private structure.
519 */
520void
521priv_destroy_hash_rxqs(struct priv *priv)
522{
523	unsigned int i;
524
525	DEBUG("destroying %u hash RX queues", priv->hash_rxqs_n);
526	if (priv->hash_rxqs_n == 0) {
527		assert(priv->hash_rxqs == NULL);
528		assert(priv->ind_tables == NULL);
529		return;
530	}
531	for (i = 0; (i != priv->hash_rxqs_n); ++i) {
532		struct hash_rxq *hash_rxq = &(*priv->hash_rxqs)[i];
533		unsigned int j, k;
534
535		assert(hash_rxq->priv == priv);
536		assert(hash_rxq->qp != NULL);
537		/* Also check that there are no remaining flows. */
538		for (j = 0; (j != RTE_DIM(hash_rxq->special_flow)); ++j)
539			for (k = 0;
540			     (k != RTE_DIM(hash_rxq->special_flow[j]));
541			     ++k)
542				assert(hash_rxq->special_flow[j][k] == NULL);
543		for (j = 0; (j != RTE_DIM(hash_rxq->mac_flow)); ++j)
544			for (k = 0; (k != RTE_DIM(hash_rxq->mac_flow[j])); ++k)
545				assert(hash_rxq->mac_flow[j][k] == NULL);
546		claim_zero(ibv_destroy_qp(hash_rxq->qp));
547	}
548	priv->hash_rxqs_n = 0;
549	rte_free(priv->hash_rxqs);
550	priv->hash_rxqs = NULL;
551	for (i = 0; (i != priv->ind_tables_n); ++i) {
552		struct ibv_exp_rwq_ind_table *ind_table =
553			(*priv->ind_tables)[i];
554
555		assert(ind_table != NULL);
556		claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
557	}
558	priv->ind_tables_n = 0;
559	rte_free(priv->ind_tables);
560	priv->ind_tables = NULL;
561}
562
563/**
564 * Check whether a given flow type is allowed.
565 *
566 * @param priv
567 *   Pointer to private structure.
568 * @param type
569 *   Flow type to check.
570 *
571 * @return
572 *   Nonzero if the given flow type is allowed.
573 */
574int
575priv_allow_flow_type(struct priv *priv, enum hash_rxq_flow_type type)
576{
577	/* Only FLOW_TYPE_PROMISC is allowed when promiscuous mode
578	 * has been requested. */
579	if (priv->promisc_req)
580		return type == HASH_RXQ_FLOW_TYPE_PROMISC;
581	switch (type) {
582	case HASH_RXQ_FLOW_TYPE_PROMISC:
583		return !!priv->promisc_req;
584	case HASH_RXQ_FLOW_TYPE_ALLMULTI:
585		return !!priv->allmulti_req;
586	case HASH_RXQ_FLOW_TYPE_BROADCAST:
587	case HASH_RXQ_FLOW_TYPE_IPV6MULTI:
588		/* If allmulti is enabled, broadcast and ipv6multi
589		 * are unnecessary. */
590		return !priv->allmulti_req;
591	case HASH_RXQ_FLOW_TYPE_MAC:
592		return 1;
593	default:
594		/* Unsupported flow type is not allowed. */
595		return 0;
596	}
597	return 0;
598}
599
600/**
601 * Automatically enable/disable flows according to configuration.
602 *
603 * @param priv
604 *   Private structure.
605 *
606 * @return
607 *   0 on success, errno value on failure.
608 */
609int
610priv_rehash_flows(struct priv *priv)
611{
612	size_t i;
613
614	for (i = 0; i != RTE_DIM((*priv->hash_rxqs)[0].special_flow); ++i)
615		if (!priv_allow_flow_type(priv, i)) {
616			priv_special_flow_disable(priv, i);
617		} else {
618			int ret = priv_special_flow_enable(priv, i);
619
620			if (ret)
621				return ret;
622		}
623	if (priv_allow_flow_type(priv, HASH_RXQ_FLOW_TYPE_MAC))
624		return priv_mac_addrs_enable(priv);
625	priv_mac_addrs_disable(priv);
626	return 0;
627}
628
629/**
630 * Allocate RX queue elements.
631 *
632 * @param rxq_ctrl
633 *   Pointer to RX queue structure.
634 * @param elts_n
635 *   Number of elements to allocate.
636 * @param[in] pool
637 *   If not NULL, fetch buffers from this array instead of allocating them
638 *   with rte_pktmbuf_alloc().
639 *
640 * @return
641 *   0 on success, errno value on failure.
642 */
643static int
644rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
645	       struct rte_mbuf *(*pool)[])
646{
647	const unsigned int sges_n = 1 << rxq_ctrl->rxq.sges_n;
648	unsigned int i;
649	int ret = 0;
650
651	/* Iterate on segments. */
652	for (i = 0; (i != elts_n); ++i) {
653		struct rte_mbuf *buf;
654		volatile struct mlx5_wqe_data_seg *scat =
655			&(*rxq_ctrl->rxq.wqes)[i];
656
657		if (pool != NULL) {
658			buf = (*pool)[i];
659			assert(buf != NULL);
660			rte_pktmbuf_reset(buf);
661			rte_pktmbuf_refcnt_update(buf, 1);
662		} else
663			buf = rte_pktmbuf_alloc(rxq_ctrl->rxq.mp);
664		if (buf == NULL) {
665			assert(pool == NULL);
666			ERROR("%p: empty mbuf pool", (void *)rxq_ctrl);
667			ret = ENOMEM;
668			goto error;
669		}
670		/* Headroom is reserved by rte_pktmbuf_alloc(). */
671		assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
672		/* Buffer is supposed to be empty. */
673		assert(rte_pktmbuf_data_len(buf) == 0);
674		assert(rte_pktmbuf_pkt_len(buf) == 0);
675		assert(!buf->next);
676		/* Only the first segment keeps headroom. */
677		if (i % sges_n)
678			SET_DATA_OFF(buf, 0);
679		PORT(buf) = rxq_ctrl->rxq.port_id;
680		DATA_LEN(buf) = rte_pktmbuf_tailroom(buf);
681		PKT_LEN(buf) = DATA_LEN(buf);
682		NB_SEGS(buf) = 1;
683		/* scat->addr must be able to store a pointer. */
684		assert(sizeof(scat->addr) >= sizeof(uintptr_t));
685		*scat = (struct mlx5_wqe_data_seg){
686			.addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)),
687			.byte_count = htonl(DATA_LEN(buf)),
688			.lkey = htonl(rxq_ctrl->mr->lkey),
689		};
690		(*rxq_ctrl->rxq.elts)[i] = buf;
691	}
692	DEBUG("%p: allocated and configured %u segments (max %u packets)",
693	      (void *)rxq_ctrl, elts_n, elts_n / (1 << rxq_ctrl->rxq.sges_n));
694	assert(ret == 0);
695	return 0;
696error:
697	assert(pool == NULL);
698	elts_n = i;
699	for (i = 0; (i != elts_n); ++i) {
700		if ((*rxq_ctrl->rxq.elts)[i] != NULL)
701			rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
702		(*rxq_ctrl->rxq.elts)[i] = NULL;
703	}
704	DEBUG("%p: failed, freed everything", (void *)rxq_ctrl);
705	assert(ret > 0);
706	return ret;
707}
708
709/**
710 * Free RX queue elements.
711 *
712 * @param rxq_ctrl
713 *   Pointer to RX queue structure.
714 */
715static void
716rxq_free_elts(struct rxq_ctrl *rxq_ctrl)
717{
718	unsigned int i;
719
720	DEBUG("%p: freeing WRs", (void *)rxq_ctrl);
721	if (rxq_ctrl->rxq.elts == NULL)
722		return;
723
724	for (i = 0; (i != (1u << rxq_ctrl->rxq.elts_n)); ++i) {
725		if ((*rxq_ctrl->rxq.elts)[i] != NULL)
726			rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
727		(*rxq_ctrl->rxq.elts)[i] = NULL;
728	}
729}
730
731/**
732 * Clean up a RX queue.
733 *
734 * Destroy objects, free allocated memory and reset the structure for reuse.
735 *
736 * @param rxq_ctrl
737 *   Pointer to RX queue structure.
738 */
739void
740rxq_cleanup(struct rxq_ctrl *rxq_ctrl)
741{
742	struct ibv_exp_release_intf_params params;
743
744	DEBUG("cleaning up %p", (void *)rxq_ctrl);
745	rxq_free_elts(rxq_ctrl);
746	if (rxq_ctrl->fdir_queue != NULL)
747		priv_fdir_queue_destroy(rxq_ctrl->priv, rxq_ctrl->fdir_queue);
748	if (rxq_ctrl->if_wq != NULL) {
749		assert(rxq_ctrl->priv != NULL);
750		assert(rxq_ctrl->priv->ctx != NULL);
751		assert(rxq_ctrl->wq != NULL);
752		params = (struct ibv_exp_release_intf_params){
753			.comp_mask = 0,
754		};
755		claim_zero(ibv_exp_release_intf(rxq_ctrl->priv->ctx,
756						rxq_ctrl->if_wq,
757						&params));
758	}
759	if (rxq_ctrl->if_cq != NULL) {
760		assert(rxq_ctrl->priv != NULL);
761		assert(rxq_ctrl->priv->ctx != NULL);
762		assert(rxq_ctrl->cq != NULL);
763		params = (struct ibv_exp_release_intf_params){
764			.comp_mask = 0,
765		};
766		claim_zero(ibv_exp_release_intf(rxq_ctrl->priv->ctx,
767						rxq_ctrl->if_cq,
768						&params));
769	}
770	if (rxq_ctrl->wq != NULL)
771		claim_zero(ibv_exp_destroy_wq(rxq_ctrl->wq));
772	if (rxq_ctrl->cq != NULL)
773		claim_zero(ibv_destroy_cq(rxq_ctrl->cq));
774	if (rxq_ctrl->rd != NULL) {
775		struct ibv_exp_destroy_res_domain_attr attr = {
776			.comp_mask = 0,
777		};
778
779		assert(rxq_ctrl->priv != NULL);
780		assert(rxq_ctrl->priv->ctx != NULL);
781		claim_zero(ibv_exp_destroy_res_domain(rxq_ctrl->priv->ctx,
782						      rxq_ctrl->rd,
783						      &attr));
784	}
785	if (rxq_ctrl->mr != NULL)
786		claim_zero(ibv_dereg_mr(rxq_ctrl->mr));
787	memset(rxq_ctrl, 0, sizeof(*rxq_ctrl));
788}
789
790/**
791 * Reconfigure RX queue buffers.
792 *
793 * rxq_rehash() does not allocate mbufs, which, if not done from the right
794 * thread (such as a control thread), may corrupt the pool.
795 * In case of failure, the queue is left untouched.
796 *
797 * @param dev
798 *   Pointer to Ethernet device structure.
799 * @param rxq_ctrl
800 *   RX queue pointer.
801 *
802 * @return
803 *   0 on success, errno value on failure.
804 */
805int
806rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
807{
808	unsigned int elts_n = 1 << rxq_ctrl->rxq.elts_n;
809	unsigned int i;
810	struct ibv_exp_wq_attr mod;
811	int err;
812
813	DEBUG("%p: rehashing queue %p with %u SGE(s) per packet",
814	      (void *)dev, (void *)rxq_ctrl, 1 << rxq_ctrl->rxq.sges_n);
815	assert(!(elts_n % (1 << rxq_ctrl->rxq.sges_n)));
816	/* From now on, any failure will render the queue unusable.
817	 * Reinitialize WQ. */
818	mod = (struct ibv_exp_wq_attr){
819		.attr_mask = IBV_EXP_WQ_ATTR_STATE,
820		.wq_state = IBV_EXP_WQS_RESET,
821	};
822	err = ibv_exp_modify_wq(rxq_ctrl->wq, &mod);
823	if (err) {
824		ERROR("%p: cannot reset WQ: %s", (void *)dev, strerror(err));
825		assert(err > 0);
826		return err;
827	}
828	/* Snatch mbufs from original queue. */
829	claim_zero(rxq_alloc_elts(rxq_ctrl, elts_n, rxq_ctrl->rxq.elts));
830	for (i = 0; i != elts_n; ++i) {
831		struct rte_mbuf *buf = (*rxq_ctrl->rxq.elts)[i];
832
833		assert(rte_mbuf_refcnt_read(buf) == 2);
834		rte_pktmbuf_free_seg(buf);
835	}
836	/* Change queue state to ready. */
837	mod = (struct ibv_exp_wq_attr){
838		.attr_mask = IBV_EXP_WQ_ATTR_STATE,
839		.wq_state = IBV_EXP_WQS_RDY,
840	};
841	err = ibv_exp_modify_wq(rxq_ctrl->wq, &mod);
842	if (err) {
843		ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
844		      (void *)dev, strerror(err));
845		goto error;
846	}
847	/* Update doorbell counter. */
848	rxq_ctrl->rxq.rq_ci = elts_n >> rxq_ctrl->rxq.sges_n;
849	rte_wmb();
850	*rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
851error:
852	assert(err >= 0);
853	return err;
854}
855
856/**
857 * Initialize RX queue.
858 *
859 * @param tmpl
860 *   Pointer to RX queue control template.
861 *
862 * @return
863 *   0 on success, errno value on failure.
864 */
865static inline int
866rxq_setup(struct rxq_ctrl *tmpl)
867{
868	struct ibv_cq *ibcq = tmpl->cq;
869	struct ibv_mlx5_cq_info cq_info;
870	struct mlx5_rwq *rwq = container_of(tmpl->wq, struct mlx5_rwq, wq);
871	struct rte_mbuf *(*elts)[1 << tmpl->rxq.elts_n] =
872		rte_calloc_socket("RXQ", 1, sizeof(*elts), 0, tmpl->socket);
873
874	if (ibv_mlx5_exp_get_cq_info(ibcq, &cq_info)) {
875		ERROR("Unable to query CQ info. check your OFED.");
876		return ENOTSUP;
877	}
878	if (cq_info.cqe_size != RTE_CACHE_LINE_SIZE) {
879		ERROR("Wrong MLX5_CQE_SIZE environment variable value: "
880		      "it should be set to %u", RTE_CACHE_LINE_SIZE);
881		return EINVAL;
882	}
883	if (elts == NULL)
884		return ENOMEM;
885	tmpl->rxq.rq_db = rwq->rq.db;
886	tmpl->rxq.cqe_n = log2above(cq_info.cqe_cnt);
887	tmpl->rxq.cq_ci = 0;
888	tmpl->rxq.rq_ci = 0;
889	tmpl->rxq.cq_db = cq_info.dbrec;
890	tmpl->rxq.wqes =
891		(volatile struct mlx5_wqe_data_seg (*)[])
892		(uintptr_t)rwq->rq.buff;
893	tmpl->rxq.cqes =
894		(volatile struct mlx5_cqe (*)[])
895		(uintptr_t)cq_info.buf;
896	tmpl->rxq.elts = elts;
897	return 0;
898}
899
900/**
901 * Configure a RX queue.
902 *
903 * @param dev
904 *   Pointer to Ethernet device structure.
905 * @param rxq_ctrl
906 *   Pointer to RX queue structure.
907 * @param desc
908 *   Number of descriptors to configure in queue.
909 * @param socket
910 *   NUMA socket on which memory must be allocated.
911 * @param[in] conf
912 *   Thresholds parameters.
913 * @param mp
914 *   Memory pool for buffer allocations.
915 *
916 * @return
917 *   0 on success, errno value on failure.
918 */
919int
920rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
921	       uint16_t desc, unsigned int socket,
922	       const struct rte_eth_rxconf *conf, struct rte_mempool *mp)
923{
924	struct priv *priv = dev->data->dev_private;
925	struct rxq_ctrl tmpl = {
926		.priv = priv,
927		.socket = socket,
928		.rxq = {
929			.elts_n = log2above(desc),
930			.mp = mp,
931			.rss_hash = priv->rxqs_n > 1,
932		},
933	};
934	struct ibv_exp_wq_attr mod;
935	union {
936		struct ibv_exp_query_intf_params params;
937		struct ibv_exp_cq_init_attr cq;
938		struct ibv_exp_res_domain_init_attr rd;
939		struct ibv_exp_wq_init_attr wq;
940		struct ibv_exp_cq_attr cq_attr;
941	} attr;
942	enum ibv_exp_query_intf_status status;
943	unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
944	unsigned int cqe_n = desc - 1;
945	struct rte_mbuf *(*elts)[desc] = NULL;
946	int ret = 0;
947
948	(void)conf; /* Thresholds configuration (ignored). */
949	/* Enable scattered packets support for this queue if necessary. */
950	assert(mb_len >= RTE_PKTMBUF_HEADROOM);
951	/* If smaller than MRU, multi-segment support must be enabled. */
952	if (mb_len < (priv->mtu > dev->data->dev_conf.rxmode.max_rx_pkt_len ?
953		     dev->data->dev_conf.rxmode.max_rx_pkt_len :
954		     priv->mtu))
955		dev->data->dev_conf.rxmode.jumbo_frame = 1;
956	if ((dev->data->dev_conf.rxmode.jumbo_frame) &&
957	    (dev->data->dev_conf.rxmode.max_rx_pkt_len >
958	     (mb_len - RTE_PKTMBUF_HEADROOM))) {
959		unsigned int size =
960			RTE_PKTMBUF_HEADROOM +
961			dev->data->dev_conf.rxmode.max_rx_pkt_len;
962		unsigned int sges_n;
963
964		/*
965		 * Determine the number of SGEs needed for a full packet
966		 * and round it to the next power of two.
967		 */
968		sges_n = log2above((size / mb_len) + !!(size % mb_len));
969		tmpl.rxq.sges_n = sges_n;
970		/* Make sure rxq.sges_n did not overflow. */
971		size = mb_len * (1 << tmpl.rxq.sges_n);
972		size -= RTE_PKTMBUF_HEADROOM;
973		if (size < dev->data->dev_conf.rxmode.max_rx_pkt_len) {
974			ERROR("%p: too many SGEs (%u) needed to handle"
975			      " requested maximum packet size %u",
976			      (void *)dev,
977			      1 << sges_n,
978			      dev->data->dev_conf.rxmode.max_rx_pkt_len);
979			return EOVERFLOW;
980		}
981	}
982	DEBUG("%p: maximum number of segments per packet: %u",
983	      (void *)dev, 1 << tmpl.rxq.sges_n);
984	if (desc % (1 << tmpl.rxq.sges_n)) {
985		ERROR("%p: number of RX queue descriptors (%u) is not a"
986		      " multiple of SGEs per packet (%u)",
987		      (void *)dev,
988		      desc,
989		      1 << tmpl.rxq.sges_n);
990		return EINVAL;
991	}
992	/* Toggle RX checksum offload if hardware supports it. */
993	if (priv->hw_csum)
994		tmpl.rxq.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
995	if (priv->hw_csum_l2tun)
996		tmpl.rxq.csum_l2tun =
997			!!dev->data->dev_conf.rxmode.hw_ip_checksum;
998	/* Use the entire RX mempool as the memory region. */
999	tmpl.mr = mlx5_mp2mr(priv->pd, mp);
1000	if (tmpl.mr == NULL) {
1001		ret = EINVAL;
1002		ERROR("%p: MR creation failure: %s",
1003		      (void *)dev, strerror(ret));
1004		goto error;
1005	}
1006	attr.rd = (struct ibv_exp_res_domain_init_attr){
1007		.comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
1008			      IBV_EXP_RES_DOMAIN_MSG_MODEL),
1009		.thread_model = IBV_EXP_THREAD_SINGLE,
1010		.msg_model = IBV_EXP_MSG_HIGH_BW,
1011	};
1012	tmpl.rd = ibv_exp_create_res_domain(priv->ctx, &attr.rd);
1013	if (tmpl.rd == NULL) {
1014		ret = ENOMEM;
1015		ERROR("%p: RD creation failure: %s",
1016		      (void *)dev, strerror(ret));
1017		goto error;
1018	}
1019	attr.cq = (struct ibv_exp_cq_init_attr){
1020		.comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
1021		.res_domain = tmpl.rd,
1022	};
1023	if (priv->cqe_comp) {
1024		attr.cq.comp_mask |= IBV_EXP_CQ_INIT_ATTR_FLAGS;
1025		attr.cq.flags |= IBV_EXP_CQ_COMPRESSED_CQE;
1026		cqe_n = (desc * 2) - 1; /* Double the number of CQEs. */
1027	}
1028	tmpl.cq = ibv_exp_create_cq(priv->ctx, cqe_n, NULL, NULL, 0,
1029				    &attr.cq);
1030	if (tmpl.cq == NULL) {
1031		ret = ENOMEM;
1032		ERROR("%p: CQ creation failure: %s",
1033		      (void *)dev, strerror(ret));
1034		goto error;
1035	}
1036	DEBUG("priv->device_attr.max_qp_wr is %d",
1037	      priv->device_attr.max_qp_wr);
1038	DEBUG("priv->device_attr.max_sge is %d",
1039	      priv->device_attr.max_sge);
1040	/* Configure VLAN stripping. */
1041	tmpl.rxq.vlan_strip = (priv->hw_vlan_strip &&
1042			       !!dev->data->dev_conf.rxmode.hw_vlan_strip);
1043	attr.wq = (struct ibv_exp_wq_init_attr){
1044		.wq_context = NULL, /* Could be useful in the future. */
1045		.wq_type = IBV_EXP_WQT_RQ,
1046		/* Max number of outstanding WRs. */
1047		.max_recv_wr = desc >> tmpl.rxq.sges_n,
1048		/* Max number of scatter/gather elements in a WR. */
1049		.max_recv_sge = 1 << tmpl.rxq.sges_n,
1050		.pd = priv->pd,
1051		.cq = tmpl.cq,
1052		.comp_mask =
1053			IBV_EXP_CREATE_WQ_RES_DOMAIN |
1054			IBV_EXP_CREATE_WQ_VLAN_OFFLOADS |
1055			0,
1056		.res_domain = tmpl.rd,
1057		.vlan_offloads = (tmpl.rxq.vlan_strip ?
1058				  IBV_EXP_RECEIVE_WQ_CVLAN_STRIP :
1059				  0),
1060	};
1061	/* By default, FCS (CRC) is stripped by hardware. */
1062	if (dev->data->dev_conf.rxmode.hw_strip_crc) {
1063		tmpl.rxq.crc_present = 0;
1064	} else if (priv->hw_fcs_strip) {
1065		/* Ask HW/Verbs to leave CRC in place when supported. */
1066		attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_SCATTER_FCS;
1067		attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS;
1068		tmpl.rxq.crc_present = 1;
1069	} else {
1070		WARN("%p: CRC stripping has been disabled but will still"
1071		     " be performed by hardware, make sure MLNX_OFED and"
1072		     " firmware are up to date",
1073		     (void *)dev);
1074		tmpl.rxq.crc_present = 0;
1075	}
1076	DEBUG("%p: CRC stripping is %s, %u bytes will be subtracted from"
1077	      " incoming frames to hide it",
1078	      (void *)dev,
1079	      tmpl.rxq.crc_present ? "disabled" : "enabled",
1080	      tmpl.rxq.crc_present << 2);
1081	if (!mlx5_getenv_int("MLX5_PMD_ENABLE_PADDING"))
1082		; /* Nothing else to do. */
1083	else if (priv->hw_padding) {
1084		INFO("%p: enabling packet padding on queue %p",
1085		     (void *)dev, (void *)rxq_ctrl);
1086		attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_RX_END_PADDING;
1087		attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS;
1088	} else
1089		WARN("%p: packet padding has been requested but is not"
1090		     " supported, make sure MLNX_OFED and firmware are"
1091		     " up to date",
1092		     (void *)dev);
1093
1094	tmpl.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
1095	if (tmpl.wq == NULL) {
1096		ret = (errno ? errno : EINVAL);
1097		ERROR("%p: WQ creation failure: %s",
1098		      (void *)dev, strerror(ret));
1099		goto error;
1100	}
1101	/*
1102	 * Make sure number of WRs*SGEs match expectations since a queue
1103	 * cannot allocate more than "desc" buffers.
1104	 */
1105	if (((int)attr.wq.max_recv_wr != (desc >> tmpl.rxq.sges_n)) ||
1106	    ((int)attr.wq.max_recv_sge != (1 << tmpl.rxq.sges_n))) {
1107		ERROR("%p: requested %u*%u but got %u*%u WRs*SGEs",
1108		      (void *)dev,
1109		      (desc >> tmpl.rxq.sges_n), (1 << tmpl.rxq.sges_n),
1110		      attr.wq.max_recv_wr, attr.wq.max_recv_sge);
1111		ret = EINVAL;
1112		goto error;
1113	}
1114	/* Save port ID. */
1115	tmpl.rxq.port_id = dev->data->port_id;
1116	DEBUG("%p: RTE port ID: %u", (void *)rxq_ctrl, tmpl.rxq.port_id);
1117	attr.params = (struct ibv_exp_query_intf_params){
1118		.intf_scope = IBV_EXP_INTF_GLOBAL,
1119		.intf_version = 1,
1120		.intf = IBV_EXP_INTF_CQ,
1121		.obj = tmpl.cq,
1122	};
1123	tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
1124	if (tmpl.if_cq == NULL) {
1125		ERROR("%p: CQ interface family query failed with status %d",
1126		      (void *)dev, status);
1127		goto error;
1128	}
1129	attr.params = (struct ibv_exp_query_intf_params){
1130		.intf_scope = IBV_EXP_INTF_GLOBAL,
1131		.intf = IBV_EXP_INTF_WQ,
1132		.obj = tmpl.wq,
1133	};
1134	tmpl.if_wq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
1135	if (tmpl.if_wq == NULL) {
1136		ERROR("%p: WQ interface family query failed with status %d",
1137		      (void *)dev, status);
1138		goto error;
1139	}
1140	/* Change queue state to ready. */
1141	mod = (struct ibv_exp_wq_attr){
1142		.attr_mask = IBV_EXP_WQ_ATTR_STATE,
1143		.wq_state = IBV_EXP_WQS_RDY,
1144	};
1145	ret = ibv_exp_modify_wq(tmpl.wq, &mod);
1146	if (ret) {
1147		ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
1148		      (void *)dev, strerror(ret));
1149		goto error;
1150	}
1151	ret = rxq_setup(&tmpl);
1152	if (ret) {
1153		ERROR("%p: cannot initialize RX queue structure: %s",
1154		      (void *)dev, strerror(ret));
1155		goto error;
1156	}
1157	/* Reuse buffers from original queue if possible. */
1158	if (rxq_ctrl->rxq.elts_n) {
1159		assert(1 << rxq_ctrl->rxq.elts_n == desc);
1160		assert(rxq_ctrl->rxq.elts != tmpl.rxq.elts);
1161		ret = rxq_alloc_elts(&tmpl, desc, rxq_ctrl->rxq.elts);
1162	} else
1163		ret = rxq_alloc_elts(&tmpl, desc, NULL);
1164	if (ret) {
1165		ERROR("%p: RXQ allocation failed: %s",
1166		      (void *)dev, strerror(ret));
1167		goto error;
1168	}
1169	/* Clean up rxq in case we're reinitializing it. */
1170	DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq_ctrl);
1171	rxq_cleanup(rxq_ctrl);
1172	/* Move mbuf pointers to dedicated storage area in RX queue. */
1173	elts = (void *)(rxq_ctrl + 1);
1174	rte_memcpy(elts, tmpl.rxq.elts, sizeof(*elts));
1175#ifndef NDEBUG
1176	memset(tmpl.rxq.elts, 0x55, sizeof(*elts));
1177#endif
1178	rte_free(tmpl.rxq.elts);
1179	tmpl.rxq.elts = elts;
1180	*rxq_ctrl = tmpl;
1181	/* Update doorbell counter. */
1182	rxq_ctrl->rxq.rq_ci = desc >> rxq_ctrl->rxq.sges_n;
1183	rte_wmb();
1184	*rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
1185	DEBUG("%p: rxq updated with %p", (void *)rxq_ctrl, (void *)&tmpl);
1186	assert(ret == 0);
1187	return 0;
1188error:
1189	elts = tmpl.rxq.elts;
1190	rxq_cleanup(&tmpl);
1191	rte_free(elts);
1192	assert(ret > 0);
1193	return ret;
1194}
1195
1196/**
1197 * DPDK callback to configure a RX queue.
1198 *
1199 * @param dev
1200 *   Pointer to Ethernet device structure.
1201 * @param idx
1202 *   RX queue index.
1203 * @param desc
1204 *   Number of descriptors to configure in queue.
1205 * @param socket
1206 *   NUMA socket on which memory must be allocated.
1207 * @param[in] conf
1208 *   Thresholds parameters.
1209 * @param mp
1210 *   Memory pool for buffer allocations.
1211 *
1212 * @return
1213 *   0 on success, negative errno value on failure.
1214 */
1215int
1216mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
1217		    unsigned int socket, const struct rte_eth_rxconf *conf,
1218		    struct rte_mempool *mp)
1219{
1220	struct priv *priv = dev->data->dev_private;
1221	struct rxq *rxq = (*priv->rxqs)[idx];
1222	struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
1223	int ret;
1224
1225	if (mlx5_is_secondary())
1226		return -E_RTE_SECONDARY;
1227
1228	priv_lock(priv);
1229	if (!rte_is_power_of_2(desc)) {
1230		desc = 1 << log2above(desc);
1231		WARN("%p: increased number of descriptors in RX queue %u"
1232		     " to the next power of two (%d)",
1233		     (void *)dev, idx, desc);
1234	}
1235	DEBUG("%p: configuring queue %u for %u descriptors",
1236	      (void *)dev, idx, desc);
1237	if (idx >= priv->rxqs_n) {
1238		ERROR("%p: queue index out of range (%u >= %u)",
1239		      (void *)dev, idx, priv->rxqs_n);
1240		priv_unlock(priv);
1241		return -EOVERFLOW;
1242	}
1243	if (rxq != NULL) {
1244		DEBUG("%p: reusing already allocated queue index %u (%p)",
1245		      (void *)dev, idx, (void *)rxq);
1246		if (priv->started) {
1247			priv_unlock(priv);
1248			return -EEXIST;
1249		}
1250		(*priv->rxqs)[idx] = NULL;
1251		rxq_cleanup(rxq_ctrl);
1252		/* Resize if rxq size is changed. */
1253		if (rxq_ctrl->rxq.elts_n != log2above(desc)) {
1254			rxq_ctrl = rte_realloc(rxq_ctrl,
1255					       sizeof(*rxq_ctrl) +
1256					       desc * sizeof(struct rte_mbuf *),
1257					       RTE_CACHE_LINE_SIZE);
1258			if (!rxq_ctrl) {
1259				ERROR("%p: unable to reallocate queue index %u",
1260					(void *)dev, idx);
1261				priv_unlock(priv);
1262				return -ENOMEM;
1263			}
1264		}
1265	} else {
1266		rxq_ctrl = rte_calloc_socket("RXQ", 1, sizeof(*rxq_ctrl) +
1267					     desc * sizeof(struct rte_mbuf *),
1268					     0, socket);
1269		if (rxq_ctrl == NULL) {
1270			ERROR("%p: unable to allocate queue index %u",
1271			      (void *)dev, idx);
1272			priv_unlock(priv);
1273			return -ENOMEM;
1274		}
1275	}
1276	ret = rxq_ctrl_setup(dev, rxq_ctrl, desc, socket, conf, mp);
1277	if (ret)
1278		rte_free(rxq_ctrl);
1279	else {
1280		rxq_ctrl->rxq.stats.idx = idx;
1281		DEBUG("%p: adding RX queue %p to list",
1282		      (void *)dev, (void *)rxq_ctrl);
1283		(*priv->rxqs)[idx] = &rxq_ctrl->rxq;
1284		/* Update receive callback. */
1285		priv_select_rx_function(priv);
1286	}
1287	priv_unlock(priv);
1288	return -ret;
1289}
1290
1291/**
1292 * DPDK callback to release a RX queue.
1293 *
1294 * @param dpdk_rxq
1295 *   Generic RX queue pointer.
1296 */
1297void
1298mlx5_rx_queue_release(void *dpdk_rxq)
1299{
1300	struct rxq *rxq = (struct rxq *)dpdk_rxq;
1301	struct rxq_ctrl *rxq_ctrl;
1302	struct priv *priv;
1303	unsigned int i;
1304
1305	if (mlx5_is_secondary())
1306		return;
1307
1308	if (rxq == NULL)
1309		return;
1310	rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
1311	priv = rxq_ctrl->priv;
1312	priv_lock(priv);
1313	for (i = 0; (i != priv->rxqs_n); ++i)
1314		if ((*priv->rxqs)[i] == rxq) {
1315			DEBUG("%p: removing RX queue %p from list",
1316			      (void *)priv->dev, (void *)rxq_ctrl);
1317			(*priv->rxqs)[i] = NULL;
1318			break;
1319		}
1320	rxq_cleanup(rxq_ctrl);
1321	rte_free(rxq_ctrl);
1322	priv_unlock(priv);
1323}
1324
1325/**
1326 * DPDK callback for RX in secondary processes.
1327 *
1328 * This function configures all queues from primary process information
1329 * if necessary before reverting to the normal RX burst callback.
1330 *
1331 * @param dpdk_rxq
1332 *   Generic pointer to RX queue structure.
1333 * @param[out] pkts
1334 *   Array to store received packets.
1335 * @param pkts_n
1336 *   Maximum number of packets in array.
1337 *
1338 * @return
1339 *   Number of packets successfully received (<= pkts_n).
1340 */
1341uint16_t
1342mlx5_rx_burst_secondary_setup(void *dpdk_rxq, struct rte_mbuf **pkts,
1343			      uint16_t pkts_n)
1344{
1345	struct rxq *rxq = dpdk_rxq;
1346	struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
1347	struct priv *priv = mlx5_secondary_data_setup(rxq_ctrl->priv);
1348	struct priv *primary_priv;
1349	unsigned int index;
1350
1351	if (priv == NULL)
1352		return 0;
1353	primary_priv =
1354		mlx5_secondary_data[priv->dev->data->port_id].primary_priv;
1355	/* Look for queue index in both private structures. */
1356	for (index = 0; index != priv->rxqs_n; ++index)
1357		if (((*primary_priv->rxqs)[index] == rxq) ||
1358		    ((*priv->rxqs)[index] == rxq))
1359			break;
1360	if (index == priv->rxqs_n)
1361		return 0;
1362	rxq = (*priv->rxqs)[index];
1363	return priv->dev->rx_pkt_burst(rxq, pkts, pkts_n);
1364}
1365