1fe750c24SBenoît Ganne/*
2fe750c24SBenoît Ganne *------------------------------------------------------------------
3fe750c24SBenoît Ganne * Copyright (c) 2018 Cisco and/or its affiliates.
4fe750c24SBenoît Ganne * Licensed under the Apache License, Version 2.0 (the "License");
5fe750c24SBenoît Ganne * you may not use this file except in compliance with the License.
6fe750c24SBenoît Ganne * You may obtain a copy of the License at:
7fe750c24SBenoît Ganne *
8fe750c24SBenoît Ganne *     http://www.apache.org/licenses/LICENSE-2.0
9fe750c24SBenoît Ganne *
10fe750c24SBenoît Ganne * Unless required by applicable law or agreed to in writing, software
11fe750c24SBenoît Ganne * distributed under the License is distributed on an "AS IS" BASIS,
12fe750c24SBenoît Ganne * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13fe750c24SBenoît Ganne * See the License for the specific language governing permissions and
14fe750c24SBenoît Ganne * limitations under the License.
15fe750c24SBenoît Ganne *------------------------------------------------------------------
16fe750c24SBenoît Ganne */
17fe750c24SBenoît Ganne
18fe750c24SBenoît Ganne#include <vlib/vlib.h>
19fe750c24SBenoît Ganne#include <vlib/unix/unix.h>
20fe750c24SBenoît Ganne#include <vlib/pci/pci.h>
21fe750c24SBenoît Ganne#include <vppinfra/ring.h>
22fe750c24SBenoît Ganne#include <vnet/ethernet/ethernet.h>
23fe750c24SBenoît Ganne#include <vnet/devices/devices.h>
24fe750c24SBenoît Ganne#include <rdma/rdma.h>
25fe750c24SBenoît Ganne
26dc812d9aSBenoît Ganne#define RDMA_TX_RETRIES 5
27dc812d9aSBenoît Ganne
28dc812d9aSBenoît Ganne#define RDMA_TXQ_DV_DSEG_SZ(txq)        (RDMA_MLX5_WQE_DS * RDMA_TXQ_DV_SQ_SZ(txq))
29dc812d9aSBenoît Ganne#define RDMA_TXQ_DV_DSEG2WQE(d)         (((d) + RDMA_MLX5_WQE_DS - 1) / RDMA_MLX5_WQE_DS)
30dc812d9aSBenoît Ganne
31dc812d9aSBenoît Ganne/*
32dc812d9aSBenoît Ganne * MLX5 direct verbs tx/free functions
33dc812d9aSBenoît Ganne */
34dc812d9aSBenoît Ganne
35dc812d9aSBenoît Gannestatic_always_inline void
36dc812d9aSBenoît Gannerdma_device_output_free_mlx5 (vlib_main_t * vm,
37dc812d9aSBenoît Ganne			      const vlib_node_runtime_t * node,
38dc812d9aSBenoît Ganne			      rdma_txq_t * txq)
39dc812d9aSBenoît Ganne{
40dc812d9aSBenoît Ganne  u16 idx = txq->dv_cq_idx;
41dc812d9aSBenoît Ganne  u32 cq_mask = pow2_mask (txq->dv_cq_log2sz);
42dc812d9aSBenoît Ganne  u32 sq_mask = pow2_mask (txq->dv_sq_log2sz);
43dc812d9aSBenoît Ganne  u32 mask = pow2_mask (txq->bufs_log2sz);
44dc812d9aSBenoît Ganne  u32 buf_sz = RDMA_TXQ_BUF_SZ (txq);
45dc812d9aSBenoît Ganne  u32 log2_cq_sz = txq->dv_cq_log2sz;
46dc812d9aSBenoît Ganne  struct mlx5_cqe64 *cqes = txq->dv_cq_cqes, *cur = cqes + (idx & cq_mask);
47dc812d9aSBenoît Ganne  u8 op_own, saved;
48dc812d9aSBenoît Ganne  const rdma_mlx5_wqe_t *wqe;
49dc812d9aSBenoît Ganne
50dc812d9aSBenoît Ganne  for (;;)
51dc812d9aSBenoît Ganne    {
52dc812d9aSBenoît Ganne      op_own = *(volatile u8 *) &cur->op_own;
53dc812d9aSBenoît Ganne      if (((idx >> log2_cq_sz) & MLX5_CQE_OWNER_MASK) !=
54dc812d9aSBenoît Ganne	  (op_own & MLX5_CQE_OWNER_MASK) || (op_own >> 4) == MLX5_CQE_INVALID)
55dc812d9aSBenoît Ganne	break;
56dc812d9aSBenoît Ganne      if (PREDICT_FALSE ((op_own >> 4)) != MLX5_CQE_REQ)
57dc812d9aSBenoît Ganne	vlib_error_count (vm, node->node_index, RDMA_TX_ERROR_COMPLETION, 1);
58dc812d9aSBenoît Ganne      idx++;
59dc812d9aSBenoît Ganne      cur = cqes + (idx & cq_mask);
60dc812d9aSBenoît Ganne    }
61dc812d9aSBenoît Ganne
62dc812d9aSBenoît Ganne  if (idx == txq->dv_cq_idx)
63dc812d9aSBenoît Ganne    return;			/* nothing to do */
64dc812d9aSBenoît Ganne
65dc812d9aSBenoît Ganne  cur = cqes + ((idx - 1) & cq_mask);
66dc812d9aSBenoît Ganne  saved = cur->op_own;
67dc812d9aSBenoît Ganne  (void) saved;
68dc812d9aSBenoît Ganne  cur->op_own = 0xf0;
69dc812d9aSBenoît Ganne  txq->dv_cq_idx = idx;
70dc812d9aSBenoît Ganne
71dc812d9aSBenoît Ganne  /* retrieve original WQE and get new tail counter */
72dc812d9aSBenoît Ganne  wqe = txq->dv_sq_wqes + (be16toh (cur->wqe_counter) & sq_mask);
73dc812d9aSBenoît Ganne  if (PREDICT_FALSE (wqe->ctrl.imm == RDMA_TXQ_DV_INVALID_ID))
74dc812d9aSBenoît Ganne    return;			/* can happen if CQE reports error for an intermediate WQE */
75dc812d9aSBenoît Ganne
76dc812d9aSBenoît Ganne  ASSERT (RDMA_TXQ_USED_SZ (txq->head, wqe->ctrl.imm) <= buf_sz &&
77dc812d9aSBenoît Ganne	  RDMA_TXQ_USED_SZ (wqe->ctrl.imm, txq->tail) < buf_sz);
78dc812d9aSBenoît Ganne
79dc812d9aSBenoît Ganne  /* free sent buffers and update txq head */
80dc812d9aSBenoît Ganne  vlib_buffer_free_from_ring (vm, txq->bufs, txq->head & mask, buf_sz,
81dc812d9aSBenoît Ganne			      RDMA_TXQ_USED_SZ (txq->head, wqe->ctrl.imm));
82dc812d9aSBenoît Ganne  txq->head = wqe->ctrl.imm;
83dc812d9aSBenoît Ganne
84dc812d9aSBenoît Ganne  /* ring doorbell */
85dc812d9aSBenoît Ganne  CLIB_MEMORY_STORE_BARRIER ();
86dc812d9aSBenoît Ganne  txq->dv_cq_dbrec[0] = htobe32 (idx);
87dc812d9aSBenoît Ganne}
88dc812d9aSBenoît Ganne
89dc812d9aSBenoît Gannestatic_always_inline void
90dc812d9aSBenoît Gannerdma_device_output_tx_mlx5_doorbell (rdma_txq_t * txq, rdma_mlx5_wqe_t * last,
91dc812d9aSBenoît Ganne				     const u16 tail, u32 sq_mask)
92dc812d9aSBenoît Ganne{
93dc812d9aSBenoît Ganne  last->ctrl.imm = tail;	/* register item to free */
94dc812d9aSBenoît Ganne  last->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;	/* generate a CQE so we can free buffers */
95dc812d9aSBenoît Ganne
96dc812d9aSBenoît Ganne  ASSERT (tail != txq->tail &&
97dc812d9aSBenoît Ganne	  RDMA_TXQ_AVAIL_SZ (txq, txq->head, txq->tail) >=
98dc812d9aSBenoît Ganne	  RDMA_TXQ_USED_SZ (txq->tail, tail));
99dc812d9aSBenoît Ganne
100dc812d9aSBenoît Ganne  CLIB_MEMORY_STORE_BARRIER ();
101dc812d9aSBenoît Ganne  txq->dv_sq_dbrec[MLX5_SND_DBR] = htobe32 (tail);
102dc812d9aSBenoît Ganne  CLIB_COMPILER_BARRIER ();
103dc812d9aSBenoît Ganne  txq->dv_sq_db[0] = *(u64 *) (txq->dv_sq_wqes + (txq->tail & sq_mask));
104dc812d9aSBenoît Ganne}
105dc812d9aSBenoît Ganne
106dc812d9aSBenoît Gannestatic_always_inline void
107dc812d9aSBenoît Gannerdma_mlx5_wqe_init (rdma_mlx5_wqe_t * wqe, const void *tmpl,
108dc812d9aSBenoît Ganne		    vlib_buffer_t * b, const u16 tail)
109dc812d9aSBenoît Ganne{
110dc812d9aSBenoît Ganne  u16 sz = b->current_length;
111aaa65a12SDamjan Marion  const void *cur = vlib_buffer_get_current (b);
112aaa65a12SDamjan Marion  uword addr = pointer_to_uword (cur);
113dc812d9aSBenoît Ganne
114dc812d9aSBenoît Ganne  clib_memcpy_fast (wqe, tmpl, RDMA_MLX5_WQE_SZ);
115dc812d9aSBenoît Ganne  /* speculatively copy at least MLX5_ETH_L2_INLINE_HEADER_SIZE (18-bytes) */
11672f4921cSBenoît Ganne  STATIC_ASSERT (STRUCT_SIZE_OF (struct mlx5_wqe_eth_seg, inline_hdr_start) +
11772f4921cSBenoît Ganne		 STRUCT_SIZE_OF (struct mlx5_wqe_eth_seg,
11872f4921cSBenoît Ganne				 inline_hdr) >=
11972f4921cSBenoît Ganne		 MLX5_ETH_L2_INLINE_HEADER_SIZE, "wrong size");
12072f4921cSBenoît Ganne  clib_memcpy_fast (wqe->eseg.inline_hdr_start, cur,
12172f4921cSBenoît Ganne		    MLX5_ETH_L2_INLINE_HEADER_SIZE);
122aaa65a12SDamjan Marion
123aaa65a12SDamjan Marion  wqe->wqe_index_lo = tail;
124aaa65a12SDamjan Marion  wqe->wqe_index_hi = tail >> 8;
125aaa65a12SDamjan Marion  if (PREDICT_TRUE (sz >= MLX5_ETH_L2_INLINE_HEADER_SIZE))
126aaa65a12SDamjan Marion    {
127aaa65a12SDamjan Marion      /* inline_hdr_sz is set to MLX5_ETH_L2_INLINE_HEADER_SIZE
128aaa65a12SDamjan Marion         in the template */
129aaa65a12SDamjan Marion      wqe->dseg.byte_count = htobe32 (sz - MLX5_ETH_L2_INLINE_HEADER_SIZE);
130aaa65a12SDamjan Marion      wqe->dseg.addr = htobe64 (addr + MLX5_ETH_L2_INLINE_HEADER_SIZE);
131aaa65a12SDamjan Marion    }
132aaa65a12SDamjan Marion  else
133aaa65a12SDamjan Marion    {
134aaa65a12SDamjan Marion      /* dseg.byte_count and desg.addr are set to 0 in the template */
135aaa65a12SDamjan Marion      wqe->eseg.inline_hdr_sz = htobe16 (sz);
136aaa65a12SDamjan Marion    }
137dc812d9aSBenoît Ganne}
138dc812d9aSBenoît Ganne
139dc812d9aSBenoît Ganne/*
140dc812d9aSBenoît Ganne * specific data path for chained buffers, supporting ring wrap-around
141dc812d9aSBenoît Ganne * contrary to the normal path - otherwise we may fail to enqueue chained
142dc812d9aSBenoît Ganne * buffers because we are close to the end of the ring while we still have
143dc812d9aSBenoît Ganne * plenty of descriptors available
144dc812d9aSBenoît Ganne */
145dc812d9aSBenoît Gannestatic_always_inline u32
146dc812d9aSBenoît Gannerdma_device_output_tx_mlx5_chained (vlib_main_t * vm,
147dc812d9aSBenoît Ganne				    const vlib_node_runtime_t * node,
148dc812d9aSBenoît Ganne				    const rdma_device_t * rd,
149dc812d9aSBenoît Ganne				    rdma_txq_t * txq, u32 n_left_from, u32 n,
150dc812d9aSBenoît Ganne				    u32 * bi, vlib_buffer_t ** b,
151dc812d9aSBenoît Ganne				    rdma_mlx5_wqe_t * wqe, u16 tail)
152dc812d9aSBenoît Ganne{
153dc812d9aSBenoît Ganne  rdma_mlx5_wqe_t *last = wqe;
154dc812d9aSBenoît Ganne  u32 wqe_n = RDMA_TXQ_AVAIL_SZ (txq, txq->head, tail);
155dc812d9aSBenoît Ganne  u32 sq_mask = pow2_mask (txq->dv_sq_log2sz);
156dc812d9aSBenoît Ganne  u32 mask = pow2_mask (txq->bufs_log2sz);
157dc812d9aSBenoît Ganne  u32 dseg_mask = RDMA_TXQ_DV_DSEG_SZ (txq) - 1;
158dc812d9aSBenoît Ganne  const u32 lkey = wqe[0].dseg.lkey;
159dc812d9aSBenoît Ganne
160dc812d9aSBenoît Ganne  vlib_buffer_copy_indices (txq->bufs + (txq->tail & mask), bi,
161dc812d9aSBenoît Ganne			    n_left_from - n);
162dc812d9aSBenoît Ganne
163dc812d9aSBenoît Ganne  while (n >= 1 && wqe_n >= 1)
164dc812d9aSBenoît Ganne    {
165dc812d9aSBenoît Ganne      u32 *bufs = txq->bufs + (tail & mask);
166dc812d9aSBenoît Ganne      rdma_mlx5_wqe_t *wqe = txq->dv_sq_wqes + (tail & sq_mask);
167dc812d9aSBenoît Ganne
168dc812d9aSBenoît Ganne      /* setup the head WQE */
169dc812d9aSBenoît Ganne      rdma_mlx5_wqe_init (wqe, txq->dv_wqe_tmpl, b[0], tail);
170dc812d9aSBenoît Ganne
171dc812d9aSBenoît Ganne      bufs[0] = bi[0];
172dc812d9aSBenoît Ganne
173dc812d9aSBenoît Ganne      if (b[0]->flags & VLIB_BUFFER_NEXT_PRESENT)
174dc812d9aSBenoît Ganne	{
175dc812d9aSBenoît Ganne	  /*
176dc812d9aSBenoît Ganne	   * max number of available dseg:
177dc812d9aSBenoît Ganne	   *  - 4 dseg per WQEBB available
178dc812d9aSBenoît Ganne	   *  - max 32 dseg per WQE (5-bits length field in WQE ctrl)
179dc812d9aSBenoît Ganne	   */
180dc812d9aSBenoît Ganne#define RDMA_MLX5_WQE_DS_MAX    (1 << 5)
181dc812d9aSBenoît Ganne	  const u32 dseg_max =
182dc812d9aSBenoît Ganne	    clib_min (RDMA_MLX5_WQE_DS * (wqe_n - 1), RDMA_MLX5_WQE_DS_MAX);
183dc812d9aSBenoît Ganne	  vlib_buffer_t *chained_b = b[0];
184dc812d9aSBenoît Ganne	  u32 chained_n = 0;
185dc812d9aSBenoît Ganne
186dc812d9aSBenoît Ganne	  /* there are exactly 4 dseg per WQEBB and we rely on that */
187dc812d9aSBenoît Ganne	  STATIC_ASSERT (RDMA_MLX5_WQE_DS *
188dc812d9aSBenoît Ganne			 sizeof (struct mlx5_wqe_data_seg) ==
189dc812d9aSBenoît Ganne			 MLX5_SEND_WQE_BB, "wrong size");
190dc812d9aSBenoît Ganne
191dc812d9aSBenoît Ganne	  /*
192dc812d9aSBenoît Ganne	   * iterate over fragments, supporting ring wrap-around contrary to
193dc812d9aSBenoît Ganne	   * the normal path - otherwise we may fail to enqueue chained
194dc812d9aSBenoît Ganne	   * buffers because we are close to the end of the ring while we
195dc812d9aSBenoît Ganne	   * still have plenty of descriptors available
196dc812d9aSBenoît Ganne	   */
197dc812d9aSBenoît Ganne	  while (chained_n < dseg_max
198dc812d9aSBenoît Ganne		 && chained_b->flags & VLIB_BUFFER_NEXT_PRESENT)
199dc812d9aSBenoît Ganne	    {
200dc812d9aSBenoît Ganne	      struct mlx5_wqe_data_seg *dseg = (void *) txq->dv_sq_wqes;
201dc812d9aSBenoît Ganne	      dseg += ((tail + 1) * RDMA_MLX5_WQE_DS + chained_n) & dseg_mask;
202dc812d9aSBenoît Ganne	      if (((clib_address_t) dseg & (MLX5_SEND_WQE_BB - 1)) == 0)
203dc812d9aSBenoît Ganne		{
204dc812d9aSBenoît Ganne		  /*
205dc812d9aSBenoît Ganne		   * start of new WQEBB
206dc812d9aSBenoît Ganne		   * head/tail are shared between buffers and descriptor
207dc812d9aSBenoît Ganne		   * In order to maintain 1:1 correspondance between
208dc812d9aSBenoît Ganne		   * buffer index and descriptor index, we build
209dc812d9aSBenoît Ganne		   * 4-fragments chains and save the head
210dc812d9aSBenoît Ganne		   */
211dc812d9aSBenoît Ganne		  chained_b->flags &= ~(VLIB_BUFFER_NEXT_PRESENT |
212dc812d9aSBenoît Ganne					VLIB_BUFFER_TOTAL_LENGTH_VALID);
213dc812d9aSBenoît Ganne		  u32 idx = tail + 1 + RDMA_TXQ_DV_DSEG2WQE (chained_n);
214dc812d9aSBenoît Ganne		  idx &= mask;
215dc812d9aSBenoît Ganne		  txq->bufs[idx] = chained_b->next_buffer;
216dc812d9aSBenoît Ganne		}
217dc812d9aSBenoît Ganne
218dc812d9aSBenoît Ganne	      chained_b = vlib_get_buffer (vm, chained_b->next_buffer);
219dc812d9aSBenoît Ganne	      dseg->byte_count = htobe32 (chained_b->current_length);
220dc812d9aSBenoît Ganne	      dseg->lkey = lkey;
221dc812d9aSBenoît Ganne	      dseg->addr = htobe64 (vlib_buffer_get_current_va (chained_b));
222dc812d9aSBenoît Ganne
223dc812d9aSBenoît Ganne	      chained_n += 1;
224dc812d9aSBenoît Ganne	    }
225dc812d9aSBenoît Ganne
226dc812d9aSBenoît Ganne	  if (chained_b->flags & VLIB_BUFFER_NEXT_PRESENT)
227dc812d9aSBenoît Ganne	    {
228dc812d9aSBenoît Ganne	      /*
229dc812d9aSBenoît Ganne	       * no descriptors left: drop the chain including 1st WQE
230dc812d9aSBenoît Ganne	       * skip the problematic packet and continue
231dc812d9aSBenoît Ganne	       */
232dc812d9aSBenoît Ganne	      vlib_buffer_free_from_ring (vm, txq->bufs, tail & mask,
233dc812d9aSBenoît Ganne					  RDMA_TXQ_BUF_SZ (txq), 1 +
234dc812d9aSBenoît Ganne					  RDMA_TXQ_DV_DSEG2WQE (chained_n));
235dc812d9aSBenoît Ganne	      vlib_error_count (vm, node->node_index,
236dc812d9aSBenoît Ganne				dseg_max == chained_n ?
237dc812d9aSBenoît Ganne				RDMA_TX_ERROR_SEGMENT_SIZE_EXCEEDED :
238dc812d9aSBenoît Ganne				RDMA_TX_ERROR_NO_FREE_SLOTS, 1);
239dc812d9aSBenoît Ganne
240dc812d9aSBenoît Ganne	      /* fixup tail to overwrite wqe head with next packet */
241dc812d9aSBenoît Ganne	      tail -= 1;
242dc812d9aSBenoît Ganne	    }
243dc812d9aSBenoît Ganne	  else
244dc812d9aSBenoît Ganne	    {
245dc812d9aSBenoît Ganne	      /* update WQE descriptor with new dseg number */
246dc812d9aSBenoît Ganne	      ((u8 *) & wqe[0].ctrl.qpn_ds)[3] = RDMA_MLX5_WQE_DS + chained_n;
247dc812d9aSBenoît Ganne
248dc812d9aSBenoît Ganne	      tail += RDMA_TXQ_DV_DSEG2WQE (chained_n);
249dc812d9aSBenoît Ganne	      wqe_n -= RDMA_TXQ_DV_DSEG2WQE (chained_n);
250dc812d9aSBenoît Ganne
251dc812d9aSBenoît Ganne	      last = wqe;
252dc812d9aSBenoît Ganne	    }
253dc812d9aSBenoît Ganne	}
254dc812d9aSBenoît Ganne      else
255dc812d9aSBenoît Ganne	{
256dc812d9aSBenoît Ganne	  /* not chained */
257dc812d9aSBenoît Ganne	  last = wqe;
258dc812d9aSBenoît Ganne	}
259dc812d9aSBenoît Ganne
260dc812d9aSBenoît Ganne      tail += 1;
261dc812d9aSBenoît Ganne      bi += 1;
262dc812d9aSBenoît Ganne      b += 1;
263dc812d9aSBenoît Ganne      wqe_n -= 1;
264dc812d9aSBenoît Ganne      n -= 1;
265dc812d9aSBenoît Ganne    }
266dc812d9aSBenoît Ganne
267dc812d9aSBenoît Ganne  if (n == n_left_from)
268dc812d9aSBenoît Ganne    return 0;			/* we fail to enqueue even a single packet */
269dc812d9aSBenoît Ganne
270dc812d9aSBenoît Ganne  rdma_device_output_tx_mlx5_doorbell (txq, last, tail, sq_mask);
271dc812d9aSBenoît Ganne  return n_left_from - n;
272dc812d9aSBenoît Ganne}
273dc812d9aSBenoît Ganne
274dc812d9aSBenoît Gannestatic_always_inline u32
275dc812d9aSBenoît Gannerdma_device_output_tx_mlx5 (vlib_main_t * vm,
276dc812d9aSBenoît Ganne			    const vlib_node_runtime_t * node,
277dc812d9aSBenoît Ganne			    const rdma_device_t * rd, rdma_txq_t * txq,
278dc812d9aSBenoît Ganne			    const u32 n_left_from, u32 * bi,
279dc812d9aSBenoît Ganne			    vlib_buffer_t ** b)
280dc812d9aSBenoît Ganne{
28172f4921cSBenoît Ganne
282dc812d9aSBenoît Ganne  u32 sq_mask = pow2_mask (txq->dv_sq_log2sz);
283dc812d9aSBenoît Ganne  u32 mask = pow2_mask (txq->bufs_log2sz);
28472f4921cSBenoît Ganne  rdma_mlx5_wqe_t *wqe;
28572f4921cSBenoît Ganne  u32 n, n_wrap;
286dc812d9aSBenoît Ganne  u16 tail = txq->tail;
287dc812d9aSBenoît Ganne
288dc812d9aSBenoît Ganne  ASSERT (RDMA_TXQ_BUF_SZ (txq) <= RDMA_TXQ_DV_SQ_SZ (txq));
289dc812d9aSBenoît Ganne
29072f4921cSBenoît Ganne  /* avoid wrap-around logic in core loop */
29172f4921cSBenoît Ganne  n = clib_min (n_left_from, RDMA_TXQ_BUF_SZ (txq) - (tail & mask));
29272f4921cSBenoît Ganne  n_wrap = n_left_from - n;
29372f4921cSBenoît Ganne
29472f4921cSBenoît Gannewrap_around:
29572f4921cSBenoît Ganne  wqe = txq->dv_sq_wqes + (tail & sq_mask);
29672f4921cSBenoît Ganne
2972d25467dSBenoît Ganne  while (n >= 8)
298dc812d9aSBenoît Ganne    {
299dc812d9aSBenoît Ganne      u32 flags = b[0]->flags | b[1]->flags | b[2]->flags | b[3]->flags;
300dc812d9aSBenoît Ganne      if (PREDICT_FALSE (flags & VLIB_BUFFER_NEXT_PRESENT))
301dc812d9aSBenoît Ganne	return rdma_device_output_tx_mlx5_chained (vm, node, rd, txq,
302dc812d9aSBenoît Ganne						   n_left_from, n, bi, b, wqe,
303dc812d9aSBenoît Ganne						   tail);
304dc812d9aSBenoît Ganne
3052d25467dSBenoît Ganne      vlib_prefetch_buffer_header (b[4], LOAD);
306dc812d9aSBenoît Ganne      rdma_mlx5_wqe_init (wqe + 0, txq->dv_wqe_tmpl, b[0], tail + 0);
3072d25467dSBenoît Ganne
3082d25467dSBenoît Ganne      vlib_prefetch_buffer_header (b[5], LOAD);
309dc812d9aSBenoît Ganne      rdma_mlx5_wqe_init (wqe + 1, txq->dv_wqe_tmpl, b[1], tail + 1);
3102d25467dSBenoît Ganne
3112d25467dSBenoît Ganne      vlib_prefetch_buffer_header (b[6], LOAD);
312dc812d9aSBenoît Ganne      rdma_mlx5_wqe_init (wqe + 2, txq->dv_wqe_tmpl, b[2], tail + 2);
3132d25467dSBenoît Ganne
3142d25467dSBenoît Ganne      vlib_prefetch_buffer_header (b[7], LOAD);
315dc812d9aSBenoît Ganne      rdma_mlx5_wqe_init (wqe + 3, txq->dv_wqe_tmpl, b[3], tail + 3);
316dc812d9aSBenoît Ganne
317dc812d9aSBenoît Ganne      b += 4;
318dc812d9aSBenoît Ganne      tail += 4;
319dc812d9aSBenoît Ganne      wqe += 4;
320dc812d9aSBenoît Ganne      n -= 4;
321dc812d9aSBenoît Ganne    }
322dc812d9aSBenoît Ganne
323dc812d9aSBenoît Ganne  while (n >= 1)
324dc812d9aSBenoît Ganne    {
325dc812d9aSBenoît Ganne      if (PREDICT_FALSE (b[0]->flags & VLIB_BUFFER_NEXT_PRESENT))
326dc812d9aSBenoît Ganne	return rdma_device_output_tx_mlx5_chained (vm, node, rd, txq,
327dc812d9aSBenoît Ganne						   n_left_from, n, bi, b, wqe,
328dc812d9aSBenoît Ganne						   tail);
329dc812d9aSBenoît Ganne
330dc812d9aSBenoît Ganne      rdma_mlx5_wqe_init (wqe, txq->dv_wqe_tmpl, b[0], tail);
331dc812d9aSBenoît Ganne
332dc812d9aSBenoît Ganne      b += 1;
333dc812d9aSBenoît Ganne      tail += 1;
334dc812d9aSBenoît Ganne      wqe += 1;
335dc812d9aSBenoît Ganne      n -= 1;
336dc812d9aSBenoît Ganne    }
337dc812d9aSBenoît Ganne
33872f4921cSBenoît Ganne  if (n_wrap)
33972f4921cSBenoît Ganne    {
34072f4921cSBenoît Ganne      n = n_wrap;
34172f4921cSBenoît Ganne      n_wrap = 0;
34272f4921cSBenoît Ganne      goto wrap_around;
34372f4921cSBenoît Ganne    }
344dc812d9aSBenoît Ganne
345dc812d9aSBenoît Ganne  rdma_device_output_tx_mlx5_doorbell (txq, &wqe[-1], tail, sq_mask);
346dc812d9aSBenoît Ganne  return n_left_from;
347dc812d9aSBenoît Ganne}
348dc812d9aSBenoît Ganne
349dc812d9aSBenoît Ganne/*
350dc812d9aSBenoît Ganne * standard ibverb tx/free functions
351dc812d9aSBenoît Ganne */
352dc812d9aSBenoît Ganne
353fe750c24SBenoît Gannestatic_always_inline void
354dc812d9aSBenoît Gannerdma_device_output_free_ibverb (vlib_main_t * vm,
355dc812d9aSBenoît Ganne				const vlib_node_runtime_t * node,
356dc812d9aSBenoît Ganne				rdma_txq_t * txq)
357fe750c24SBenoît Ganne{
358fe750c24SBenoît Ganne  struct ibv_wc wc[VLIB_FRAME_SIZE];
359dc812d9aSBenoît Ganne  u32 mask = pow2_mask (txq->bufs_log2sz);
360dc812d9aSBenoît Ganne  u16 tail;
361e7e8bf37SBenoît Ganne  int n;
362fe750c24SBenoît Ganne
363dc812d9aSBenoît Ganne  n = ibv_poll_cq (txq->ibv_cq, VLIB_FRAME_SIZE, wc);
364e7e8bf37SBenoît Ganne  if (n <= 0)
365dc812d9aSBenoît Ganne    {
366dc812d9aSBenoît Ganne      if (PREDICT_FALSE (n < 0))
367dc812d9aSBenoît Ganne	vlib_error_count (vm, node->node_index, RDMA_TX_ERROR_COMPLETION, 1);
368dc812d9aSBenoît Ganne      return;
369dc812d9aSBenoît Ganne    }
370dc812d9aSBenoît Ganne
371dc812d9aSBenoît Ganne  while (PREDICT_FALSE (IBV_WC_SUCCESS != wc[n - 1].status))
372dc812d9aSBenoît Ganne    {
373dc812d9aSBenoît Ganne      vlib_error_count (vm, node->node_index, RDMA_TX_ERROR_COMPLETION, 1);
374dc812d9aSBenoît Ganne      n--;
375dc812d9aSBenoît Ganne      if (0 == n)
376dc812d9aSBenoît Ganne	return;
377dc812d9aSBenoît Ganne    }
378fe750c24SBenoît Ganne
379e7e8bf37SBenoît Ganne  tail = wc[n - 1].wr_id;
380dc812d9aSBenoît Ganne  vlib_buffer_free_from_ring (vm, txq->bufs, txq->head & mask,
381dc812d9aSBenoît Ganne			      RDMA_TXQ_BUF_SZ (txq),
382dc812d9aSBenoît Ganne			      RDMA_TXQ_USED_SZ (txq->head, tail));
383e7e8bf37SBenoît Ganne  txq->head = tail;
384fe750c24SBenoît Ganne}
385fe750c24SBenoît Ganne
386e7e8bf37SBenoît Gannestatic_always_inline u32
387dc812d9aSBenoît Gannerdma_device_output_tx_ibverb (vlib_main_t * vm,
388dc812d9aSBenoît Ganne			      const vlib_node_runtime_t * node,
389dc812d9aSBenoît Ganne			      const rdma_device_t * rd, rdma_txq_t * txq,
390dc812d9aSBenoît Ganne			      u32 n_left_from, u32 * bi, vlib_buffer_t ** b)
391fe750c24SBenoît Ganne{
392211ef2ebSBenoît Ganne  struct ibv_send_wr wr[VLIB_FRAME_SIZE], *w = wr;
393211ef2ebSBenoît Ganne  struct ibv_sge sge[VLIB_FRAME_SIZE], *s = sge;
394dc812d9aSBenoît Ganne  u32 n = n_left_from;
395fe750c24SBenoît Ganne
3962d25467dSBenoît Ganne  while (n >= 8)
397fe750c24SBenoît Ganne    {
3982d25467dSBenoît Ganne      vlib_prefetch_buffer_header (b[4], LOAD);
399211ef2ebSBenoît Ganne      s[0].addr = vlib_buffer_get_current_va (b[0]);
400211ef2ebSBenoît Ganne      s[0].length = b[0]->current_length;
401e7e8bf37SBenoît Ganne      s[0].lkey = rd->lkey;
402211ef2ebSBenoît Ganne
4032d25467dSBenoît Ganne      vlib_prefetch_buffer_header (b[5], LOAD);
404211ef2ebSBenoît Ganne      s[1].addr = vlib_buffer_get_current_va (b[1]);
405211ef2ebSBenoît Ganne      s[1].length = b[1]->current_length;
406e7e8bf37SBenoît Ganne      s[1].lkey = rd->lkey;
407211ef2ebSBenoît Ganne
4082d25467dSBenoît Ganne      vlib_prefetch_buffer_header (b[6], LOAD);
409dc195d68SBenoît Ganne      s[2].addr = vlib_buffer_get_current_va (b[2]);
410dc195d68SBenoît Ganne      s[2].length = b[2]->current_length;
411e7e8bf37SBenoît Ganne      s[2].lkey = rd->lkey;
412dc195d68SBenoît Ganne
4132d25467dSBenoît Ganne      vlib_prefetch_buffer_header (b[7], LOAD);
414dc195d68SBenoît Ganne      s[3].addr = vlib_buffer_get_current_va (b[3]);
415dc195d68SBenoît Ganne      s[3].length = b[3]->current_length;
416e7e8bf37SBenoît Ganne      s[3].lkey = rd->lkey;
417dc195d68SBenoît Ganne
41872f4921cSBenoît Ganne      clib_memset_u8 (&w[0], 0, sizeof (w[0]));
419dc195d68SBenoît Ganne      w[0].next = &w[0] + 1;
420211ef2ebSBenoît Ganne      w[0].sg_list = &s[0];
421211ef2ebSBenoît Ganne      w[0].num_sge = 1;
422211ef2ebSBenoît Ganne      w[0].opcode = IBV_WR_SEND;
423211ef2ebSBenoît Ganne
42472f4921cSBenoît Ganne      clib_memset_u8 (&w[1], 0, sizeof (w[1]));
425dc195d68SBenoît Ganne      w[1].next = &w[1] + 1;
426211ef2ebSBenoît Ganne      w[1].sg_list = &s[1];
427211ef2ebSBenoît Ganne      w[1].num_sge = 1;
428211ef2ebSBenoît Ganne      w[1].opcode = IBV_WR_SEND;
429211ef2ebSBenoît Ganne
43072f4921cSBenoît Ganne      clib_memset_u8 (&w[2], 0, sizeof (w[2]));
431dc195d68SBenoît Ganne      w[2].next = &w[2] + 1;
432dc195d68SBenoît Ganne      w[2].sg_list = &s[2];
433dc195d68SBenoît Ganne      w[2].num_sge = 1;
434dc195d68SBenoît Ganne      w[2].opcode = IBV_WR_SEND;
435dc195d68SBenoît Ganne
43672f4921cSBenoît Ganne      clib_memset_u8 (&w[3], 0, sizeof (w[3]));
437dc195d68SBenoît Ganne      w[3].next = &w[3] + 1;
438dc195d68SBenoît Ganne      w[3].sg_list = &s[3];
439dc195d68SBenoît Ganne      w[3].num_sge = 1;
440dc195d68SBenoît Ganne      w[3].opcode = IBV_WR_SEND;
441dc195d68SBenoît Ganne
442dc195d68SBenoît Ganne      s += 4;
443dc195d68SBenoît Ganne      w += 4;
444dc195d68SBenoît Ganne      b += 4;
445e7e8bf37SBenoît Ganne      n -= 4;
446211ef2ebSBenoît Ganne    }
447211ef2ebSBenoît Ganne
448e7e8bf37SBenoît Ganne  while (n >= 1)
449211ef2ebSBenoît Ganne    {
450211ef2ebSBenoît Ganne      s[0].addr = vlib_buffer_get_current_va (b[0]);
451211ef2ebSBenoît Ganne      s[0].length = b[0]->current_length;
452e7e8bf37SBenoît Ganne      s[0].lkey = rd->lkey;
453211ef2ebSBenoît Ganne
45472f4921cSBenoît Ganne      clib_memset_u8 (&w[0], 0, sizeof (w[0]));
455dc195d68SBenoît Ganne      w[0].next = &w[0] + 1;
456211ef2ebSBenoît Ganne      w[0].sg_list = &s[0];
457211ef2ebSBenoît Ganne      w[0].num_sge = 1;
458211ef2ebSBenoît Ganne      w[0].opcode = IBV_WR_SEND;
459211ef2ebSBenoît Ganne
460211ef2ebSBenoît Ganne      s += 1;
461211ef2ebSBenoît Ganne      w += 1;
462211ef2ebSBenoît Ganne      b += 1;
463e7e8bf37SBenoît Ganne      n -= 1;
464fe750c24SBenoît Ganne    }
465fe750c24SBenoît Ganne
466dc812d9aSBenoît Ganne  w[-1].wr_id = txq->tail;	/* register item to free */
467e7e8bf37SBenoît Ganne  w[-1].next = 0;		/* fix next pointer in WR linked-list */
468e7e8bf37SBenoît Ganne  w[-1].send_flags = IBV_SEND_SIGNALED;	/* generate a CQE so we can free buffers */
469211ef2ebSBenoît Ganne
470211ef2ebSBenoît Ganne  w = wr;
471dc812d9aSBenoît Ganne  if (PREDICT_FALSE (0 != ibv_post_send (txq->ibv_qp, w, &w)))
472dc812d9aSBenoît Ganne    {
473dc812d9aSBenoît Ganne      vlib_error_count (vm, node->node_index, RDMA_TX_ERROR_SUBMISSION,
474dc812d9aSBenoît Ganne			n_left_from - (w - wr));
475dc812d9aSBenoît Ganne      n_left_from = w - wr;
476dc812d9aSBenoît Ganne    }
477e7e8bf37SBenoît Ganne
478e7e8bf37SBenoît Ganne  return n_left_from;
479e7e8bf37SBenoît Ganne}
480e7e8bf37SBenoît Ganne
481dc812d9aSBenoît Ganne/*
482dc812d9aSBenoît Ganne * common tx/free functions
483dc812d9aSBenoît Ganne */
484dc812d9aSBenoît Ganne
485dc812d9aSBenoît Gannestatic_always_inline void
486dc812d9aSBenoît Gannerdma_device_output_free (vlib_main_t * vm, const vlib_node_runtime_t * node,
487dc812d9aSBenoît Ganne			 rdma_txq_t * txq, int is_mlx5dv)
488dc812d9aSBenoît Ganne{
489dc812d9aSBenoît Ganne  if (is_mlx5dv)
490dc812d9aSBenoît Ganne    rdma_device_output_free_mlx5 (vm, node, txq);
491dc812d9aSBenoît Ganne  else
492dc812d9aSBenoît Ganne    rdma_device_output_free_ibverb (vm, node, txq);
493dc812d9aSBenoît Ganne}
494dc812d9aSBenoît Ganne
495dc812d9aSBenoît Gannestatic_always_inline u32
496dc812d9aSBenoît Gannerdma_device_output_tx_try (vlib_main_t * vm, const vlib_node_runtime_t * node,
497dc812d9aSBenoît Ganne			   const rdma_device_t * rd, rdma_txq_t * txq,
498dc812d9aSBenoît Ganne			   u32 n_left_from, u32 * bi, int is_mlx5dv)
499dc812d9aSBenoît Ganne{
500dc812d9aSBenoît Ganne  vlib_buffer_t *b[VLIB_FRAME_SIZE];
50172f4921cSBenoît Ganne  const u32 mask = pow2_mask (txq->bufs_log2sz);
502dc812d9aSBenoît Ganne
503dc812d9aSBenoît Ganne  /* do not enqueue more packet than ring space */
504dc812d9aSBenoît Ganne  n_left_from = clib_min (n_left_from, RDMA_TXQ_AVAIL_SZ (txq, txq->head,
505dc812d9aSBenoît Ganne							  txq->tail));
506dc812d9aSBenoît Ganne  /* if ring is full, do nothing */
507dc812d9aSBenoît Ganne  if (PREDICT_FALSE (n_left_from == 0))
508dc812d9aSBenoît Ganne    return 0;
509dc812d9aSBenoît Ganne
510dc812d9aSBenoît Ganne  vlib_get_buffers (vm, bi, b, n_left_from);
511dc812d9aSBenoît Ganne
51272f4921cSBenoît Ganne  n_left_from = is_mlx5dv ?
513dc812d9aSBenoît Ganne    rdma_device_output_tx_mlx5 (vm, node, rd, txq, n_left_from, bi, b) :
514dc812d9aSBenoît Ganne    rdma_device_output_tx_ibverb (vm, node, rd, txq, n_left_from, bi, b);
51572f4921cSBenoît Ganne
51672f4921cSBenoît Ganne  vlib_buffer_copy_indices_to_ring (txq->bufs, bi, txq->tail & mask,
51772f4921cSBenoît Ganne				    RDMA_TXQ_BUF_SZ (txq), n_left_from);
51872f4921cSBenoît Ganne  txq->tail += n_left_from;
51972f4921cSBenoît Ganne
52072f4921cSBenoît Ganne  return n_left_from;
521dc812d9aSBenoît Ganne}
522dc812d9aSBenoît Ganne
523dc812d9aSBenoît Gannestatic_always_inline uword
524dc812d9aSBenoît Gannerdma_device_output_tx (vlib_main_t * vm, vlib_node_runtime_t * node,
525dc812d9aSBenoît Ganne		       vlib_frame_t * frame, rdma_device_t * rd,
526dc812d9aSBenoît Ganne		       int is_mlx5dv)
527e7e8bf37SBenoît Ganne{
528e7e8bf37SBenoît Ganne  u32 thread_index = vm->thread_index;
529e7e8bf37SBenoît Ganne  rdma_txq_t *txq =
530e7e8bf37SBenoît Ganne    vec_elt_at_index (rd->txqs, thread_index % vec_len (rd->txqs));
531e7e8bf37SBenoît Ganne  u32 *from;
532e7e8bf37SBenoît Ganne  u32 n_left_from;
533e7e8bf37SBenoît Ganne  int i;
534e7e8bf37SBenoît Ganne
535dc812d9aSBenoît Ganne  ASSERT (RDMA_TXQ_BUF_SZ (txq) >= VLIB_FRAME_SIZE);
536e7e8bf37SBenoît Ganne
537e7e8bf37SBenoît Ganne  from = vlib_frame_vector_args (frame);
538e7e8bf37SBenoît Ganne  n_left_from = frame->n_vectors;
539e7e8bf37SBenoît Ganne
540211ef2ebSBenoît Ganne  clib_spinlock_lock_if_init (&txq->lock);
541e7e8bf37SBenoît Ganne
542dc812d9aSBenoît Ganne  for (i = 0; i < RDMA_TX_RETRIES && n_left_from > 0; i++)
543211ef2ebSBenoît Ganne    {
544e7e8bf37SBenoît Ganne      u32 n_enq;
545dc812d9aSBenoît Ganne      rdma_device_output_free (vm, node, txq, is_mlx5dv);
546dc812d9aSBenoît Ganne      n_enq = rdma_device_output_tx_try (vm, node, rd, txq, n_left_from, from,
547dc812d9aSBenoît Ganne					 is_mlx5dv);
548dc812d9aSBenoît Ganne
549e7e8bf37SBenoît Ganne      n_left_from -= n_enq;
550e7e8bf37SBenoît Ganne      from += n_enq;
551211ef2ebSBenoît Ganne    }
552fe750c24SBenoît Ganne
553e7e8bf37SBenoît Ganne  clib_spinlock_unlock_if_init (&txq->lock);
554211ef2ebSBenoît Ganne
555e7e8bf37SBenoît Ganne  if (PREDICT_FALSE (n_left_from))
556211ef2ebSBenoît Ganne    {
557e7e8bf37SBenoît Ganne      vlib_buffer_free (vm, from, n_left_from);
558211ef2ebSBenoît Ganne      vlib_error_count (vm, node->node_index,
559e7e8bf37SBenoît Ganne			RDMA_TX_ERROR_NO_FREE_SLOTS, n_left_from);
560211ef2ebSBenoît Ganne    }
561fe750c24SBenoît Ganne
562e7e8bf37SBenoît Ganne  return frame->n_vectors - n_left_from;
563fe750c24SBenoît Ganne}
564fe750c24SBenoît Ganne
565dc812d9aSBenoît GanneVNET_DEVICE_CLASS_TX_FN (rdma_device_class) (vlib_main_t * vm,
566dc812d9aSBenoît Ganne					     vlib_node_runtime_t * node,
567dc812d9aSBenoît Ganne					     vlib_frame_t * frame)
568dc812d9aSBenoît Ganne{
569dc812d9aSBenoît Ganne  rdma_main_t *rm = &rdma_main;
570dc812d9aSBenoît Ganne  vnet_interface_output_runtime_t *ord = (void *) node->runtime_data;
571dc812d9aSBenoît Ganne  rdma_device_t *rd = pool_elt_at_index (rm->devices, ord->dev_instance);
572dc812d9aSBenoît Ganne
573dc812d9aSBenoît Ganne  if (PREDICT_TRUE (rd->flags & RDMA_DEVICE_F_MLX5DV))
574dc812d9aSBenoît Ganne    return rdma_device_output_tx (vm, node, frame, rd, 1 /* is_mlx5dv */ );
575dc812d9aSBenoît Ganne
576dc812d9aSBenoît Ganne  return rdma_device_output_tx (vm, node, frame, rd, 0 /* is_mlx5dv */ );
577dc812d9aSBenoît Ganne}
578dc812d9aSBenoît Ganne
579fe750c24SBenoît Ganne/*
580fe750c24SBenoît Ganne * fd.io coding-style-patch-verification: ON
581fe750c24SBenoît Ganne *
582fe750c24SBenoît Ganne * Local Variables:
583fe750c24SBenoît Ganne * eval: (c-set-style "gnu")
584fe750c24SBenoît Ganne * End:
585fe750c24SBenoît Ganne */
586