1/*
2 *------------------------------------------------------------------
3 * Copyright (c) 2018 Cisco and/or its affiliates.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *------------------------------------------------------------------
16 */
17
18#include <unistd.h>
19#include <fcntl.h>
20#include <net/if.h>
21#include <linux/if_link.h>
22#include <linux/if_ether.h>
23
24#include <vppinfra/linux/sysfs.h>
25#include <vlib/vlib.h>
26#include <vlib/unix/unix.h>
27#include <vlib/pci/pci.h>
28#include <vnet/ethernet/ethernet.h>
29
30#include <rdma/rdma.h>
31
32/* Default RSS hash key (from DPDK MLX driver) */
33static u8 rdma_rss_hash_key[] = {
34  0x2c, 0xc6, 0x81, 0xd1,
35  0x5b, 0xdb, 0xf4, 0xf7,
36  0xfc, 0xa2, 0x83, 0x19,
37  0xdb, 0x1a, 0x3e, 0x94,
38  0x6b, 0x9e, 0x38, 0xd9,
39  0x2c, 0x9c, 0x03, 0xd1,
40  0xad, 0x99, 0x44, 0xa7,
41  0xd9, 0x56, 0x3d, 0x59,
42  0x06, 0x3c, 0x25, 0xf3,
43  0xfc, 0x1f, 0xdc, 0x2a,
44};
45
46rdma_main_t rdma_main;
47
48#define rdma_log__(lvl, dev, f, ...) \
49  do { \
50      vlib_log((lvl), rdma_main.log_class, "%s: " f, \
51               &(dev)->name, ##__VA_ARGS__); \
52  } while (0)
53
54#define rdma_log(lvl, dev, f, ...) \
55   rdma_log__((lvl), (dev), "%s (%d): " f, strerror(errno), errno, ##__VA_ARGS__)
56
57static struct ibv_flow *
58rdma_rxq_init_flow (const rdma_device_t * rd, struct ibv_qp *qp,
59		    const mac_address_t * mac, const mac_address_t * mask,
60		    u32 flags)
61{
62  struct ibv_flow *flow;
63  struct raw_eth_flow_attr
64  {
65    struct ibv_flow_attr attr;
66    struct ibv_flow_spec_eth spec_eth;
67  } __attribute__ ((packed)) fa;
68
69  memset (&fa, 0, sizeof (fa));
70  fa.attr.num_of_specs = 1;
71  fa.attr.port = 1;
72  fa.attr.flags = flags;
73  fa.spec_eth.type = IBV_FLOW_SPEC_ETH;
74  fa.spec_eth.size = sizeof (struct ibv_flow_spec_eth);
75
76  memcpy (fa.spec_eth.val.dst_mac, mac, sizeof (fa.spec_eth.val.dst_mac));
77  memcpy (fa.spec_eth.mask.dst_mac, mask, sizeof (fa.spec_eth.mask.dst_mac));
78
79  flow = ibv_create_flow (qp, &fa.attr);
80  if (!flow)
81    rdma_log (VLIB_LOG_LEVEL_ERR, rd, "ibv_create_flow() failed");
82  return flow;
83}
84
85static u32
86rdma_rxq_destroy_flow (const rdma_device_t * rd, struct ibv_flow **flow)
87{
88  if (!*flow)
89    return 0;
90
91  if (ibv_destroy_flow (*flow))
92    {
93      rdma_log (VLIB_LOG_LEVEL_ERR, rd, "ibv_destroy_flow() failed");
94      return ~0;
95    }
96
97  *flow = 0;
98  return 0;
99}
100
101static u32
102rdma_dev_set_promisc (rdma_device_t * rd)
103{
104  const mac_address_t all = {.bytes = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0} };
105  int err;
106
107  err = rdma_rxq_destroy_flow (rd, &rd->flow_mcast);
108  if (err)
109    return ~0;
110
111  err = rdma_rxq_destroy_flow (rd, &rd->flow_ucast);
112  if (err)
113    return ~0;
114
115  rd->flow_ucast = rdma_rxq_init_flow (rd, rd->rx_qp, &all, &all, 0);
116  if (!rd->flow_ucast)
117    return ~0;
118
119  rd->flags |= RDMA_DEVICE_F_PROMISC;
120  return 0;
121}
122
123static u32
124rdma_dev_set_ucast (rdma_device_t * rd)
125{
126  const mac_address_t ucast = {.bytes = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}
127  };
128  const mac_address_t mcast = {.bytes = {0x1, 0x0, 0x0, 0x0, 0x0, 0x0} };
129  int err;
130
131  err = rdma_rxq_destroy_flow (rd, &rd->flow_mcast);
132  if (err)
133    return ~0;
134
135  err = rdma_rxq_destroy_flow (rd, &rd->flow_ucast);
136  if (err)
137    return ~0;
138
139  /* receive only packets with src = our MAC */
140  rd->flow_ucast = rdma_rxq_init_flow (rd, rd->rx_qp, &rd->hwaddr, &ucast, 0);
141  if (!rd->flow_ucast)
142    return ~0;
143
144  /* receive multicast packets */
145  rd->flow_mcast = rdma_rxq_init_flow (rd, rd->rx_qp, &mcast, &mcast,
146				       IBV_FLOW_ATTR_FLAGS_DONT_TRAP
147				       /* let others receive mcast packet too (eg. Linux) */
148    );
149  if (!rd->flow_mcast)
150    return ~0;
151
152  rd->flags &= ~RDMA_DEVICE_F_PROMISC;
153  return 0;
154}
155
156static clib_error_t *
157rdma_mac_change (vnet_hw_interface_t * hw, const u8 * old, const u8 * new)
158{
159  rdma_main_t *rm = &rdma_main;
160  rdma_device_t *rd = vec_elt_at_index (rm->devices, hw->dev_instance);
161  mac_address_from_bytes (&rd->hwaddr, new);
162  if (!(rd->flags & RDMA_DEVICE_F_PROMISC) && rdma_dev_set_ucast (rd))
163    {
164      mac_address_from_bytes (&rd->hwaddr, old);
165      return clib_error_return_unix (0, "MAC update failed");
166    }
167  return 0;
168}
169
170static u32
171rdma_dev_change_mtu (rdma_device_t * rd)
172{
173  rdma_log__ (VLIB_LOG_LEVEL_ERR, rd, "MTU change not supported");
174  return ~0;
175}
176
177static u32
178rdma_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hw, u32 flags)
179{
180  rdma_main_t *rm = &rdma_main;
181  rdma_device_t *rd = vec_elt_at_index (rm->devices, hw->dev_instance);
182
183  switch (flags)
184    {
185    case ETHERNET_INTERFACE_FLAG_DEFAULT_L3:
186      return rdma_dev_set_ucast (rd);
187    case ETHERNET_INTERFACE_FLAG_ACCEPT_ALL:
188      return rdma_dev_set_promisc (rd);
189    case ETHERNET_INTERFACE_FLAG_MTU:
190      return rdma_dev_change_mtu (rd);
191    }
192
193  rdma_log__ (VLIB_LOG_LEVEL_ERR, rd, "unknown flag %x requested", flags);
194  return ~0;
195}
196
197static void
198rdma_update_state (vnet_main_t * vnm, rdma_device_t * rd, int port)
199{
200  struct ibv_port_attr attr;
201  u32 width = 0;
202  u32 speed = 0;
203
204  if (ibv_query_port (rd->ctx, port, &attr))
205    {
206      vnet_hw_interface_set_link_speed (vnm, rd->hw_if_index, 0);
207      vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
208      return;
209    }
210
211  /* update state */
212  switch (attr.state)
213    {
214    case IBV_PORT_ACTIVE:	/* fallthrough */
215    case IBV_PORT_ACTIVE_DEFER:
216      rd->flags |= RDMA_DEVICE_F_LINK_UP;
217      vnet_hw_interface_set_flags (vnm, rd->hw_if_index,
218				   VNET_HW_INTERFACE_FLAG_LINK_UP);
219      break;
220    default:
221      rd->flags &= ~RDMA_DEVICE_F_LINK_UP;
222      vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
223      break;
224    }
225
226  /* update speed */
227  switch (attr.active_width)
228    {
229    case 1:
230      width = 1;
231      break;
232    case 2:
233      width = 4;
234      break;
235    case 4:
236      width = 8;
237      break;
238    case 8:
239      width = 12;
240      break;
241    }
242  switch (attr.active_speed)
243    {
244    case 1:
245      speed = 2500000;
246      break;
247    case 2:
248      speed = 5000000;
249      break;
250    case 4:			/* fallthrough */
251    case 8:
252      speed = 10000000;
253      break;
254    case 16:
255      speed = 14000000;
256      break;
257    case 32:
258      speed = 25000000;
259      break;
260    }
261  vnet_hw_interface_set_link_speed (vnm, rd->hw_if_index, width * speed);
262}
263
264static clib_error_t *
265rdma_async_event_error_ready (clib_file_t * f)
266{
267  rdma_main_t *rm = &rdma_main;
268  rdma_device_t *rd = vec_elt_at_index (rm->devices, f->private_data);
269  return clib_error_return (0, "RDMA: %s: async event error", rd->name);
270}
271
272static clib_error_t *
273rdma_async_event_read_ready (clib_file_t * f)
274{
275  vnet_main_t *vnm = vnet_get_main ();
276  rdma_main_t *rm = &rdma_main;
277  rdma_device_t *rd = vec_elt_at_index (rm->devices, f->private_data);
278  int ret;
279  struct ibv_async_event event;
280  ret = ibv_get_async_event (rd->ctx, &event);
281  if (ret < 0)
282    return clib_error_return_unix (0, "ibv_get_async_event() failed");
283
284  switch (event.event_type)
285    {
286    case IBV_EVENT_PORT_ACTIVE:
287      rdma_update_state (vnm, rd, event.element.port_num);
288      break;
289    case IBV_EVENT_PORT_ERR:
290      rdma_update_state (vnm, rd, event.element.port_num);
291      break;
292    case IBV_EVENT_DEVICE_FATAL:
293      rd->flags &= ~RDMA_DEVICE_F_LINK_UP;
294      vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
295      vlib_log_emerg (rm->log_class, "%s: fatal error", rd->name);
296      break;
297    default:
298      rdma_log__ (VLIB_LOG_LEVEL_ERR, rd, "unhandeld RDMA async event %i",
299		  event.event_type);
300      break;
301    }
302
303  ibv_ack_async_event (&event);
304  return 0;
305}
306
307static clib_error_t *
308rdma_async_event_init (rdma_device_t * rd)
309{
310  clib_file_t t = { 0 };
311  int ret;
312
313  /* make RDMA async event fd non-blocking */
314  ret = fcntl (rd->ctx->async_fd, F_GETFL);
315  if (ret < 0)
316    return clib_error_return_unix (0, "fcntl(F_GETFL) failed");
317
318  ret = fcntl (rd->ctx->async_fd, F_SETFL, ret | O_NONBLOCK);
319  if (ret < 0)
320    return clib_error_return_unix (0, "fcntl(F_SETFL, O_NONBLOCK) failed");
321
322  /* register RDMA async event fd */
323  t.read_function = rdma_async_event_read_ready;
324  t.file_descriptor = rd->ctx->async_fd;
325  t.error_function = rdma_async_event_error_ready;
326  t.private_data = rd->dev_instance;
327  t.description = format (0, "%v async event", rd->name);
328
329  rd->async_event_clib_file_index = clib_file_add (&file_main, &t);
330  return 0;
331}
332
333static void
334rdma_async_event_cleanup (rdma_device_t * rd)
335{
336  clib_file_del_by_index (&file_main, rd->async_event_clib_file_index);
337}
338
339static clib_error_t *
340rdma_register_interface (vnet_main_t * vnm, rdma_device_t * rd)
341{
342  clib_error_t *err =
343    ethernet_register_interface (vnm, rdma_device_class.index,
344				 rd->dev_instance, rd->hwaddr.bytes,
345				 &rd->hw_if_index, rdma_flag_change);
346
347  /* Indicate ability to support L3 DMAC filtering and
348   * initialize interface to L3 non-promisc mode */
349  vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, rd->hw_if_index);
350  hi->flags |= VNET_HW_INTERFACE_FLAG_SUPPORTS_MAC_FILTER;
351  ethernet_set_flags (vnm, rd->hw_if_index,
352		      ETHERNET_INTERFACE_FLAG_DEFAULT_L3);
353  return err;
354}
355
356static void
357rdma_unregister_interface (vnet_main_t * vnm, rdma_device_t * rd)
358{
359  vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
360  vnet_hw_interface_unassign_rx_thread (vnm, rd->hw_if_index, 0);
361  ethernet_delete_interface (vnm, rd->hw_if_index);
362}
363
364static void
365rdma_dev_cleanup (rdma_device_t * rd)
366{
367  rdma_main_t *rm = &rdma_main;
368  rdma_rxq_t *rxq;
369  rdma_txq_t *txq;
370
371#define _(fn, arg) if (arg) \
372  { \
373    int rv; \
374    if ((rv = fn (arg))) \
375       rdma_log (VLIB_LOG_LEVEL_DEBUG, rd, #fn "() failed (rv = %d)", rv); \
376  }
377
378  _(ibv_destroy_flow, rd->flow_mcast);
379  _(ibv_destroy_flow, rd->flow_ucast);
380  _(ibv_dereg_mr, rd->mr);
381  vec_foreach (txq, rd->txqs)
382  {
383    _(ibv_destroy_qp, txq->qp);
384    _(ibv_destroy_cq, txq->cq);
385  }
386  vec_foreach (rxq, rd->rxqs)
387  {
388    _(ibv_destroy_wq, rxq->wq);
389    _(ibv_destroy_cq, rxq->cq);
390  }
391  _(ibv_destroy_rwq_ind_table, rd->rx_rwq_ind_tbl);
392  _(ibv_destroy_qp, rd->rx_qp);
393  _(ibv_dealloc_pd, rd->pd);
394  _(ibv_close_device, rd->ctx);
395#undef _
396
397  clib_error_free (rd->error);
398
399  vec_free (rd->rxqs);
400  vec_free (rd->txqs);
401  vec_free (rd->name);
402  vlib_pci_free_device_info (rd->pci);
403  pool_put (rm->devices, rd);
404}
405
406static clib_error_t *
407rdma_rxq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc)
408{
409  rdma_rxq_t *rxq;
410  struct ibv_wq_init_attr wqia;
411  struct ibv_cq_init_attr_ex cqa = { };
412  struct ibv_wq_attr wqa;
413  struct ibv_cq_ex *cqex;
414
415  vec_validate_aligned (rd->rxqs, qid, CLIB_CACHE_LINE_BYTES);
416  rxq = vec_elt_at_index (rd->rxqs, qid);
417  rxq->size = n_desc;
418  vec_validate_aligned (rxq->bufs, n_desc - 1, CLIB_CACHE_LINE_BYTES);
419
420  cqa.cqe = n_desc;
421  if (rd->flags & RDMA_DEVICE_F_MLX5DV)
422    {
423      struct mlx5dv_cq_init_attr dvcq = { };
424      dvcq.comp_mask = MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE;
425      dvcq.cqe_comp_res_format = MLX5DV_CQE_RES_FORMAT_HASH;
426
427      if ((cqex = mlx5dv_create_cq (rd->ctx, &cqa, &dvcq)) == 0)
428	return clib_error_return_unix (0, "Create mlx5dv rx CQ Failed");
429    }
430  else
431    {
432      if ((cqex = ibv_create_cq_ex (rd->ctx, &cqa)) == 0)
433	return clib_error_return_unix (0, "Create CQ Failed");
434    }
435
436  rxq->cq = ibv_cq_ex_to_cq (cqex);
437
438  memset (&wqia, 0, sizeof (wqia));
439  wqia.wq_type = IBV_WQT_RQ;
440  wqia.max_wr = n_desc;
441  wqia.max_sge = 1;
442  wqia.pd = rd->pd;
443  wqia.cq = rxq->cq;
444  if ((rxq->wq = ibv_create_wq (rd->ctx, &wqia)) == 0)
445    return clib_error_return_unix (0, "Create WQ Failed");
446
447  memset (&wqa, 0, sizeof (wqa));
448  wqa.attr_mask = IBV_WQ_ATTR_STATE;
449  wqa.wq_state = IBV_WQS_RDY;
450  if (ibv_modify_wq (rxq->wq, &wqa) != 0)
451    return clib_error_return_unix (0, "Modify WQ (RDY) Failed");
452
453  if (rd->flags & RDMA_DEVICE_F_MLX5DV)
454    {
455      struct mlx5dv_obj obj = { };
456      struct mlx5dv_cq dv_cq;
457      struct mlx5dv_rwq dv_rwq;
458      u64 qw0;
459
460      obj.cq.in = rxq->cq;
461      obj.cq.out = &dv_cq;
462      obj.rwq.in = rxq->wq;
463      obj.rwq.out = &dv_rwq;
464
465      if ((mlx5dv_init_obj (&obj, MLX5DV_OBJ_CQ | MLX5DV_OBJ_RWQ)))
466	return clib_error_return_unix (0, "mlx5dv: failed to init rx obj");
467
468      if (dv_cq.cqe_size != sizeof (mlx5dv_cqe_t))
469	return clib_error_return_unix (0, "mlx5dv: incompatible rx CQE size");
470
471      rxq->log2_cq_size = max_log2 (dv_cq.cqe_cnt);
472      rxq->cqes = (mlx5dv_cqe_t *) dv_cq.buf;
473      rxq->cq_db = (volatile u32 *) dv_cq.dbrec;
474      rxq->cqn = dv_cq.cqn;
475
476      rxq->wqes = (mlx5dv_rwq_t *) dv_rwq.buf;
477      rxq->wq_db = (volatile u32 *) dv_rwq.dbrec;
478      rxq->wq_stride = dv_rwq.stride;
479      rxq->wqe_cnt = dv_rwq.wqe_cnt;
480
481      qw0 = clib_host_to_net_u32 (vlib_buffer_get_default_data_size (vm));
482      qw0 |= (u64) clib_host_to_net_u32 (rd->lkey) << 32;
483
484      for (int i = 0; i < rxq->size; i++)
485	rxq->wqes[i].dsz_and_lkey = qw0;
486
487      for (int i = 0; i < (1 << rxq->log2_cq_size); i++)
488	rxq->cqes[i].opcode_cqefmt_se_owner = 0xff;
489    }
490
491  return 0;
492}
493
494static clib_error_t *
495rdma_rxq_finalize (vlib_main_t * vm, rdma_device_t * rd)
496{
497  struct ibv_rwq_ind_table_init_attr rwqia;
498  struct ibv_qp_init_attr_ex qpia;
499  struct ibv_wq **ind_tbl;
500  u32 i;
501
502  ASSERT (is_pow2 (vec_len (rd->rxqs))
503	  && "rxq number should be a power of 2");
504
505  ind_tbl = vec_new (struct ibv_wq *, vec_len (rd->rxqs));
506  vec_foreach_index (i, rd->rxqs)
507    ind_tbl[i] = vec_elt_at_index (rd->rxqs, i)->wq;
508  memset (&rwqia, 0, sizeof (rwqia));
509  rwqia.log_ind_tbl_size = min_log2 (vec_len (ind_tbl));
510  rwqia.ind_tbl = ind_tbl;
511  if ((rd->rx_rwq_ind_tbl = ibv_create_rwq_ind_table (rd->ctx, &rwqia)) == 0)
512    return clib_error_return_unix (0, "RWQ indirection table create failed");
513  vec_free (ind_tbl);
514
515  memset (&qpia, 0, sizeof (qpia));
516  qpia.qp_type = IBV_QPT_RAW_PACKET;
517  qpia.comp_mask =
518    IBV_QP_INIT_ATTR_PD | IBV_QP_INIT_ATTR_IND_TABLE |
519    IBV_QP_INIT_ATTR_RX_HASH;
520  qpia.pd = rd->pd;
521  qpia.rwq_ind_tbl = rd->rx_rwq_ind_tbl;
522  STATIC_ASSERT_SIZEOF (rdma_rss_hash_key, 40);
523  qpia.rx_hash_conf.rx_hash_key_len = sizeof (rdma_rss_hash_key);
524  qpia.rx_hash_conf.rx_hash_key = rdma_rss_hash_key;
525  qpia.rx_hash_conf.rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ;
526  qpia.rx_hash_conf.rx_hash_fields_mask =
527    IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_DST_IPV4;
528  if ((rd->rx_qp = ibv_create_qp_ex (rd->ctx, &qpia)) == 0)
529    return clib_error_return_unix (0, "Queue Pair create failed");
530
531  if (rdma_dev_set_ucast (rd))
532    return clib_error_return_unix (0, "Set unicast mode failed");
533
534  return 0;
535}
536
537static clib_error_t *
538rdma_txq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc)
539{
540  rdma_txq_t *txq;
541  struct ibv_qp_init_attr qpia;
542  struct ibv_qp_attr qpa;
543  int qp_flags;
544
545  vec_validate_aligned (rd->txqs, qid, CLIB_CACHE_LINE_BYTES);
546  txq = vec_elt_at_index (rd->txqs, qid);
547  ASSERT (is_pow2 (n_desc));
548  txq->bufs_log2sz = min_log2 (n_desc);
549  vec_validate_aligned (txq->bufs, n_desc - 1, CLIB_CACHE_LINE_BYTES);
550
551  if ((txq->cq = ibv_create_cq (rd->ctx, n_desc, NULL, NULL, 0)) == 0)
552    return clib_error_return_unix (0, "Create CQ Failed");
553
554  memset (&qpia, 0, sizeof (qpia));
555  qpia.send_cq = txq->cq;
556  qpia.recv_cq = txq->cq;
557  qpia.cap.max_send_wr = n_desc;
558  qpia.cap.max_send_sge = 1;
559  qpia.qp_type = IBV_QPT_RAW_PACKET;
560
561  if ((txq->qp = ibv_create_qp (rd->pd, &qpia)) == 0)
562    return clib_error_return_unix (0, "Queue Pair create failed");
563
564  memset (&qpa, 0, sizeof (qpa));
565  qp_flags = IBV_QP_STATE | IBV_QP_PORT;
566  qpa.qp_state = IBV_QPS_INIT;
567  qpa.port_num = 1;
568  if (ibv_modify_qp (txq->qp, &qpa, qp_flags) != 0)
569    return clib_error_return_unix (0, "Modify QP (init) Failed");
570
571  memset (&qpa, 0, sizeof (qpa));
572  qp_flags = IBV_QP_STATE;
573  qpa.qp_state = IBV_QPS_RTR;
574  if (ibv_modify_qp (txq->qp, &qpa, qp_flags) != 0)
575    return clib_error_return_unix (0, "Modify QP (receive) Failed");
576
577  memset (&qpa, 0, sizeof (qpa));
578  qp_flags = IBV_QP_STATE;
579  qpa.qp_state = IBV_QPS_RTS;
580  if (ibv_modify_qp (txq->qp, &qpa, qp_flags) != 0)
581    return clib_error_return_unix (0, "Modify QP (send) Failed");
582
583  txq->ibv_cq = txq->cq;
584  txq->ibv_qp = txq->qp;
585
586  if (rd->flags & RDMA_DEVICE_F_MLX5DV)
587    {
588      rdma_mlx5_wqe_t *tmpl = (void *) txq->dv_wqe_tmpl;
589      struct mlx5dv_cq dv_cq;
590      struct mlx5dv_qp dv_qp;
591      struct mlx5dv_obj obj = { };
592
593      obj.cq.in = txq->cq;
594      obj.cq.out = &dv_cq;
595      obj.qp.in = txq->qp;
596      obj.qp.out = &dv_qp;
597
598      if (mlx5dv_init_obj (&obj, MLX5DV_OBJ_CQ | MLX5DV_OBJ_QP))
599	return clib_error_return_unix (0, "DV init obj failed");
600
601      if (RDMA_TXQ_BUF_SZ (txq) > dv_qp.sq.wqe_cnt
602	  || !is_pow2 (dv_qp.sq.wqe_cnt)
603	  || sizeof (rdma_mlx5_wqe_t) != dv_qp.sq.stride
604	  || (uword) dv_qp.sq.buf % sizeof (rdma_mlx5_wqe_t))
605	return clib_error_return (0, "Unsupported DV SQ parameters");
606
607      if (RDMA_TXQ_BUF_SZ (txq) > dv_cq.cqe_cnt
608	  || !is_pow2 (dv_cq.cqe_cnt)
609	  || sizeof (struct mlx5_cqe64) != dv_cq.cqe_size
610	  || (uword) dv_cq.buf % sizeof (struct mlx5_cqe64))
611	return clib_error_return (0, "Unsupported DV CQ parameters");
612
613      /* get SQ and doorbell addresses */
614      txq->dv_sq_wqes = dv_qp.sq.buf;
615      txq->dv_sq_dbrec = dv_qp.dbrec;
616      txq->dv_sq_db = dv_qp.bf.reg;
617      txq->dv_sq_log2sz = min_log2 (dv_qp.sq.wqe_cnt);
618
619      /* get CQ and doorbell addresses */
620      txq->dv_cq_cqes = dv_cq.buf;
621      txq->dv_cq_dbrec = dv_cq.dbrec;
622      txq->dv_cq_log2sz = min_log2 (dv_cq.cqe_cnt);
623
624      /* init tx desc template */
625      STATIC_ASSERT_SIZEOF (txq->dv_wqe_tmpl, sizeof (*tmpl));
626      mlx5dv_set_ctrl_seg (&tmpl->ctrl, 0, MLX5_OPCODE_SEND, 0,
627			   txq->qp->qp_num, 0, RDMA_MLX5_WQE_DS, 0,
628			   RDMA_TXQ_DV_INVALID_ID);
629      tmpl->eseg.inline_hdr_sz = htobe16 (MLX5_ETH_L2_INLINE_HEADER_SIZE);
630      mlx5dv_set_data_seg (&tmpl->dseg, 0, rd->lkey, 0);
631    }
632
633  return 0;
634}
635
636static clib_error_t *
637rdma_dev_init (vlib_main_t * vm, rdma_device_t * rd, u32 rxq_size,
638	       u32 txq_size, u32 rxq_num)
639{
640  clib_error_t *err;
641  vlib_buffer_main_t *bm = vm->buffer_main;
642  vlib_thread_main_t *tm = vlib_get_thread_main ();
643  u32 i;
644
645  if (rd->ctx == 0)
646    return clib_error_return_unix (0, "Device Open Failed");
647
648  if ((rd->pd = ibv_alloc_pd (rd->ctx)) == 0)
649    return clib_error_return_unix (0, "PD Alloc Failed");
650
651  if ((rd->mr = ibv_reg_mr (rd->pd, (void *) bm->buffer_mem_start,
652			    bm->buffer_mem_size,
653			    IBV_ACCESS_LOCAL_WRITE)) == 0)
654    return clib_error_return_unix (0, "Register MR Failed");
655
656  rd->lkey = rd->mr->lkey;	/* avoid indirection in datapath */
657
658  ethernet_mac_address_generate (rd->hwaddr.bytes);
659
660  if ((rd->mr = ibv_reg_mr (rd->pd, (void *) bm->buffer_mem_start,
661			    bm->buffer_mem_size,
662			    IBV_ACCESS_LOCAL_WRITE)) == 0)
663    return clib_error_return_unix (0, "Register MR Failed");
664  rd->lkey = rd->mr->lkey;	/* avoid indirection in datapath */
665
666  /*
667   * /!\ WARNING /!\ creation order is important
668   * We *must* create TX queues *before* RX queues, otherwise we will receive
669   * the broacast packets we sent
670   */
671  for (i = 0; i < tm->n_vlib_mains; i++)
672    if ((err = rdma_txq_init (vm, rd, i, txq_size)))
673      return err;
674
675  for (i = 0; i < rxq_num; i++)
676    if ((err = rdma_rxq_init (vm, rd, i, rxq_size)))
677      return err;
678  if ((err = rdma_rxq_finalize (vm, rd)))
679    return err;
680
681  return 0;
682}
683
684static uword
685sysfs_path_to_pci_addr (char *path, vlib_pci_addr_t * addr)
686{
687  uword rv;
688  unformat_input_t in;
689  u8 *s;
690
691  s = clib_sysfs_link_to_name (path);
692  if (!s)
693    return 0;
694
695  unformat_init_string (&in, (char *) s, strlen ((char *) s));
696  rv = unformat (&in, "%U", unformat_vlib_pci_addr, addr);
697  unformat_free (&in);
698  vec_free (s);
699  return rv;
700}
701
702void
703rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args)
704{
705  vnet_main_t *vnm = vnet_get_main ();
706  rdma_main_t *rm = &rdma_main;
707  rdma_device_t *rd;
708  vlib_pci_addr_t pci_addr;
709  struct ibv_device **dev_list;
710  int n_devs;
711  u8 *s;
712  u16 qid;
713  int i;
714
715  args->rxq_size = args->rxq_size ? args->rxq_size : 1024;
716  args->txq_size = args->txq_size ? args->txq_size : 1024;
717  args->rxq_num = args->rxq_num ? args->rxq_num : 1;
718
719  if (!is_pow2 (args->rxq_num))
720    {
721      args->rv = VNET_API_ERROR_INVALID_VALUE;
722      args->error =
723	clib_error_return (0, "rx queue number must be a power of two");
724      goto err0;
725    }
726
727  if (args->rxq_size < VLIB_FRAME_SIZE || args->txq_size < VLIB_FRAME_SIZE ||
728      args->rxq_size > 65535 || args->txq_size > 65535 ||
729      !is_pow2 (args->rxq_size) || !is_pow2 (args->txq_size))
730    {
731      args->rv = VNET_API_ERROR_INVALID_VALUE;
732      args->error = clib_error_return (0, "queue size must be a power of two "
733				       "between %i and 65535",
734				       VLIB_FRAME_SIZE);
735      goto err0;
736    }
737
738  dev_list = ibv_get_device_list (&n_devs);
739  if (n_devs == 0)
740    {
741      args->error =
742	clib_error_return_unix (0,
743				"no RDMA devices available. Is the ib_uverbs module loaded?");
744      goto err0;
745    }
746
747  /* get PCI address */
748  s = format (0, "/sys/class/net/%s/device%c", args->ifname, 0);
749  if (sysfs_path_to_pci_addr ((char *) s, &pci_addr) == 0)
750    {
751      args->error =
752	clib_error_return (0, "cannot find PCI address for device ");
753      goto err1;
754    }
755
756  pool_get_zero (rm->devices, rd);
757  rd->dev_instance = rd - rm->devices;
758  rd->per_interface_next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT;
759  rd->linux_ifname = format (0, "%s", args->ifname);
760
761  if (!args->name || 0 == args->name[0])
762    rd->name = format (0, "%s/%d", args->ifname, rd->dev_instance);
763  else
764    rd->name = format (0, "%s", args->name);
765
766  rd->pci = vlib_pci_get_device_info (vm, &pci_addr, &args->error);
767  if (!rd->pci)
768    goto err2;
769
770  /* if we failed to parse NUMA node, default to 0 */
771  if (-1 == rd->pci->numa_node)
772    rd->pci->numa_node = 0;
773
774  rd->pool = vlib_buffer_pool_get_default_for_numa (vm, rd->pci->numa_node);
775
776  if (strncmp ((char *) rd->pci->driver_name, "mlx5_core", 9))
777    {
778      args->error =
779	clib_error_return (0,
780			   "invalid interface (only mlx5 supported for now)");
781      goto err2;
782    }
783
784  for (i = 0; i < n_devs; i++)
785    {
786      vlib_pci_addr_t addr;
787
788      vec_reset_length (s);
789      s = format (s, "%s/device%c", dev_list[i]->dev_path, 0);
790
791      if (sysfs_path_to_pci_addr ((char *) s, &addr) == 0)
792	continue;
793
794      if (addr.as_u32 != rd->pci->addr.as_u32)
795	continue;
796
797      if ((rd->ctx = ibv_open_device (dev_list[i])))
798	break;
799    }
800
801  if (args->mode != RDMA_MODE_IBV)
802    {
803      struct mlx5dv_context mlx5dv_attrs = { };
804
805      if (mlx5dv_query_device (rd->ctx, &mlx5dv_attrs) == 0)
806	{
807	  if ((mlx5dv_attrs.flags & MLX5DV_CONTEXT_FLAGS_CQE_V1))
808	    rd->flags |= RDMA_DEVICE_F_MLX5DV;
809	}
810      else
811	{
812	  if (args->mode == RDMA_MODE_DV)
813	    {
814	      args->error = clib_error_return (0, "Direct Verbs mode not "
815					       "supported on this interface");
816	      goto err2;
817	    }
818	}
819    }
820
821  if ((args->error = rdma_dev_init (vm, rd, args->rxq_size, args->txq_size,
822				    args->rxq_num)))
823    goto err2;
824
825  if ((args->error = rdma_register_interface (vnm, rd)))
826    goto err2;
827
828  if ((args->error = rdma_async_event_init (rd)))
829    goto err3;
830
831  rdma_update_state (vnm, rd, 1);
832
833  vnet_sw_interface_t *sw = vnet_get_hw_sw_interface (vnm, rd->hw_if_index);
834  args->sw_if_index = rd->sw_if_index = sw->sw_if_index;
835  /*
836   * FIXME: add support for interrupt mode
837   * vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, rd->hw_if_index);
838   * hw->flags |= VNET_HW_INTERFACE_FLAG_SUPPORTS_INT_MODE;
839   */
840  vnet_hw_interface_set_input_node (vnm, rd->hw_if_index,
841				    rdma_input_node.index);
842  vec_foreach_index (qid, rd->rxqs)
843    vnet_hw_interface_assign_rx_thread (vnm, rd->hw_if_index, qid, ~0);
844
845  vec_free (s);
846  return;
847
848err3:
849  rdma_unregister_interface (vnm, rd);
850err2:
851  rdma_dev_cleanup (rd);
852err1:
853  ibv_free_device_list (dev_list);
854  vec_free (s);
855  args->rv = VNET_API_ERROR_INVALID_INTERFACE;
856err0:
857  vlib_log_err (rm->log_class, "%U", format_clib_error, args->error);
858}
859
860void
861rdma_delete_if (vlib_main_t * vm, rdma_device_t * rd)
862{
863  rdma_async_event_cleanup (rd);
864  rdma_unregister_interface (vnet_get_main (), rd);
865  rdma_dev_cleanup (rd);
866}
867
868static clib_error_t *
869rdma_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags)
870{
871  vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, hw_if_index);
872  rdma_main_t *rm = &rdma_main;
873  rdma_device_t *rd = vec_elt_at_index (rm->devices, hi->dev_instance);
874  uword is_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0;
875
876  if (rd->flags & RDMA_DEVICE_F_ERROR)
877    return clib_error_return (0, "device is in error state");
878
879  if (is_up)
880    {
881      vnet_hw_interface_set_flags (vnm, rd->hw_if_index,
882				   VNET_HW_INTERFACE_FLAG_LINK_UP);
883      rd->flags |= RDMA_DEVICE_F_ADMIN_UP;
884    }
885  else
886    {
887      vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
888      rd->flags &= ~RDMA_DEVICE_F_ADMIN_UP;
889    }
890  return 0;
891}
892
893static void
894rdma_set_interface_next_node (vnet_main_t * vnm, u32 hw_if_index,
895			      u32 node_index)
896{
897  rdma_main_t *rm = &rdma_main;
898  vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
899  rdma_device_t *rd = pool_elt_at_index (rm->devices, hw->dev_instance);
900  rd->per_interface_next_index =
901    ~0 ==
902    node_index ? VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT :
903    vlib_node_add_next (vlib_get_main (), rdma_input_node.index, node_index);
904}
905
906static char *rdma_tx_func_error_strings[] = {
907#define _(n,s) s,
908  foreach_rdma_tx_func_error
909#undef _
910};
911
912/* *INDENT-OFF* */
913VNET_DEVICE_CLASS (rdma_device_class) =
914{
915  .name = "RDMA interface",
916  .format_device = format_rdma_device,
917  .format_device_name = format_rdma_device_name,
918  .admin_up_down_function = rdma_interface_admin_up_down,
919  .rx_redirect_to_node = rdma_set_interface_next_node,
920  .tx_function_n_errors = RDMA_TX_N_ERROR,
921  .tx_function_error_strings = rdma_tx_func_error_strings,
922  .mac_addr_change_function = rdma_mac_change,
923};
924/* *INDENT-ON* */
925
926clib_error_t *
927rdma_init (vlib_main_t * vm)
928{
929  rdma_main_t *rm = &rdma_main;
930  vlib_thread_main_t *tm = vlib_get_thread_main ();
931
932  rm->log_class = vlib_log_register_class ("rdma", 0);
933
934  /* vlib_buffer_t template */
935  vec_validate_aligned (rm->per_thread_data, tm->n_vlib_mains - 1,
936			CLIB_CACHE_LINE_BYTES);
937
938  for (int i = 0; i < tm->n_vlib_mains; i++)
939    {
940      rdma_per_thread_data_t *ptd = vec_elt_at_index (rm->per_thread_data, i);
941      clib_memset (&ptd->buffer_template, 0, sizeof (vlib_buffer_t));
942      ptd->buffer_template.flags = VLIB_BUFFER_TOTAL_LENGTH_VALID;
943      ptd->buffer_template.ref_count = 1;
944      vnet_buffer (&ptd->buffer_template)->sw_if_index[VLIB_TX] = (u32) ~ 0;
945    }
946
947  return 0;
948}
949
950/* *INDENT-OFF* */
951VLIB_INIT_FUNCTION (rdma_init) =
952{
953  .runs_after = VLIB_INITS ("pci_bus_init"),
954};
955/* *INDENT-OFF* */
956
957/*
958 * fd.io coding-style-patch-verification: ON
959 *
960 * Local Variables:
961 * eval: (c-set-style "gnu")
962 * End:
963 */
964