device.c revision e7e8bf37
1/*
2 *------------------------------------------------------------------
3 * Copyright (c) 2018 Cisco and/or its affiliates.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *------------------------------------------------------------------
16 */
17
18#include <unistd.h>
19#include <fcntl.h>
20#include <net/if.h>
21#include <linux/if_link.h>
22#include <linux/if_ether.h>
23
24#include <vppinfra/linux/sysfs.h>
25#include <vlib/vlib.h>
26#include <vlib/unix/unix.h>
27#include <vlib/pci/pci.h>
28#include <vnet/ethernet/ethernet.h>
29
30#include <rdma/rdma.h>
31
32/* Default RSS hash key (from DPDK MLX driver) */
33static u8 rdma_rss_hash_key[] = {
34  0x2c, 0xc6, 0x81, 0xd1,
35  0x5b, 0xdb, 0xf4, 0xf7,
36  0xfc, 0xa2, 0x83, 0x19,
37  0xdb, 0x1a, 0x3e, 0x94,
38  0x6b, 0x9e, 0x38, 0xd9,
39  0x2c, 0x9c, 0x03, 0xd1,
40  0xad, 0x99, 0x44, 0xa7,
41  0xd9, 0x56, 0x3d, 0x59,
42  0x06, 0x3c, 0x25, 0xf3,
43  0xfc, 0x1f, 0xdc, 0x2a,
44};
45
46rdma_main_t rdma_main;
47
48#define rdma_log__(lvl, dev, f, ...) \
49  do { \
50      vlib_log((lvl), rdma_main.log_class, "%s: " f, \
51               &(dev)->name, ##__VA_ARGS__); \
52  } while (0)
53
54#define rdma_log(lvl, dev, f, ...) \
55   rdma_log__((lvl), (dev), "%s (%d): " f, strerror(errno), errno, ##__VA_ARGS__)
56
57static struct ibv_flow *
58rdma_rxq_init_flow (const rdma_device_t * rd, struct ibv_qp *qp,
59		    const mac_address_t * mac, const mac_address_t * mask,
60		    u32 flags)
61{
62  struct ibv_flow *flow;
63  struct raw_eth_flow_attr
64  {
65    struct ibv_flow_attr attr;
66    struct ibv_flow_spec_eth spec_eth;
67  } __attribute__ ((packed)) fa;
68
69  memset (&fa, 0, sizeof (fa));
70  fa.attr.num_of_specs = 1;
71  fa.attr.port = 1;
72  fa.attr.flags = flags;
73  fa.spec_eth.type = IBV_FLOW_SPEC_ETH;
74  fa.spec_eth.size = sizeof (struct ibv_flow_spec_eth);
75
76  memcpy (fa.spec_eth.val.dst_mac, mac, sizeof (fa.spec_eth.val.dst_mac));
77  memcpy (fa.spec_eth.mask.dst_mac, mask, sizeof (fa.spec_eth.mask.dst_mac));
78
79  flow = ibv_create_flow (qp, &fa.attr);
80  if (!flow)
81    rdma_log (VLIB_LOG_LEVEL_ERR, rd, "ibv_create_flow() failed");
82  return flow;
83}
84
85static u32
86rdma_rxq_destroy_flow (const rdma_device_t * rd, struct ibv_flow **flow)
87{
88  if (!*flow)
89    return 0;
90
91  if (ibv_destroy_flow (*flow))
92    {
93      rdma_log (VLIB_LOG_LEVEL_ERR, rd, "ibv_destroy_flow() failed");
94      return ~0;
95    }
96
97  *flow = 0;
98  return 0;
99}
100
101static u32
102rdma_dev_set_promisc (rdma_device_t * rd)
103{
104  const mac_address_t all = {.bytes = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0} };
105  int err;
106
107  err = rdma_rxq_destroy_flow (rd, &rd->flow_mcast);
108  if (err)
109    return ~0;
110
111  err = rdma_rxq_destroy_flow (rd, &rd->flow_ucast);
112  if (err)
113    return ~0;
114
115  rd->flow_ucast = rdma_rxq_init_flow (rd, rd->rx_qp, &all, &all, 0);
116  if (!rd->flow_ucast)
117    return ~0;
118
119  rd->flags |= RDMA_DEVICE_F_PROMISC;
120  return 0;
121}
122
123static u32
124rdma_dev_set_ucast (rdma_device_t * rd)
125{
126  const mac_address_t ucast = {.bytes = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}
127  };
128  const mac_address_t mcast = {.bytes = {0x1, 0x0, 0x0, 0x0, 0x0, 0x0} };
129  int err;
130
131  err = rdma_rxq_destroy_flow (rd, &rd->flow_mcast);
132  if (err)
133    return ~0;
134
135  err = rdma_rxq_destroy_flow (rd, &rd->flow_ucast);
136  if (err)
137    return ~0;
138
139  /* receive only packets with src = our MAC */
140  rd->flow_ucast = rdma_rxq_init_flow (rd, rd->rx_qp, &rd->hwaddr, &ucast, 0);
141  if (!rd->flow_ucast)
142    return ~0;
143
144  /* receive multicast packets */
145  rd->flow_mcast = rdma_rxq_init_flow (rd, rd->rx_qp, &mcast, &mcast,
146				       IBV_FLOW_ATTR_FLAGS_DONT_TRAP
147				       /* let others receive mcast packet too (eg. Linux) */
148    );
149  if (!rd->flow_mcast)
150    return ~0;
151
152  rd->flags &= ~RDMA_DEVICE_F_PROMISC;
153  return 0;
154}
155
156static clib_error_t *
157rdma_mac_change (vnet_hw_interface_t * hw, const u8 * old, const u8 * new)
158{
159  rdma_main_t *rm = &rdma_main;
160  rdma_device_t *rd = vec_elt_at_index (rm->devices, hw->dev_instance);
161  mac_address_from_bytes (&rd->hwaddr, new);
162  if (!(rd->flags & RDMA_DEVICE_F_PROMISC) && rdma_dev_set_ucast (rd))
163    {
164      mac_address_from_bytes (&rd->hwaddr, old);
165      return clib_error_return_unix (0, "MAC update failed");
166    }
167  return 0;
168}
169
170static u32
171rdma_dev_change_mtu (rdma_device_t * rd)
172{
173  rdma_log__ (VLIB_LOG_LEVEL_ERR, rd, "MTU change not supported");
174  return ~0;
175}
176
177static u32
178rdma_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hw, u32 flags)
179{
180  rdma_main_t *rm = &rdma_main;
181  rdma_device_t *rd = vec_elt_at_index (rm->devices, hw->dev_instance);
182
183  switch (flags)
184    {
185    case 0:
186      return rdma_dev_set_ucast (rd);
187    case ETHERNET_INTERFACE_FLAG_ACCEPT_ALL:
188      return rdma_dev_set_promisc (rd);
189    case ETHERNET_INTERFACE_FLAG_MTU:
190      return rdma_dev_change_mtu (rd);
191    }
192
193  rdma_log__ (VLIB_LOG_LEVEL_ERR, rd, "unknown flag %x requested", flags);
194  return ~0;
195}
196
197static void
198rdma_update_state (vnet_main_t * vnm, rdma_device_t * rd, int port)
199{
200  struct ibv_port_attr attr;
201  u32 width = 0;
202  u32 speed = 0;
203
204  if (ibv_query_port (rd->ctx, port, &attr))
205    {
206      vnet_hw_interface_set_link_speed (vnm, rd->hw_if_index, 0);
207      vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
208      return;
209    }
210
211  /* update state */
212  switch (attr.state)
213    {
214    case IBV_PORT_ACTIVE:	/* fallthrough */
215    case IBV_PORT_ACTIVE_DEFER:
216      rd->flags |= RDMA_DEVICE_F_LINK_UP;
217      vnet_hw_interface_set_flags (vnm, rd->hw_if_index,
218				   VNET_HW_INTERFACE_FLAG_LINK_UP);
219      break;
220    default:
221      rd->flags &= ~RDMA_DEVICE_F_LINK_UP;
222      vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
223      break;
224    }
225
226  /* update speed */
227  switch (attr.active_width)
228    {
229    case 1:
230      width = 1;
231      break;
232    case 2:
233      width = 4;
234      break;
235    case 4:
236      width = 8;
237      break;
238    case 8:
239      width = 12;
240      break;
241    }
242  switch (attr.active_speed)
243    {
244    case 1:
245      speed = 2500000;
246      break;
247    case 2:
248      speed = 5000000;
249      break;
250    case 4:			/* fallthrough */
251    case 8:
252      speed = 10000000;
253      break;
254    case 16:
255      speed = 14000000;
256      break;
257    case 32:
258      speed = 25000000;
259      break;
260    }
261  vnet_hw_interface_set_link_speed (vnm, rd->hw_if_index, width * speed);
262}
263
264static clib_error_t *
265rdma_async_event_error_ready (clib_file_t * f)
266{
267  rdma_main_t *rm = &rdma_main;
268  rdma_device_t *rd = vec_elt_at_index (rm->devices, f->private_data);
269  return clib_error_return (0, "RDMA: %s: async event error", rd->name);
270}
271
272static clib_error_t *
273rdma_async_event_read_ready (clib_file_t * f)
274{
275  vnet_main_t *vnm = vnet_get_main ();
276  rdma_main_t *rm = &rdma_main;
277  rdma_device_t *rd = vec_elt_at_index (rm->devices, f->private_data);
278  int ret;
279  struct ibv_async_event event;
280  ret = ibv_get_async_event (rd->ctx, &event);
281  if (ret < 0)
282    return clib_error_return_unix (0, "ibv_get_async_event() failed");
283
284  switch (event.event_type)
285    {
286    case IBV_EVENT_PORT_ACTIVE:
287      rdma_update_state (vnm, rd, event.element.port_num);
288      break;
289    case IBV_EVENT_PORT_ERR:
290      rdma_update_state (vnm, rd, event.element.port_num);
291      break;
292    case IBV_EVENT_DEVICE_FATAL:
293      rd->flags &= ~RDMA_DEVICE_F_LINK_UP;
294      vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
295      vlib_log_emerg (rm->log_class, "%s: fatal error", rd->name);
296      break;
297    default:
298      rdma_log__ (VLIB_LOG_LEVEL_ERR, rd, "unhandeld RDMA async event %i",
299		  event.event_type);
300      break;
301    }
302
303  ibv_ack_async_event (&event);
304  return 0;
305}
306
307static clib_error_t *
308rdma_async_event_init (rdma_device_t * rd)
309{
310  clib_file_t t = { 0 };
311  int ret;
312
313  /* make RDMA async event fd non-blocking */
314  ret = fcntl (rd->ctx->async_fd, F_GETFL);
315  if (ret < 0)
316    return clib_error_return_unix (0, "fcntl(F_GETFL) failed");
317
318  ret = fcntl (rd->ctx->async_fd, F_SETFL, ret | O_NONBLOCK);
319  if (ret < 0)
320    return clib_error_return_unix (0, "fcntl(F_SETFL, O_NONBLOCK) failed");
321
322  /* register RDMA async event fd */
323  t.read_function = rdma_async_event_read_ready;
324  t.file_descriptor = rd->ctx->async_fd;
325  t.error_function = rdma_async_event_error_ready;
326  t.private_data = rd->dev_instance;
327  t.description = format (0, "%s async event", rd->name);
328
329  rd->async_event_clib_file_index = clib_file_add (&file_main, &t);
330  return 0;
331}
332
333static void
334rdma_async_event_cleanup (rdma_device_t * rd)
335{
336  clib_file_del_by_index (&file_main, rd->async_event_clib_file_index);
337}
338
339static clib_error_t *
340rdma_register_interface (vnet_main_t * vnm, rdma_device_t * rd)
341{
342  return ethernet_register_interface (vnm, rdma_device_class.index,
343				      rd->dev_instance, rd->hwaddr.bytes,
344				      &rd->hw_if_index, rdma_flag_change);
345}
346
347static void
348rdma_unregister_interface (vnet_main_t * vnm, rdma_device_t * rd)
349{
350  vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
351  vnet_hw_interface_unassign_rx_thread (vnm, rd->hw_if_index, 0);
352  ethernet_delete_interface (vnm, rd->hw_if_index);
353}
354
355static void
356rdma_dev_cleanup (rdma_device_t * rd)
357{
358  rdma_main_t *rm = &rdma_main;
359  rdma_rxq_t *rxq;
360  rdma_txq_t *txq;
361
362#define _(fn, arg) if (arg) \
363  { \
364    int rv; \
365    if ((rv = fn (arg))) \
366       rdma_log (VLIB_LOG_LEVEL_DEBUG, rd, #fn "() failed (rv = %d)", rv); \
367  }
368
369  _(ibv_destroy_flow, rd->flow_mcast);
370  _(ibv_destroy_flow, rd->flow_ucast);
371  _(ibv_dereg_mr, rd->mr);
372  vec_foreach (txq, rd->txqs)
373  {
374    _(ibv_destroy_qp, txq->qp);
375    _(ibv_destroy_cq, txq->cq);
376  }
377  vec_foreach (rxq, rd->rxqs)
378  {
379    _(ibv_destroy_wq, rxq->wq);
380    _(ibv_destroy_cq, rxq->cq);
381  }
382  _(ibv_destroy_rwq_ind_table, rd->rx_rwq_ind_tbl);
383  _(ibv_destroy_qp, rd->rx_qp);
384  _(ibv_dealloc_pd, rd->pd);
385  _(ibv_close_device, rd->ctx);
386#undef _
387
388  clib_error_free (rd->error);
389
390  vec_free (rd->rxqs);
391  vec_free (rd->txqs);
392  vec_free (rd->name);
393  vlib_pci_free_device_info (rd->pci);
394  pool_put (rm->devices, rd);
395}
396
397static clib_error_t *
398rdma_rxq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc)
399{
400  rdma_rxq_t *rxq;
401  struct ibv_wq_init_attr wqia;
402  struct ibv_wq_attr wqa;
403
404  vec_validate_aligned (rd->rxqs, qid, CLIB_CACHE_LINE_BYTES);
405  rxq = vec_elt_at_index (rd->rxqs, qid);
406  rxq->size = n_desc;
407  vec_validate_aligned (rxq->bufs, n_desc - 1, CLIB_CACHE_LINE_BYTES);
408
409  if ((rxq->cq = ibv_create_cq (rd->ctx, n_desc, NULL, NULL, 0)) == 0)
410    return clib_error_return_unix (0, "Create CQ Failed");
411
412  memset (&wqia, 0, sizeof (wqia));
413  wqia.wq_type = IBV_WQT_RQ;
414  wqia.max_wr = n_desc;
415  wqia.max_sge = 1;
416  wqia.pd = rd->pd;
417  wqia.cq = rxq->cq;
418  if ((rxq->wq = ibv_create_wq (rd->ctx, &wqia)) == 0)
419    return clib_error_return_unix (0, "Create WQ Failed");
420
421  memset (&wqa, 0, sizeof (wqa));
422  wqa.attr_mask = IBV_WQ_ATTR_STATE;
423  wqa.wq_state = IBV_WQS_RDY;
424  if (ibv_modify_wq (rxq->wq, &wqa) != 0)
425    return clib_error_return_unix (0, "Modify WQ (RDY) Failed");
426
427  return 0;
428}
429
430static clib_error_t *
431rdma_rxq_finalize (vlib_main_t * vm, rdma_device_t * rd)
432{
433  struct ibv_rwq_ind_table_init_attr rwqia;
434  struct ibv_qp_init_attr_ex qpia;
435  struct ibv_wq **ind_tbl;
436  u32 i;
437
438  ASSERT (is_pow2 (vec_len (rd->rxqs))
439	  && "rxq number should be a power of 2");
440
441  ind_tbl = vec_new (struct ibv_wq *, vec_len (rd->rxqs));
442  vec_foreach_index (i, rd->rxqs)
443    ind_tbl[i] = vec_elt_at_index (rd->rxqs, i)->wq;
444  memset (&rwqia, 0, sizeof (rwqia));
445  rwqia.log_ind_tbl_size = min_log2 (vec_len (ind_tbl));
446  rwqia.ind_tbl = ind_tbl;
447  if ((rd->rx_rwq_ind_tbl = ibv_create_rwq_ind_table (rd->ctx, &rwqia)) == 0)
448    return clib_error_return_unix (0, "RWQ indirection table create failed");
449  vec_free (ind_tbl);
450
451  memset (&qpia, 0, sizeof (qpia));
452  qpia.qp_type = IBV_QPT_RAW_PACKET;
453  qpia.comp_mask =
454    IBV_QP_INIT_ATTR_PD | IBV_QP_INIT_ATTR_IND_TABLE |
455    IBV_QP_INIT_ATTR_RX_HASH;
456  qpia.pd = rd->pd;
457  qpia.rwq_ind_tbl = rd->rx_rwq_ind_tbl;
458  STATIC_ASSERT_SIZEOF (rdma_rss_hash_key, 40);
459  qpia.rx_hash_conf.rx_hash_key_len = sizeof (rdma_rss_hash_key);
460  qpia.rx_hash_conf.rx_hash_key = rdma_rss_hash_key;
461  qpia.rx_hash_conf.rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ;
462  qpia.rx_hash_conf.rx_hash_fields_mask =
463    IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_DST_IPV4;
464  if ((rd->rx_qp = ibv_create_qp_ex (rd->ctx, &qpia)) == 0)
465    return clib_error_return_unix (0, "Queue Pair create failed");
466
467  if (rdma_dev_set_ucast (rd))
468    return clib_error_return_unix (0, "Set unicast mode failed");
469
470  return 0;
471}
472
473static clib_error_t *
474rdma_txq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc)
475{
476  rdma_txq_t *txq;
477  struct ibv_qp_init_attr qpia;
478  struct ibv_qp_attr qpa;
479  int qp_flags;
480
481  vec_validate_aligned (rd->txqs, qid, CLIB_CACHE_LINE_BYTES);
482  txq = vec_elt_at_index (rd->txqs, qid);
483  txq->size = n_desc;
484  vec_validate_aligned (txq->bufs, n_desc - 1, CLIB_CACHE_LINE_BYTES);
485
486  if ((txq->cq = ibv_create_cq (rd->ctx, n_desc, NULL, NULL, 0)) == 0)
487    return clib_error_return_unix (0, "Create CQ Failed");
488
489  memset (&qpia, 0, sizeof (qpia));
490  qpia.send_cq = txq->cq;
491  qpia.recv_cq = txq->cq;
492  qpia.cap.max_send_wr = n_desc;
493  qpia.cap.max_send_sge = 1;
494  qpia.qp_type = IBV_QPT_RAW_PACKET;
495
496  if ((txq->qp = ibv_create_qp (rd->pd, &qpia)) == 0)
497    return clib_error_return_unix (0, "Queue Pair create failed");
498
499  memset (&qpa, 0, sizeof (qpa));
500  qp_flags = IBV_QP_STATE | IBV_QP_PORT;
501  qpa.qp_state = IBV_QPS_INIT;
502  qpa.port_num = 1;
503  if (ibv_modify_qp (txq->qp, &qpa, qp_flags) != 0)
504    return clib_error_return_unix (0, "Modify QP (init) Failed");
505
506  memset (&qpa, 0, sizeof (qpa));
507  qp_flags = IBV_QP_STATE;
508  qpa.qp_state = IBV_QPS_RTR;
509  if (ibv_modify_qp (txq->qp, &qpa, qp_flags) != 0)
510    return clib_error_return_unix (0, "Modify QP (receive) Failed");
511
512  memset (&qpa, 0, sizeof (qpa));
513  qp_flags = IBV_QP_STATE;
514  qpa.qp_state = IBV_QPS_RTS;
515  if (ibv_modify_qp (txq->qp, &qpa, qp_flags) != 0)
516    return clib_error_return_unix (0, "Modify QP (send) Failed");
517  return 0;
518}
519
520static clib_error_t *
521rdma_dev_init (vlib_main_t * vm, rdma_device_t * rd, u32 rxq_size,
522	       u32 txq_size, u32 rxq_num)
523{
524  clib_error_t *err;
525  vlib_buffer_main_t *bm = vm->buffer_main;
526  vlib_thread_main_t *tm = vlib_get_thread_main ();
527  u32 i;
528
529  if (rd->ctx == 0)
530    return clib_error_return_unix (0, "Device Open Failed");
531
532  if ((rd->pd = ibv_alloc_pd (rd->ctx)) == 0)
533    return clib_error_return_unix (0, "PD Alloc Failed");
534
535  ethernet_mac_address_generate (rd->hwaddr.bytes);
536
537  for (i = 0; i < rxq_num; i++)
538    if ((err = rdma_rxq_init (vm, rd, i, rxq_size)))
539      return err;
540  if ((err = rdma_rxq_finalize (vm, rd)))
541    return err;
542
543  for (i = 0; i < tm->n_vlib_mains; i++)
544    if ((err = rdma_txq_init (vm, rd, i, txq_size)))
545      return err;
546
547  if ((rd->mr = ibv_reg_mr (rd->pd, (void *) bm->buffer_mem_start,
548			    bm->buffer_mem_size,
549			    IBV_ACCESS_LOCAL_WRITE)) == 0)
550    return clib_error_return_unix (0, "Register MR Failed");
551  rd->lkey = rd->mr->lkey;	/* avoid indirection in datapath */
552
553  return 0;
554}
555
556static uword
557sysfs_path_to_pci_addr (char *path, vlib_pci_addr_t * addr)
558{
559  uword rv;
560  unformat_input_t in;
561  u8 *s;
562
563  s = clib_sysfs_link_to_name (path);
564  unformat_init_string (&in, (char *) s, strlen ((char *) s));
565  rv = unformat (&in, "%U", unformat_vlib_pci_addr, addr);
566  unformat_free (&in);
567  vec_free (s);
568  return rv;
569}
570
571void
572rdma_create_if (vlib_main_t * vm, rdma_create_if_args_t * args)
573{
574  vnet_main_t *vnm = vnet_get_main ();
575  rdma_main_t *rm = &rdma_main;
576  rdma_device_t *rd;
577  vlib_pci_addr_t pci_addr;
578  struct ibv_device **dev_list;
579  int n_devs;
580  u8 *s;
581  u16 qid;
582  int i;
583
584  args->rxq_size = args->rxq_size ? args->rxq_size : 2 * VLIB_FRAME_SIZE;
585  args->txq_size = args->txq_size ? args->txq_size : 2 * VLIB_FRAME_SIZE;
586  args->rxq_num = args->rxq_num ? args->rxq_num : 1;
587
588  if (!is_pow2 (args->rxq_num))
589    {
590      args->rv = VNET_API_ERROR_INVALID_VALUE;
591      args->error =
592	clib_error_return (0, "rx queue number must be a power of two");
593      goto err0;
594    }
595
596  if (args->rxq_size < VLIB_FRAME_SIZE || args->txq_size < VLIB_FRAME_SIZE ||
597      !is_pow2 (args->rxq_size) || !is_pow2 (args->txq_size))
598    {
599      args->rv = VNET_API_ERROR_INVALID_VALUE;
600      args->error =
601	clib_error_return (0, "queue size must be a power of two >= %i",
602			   VLIB_FRAME_SIZE);
603      goto err0;
604    }
605
606  dev_list = ibv_get_device_list (&n_devs);
607  if (n_devs == 0)
608    {
609      args->error =
610	clib_error_return_unix (0,
611				"no RDMA devices available. Is the ib_uverbs module loaded?");
612      goto err0;
613    }
614
615  /* get PCI address */
616  s = format (0, "/sys/class/net/%s/device%c", args->ifname, 0);
617  if (sysfs_path_to_pci_addr ((char *) s, &pci_addr) == 0)
618    {
619      args->error =
620	clib_error_return (0, "cannot find PCI address for device ");
621      goto err1;
622    }
623
624  pool_get_zero (rm->devices, rd);
625  rd->dev_instance = rd - rm->devices;
626  rd->per_interface_next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT;
627  rd->name = format (0, "%s", args->name);
628  rd->linux_ifname = format (0, "%s", args->ifname);
629
630  rd->pci = vlib_pci_get_device_info (vm, &pci_addr, &args->error);
631  if (!rd->pci)
632    goto err2;
633  rd->pool = vlib_buffer_pool_get_default_for_numa (vm, rd->pci->numa_node);
634
635  if (strncmp ((char *) rd->pci->driver_name, "mlx5_core", 9))
636    {
637      args->error =
638	clib_error_return (0,
639			   "invalid interface (only mlx5 supported for now)");
640      goto err2;
641    }
642
643  for (i = 0; i < n_devs; i++)
644    {
645      vlib_pci_addr_t addr;
646
647      vec_reset_length (s);
648      s = format (s, "%s/device%c", dev_list[i]->dev_path, 0);
649
650      if (sysfs_path_to_pci_addr ((char *) s, &addr) == 0)
651	continue;
652
653      if (addr.as_u32 != rd->pci->addr.as_u32)
654	continue;
655
656      if ((rd->ctx = ibv_open_device (dev_list[i])))
657	break;
658    }
659
660  if ((args->error =
661       rdma_dev_init (vm, rd, args->rxq_size, args->txq_size, args->rxq_num)))
662    goto err2;
663
664  if ((args->error = rdma_register_interface (vnm, rd)))
665    goto err2;
666
667  if ((args->error = rdma_async_event_init (rd)))
668    goto err3;
669
670  rdma_update_state (vnm, rd, 1);
671
672  vnet_sw_interface_t *sw = vnet_get_hw_sw_interface (vnm, rd->hw_if_index);
673  args->sw_if_index = rd->sw_if_index = sw->sw_if_index;
674  /*
675   * FIXME: add support for interrupt mode
676   * vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, rd->hw_if_index);
677   * hw->flags |= VNET_HW_INTERFACE_FLAG_SUPPORTS_INT_MODE;
678   */
679  vnet_hw_interface_set_input_node (vnm, rd->hw_if_index,
680				    rdma_input_node.index);
681  vec_foreach_index (qid, rd->rxqs)
682    vnet_hw_interface_assign_rx_thread (vnm, rd->hw_if_index, qid, ~0);
683
684  vec_free (s);
685  return;
686
687err3:
688  rdma_unregister_interface (vnm, rd);
689err2:
690  rdma_dev_cleanup (rd);
691err1:
692  ibv_free_device_list (dev_list);
693  vec_free (s);
694  args->rv = VNET_API_ERROR_INVALID_INTERFACE;
695err0:
696  vlib_log_err (rm->log_class, "%U", format_clib_error, args->error);
697}
698
699void
700rdma_delete_if (vlib_main_t * vm, rdma_device_t * rd)
701{
702  rdma_async_event_cleanup (rd);
703  rdma_unregister_interface (vnet_get_main (), rd);
704  rdma_dev_cleanup (rd);
705}
706
707static clib_error_t *
708rdma_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags)
709{
710  vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, hw_if_index);
711  rdma_main_t *rm = &rdma_main;
712  rdma_device_t *rd = vec_elt_at_index (rm->devices, hi->dev_instance);
713  uword is_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0;
714
715  if (rd->flags & RDMA_DEVICE_F_ERROR)
716    return clib_error_return (0, "device is in error state");
717
718  if (is_up)
719    {
720      vnet_hw_interface_set_flags (vnm, rd->hw_if_index,
721				   VNET_HW_INTERFACE_FLAG_LINK_UP);
722      rd->flags |= RDMA_DEVICE_F_ADMIN_UP;
723    }
724  else
725    {
726      vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
727      rd->flags &= ~RDMA_DEVICE_F_ADMIN_UP;
728    }
729  return 0;
730}
731
732static void
733rdma_set_interface_next_node (vnet_main_t * vnm, u32 hw_if_index,
734			      u32 node_index)
735{
736  rdma_main_t *rm = &rdma_main;
737  vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
738  rdma_device_t *rd = pool_elt_at_index (rm->devices, hw->dev_instance);
739
740  /* Shut off redirection */
741  if (node_index == ~0)
742    {
743      rd->per_interface_next_index = node_index;
744      return;
745    }
746
747  rd->per_interface_next_index =
748    vlib_node_add_next (vlib_get_main (), rdma_input_node.index, node_index);
749}
750
751static char *rdma_tx_func_error_strings[] = {
752#define _(n,s) s,
753  foreach_rdma_tx_func_error
754#undef _
755};
756
757/* *INDENT-OFF* */
758VNET_DEVICE_CLASS (rdma_device_class) =
759{
760  .name = "RDMA interface",
761  .format_device = format_rdma_device,
762  .format_device_name = format_rdma_device_name,
763  .admin_up_down_function = rdma_interface_admin_up_down,
764  .rx_redirect_to_node = rdma_set_interface_next_node,
765  .tx_function_n_errors = RDMA_TX_N_ERROR,
766  .tx_function_error_strings = rdma_tx_func_error_strings,
767  .mac_addr_change_function = rdma_mac_change,
768};
769/* *INDENT-ON* */
770
771clib_error_t *
772rdma_init (vlib_main_t * vm)
773{
774  rdma_main_t *rm = &rdma_main;
775
776  rm->log_class = vlib_log_register_class ("rdma", 0);
777
778  return 0;
779}
780
781VLIB_INIT_FUNCTION (rdma_init);
782
783/*
784 * fd.io coding-style-patch-verification: ON
785 *
786 * Local Variables:
787 * eval: (c-set-style "gnu")
788 * End:
789 */
790