1fe750c24SBenoît Ganne/*
2fe750c24SBenoît Ganne *------------------------------------------------------------------
3fe750c24SBenoît Ganne * Copyright (c) 2018 Cisco and/or its affiliates.
4fe750c24SBenoît Ganne * Licensed under the Apache License, Version 2.0 (the "License");
5fe750c24SBenoît Ganne * you may not use this file except in compliance with the License.
6fe750c24SBenoît Ganne * You may obtain a copy of the License at:
7fe750c24SBenoît Ganne *
8fe750c24SBenoît Ganne *     http://www.apache.org/licenses/LICENSE-2.0
9fe750c24SBenoît Ganne *
10fe750c24SBenoît Ganne * Unless required by applicable law or agreed to in writing, software
11fe750c24SBenoît Ganne * distributed under the License is distributed on an "AS IS" BASIS,
12fe750c24SBenoît Ganne * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13fe750c24SBenoît Ganne * See the License for the specific language governing permissions and
14fe750c24SBenoît Ganne * limitations under the License.
15fe750c24SBenoît Ganne *------------------------------------------------------------------
16fe750c24SBenoît Ganne */
17fe750c24SBenoît Ganne
18fe750c24SBenoît Ganne#include <unistd.h>
19fe750c24SBenoît Ganne#include <fcntl.h>
20fe750c24SBenoît Ganne#include <net/if.h>
21fe750c24SBenoît Ganne#include <linux/if_link.h>
22fe750c24SBenoît Ganne#include <linux/if_ether.h>
23fe750c24SBenoît Ganne
24fe750c24SBenoît Ganne#include <vppinfra/linux/sysfs.h>
25fe750c24SBenoît Ganne#include <vlib/vlib.h>
26fe750c24SBenoît Ganne#include <vlib/unix/unix.h>
27fe750c24SBenoît Ganne#include <vlib/pci/pci.h>
28fe750c24SBenoît Ganne#include <vnet/ethernet/ethernet.h>
29fe750c24SBenoît Ganne
30fe750c24SBenoît Ganne#include <rdma/rdma.h>
31fe750c24SBenoît Ganne
325763e47bSBenoît Ganne/* Default RSS hash key (from DPDK MLX driver) */
335763e47bSBenoît Gannestatic u8 rdma_rss_hash_key[] = {
345763e47bSBenoît Ganne  0x2c, 0xc6, 0x81, 0xd1,
355763e47bSBenoît Ganne  0x5b, 0xdb, 0xf4, 0xf7,
365763e47bSBenoît Ganne  0xfc, 0xa2, 0x83, 0x19,
375763e47bSBenoît Ganne  0xdb, 0x1a, 0x3e, 0x94,
385763e47bSBenoît Ganne  0x6b, 0x9e, 0x38, 0xd9,
395763e47bSBenoît Ganne  0x2c, 0x9c, 0x03, 0xd1,
405763e47bSBenoît Ganne  0xad, 0x99, 0x44, 0xa7,
415763e47bSBenoît Ganne  0xd9, 0x56, 0x3d, 0x59,
425763e47bSBenoît Ganne  0x06, 0x3c, 0x25, 0xf3,
435763e47bSBenoît Ganne  0xfc, 0x1f, 0xdc, 0x2a,
445763e47bSBenoît Ganne};
455763e47bSBenoît Ganne
46fe750c24SBenoît Gannerdma_main_t rdma_main;
47fe750c24SBenoît Ganne
48f2d5cdbfSBenoît Ganne#define rdma_log__(lvl, dev, f, ...) \
49f2d5cdbfSBenoît Ganne  do { \
50f2d5cdbfSBenoît Ganne      vlib_log((lvl), rdma_main.log_class, "%s: " f, \
51f2d5cdbfSBenoît Ganne               &(dev)->name, ##__VA_ARGS__); \
52f2d5cdbfSBenoît Ganne  } while (0)
53f2d5cdbfSBenoît Ganne
54f2d5cdbfSBenoît Ganne#define rdma_log(lvl, dev, f, ...) \
55f2d5cdbfSBenoît Ganne   rdma_log__((lvl), (dev), "%s (%d): " f, strerror(errno), errno, ##__VA_ARGS__)
56f2d5cdbfSBenoît Ganne
57f2d5cdbfSBenoît Gannestatic struct ibv_flow *
58f2d5cdbfSBenoît Gannerdma_rxq_init_flow (const rdma_device_t * rd, struct ibv_qp *qp,
59f2d5cdbfSBenoît Ganne		    const mac_address_t * mac, const mac_address_t * mask,
60f2d5cdbfSBenoît Ganne		    u32 flags)
61f2d5cdbfSBenoît Ganne{
62f2d5cdbfSBenoît Ganne  struct ibv_flow *flow;
63f2d5cdbfSBenoît Ganne  struct raw_eth_flow_attr
64f2d5cdbfSBenoît Ganne  {
65f2d5cdbfSBenoît Ganne    struct ibv_flow_attr attr;
66f2d5cdbfSBenoît Ganne    struct ibv_flow_spec_eth spec_eth;
67f2d5cdbfSBenoît Ganne  } __attribute__ ((packed)) fa;
68f2d5cdbfSBenoît Ganne
69f2d5cdbfSBenoît Ganne  memset (&fa, 0, sizeof (fa));
70f2d5cdbfSBenoît Ganne  fa.attr.num_of_specs = 1;
71f2d5cdbfSBenoît Ganne  fa.attr.port = 1;
72f2d5cdbfSBenoît Ganne  fa.attr.flags = flags;
73f2d5cdbfSBenoît Ganne  fa.spec_eth.type = IBV_FLOW_SPEC_ETH;
74f2d5cdbfSBenoît Ganne  fa.spec_eth.size = sizeof (struct ibv_flow_spec_eth);
75f2d5cdbfSBenoît Ganne
76f2d5cdbfSBenoît Ganne  memcpy (fa.spec_eth.val.dst_mac, mac, sizeof (fa.spec_eth.val.dst_mac));
77f2d5cdbfSBenoît Ganne  memcpy (fa.spec_eth.mask.dst_mac, mask, sizeof (fa.spec_eth.mask.dst_mac));
78f2d5cdbfSBenoît Ganne
79f2d5cdbfSBenoît Ganne  flow = ibv_create_flow (qp, &fa.attr);
80f2d5cdbfSBenoît Ganne  if (!flow)
81f2d5cdbfSBenoît Ganne    rdma_log (VLIB_LOG_LEVEL_ERR, rd, "ibv_create_flow() failed");
82f2d5cdbfSBenoît Ganne  return flow;
83f2d5cdbfSBenoît Ganne}
84f2d5cdbfSBenoît Ganne
85f2d5cdbfSBenoît Gannestatic u32
86f2d5cdbfSBenoît Gannerdma_rxq_destroy_flow (const rdma_device_t * rd, struct ibv_flow **flow)
87f2d5cdbfSBenoît Ganne{
88f2d5cdbfSBenoît Ganne  if (!*flow)
89f2d5cdbfSBenoît Ganne    return 0;
90f2d5cdbfSBenoît Ganne
91f2d5cdbfSBenoît Ganne  if (ibv_destroy_flow (*flow))
92f2d5cdbfSBenoît Ganne    {
93f2d5cdbfSBenoît Ganne      rdma_log (VLIB_LOG_LEVEL_ERR, rd, "ibv_destroy_flow() failed");
94f2d5cdbfSBenoît Ganne      return ~0;
95f2d5cdbfSBenoît Ganne    }
96f2d5cdbfSBenoît Ganne
97f2d5cdbfSBenoît Ganne  *flow = 0;
98f2d5cdbfSBenoît Ganne  return 0;
99f2d5cdbfSBenoît Ganne}
100f2d5cdbfSBenoît Ganne
101f2d5cdbfSBenoît Gannestatic u32
102f2d5cdbfSBenoît Gannerdma_dev_set_promisc (rdma_device_t * rd)
103f2d5cdbfSBenoît Ganne{
104f2d5cdbfSBenoît Ganne  const mac_address_t all = {.bytes = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0} };
105f2d5cdbfSBenoît Ganne  int err;
106f2d5cdbfSBenoît Ganne
107f2d5cdbfSBenoît Ganne  err = rdma_rxq_destroy_flow (rd, &rd->flow_mcast);
108f2d5cdbfSBenoît Ganne  if (err)
109f2d5cdbfSBenoît Ganne    return ~0;
110f2d5cdbfSBenoît Ganne
111f2d5cdbfSBenoît Ganne  err = rdma_rxq_destroy_flow (rd, &rd->flow_ucast);
112f2d5cdbfSBenoît Ganne  if (err)
113f2d5cdbfSBenoît Ganne    return ~0;
114f2d5cdbfSBenoît Ganne
115f2d5cdbfSBenoît Ganne  rd->flow_ucast = rdma_rxq_init_flow (rd, rd->rx_qp, &all, &all, 0);
116f2d5cdbfSBenoît Ganne  if (!rd->flow_ucast)
117f2d5cdbfSBenoît Ganne    return ~0;
118f2d5cdbfSBenoît Ganne
119f2d5cdbfSBenoît Ganne  rd->flags |= RDMA_DEVICE_F_PROMISC;
120f2d5cdbfSBenoît Ganne  return 0;
121f2d5cdbfSBenoît Ganne}
122f2d5cdbfSBenoît Ganne
123f2d5cdbfSBenoît Gannestatic u32
124f2d5cdbfSBenoît Gannerdma_dev_set_ucast (rdma_device_t * rd)
125f2d5cdbfSBenoît Ganne{
126f2d5cdbfSBenoît Ganne  const mac_address_t ucast = {.bytes = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}
127f2d5cdbfSBenoît Ganne  };
128f2d5cdbfSBenoît Ganne  const mac_address_t mcast = {.bytes = {0x1, 0x0, 0x0, 0x0, 0x0, 0x0} };
129f2d5cdbfSBenoît Ganne  int err;
130f2d5cdbfSBenoît Ganne
131f2d5cdbfSBenoît Ganne  err = rdma_rxq_destroy_flow (rd, &rd->flow_mcast);
132f2d5cdbfSBenoît Ganne  if (err)
133f2d5cdbfSBenoît Ganne    return ~0;
134f2d5cdbfSBenoît Ganne
135f2d5cdbfSBenoît Ganne  err = rdma_rxq_destroy_flow (rd, &rd->flow_ucast);
136f2d5cdbfSBenoît Ganne  if (err)
137f2d5cdbfSBenoît Ganne    return ~0;
138f2d5cdbfSBenoît Ganne
139f2d5cdbfSBenoît Ganne  /* receive only packets with src = our MAC */
140f2d5cdbfSBenoît Ganne  rd->flow_ucast = rdma_rxq_init_flow (rd, rd->rx_qp, &rd->hwaddr, &ucast, 0);
141f2d5cdbfSBenoît Ganne  if (!rd->flow_ucast)
142f2d5cdbfSBenoît Ganne    return ~0;
143f2d5cdbfSBenoît Ganne
144f2d5cdbfSBenoît Ganne  /* receive multicast packets */
145f2d5cdbfSBenoît Ganne  rd->flow_mcast = rdma_rxq_init_flow (rd, rd->rx_qp, &mcast, &mcast,
146f2d5cdbfSBenoît Ganne				       IBV_FLOW_ATTR_FLAGS_DONT_TRAP
147f2d5cdbfSBenoît Ganne				       /* let others receive mcast packet too (eg. Linux) */
148f2d5cdbfSBenoît Ganne    );
149f2d5cdbfSBenoît Ganne  if (!rd->flow_mcast)
150f2d5cdbfSBenoît Ganne    return ~0;
151f2d5cdbfSBenoît Ganne
152f2d5cdbfSBenoît Ganne  rd->flags &= ~RDMA_DEVICE_F_PROMISC;
153f2d5cdbfSBenoît Ganne  return 0;
154f2d5cdbfSBenoît Ganne}
155f2d5cdbfSBenoît Ganne
1560dcafcc5SBenoît Gannestatic clib_error_t *
1570dcafcc5SBenoît Gannerdma_mac_change (vnet_hw_interface_t * hw, const u8 * old, const u8 * new)
1580dcafcc5SBenoît Ganne{
1590dcafcc5SBenoît Ganne  rdma_main_t *rm = &rdma_main;
1600dcafcc5SBenoît Ganne  rdma_device_t *rd = vec_elt_at_index (rm->devices, hw->dev_instance);
1610dcafcc5SBenoît Ganne  mac_address_from_bytes (&rd->hwaddr, new);
1620dcafcc5SBenoît Ganne  if (!(rd->flags & RDMA_DEVICE_F_PROMISC) && rdma_dev_set_ucast (rd))
1630dcafcc5SBenoît Ganne    {
1640dcafcc5SBenoît Ganne      mac_address_from_bytes (&rd->hwaddr, old);
1650dcafcc5SBenoît Ganne      return clib_error_return_unix (0, "MAC update failed");
1660dcafcc5SBenoît Ganne    }
1670dcafcc5SBenoît Ganne  return 0;
1680dcafcc5SBenoît Ganne}
1690dcafcc5SBenoît Ganne
170f2d5cdbfSBenoît Gannestatic u32
171f2d5cdbfSBenoît Gannerdma_dev_change_mtu (rdma_device_t * rd)
172f2d5cdbfSBenoît Ganne{
173f2d5cdbfSBenoît Ganne  rdma_log__ (VLIB_LOG_LEVEL_ERR, rd, "MTU change not supported");
174f2d5cdbfSBenoît Ganne  return ~0;
175f2d5cdbfSBenoît Ganne}
176fe750c24SBenoît Ganne
177fe750c24SBenoît Gannestatic u32
178fe750c24SBenoît Gannerdma_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hw, u32 flags)
179fe750c24SBenoît Ganne{
180fe750c24SBenoît Ganne  rdma_main_t *rm = &rdma_main;
181f2d5cdbfSBenoît Ganne  rdma_device_t *rd = vec_elt_at_index (rm->devices, hw->dev_instance);
182f2d5cdbfSBenoît Ganne
183f2d5cdbfSBenoît Ganne  switch (flags)
184f2d5cdbfSBenoît Ganne    {
186f2d5cdbfSBenoît Ganne      return rdma_dev_set_ucast (rd);
187f2d5cdbfSBenoît Ganne    case ETHERNET_INTERFACE_FLAG_ACCEPT_ALL:
188f2d5cdbfSBenoît Ganne      return rdma_dev_set_promisc (rd);
189f2d5cdbfSBenoît Ganne    case ETHERNET_INTERFACE_FLAG_MTU:
190f2d5cdbfSBenoît Ganne      return rdma_dev_change_mtu (rd);
191f2d5cdbfSBenoît Ganne    }
192f2d5cdbfSBenoît Ganne
193f2d5cdbfSBenoît Ganne  rdma_log__ (VLIB_LOG_LEVEL_ERR, rd, "unknown flag %x requested", flags);
194f2d5cdbfSBenoît Ganne  return ~0;
195fe750c24SBenoît Ganne}
196fe750c24SBenoît Ganne
197fe750c24SBenoît Gannestatic void
198fe750c24SBenoît Gannerdma_update_state (vnet_main_t * vnm, rdma_device_t * rd, int port)
199fe750c24SBenoît Ganne{
200fe750c24SBenoît Ganne  struct ibv_port_attr attr;
201fe750c24SBenoît Ganne  u32 width = 0;
202fe750c24SBenoît Ganne  u32 speed = 0;
203fe750c24SBenoît Ganne
204fe750c24SBenoît Ganne  if (ibv_query_port (rd->ctx, port, &attr))
205fe750c24SBenoît Ganne    {
206fe750c24SBenoît Ganne      vnet_hw_interface_set_link_speed (vnm, rd->hw_if_index, 0);
207fe750c24SBenoît Ganne      vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
208fe750c24SBenoît Ganne      return;
209fe750c24SBenoît Ganne    }
210fe750c24SBenoît Ganne
211fe750c24SBenoît Ganne  /* update state */
212fe750c24SBenoît Ganne  switch (attr.state)
213fe750c24SBenoît Ganne    {
214fe750c24SBenoît Ganne    case IBV_PORT_ACTIVE:	/* fallthrough */
215fe750c24SBenoît Ganne    case IBV_PORT_ACTIVE_DEFER:
216fe750c24SBenoît Ganne      rd->flags |= RDMA_DEVICE_F_LINK_UP;
217fe750c24SBenoît Ganne      vnet_hw_interface_set_flags (vnm, rd->hw_if_index,
218fe750c24SBenoît Ganne				   VNET_HW_INTERFACE_FLAG_LINK_UP);
219fe750c24SBenoît Ganne      break;
220fe750c24SBenoît Ganne    default:
221fe750c24SBenoît Ganne      rd->flags &= ~RDMA_DEVICE_F_LINK_UP;
222fe750c24SBenoît Ganne      vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
223fe750c24SBenoît Ganne      break;
224fe750c24SBenoît Ganne    }
225fe750c24SBenoît Ganne
226fe750c24SBenoît Ganne  /* update speed */
227fe750c24SBenoît Ganne  switch (attr.active_width)
228fe750c24SBenoît Ganne    {
229fe750c24SBenoît Ganne    case 1:
230fe750c24SBenoît Ganne      width = 1;
231fe750c24SBenoît Ganne      break;
232fe750c24SBenoît Ganne    case 2:
233fe750c24SBenoît Ganne      width = 4;
234fe750c24SBenoît Ganne      break;
235fe750c24SBenoît Ganne    case 4:
236fe750c24SBenoît Ganne      width = 8;
237fe750c24SBenoît Ganne      break;
238fe750c24SBenoît Ganne    case 8:
239fe750c24SBenoît Ganne      width = 12;
240fe750c24SBenoît Ganne      break;
241fe750c24SBenoît Ganne    }
242fe750c24SBenoît Ganne  switch (attr.active_speed)
243fe750c24SBenoît Ganne    {
244fe750c24SBenoît Ganne    case 1:
245fe750c24SBenoît Ganne      speed = 2500000;
246fe750c24SBenoît Ganne      break;
247fe750c24SBenoît Ganne    case 2:
248fe750c24SBenoît Ganne      speed = 5000000;
249fe750c24SBenoît Ganne      break;
250fe750c24SBenoît Ganne    case 4:			/* fallthrough */
251fe750c24SBenoît Ganne    case 8:
252fe750c24SBenoît Ganne      speed = 10000000;
253fe750c24SBenoît Ganne      break;
254fe750c24SBenoît Ganne    case 16:
255fe750c24SBenoît Ganne      speed = 14000000;
256fe750c24SBenoît Ganne      break;
257fe750c24SBenoît Ganne    case 32:
258fe750c24SBenoît Ganne      speed = 25000000;
259fe750c24SBenoît Ganne      break;
260fe750c24SBenoît Ganne    }
261fe750c24SBenoît Ganne  vnet_hw_interface_set_link_speed (vnm, rd->hw_if_index, width * speed);
262fe750c24SBenoît Ganne}
263fe750c24SBenoît Ganne
264fe750c24SBenoît Gannestatic clib_error_t *
265fe750c24SBenoît Gannerdma_async_event_error_ready (clib_file_t * f)
266fe750c24SBenoît Ganne{
267fe750c24SBenoît Ganne  rdma_main_t *rm = &rdma_main;
268fe750c24SBenoît Ganne  rdma_device_t *rd = vec_elt_at_index (rm->devices, f->private_data);
269e7e8bf37SBenoît Ganne  return clib_error_return (0, "RDMA: %s: async event error", rd->name);
270fe750c24SBenoît Ganne}
271fe750c24SBenoît Ganne
272fe750c24SBenoît Gannestatic clib_error_t *
273fe750c24SBenoît Gannerdma_async_event_read_ready (clib_file_t * f)
274fe750c24SBenoît Ganne{
275fe750c24SBenoît Ganne  vnet_main_t *vnm = vnet_get_main ();
276fe750c24SBenoît Ganne  rdma_main_t *rm = &rdma_main;
277fe750c24SBenoît Ganne  rdma_device_t *rd = vec_elt_at_index (rm->devices, f->private_data);
278fe750c24SBenoît Ganne  int ret;
279fe750c24SBenoît Ganne  struct ibv_async_event event;
280fe750c24SBenoît Ganne  ret = ibv_get_async_event (rd->ctx, &event);
281fe750c24SBenoît Ganne  if (ret < 0)
282f2d5cdbfSBenoît Ganne    return clib_error_return_unix (0, "ibv_get_async_event() failed");
283fe750c24SBenoît Ganne
284fe750c24SBenoît Ganne  switch (event.event_type)
285fe750c24SBenoît Ganne    {
286fe750c24SBenoît Ganne    case IBV_EVENT_PORT_ACTIVE:
287fe750c24SBenoît Ganne      rdma_update_state (vnm, rd, event.element.port_num);
288fe750c24SBenoît Ganne      break;
289fe750c24SBenoît Ganne    case IBV_EVENT_PORT_ERR:
290fe750c24SBenoît Ganne      rdma_update_state (vnm, rd, event.element.port_num);
291fe750c24SBenoît Ganne      break;
292fe750c24SBenoît Ganne    case IBV_EVENT_DEVICE_FATAL:
293fe750c24SBenoît Ganne      rd->flags &= ~RDMA_DEVICE_F_LINK_UP;
294fe750c24SBenoît Ganne      vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
295e7e8bf37SBenoît Ganne      vlib_log_emerg (rm->log_class, "%s: fatal error", rd->name);
296fe750c24SBenoît Ganne      break;
297fe750c24SBenoît Ganne    default:
298f2d5cdbfSBenoît Ganne      rdma_log__ (VLIB_LOG_LEVEL_ERR, rd, "unhandeld RDMA async event %i",
299f2d5cdbfSBenoît Ganne		  event.event_type);
300fe750c24SBenoît Ganne      break;
301fe750c24SBenoît Ganne    }
302fe750c24SBenoît Ganne
303fe750c24SBenoît Ganne  ibv_ack_async_event (&event);
304fe750c24SBenoît Ganne  return 0;
305fe750c24SBenoît Ganne}
306fe750c24SBenoît Ganne
307fe750c24SBenoît Gannestatic clib_error_t *
308fe750c24SBenoît Gannerdma_async_event_init (rdma_device_t * rd)
309fe750c24SBenoît Ganne{
310fe750c24SBenoît Ganne  clib_file_t t = { 0 };
311fe750c24SBenoît Ganne  int ret;
312fe750c24SBenoît Ganne
313fe750c24SBenoît Ganne  /* make RDMA async event fd non-blocking */
314fe750c24SBenoît Ganne  ret = fcntl (rd->ctx->async_fd, F_GETFL);
315fe750c24SBenoît Ganne  if (ret < 0)
316f2d5cdbfSBenoît Ganne    return clib_error_return_unix (0, "fcntl(F_GETFL) failed");
317f2d5cdbfSBenoît Ganne
318fe750c24SBenoît Ganne  ret = fcntl (rd->ctx->async_fd, F_SETFL, ret | O_NONBLOCK);
319fe750c24SBenoît Ganne  if (ret < 0)
320f2d5cdbfSBenoît Ganne    return clib_error_return_unix (0, "fcntl(F_SETFL, O_NONBLOCK) failed");
321fe750c24SBenoît Ganne
322fe750c24SBenoît Ganne  /* register RDMA async event fd */
323fe750c24SBenoît Ganne  t.read_function = rdma_async_event_read_ready;
324fe750c24SBenoît Ganne  t.file_descriptor = rd->ctx->async_fd;
325fe750c24SBenoît Ganne  t.error_function = rdma_async_event_error_ready;
326fe750c24SBenoît Ganne  t.private_data = rd->dev_instance;
3277ff07354SBenoît Ganne  t.description = format (0, "%v async event", rd->name);
328fe750c24SBenoît Ganne
329fe750c24SBenoît Ganne  rd->async_event_clib_file_index = clib_file_add (&file_main, &t);
330fe750c24SBenoît Ganne  return 0;
331fe750c24SBenoît Ganne}
332fe750c24SBenoît Ganne
333fe750c24SBenoît Gannestatic void
334fe750c24SBenoît Gannerdma_async_event_cleanup (rdma_device_t * rd)
335fe750c24SBenoît Ganne{
336fe750c24SBenoît Ganne  clib_file_del_by_index (&file_main, rd->async_event_clib_file_index);
337fe750c24SBenoît Ganne}
338fe750c24SBenoît Ganne
339fe750c24SBenoît Gannestatic clib_error_t *
340fe750c24SBenoît Gannerdma_register_interface (vnet_main_t * vnm, rdma_device_t * rd)
341fe750c24SBenoît Ganne{
3424a302ee7SJohn Lo  clib_error_t *err =
3434a302ee7SJohn Lo    ethernet_register_interface (vnm, rdma_device_class.index,
3444a302ee7SJohn Lo				 rd->dev_instance, rd->hwaddr.bytes,
3454a302ee7SJohn Lo				 &rd->hw_if_index, rdma_flag_change);
3464a302ee7SJohn Lo
3474a302ee7SJohn Lo  /* Indicate ability to support L3 DMAC filtering and
3484a302ee7SJohn Lo   * initialize interface to L3 non-promisc mode */
3494a302ee7SJohn Lo  vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, rd->hw_if_index);
3514a302ee7SJohn Lo  ethernet_set_flags (vnm, rd->hw_if_index,
3534a302ee7SJohn Lo  return err;
354fe750c24SBenoît Ganne}
355fe750c24SBenoît Ganne
356fe750c24SBenoît Gannestatic void
357fe750c24SBenoît Gannerdma_unregister_interface (vnet_main_t * vnm, rdma_device_t * rd)
358fe750c24SBenoît Ganne{
359fe750c24SBenoît Ganne  vnet_hw_interface_set_flags (vnm, rd->hw_if_index, 0);
360fe750c24SBenoît Ganne  vnet_hw_interface_unassign_rx_thread (vnm, rd->hw_if_index, 0);
361fe750c24SBenoît Ganne  ethernet_delete_interface (vnm, rd->hw_if_index);
362fe750c24SBenoît Ganne}
363fe750c24SBenoît Ganne
364fe750c24SBenoît Gannestatic void
365fe750c24SBenoît Gannerdma_dev_cleanup (rdma_device_t * rd)
366fe750c24SBenoît Ganne{
367fe750c24SBenoît Ganne  rdma_main_t *rm = &rdma_main;
368fe750c24SBenoît Ganne  rdma_rxq_t *rxq;
369fe750c24SBenoît Ganne  rdma_txq_t *txq;
370fe750c24SBenoît Ganne
371fe750c24SBenoît Ganne#define _(fn, arg) if (arg) \
372fe750c24SBenoît Ganne  { \
373fe750c24SBenoît Ganne    int rv; \
374fe750c24SBenoît Ganne    if ((rv = fn (arg))) \
375f2d5cdbfSBenoît Ganne       rdma_log (VLIB_LOG_LEVEL_DEBUG, rd, #fn "() failed (rv = %d)", rv); \
376fe750c24SBenoît Ganne  }
377fe750c24SBenoît Ganne
378fe750c24SBenoît Ganne  _(ibv_destroy_flow, rd->flow_mcast);
379fe750c24SBenoît Ganne  _(ibv_destroy_flow, rd->flow_ucast);
380fe750c24SBenoît Ganne  _(ibv_dereg_mr, rd->mr);
381fe750c24SBenoît Ganne  vec_foreach (txq, rd->txqs)
382fe750c24SBenoît Ganne  {
383fe750c24SBenoît Ganne    _(ibv_destroy_qp, txq->qp);
384fe750c24SBenoît Ganne    _(ibv_destroy_cq, txq->cq);
385fe750c24SBenoît Ganne  }
386fe750c24SBenoît Ganne  vec_foreach (rxq, rd->rxqs)
387fe750c24SBenoît Ganne  {
3885763e47bSBenoît Ganne    _(ibv_destroy_wq, rxq->wq);
389fe750c24SBenoît Ganne    _(ibv_destroy_cq, rxq->cq);
390fe750c24SBenoît Ganne  }
3915763e47bSBenoît Ganne  _(ibv_destroy_rwq_ind_table, rd->rx_rwq_ind_tbl);
3925763e47bSBenoît Ganne  _(ibv_destroy_qp, rd->rx_qp);
393fe750c24SBenoît Ganne  _(ibv_dealloc_pd, rd->pd);
394fe750c24SBenoît Ganne  _(ibv_close_device, rd->ctx);
395fe750c24SBenoît Ganne#undef _
396fe750c24SBenoît Ganne
397fe750c24SBenoît Ganne  clib_error_free (rd->error);
398fe750c24SBenoît Ganne
399fe750c24SBenoît Ganne  vec_free (rd->rxqs);
400fe750c24SBenoît Ganne  vec_free (rd->txqs);
40196d4e533SDamjan Marion  vec_free (rd->name);
402e7e8bf37SBenoît Ganne  vlib_pci_free_device_info (rd->pci);
403fe750c24SBenoît Ganne  pool_put (rm->devices, rd);
404fe750c24SBenoît Ganne}
405fe750c24SBenoît Ganne
406fe750c24SBenoît Gannestatic clib_error_t *
407fe750c24SBenoît Gannerdma_rxq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc)
408fe750c24SBenoît Ganne{
409fe750c24SBenoît Ganne  rdma_rxq_t *rxq;
4105763e47bSBenoît Ganne  struct ibv_wq_init_attr wqia;
411dd648aacSDamjan Marion  struct ibv_cq_init_attr_ex cqa = { };
4125763e47bSBenoît Ganne  struct ibv_wq_attr wqa;
413dd648aacSDamjan Marion  struct ibv_cq_ex *cqex;
414fe750c24SBenoît Ganne
415fe750c24SBenoît Ganne  vec_validate_aligned (rd->rxqs, qid, CLIB_CACHE_LINE_BYTES);
416fe750c24SBenoît Ganne  rxq = vec_elt_at_index (rd->rxqs, qid);
417fe750c24SBenoît Ganne  rxq->size = n_desc;
418e7e8bf37SBenoît Ganne  vec_validate_aligned (rxq->bufs, n_desc - 1, CLIB_CACHE_LINE_BYTES);
419fe750c24SBenoît Ganne
420dd648aacSDamjan Marion  cqa.cqe = n_desc;
421dd648aacSDamjan Marion  if (rd->flags & RDMA_DEVICE_F_MLX5DV)
422dd648aacSDamjan Marion    {
423dd648aacSDamjan Marion      struct mlx5dv_cq_init_attr dvcq = { };
424dd648aacSDamjan Marion      dvcq.comp_mask = MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE;
425dd648aacSDamjan Marion      dvcq.cqe_comp_res_format = MLX5DV_CQE_RES_FORMAT_HASH;
426dd648aacSDamjan Marion
427dd648aacSDamjan Marion      if ((cqex = mlx5dv_create_cq (rd->ctx, &cqa, &dvcq)) == 0)
428dd648aacSDamjan Marion	return clib_error_return_unix (0, "Create mlx5dv rx CQ Failed");
429dd648aacSDamjan Marion    }
430dd648aacSDamjan Marion  else
431dd648aacSDamjan Marion    {
432dd648aacSDamjan Marion      if ((cqex = ibv_create_cq_ex (rd->ctx, &cqa)) == 0)
433dd648aacSDamjan Marion	return clib_error_return_unix (0, "Create CQ Failed");
434dd648aacSDamjan Marion    }
435dd648aacSDamjan Marion
436dd648aacSDamjan Marion  rxq->cq = ibv_cq_ex_to_cq (cqex);
437fe750c24SBenoît Ganne
4385763e47bSBenoît Ganne  memset (&wqia, 0, sizeof (wqia));
4395763e47bSBenoît Ganne  wqia.wq_type = IBV_WQT_RQ;
4405763e47bSBenoît Ganne  wqia.max_wr = n_desc;
4415763e47bSBenoît Ganne  wqia.max_sge = 1;
4425763e47bSBenoît Ganne  wqia.pd = rd->pd;
4435763e47bSBenoît Ganne  wqia.cq = rxq->cq;
4445763e47bSBenoît Ganne  if ((rxq->wq = ibv_create_wq (rd->ctx, &wqia)) == 0)
4455763e47bSBenoît Ganne    return clib_error_return_unix (0, "Create WQ Failed");
4465763e47bSBenoît Ganne
4475763e47bSBenoît Ganne  memset (&wqa, 0, sizeof (wqa));
4485763e47bSBenoît Ganne  wqa.attr_mask = IBV_WQ_ATTR_STATE;
4495763e47bSBenoît Ganne  wqa.wq_state = IBV_WQS_RDY;
4505763e47bSBenoît Ganne  if (ibv_modify_wq (rxq->wq, &wqa) != 0)
4515763e47bSBenoît Ganne    return clib_error_return_unix (0, "Modify WQ (RDY) Failed");
452fe750c24SBenoît Ganne
453dd648aacSDamjan Marion  if (rd->flags & RDMA_DEVICE_F_MLX5DV)
454dd648aacSDamjan Marion    {
455dd648aacSDamjan Marion      struct mlx5dv_obj obj = { };
456dd648aacSDamjan Marion      struct mlx5dv_cq dv_cq;
457dd648aacSDamjan Marion      struct mlx5dv_rwq dv_rwq;
458dd648aacSDamjan Marion      u64 qw0;
459dd648aacSDamjan Marion
460dd648aacSDamjan Marion      obj.cq.in = rxq->cq;
461dd648aacSDamjan Marion      obj.cq.out = &dv_cq;
462dd648aacSDamjan Marion      obj.rwq.in = rxq->wq;
463dd648aacSDamjan Marion      obj.rwq.out = &dv_rwq;
464dd648aacSDamjan Marion
465dd648aacSDamjan Marion      if ((mlx5dv_init_obj (&obj, MLX5DV_OBJ_CQ | MLX5DV_OBJ_RWQ)))
466dd648aacSDamjan Marion	return clib_error_return_unix (0, "mlx5dv: failed to init rx obj");
467dd648aacSDamjan Marion
468dd648aacSDamjan Marion      if (dv_cq.cqe_size != sizeof (mlx5dv_cqe_t))
469dd648aacSDamjan Marion	return clib_error_return_unix (0, "mlx5dv: incompatible rx CQE size");
470dd648aacSDamjan Marion
471dd648aacSDamjan Marion      rxq->log2_cq_size = max_log2 (dv_cq.cqe_cnt);
472dd648aacSDamjan Marion      rxq->cqes = (mlx5dv_cqe_t *) dv_cq.buf;
473dd648aacSDamjan Marion      rxq->cq_db = (volatile u32 *) dv_cq.dbrec;
474dd648aacSDamjan Marion      rxq->cqn = dv_cq.cqn;
475dd648aacSDamjan Marion
476dd648aacSDamjan Marion      rxq->wqes = (mlx5dv_rwq_t *) dv_rwq.buf;
477dd648aacSDamjan Marion      rxq->wq_db = (volatile u32 *) dv_rwq.dbrec;
478dd648aacSDamjan Marion      rxq->wq_stride = dv_rwq.stride;
479dd648aacSDamjan Marion      rxq->wqe_cnt = dv_rwq.wqe_cnt;
480dd648aacSDamjan Marion
481dd648aacSDamjan Marion      qw0 = clib_host_to_net_u32 (vlib_buffer_get_default_data_size (vm));
482dd648aacSDamjan Marion      qw0 |= (u64) clib_host_to_net_u32 (rd->lkey) << 32;
483dd648aacSDamjan Marion
484dd648aacSDamjan Marion      for (int i = 0; i < rxq->size; i++)
485dd648aacSDamjan Marion	rxq->wqes[i].dsz_and_lkey = qw0;
486dd648aacSDamjan Marion
487dd648aacSDamjan Marion      for (int i = 0; i < (1 << rxq->log2_cq_size); i++)
488dd648aacSDamjan Marion	rxq->cqes[i].opcode_cqefmt_se_owner = 0xff;
489dd648aacSDamjan Marion    }
490dd648aacSDamjan Marion
4915763e47bSBenoît Ganne  return 0;
4925763e47bSBenoît Ganne}
493fe750c24SBenoît Ganne
4945763e47bSBenoît Gannestatic clib_error_t *
4955763e47bSBenoît Gannerdma_rxq_finalize (vlib_main_t * vm, rdma_device_t * rd)
4965763e47bSBenoît Ganne{
4975763e47bSBenoît Ganne  struct ibv_rwq_ind_table_init_attr rwqia;
4985763e47bSBenoît Ganne  struct ibv_qp_init_attr_ex qpia;
4995763e47bSBenoît Ganne  struct ibv_wq **ind_tbl;
5005763e47bSBenoît Ganne  u32 i;
501fe750c24SBenoît Ganne
5025763e47bSBenoît Ganne  ASSERT (is_pow2 (vec_len (rd->rxqs))
5035763e47bSBenoît Ganne	  && "rxq number should be a power of 2");
504fe750c24SBenoît Ganne
5055763e47bSBenoît Ganne  ind_tbl = vec_new (struct ibv_wq *, vec_len (rd->rxqs));
5065763e47bSBenoît Ganne  vec_foreach_index (i, rd->rxqs)
5075763e47bSBenoît Ganne    ind_tbl[i] = vec_elt_at_index (rd->rxqs, i)->wq;
5085763e47bSBenoît Ganne  memset (&rwqia, 0, sizeof (rwqia));
5095763e47bSBenoît Ganne  rwqia.log_ind_tbl_size = min_log2 (vec_len (ind_tbl));
5105763e47bSBenoît Ganne  rwqia.ind_tbl = ind_tbl;
5115763e47bSBenoît Ganne  if ((rd->rx_rwq_ind_tbl = ibv_create_rwq_ind_table (rd->ctx, &rwqia)) == 0)
5125763e47bSBenoît Ganne    return clib_error_return_unix (0, "RWQ indirection table create failed");
5135763e47bSBenoît Ganne  vec_free (ind_tbl);
5145763e47bSBenoît Ganne
5155763e47bSBenoît Ganne  memset (&qpia, 0, sizeof (qpia));
5165763e47bSBenoît Ganne  qpia.qp_type = IBV_QPT_RAW_PACKET;
5175763e47bSBenoît Ganne  qpia.comp_mask =
5185763e47bSBenoît Ganne    IBV_QP_INIT_ATTR_PD | IBV_QP_INIT_ATTR_IND_TABLE |
5195763e47bSBenoît Ganne    IBV_QP_INIT_ATTR_RX_HASH;
5205763e47bSBenoît Ganne  qpia.pd = rd->pd;
5215763e47bSBenoît Ganne  qpia.rwq_ind_tbl = rd->rx_rwq_ind_tbl;
5225763e47bSBenoît Ganne  STATIC_ASSERT_SIZEOF (rdma_rss_hash_key, 40);
5235763e47bSBenoît Ganne  qpia.rx_hash_conf.rx_hash_key_len = sizeof (rdma_rss_hash_key);
5245763e47bSBenoît Ganne  qpia.rx_hash_conf.rx_hash_key = rdma_rss_hash_key;
5255763e47bSBenoît Ganne  qpia.rx_hash_conf.rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ;
5265763e47bSBenoît Ganne  qpia.rx_hash_conf.rx_hash_fields_mask =
5275763e47bSBenoît Ganne    IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_DST_IPV4;
5285763e47bSBenoît Ganne  if ((rd->rx_qp = ibv_create_qp_ex (rd->ctx, &qpia)) == 0)
5295763e47bSBenoît Ganne    return clib_error_return_unix (0, "Queue Pair create failed");
5305763e47bSBenoît Ganne
531f2d5cdbfSBenoît Ganne  if (rdma_dev_set_ucast (rd))
532f2d5cdbfSBenoît Ganne    return clib_error_return_unix (0, "Set unicast mode failed");
533f2d5cdbfSBenoît Ganne
534f2d5cdbfSBenoît Ganne  return 0;
535fe750c24SBenoît Ganne}
536fe750c24SBenoît Ganne
537fe750c24SBenoît Gannestatic clib_error_t *
538fe750c24SBenoît Gannerdma_txq_init (vlib_main_t * vm, rdma_device_t * rd, u16 qid, u32 n_desc)
539fe750c24SBenoît Ganne{
540fe750c24SBenoît Ganne  rdma_txq_t *txq;
541fe750c24SBenoît Ganne  struct ibv_qp_init_attr qpia;
542fe750c24SBenoît Ganne  struct ibv_qp_attr qpa;
543fe750c24SBenoît Ganne  int qp_flags;
544fe750c24SBenoît Ganne
545fe750c24SBenoît Ganne  vec_validate_aligned (rd->txqs, qid, CLIB_CACHE_LINE_BYTES);
546fe750c24SBenoît Ganne  txq = vec_elt_at_index (rd->txqs, qid);
547dc812d9aSBenoît Ganne  ASSERT (is_pow2 (n_desc));
548dc812d9aSBenoît Ganne  txq->bufs_log2sz = min_log2 (n_desc);
549e7e8bf37SBenoît Ganne  vec_validate_aligned (txq->bufs, n_desc - 1, CLIB_CACHE_LINE_BYTES);
550fe750c24SBenoît Ganne
551fe750c24SBenoît Ganne  if ((txq->cq = ibv_create_cq (rd->ctx, n_desc, NULL, NULL, 0)) == 0)
552fe750c24SBenoît Ganne    return clib_error_return_unix (0, "Create CQ Failed");
553fe750c24SBenoît Ganne
554fe750c24SBenoît Ganne  memset (&qpia, 0, sizeof (qpia));
555fe750c24SBenoît Ganne  qpia.send_cq = txq->cq;
556fe750c24SBenoît Ganne  qpia.recv_cq = txq->cq;
557fe750c24SBenoît Ganne  qpia.cap.max_send_wr = n_desc;
558fe750c24SBenoît Ganne  qpia.cap.max_send_sge = 1;
559dc195d68SBenoît Ganne  qpia.qp_type = IBV_QPT_RAW_PACKET;
560fe750c24SBenoît Ganne
561fe750c24SBenoît Ganne  if ((txq->qp = ibv_create_qp (rd->pd, &qpia)) == 0)
562fe750c24SBenoît Ganne    return clib_error_return_unix (0, "Queue Pair create failed");
563fe750c24SBenoît Ganne
564fe750c24SBenoît Ganne  memset (&qpa, 0, sizeof (qpa));
565fe750c24SBenoît Ganne  qp_flags = IBV_QP_STATE | IBV_QP_PORT;
566fe750c24SBenoît Ganne  qpa.qp_state = IBV_QPS_INIT;
567fe750c24SBenoît Ganne  qpa.port_num = 1;
568fe750c24SBenoît Ganne  if (ibv_modify_qp (txq->qp, &qpa, qp_flags) != 0)
569fe750c24SBenoît Ganne    return clib_error_return_unix (0, "Modify QP (init) Failed");
570fe750c24SBenoît Ganne
571fe750c24SBenoît Ganne  memset (&qpa, 0, sizeof (qpa));
572fe750c24SBenoît Ganne  qp_flags = IBV_QP_STATE;
573fe750c24SBenoît Ganne  qpa.qp_state = IBV_QPS_RTR;
574fe750c24SBenoît Ganne  if (ibv_modify_qp (txq->qp, &qpa, qp_flags) != 0)
575fe750c24SBenoît Ganne    return clib_error_return_unix (0, "Modify QP (receive) Failed");
576fe750c24SBenoît Ganne
577fe750c24SBenoît Ganne  memset (&qpa, 0, sizeof (qpa));
578fe750c24SBenoît Ganne  qp_flags = IBV_QP_STATE;
579fe750c24SBenoît Ganne  qpa.qp_state = IBV_QPS_RTS;
580fe750c24SBenoît Ganne  if (ibv_modify_qp (txq->qp, &qpa, qp_flags) != 0)
581fe750c24SBenoît Ganne    return clib_error_return_unix (0, "Modify QP (send) Failed");
582dc812d9aSBenoît Ganne
583dc812d9aSBenoît Ganne  txq->ibv_cq = txq->cq;
584dc812d9aSBenoît Ganne  txq->ibv_qp = txq->qp;
585dc812d9aSBenoît Ganne
586dc812d9aSBenoît Ganne  if (rd->flags & RDMA_DEVICE_F_MLX5DV)
587dc812d9aSBenoît Ganne    {
588dc812d9aSBenoît Ganne      rdma_mlx5_wqe_t *tmpl = (void *) txq->dv_wqe_tmpl;
589dc812d9aSBenoît Ganne      struct mlx5dv_cq dv_cq;
590dc812d9aSBenoît Ganne      struct mlx5dv_qp dv_qp;
591dc812d9aSBenoît Ganne      struct mlx5dv_obj obj = { };
592dc812d9aSBenoît Ganne
593dc812d9aSBenoît Ganne      obj.cq.in = txq->cq;
594dc812d9aSBenoît Ganne      obj.cq.out = &dv_cq;
595dc812d9aSBenoît Ganne      obj.qp.in = txq->qp;
596dc812d9aSBenoît Ganne      obj.qp.out = &dv_qp;
597dc812d9aSBenoît Ganne
598dc812d9aSBenoît Ganne      if (mlx5dv_init_obj (&obj, MLX5DV_OBJ_CQ | MLX5DV_OBJ_QP))
599dc812d9aSBenoît Ganne	return clib_error_return_unix (0, "DV init obj failed");
600dc812d9aSBenoît Ganne
601dc812d9aSBenoît Ganne      if (RDMA_TXQ_BUF_SZ (txq) > dv_qp.sq.wqe_cnt
602dc812d9aSBenoît Ganne	  || !is_pow2 (dv_qp.sq.wqe_cnt)
603dc812d9aSBenoît Ganne	  || sizeof (rdma_mlx5_wqe_t) != dv_qp.sq.stride
604dc812d9aSBenoît Ganne	  || (uword) dv_qp.sq.buf % sizeof (rdma_mlx5_wqe_t))
605dc812d9aSBenoît Ganne	return clib_error_return (0, "Unsupported DV SQ parameters");
606dc812d9aSBenoît Ganne
607dc812d9aSBenoît Ganne      if (RDMA_TXQ_BUF_SZ (txq) > dv_cq.cqe_cnt
608dc812d9aSBenoît Ganne	  || !is_pow2 (dv_cq.cqe_cnt)
609dc812d9aSBenoît Ganne	  || sizeof (struct mlx5_cqe64) != dv_cq.cqe_size
610dc812d9aSBenoît Ganne	  || (uword) dv_cq.buf % sizeof (struct mlx5_cqe64))
611dc812d9aSBenoît Ganne	return clib_error_return (0, "Unsupported DV CQ parameters");
612dc812d9aSBenoît Ganne
613dc812d9aSBenoît Ganne      /* get SQ and doorbell addresses */
614dc812d9aSBenoît Ganne      txq->dv_sq_wqes = dv_qp.sq.buf;
615dc812d9aSBenoît Ganne      txq->dv_sq_dbrec = dv_qp.dbrec;
616dc812d9aSBenoît Ganne      txq->dv_sq_db = dv_qp.bf.reg;
617dc812d9aSBenoît Ganne      txq->dv_sq_log2sz = min_log2 (dv_qp.sq.wqe_cnt);
618dc812d9aSBenoît Ganne
619dc812d9aSBenoît Ganne      /* get CQ and doorbell addresses */
620dc812d9aSBenoît Ganne      txq->dv_cq_cqes = dv_cq.buf;
621dc812d9aSBenoît Ganne      txq->dv_cq_dbrec = dv_cq.dbrec;
622dc812d9aSBenoît Ganne      txq->dv_cq_log2sz = min_log2 (dv_cq.cqe_cnt);
623dc812d9aSBenoît Ganne
624dc812d9aSBenoît Ganne      /* init tx desc template */
625dc812d9aSBenoît Ganne      STATIC_ASSERT_SIZEOF (txq->dv_wqe_tmpl, sizeof (*tmpl));
626dc812d9aSBenoît Ganne      mlx5dv_set_ctrl_seg (&tmpl->ctrl, 0, MLX5_OPCODE_SEND, 0,
627dc812d9aSBenoît Ganne			   txq->qp->qp_num, 0, RDMA_MLX5_WQE_DS, 0,
628dc812d9aSBenoît Ganne			   RDMA_TXQ_DV_INVALID_ID);
629aaa65a12SDamjan Marion      tmpl->eseg.inline_hdr_sz = htobe16 (MLX5_ETH_L2_INLINE_HEADER_SIZE);
630dc812d9aSBenoît Ganne      mlx5dv_set_data_seg (&tmpl->dseg, 0, rd->lkey, 0);
631dc812d9aSBenoît Ganne    }
632dc812d9aSBenoît Ganne
633fe750c24SBenoît Ganne  return 0;
634fe750c24SBenoît Ganne}
635fe750c24SBenoît Ganne
636fe750c24SBenoît Gannestatic clib_error_t *
6375763e47bSBenoît Gannerdma_dev_init (vlib_main_t * vm, rdma_device_t * rd, u32 rxq_size,
6385763e47bSBenoît Ganne	       u32 txq_size, u32 rxq_num)
639fe750c24SBenoît Ganne{
640fe750c24SBenoît Ganne  clib_error_t *err;
641fe750c24SBenoît Ganne  vlib_buffer_main_t *bm = vm->buffer_main;
642fe750c24SBenoît Ganne  vlib_thread_main_t *tm = vlib_get_thread_main ();
6435763e47bSBenoît Ganne  u32 i;
644fe750c24SBenoît Ganne
645fe750c24SBenoît Ganne  if (rd->ctx == 0)
646fe750c24SBenoît Ganne    return clib_error_return_unix (0, "Device Open Failed");
647fe750c24SBenoît Ganne
648fe750c24SBenoît Ganne  if ((rd->pd = ibv_alloc_pd (rd->ctx)) == 0)
649fe750c24SBenoît Ganne    return clib_error_return_unix (0, "PD Alloc Failed");
650fe750c24SBenoît Ganne
651dc812d9aSBenoît Ganne  if ((rd->mr = ibv_reg_mr (rd->pd, (void *) bm->buffer_mem_start,
652dc812d9aSBenoît Ganne			    bm->buffer_mem_size,
653dc812d9aSBenoît Ganne			    IBV_ACCESS_LOCAL_WRITE)) == 0)
654dc812d9aSBenoît Ganne    return clib_error_return_unix (0, "Register MR Failed");
655dc812d9aSBenoît Ganne
656dc812d9aSBenoît Ganne  rd->lkey = rd->mr->lkey;	/* avoid indirection in datapath */
657dc812d9aSBenoît Ganne
6585763e47bSBenoît Ganne  ethernet_mac_address_generate (rd->hwaddr.bytes);
6595763e47bSBenoît Ganne
660dd648aacSDamjan Marion  if ((rd->mr = ibv_reg_mr (rd->pd, (void *) bm->buffer_mem_start,
661dd648aacSDamjan Marion			    bm->buffer_mem_size,
662dd648aacSDamjan Marion			    IBV_ACCESS_LOCAL_WRITE)) == 0)
663dd648aacSDamjan Marion    return clib_error_return_unix (0, "Register MR Failed");
664dd648aacSDamjan Marion  rd->lkey = rd->mr->lkey;	/* avoid indirection in datapath */
665dd648aacSDamjan Marion
666df213385SBenoît Ganne  /*
667df213385SBenoît Ganne   * /!\ WARNING /!\ creation order is important
668df213385SBenoît Ganne   * We *must* create TX queues *before* RX queues, otherwise we will receive
669df213385SBenoît Ganne   * the broacast packets we sent
670df213385SBenoît Ganne   */
671df213385SBenoît Ganne  for (i = 0; i < tm->n_vlib_mains; i++)
672df213385SBenoît Ganne    if ((err = rdma_txq_init (vm, rd, i, txq_size)))
673df213385SBenoît Ganne      return err;
674df213385SBenoît Ganne
6755763e47bSBenoît Ganne  for (i = 0; i < rxq_num; i++)
6765763e47bSBenoît Ganne    if ((err = rdma_rxq_init (vm, rd, i, rxq_size)))
6775763e47bSBenoît Ganne      return err;
6785763e47bSBenoît Ganne  if ((err = rdma_rxq_finalize (vm, rd)))
679fe750c24SBenoît Ganne    return err;
680fe750c24SBenoît Ganne
681fe750c24SBenoît Ganne  return 0;
682fe750c24SBenoît Ganne}
683fe750c24SBenoît Ganne
684fe750c24SBenoît Gannestatic uword
685fe750c24SBenoît Gannesysfs_path_to_pci_addr (char *path, vlib_pci_addr_t * addr)
686fe750c24SBenoît Ganne{
687fe750c24SBenoît Ganne  uword rv;
688fe750c24SBenoît Ganne  unformat_input_t