1/*
2 * Copyright (c) 2015 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
6 *
7 *     http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15#include <vnet/vnet.h>
16#include <vppinfra/vec.h>
17#include <vppinfra/error.h>
18#include <vppinfra/format.h>
19#include <vppinfra/bitmap.h>
20#include <vppinfra/linux/sysfs.h>
21#include <vlib/unix/unix.h>
22#include <vlib/log.h>
23
24#include <vnet/ethernet/ethernet.h>
25#include <dpdk/buffer.h>
26#include <dpdk/device/dpdk.h>
27#include <dpdk/cryptodev/cryptodev.h>
28#include <vlib/pci/pci.h>
29#include <vlib/vmbus/vmbus.h>
30
31#include <rte_ring.h>
32
33#include <stdio.h>
34#include <stdlib.h>
35#include <unistd.h>
36#include <sys/stat.h>
37#include <sys/mount.h>
38#include <string.h>
39#include <fcntl.h>
40#include <dirent.h>
41
42#include <dpdk/device/dpdk_priv.h>
43
44#define ETHER_MAX_LEN   1518  /**< Maximum frame len, including CRC. */
45
46dpdk_main_t dpdk_main;
47dpdk_config_main_t dpdk_config_main;
48
49#define LINK_STATE_ELOGS	0
50
51/* Port configuration, mildly modified Intel app values */
52
53static dpdk_port_type_t
54port_type_from_speed_capa (struct rte_eth_dev_info *dev_info)
55{
56
57  if (dev_info->speed_capa & ETH_LINK_SPEED_100G)
58    return VNET_DPDK_PORT_TYPE_ETH_100G;
59  else if (dev_info->speed_capa & ETH_LINK_SPEED_56G)
60    return VNET_DPDK_PORT_TYPE_ETH_56G;
61  else if (dev_info->speed_capa & ETH_LINK_SPEED_50G)
62    return VNET_DPDK_PORT_TYPE_ETH_50G;
63  else if (dev_info->speed_capa & ETH_LINK_SPEED_40G)
64    return VNET_DPDK_PORT_TYPE_ETH_40G;
65  else if (dev_info->speed_capa & ETH_LINK_SPEED_25G)
66    return VNET_DPDK_PORT_TYPE_ETH_25G;
67  else if (dev_info->speed_capa & ETH_LINK_SPEED_20G)
68    return VNET_DPDK_PORT_TYPE_ETH_20G;
69  else if (dev_info->speed_capa & ETH_LINK_SPEED_10G)
70    return VNET_DPDK_PORT_TYPE_ETH_10G;
71  else if (dev_info->speed_capa & ETH_LINK_SPEED_5G)
72    return VNET_DPDK_PORT_TYPE_ETH_5G;
73  else if (dev_info->speed_capa & ETH_LINK_SPEED_2_5G)
74    return VNET_DPDK_PORT_TYPE_ETH_2_5G;
75  else if (dev_info->speed_capa & ETH_LINK_SPEED_1G)
76    return VNET_DPDK_PORT_TYPE_ETH_1G;
77
78  return VNET_DPDK_PORT_TYPE_UNKNOWN;
79}
80
81static dpdk_port_type_t
82port_type_from_link_speed (u32 link_speed)
83{
84  switch (link_speed)
85    {
86    case ETH_SPEED_NUM_1G:
87      return VNET_DPDK_PORT_TYPE_ETH_1G;
88    case ETH_SPEED_NUM_2_5G:
89      return VNET_DPDK_PORT_TYPE_ETH_2_5G;
90    case ETH_SPEED_NUM_5G:
91      return VNET_DPDK_PORT_TYPE_ETH_5G;
92    case ETH_SPEED_NUM_10G:
93      return VNET_DPDK_PORT_TYPE_ETH_10G;
94    case ETH_SPEED_NUM_20G:
95      return VNET_DPDK_PORT_TYPE_ETH_20G;
96    case ETH_SPEED_NUM_25G:
97      return VNET_DPDK_PORT_TYPE_ETH_25G;
98    case ETH_SPEED_NUM_40G:
99      return VNET_DPDK_PORT_TYPE_ETH_40G;
100    case ETH_SPEED_NUM_50G:
101      return VNET_DPDK_PORT_TYPE_ETH_50G;
102    case ETH_SPEED_NUM_56G:
103      return VNET_DPDK_PORT_TYPE_ETH_56G;
104    case ETH_SPEED_NUM_100G:
105      return VNET_DPDK_PORT_TYPE_ETH_100G;
106    default:
107      return VNET_DPDK_PORT_TYPE_UNKNOWN;
108    }
109}
110
111static u32
112dpdk_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hi, u32 flags)
113{
114  dpdk_main_t *dm = &dpdk_main;
115  dpdk_device_t *xd = vec_elt_at_index (dm->devices, hi->dev_instance);
116  u32 old = (xd->flags & DPDK_DEVICE_FLAG_PROMISC) != 0;
117
118  switch (flags)
119    {
120    case ETHERNET_INTERFACE_FLAG_DEFAULT_L3:
121      /* set to L3/non-promisc mode */
122      xd->flags &= ~DPDK_DEVICE_FLAG_PROMISC;
123      break;
124    case ETHERNET_INTERFACE_FLAG_ACCEPT_ALL:
125      xd->flags |= DPDK_DEVICE_FLAG_PROMISC;
126      break;
127    case ETHERNET_INTERFACE_FLAG_MTU:
128      xd->port_conf.rxmode.max_rx_pkt_len = hi->max_packet_bytes;
129      dpdk_device_setup (xd);
130      return 0;
131    default:
132      return ~0;
133    }
134
135  if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP)
136    {
137      if (xd->flags & DPDK_DEVICE_FLAG_PROMISC)
138	rte_eth_promiscuous_enable (xd->port_id);
139      else
140	rte_eth_promiscuous_disable (xd->port_id);
141    }
142
143  return old;
144}
145
146static void
147dpdk_device_lock_init (dpdk_device_t * xd)
148{
149  int q;
150  vec_validate (xd->lockp, xd->tx_q_used - 1);
151  for (q = 0; q < xd->tx_q_used; q++)
152    {
153      xd->lockp[q] = clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES,
154					     CLIB_CACHE_LINE_BYTES);
155      clib_memset ((void *) xd->lockp[q], 0, CLIB_CACHE_LINE_BYTES);
156    }
157}
158
159static int
160dpdk_port_crc_strip_enabled (dpdk_device_t * xd)
161{
162  return !(xd->port_conf.rxmode.offloads & DEV_RX_OFFLOAD_KEEP_CRC);
163}
164
165/* The function check_l3cache helps check if Level 3 cache exists or not on current CPUs
166  return value 1: exist.
167  return value 0: not exist.
168*/
169static int
170check_l3cache ()
171{
172
173  struct dirent *dp;
174  clib_error_t *err;
175  const char *sys_cache_dir = "/sys/devices/system/cpu/cpu0/cache";
176  DIR *dir_cache = opendir (sys_cache_dir);
177
178  if (dir_cache == NULL)
179    return -1;
180
181  while ((dp = readdir (dir_cache)) != NULL)
182    {
183      if (dp->d_type == DT_DIR)
184	{
185	  u8 *p = NULL;
186	  int level_cache = -1;
187
188	  p = format (p, "%s/%s/%s%c", sys_cache_dir, dp->d_name, "level", 0);
189	  if ((err = clib_sysfs_read ((char *) p, "%d", &level_cache)))
190	    clib_error_free (err);
191
192	  if (level_cache == 3)
193	    {
194	      closedir (dir_cache);
195	      return 1;
196	    }
197	}
198    }
199
200  if (dir_cache != NULL)
201    closedir (dir_cache);
202
203  return 0;
204}
205
206static void
207dpdk_enable_l4_csum_offload (dpdk_device_t * xd)
208{
209  xd->port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_CKSUM;
210  xd->port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM;
211  xd->flags |= DPDK_DEVICE_FLAG_TX_OFFLOAD |
212    DPDK_DEVICE_FLAG_INTEL_PHDR_CKSUM;
213}
214
215static clib_error_t *
216dpdk_lib_init (dpdk_main_t * dm)
217{
218  u32 nports;
219  u32 mtu, max_rx_frame;
220  int i;
221  clib_error_t *error;
222  vlib_main_t *vm = vlib_get_main ();
223  vlib_thread_main_t *tm = vlib_get_thread_main ();
224  vnet_device_main_t *vdm = &vnet_device_main;
225  vnet_sw_interface_t *sw;
226  vnet_hw_interface_t *hi;
227  dpdk_device_t *xd;
228  vlib_pci_addr_t last_pci_addr;
229  u32 last_pci_addr_port = 0;
230  u8 af_packet_instance_num = 0;
231  last_pci_addr.as_u32 = ~0;
232
233  nports = rte_eth_dev_count_avail ();
234
235  if (nports < 1)
236    {
237      dpdk_log_notice ("DPDK drivers found no Ethernet devices...");
238    }
239
240  if (CLIB_DEBUG > 0)
241    dpdk_log_notice ("DPDK drivers found %d ports...", nports);
242
243  if (dm->conf->enable_tcp_udp_checksum)
244    dm->buffer_flags_template &= ~(VNET_BUFFER_F_L4_CHECKSUM_CORRECT
245				   | VNET_BUFFER_F_L4_CHECKSUM_COMPUTED);
246
247  /* vlib_buffer_t template */
248  vec_validate_aligned (dm->per_thread_data, tm->n_vlib_mains - 1,
249			CLIB_CACHE_LINE_BYTES);
250  for (i = 0; i < tm->n_vlib_mains; i++)
251    {
252      dpdk_per_thread_data_t *ptd = vec_elt_at_index (dm->per_thread_data, i);
253      clib_memset (&ptd->buffer_template, 0, sizeof (vlib_buffer_t));
254      ptd->buffer_template.flags = dm->buffer_flags_template;
255      vnet_buffer (&ptd->buffer_template)->sw_if_index[VLIB_TX] = (u32) ~ 0;
256    }
257
258  /* *INDENT-OFF* */
259  RTE_ETH_FOREACH_DEV(i)
260    {
261      u8 addr[6];
262      int vlan_off;
263      struct rte_eth_dev_info dev_info;
264      struct rte_pci_device *pci_dev;
265      dpdk_portid_t next_port_id;
266      dpdk_device_config_t *devconf = 0;
267      vlib_pci_addr_t pci_addr;
268      uword *p = 0;
269
270      if (!rte_eth_dev_is_valid_port(i))
271	continue;
272
273      rte_eth_dev_info_get (i, &dev_info);
274
275      if (dev_info.device == 0)
276	{
277	  dpdk_log_notice ("DPDK bug: missing device info. Skipping %s device",
278			dev_info.driver_name);
279	  continue;
280	}
281
282      pci_dev = dpdk_get_pci_device (&dev_info);
283
284      if (pci_dev)
285	{
286	  pci_addr.domain = pci_dev->addr.domain;
287	  pci_addr.bus = pci_dev->addr.bus;
288	  pci_addr.slot = pci_dev->addr.devid;
289	  pci_addr.function = pci_dev->addr.function;
290	  p = hash_get (dm->conf->device_config_index_by_pci_addr,
291			pci_addr.as_u32);
292	}
293
294
295      /* Create vnet interface */
296      vec_add2_aligned (dm->devices, xd, 1, CLIB_CACHE_LINE_BYTES);
297      xd->nb_rx_desc = DPDK_NB_RX_DESC_DEFAULT;
298      xd->nb_tx_desc = DPDK_NB_TX_DESC_DEFAULT;
299      xd->cpu_socket = (i8) rte_eth_dev_socket_id (i);
300
301      if (p)
302	{
303	  devconf = pool_elt_at_index (dm->conf->dev_confs, p[0]);
304	  xd->name = devconf->name;
305	}
306      else
307	devconf = &dm->conf->default_devconf;
308
309      /* Handle representor devices that share the same PCI ID */
310      if (dev_info.switch_info.domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
311        {
312          if (dev_info.switch_info.port_id != (uint16_t)-1)
313            xd->interface_name_suffix = format (0, "%d", dev_info.switch_info.port_id);
314        }
315      /* Handle interface naming for devices with multiple ports sharing same PCI ID */
316      else if (pci_dev &&
317	  ((next_port_id = rte_eth_find_next (i + 1)) != RTE_MAX_ETHPORTS))
318	{
319	  struct rte_eth_dev_info di = { 0 };
320	  struct rte_pci_device *next_pci_dev;
321	  rte_eth_dev_info_get (next_port_id, &di);
322	  next_pci_dev = di.device ? RTE_DEV_TO_PCI (di.device) : 0;
323	  if (next_pci_dev &&
324	      pci_addr.as_u32 != last_pci_addr.as_u32 &&
325	      memcmp (&pci_dev->addr, &next_pci_dev->addr,
326		      sizeof (struct rte_pci_addr)) == 0)
327	    {
328	      xd->interface_name_suffix = format (0, "0");
329	      last_pci_addr.as_u32 = pci_addr.as_u32;
330	      last_pci_addr_port = i;
331	    }
332	  else if (pci_addr.as_u32 == last_pci_addr.as_u32)
333	    {
334	      xd->interface_name_suffix =
335		format (0, "%u", i - last_pci_addr_port);
336	    }
337	  else
338	    {
339	      last_pci_addr.as_u32 = ~0;
340	    }
341	}
342      else
343	last_pci_addr.as_u32 = ~0;
344
345      clib_memcpy (&xd->tx_conf, &dev_info.default_txconf,
346		   sizeof (struct rte_eth_txconf));
347
348      if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM)
349	{
350	  xd->port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_IPV4_CKSUM;
351	  xd->flags |= DPDK_DEVICE_FLAG_RX_IP4_CKSUM;
352	}
353
354      if (dm->conf->enable_tcp_udp_checksum)
355	{
356	  if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM)
357	    xd->port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_UDP_CKSUM;
358	  if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)
359	    xd->port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_CKSUM;
360	}
361
362      if (dm->conf->no_multi_seg)
363	{
364	  xd->port_conf.txmode.offloads &= ~DEV_TX_OFFLOAD_MULTI_SEGS;
365	  xd->port_conf.rxmode.offloads &= ~DEV_RX_OFFLOAD_JUMBO_FRAME;
366	  xd->port_conf.rxmode.offloads &= ~DEV_RX_OFFLOAD_SCATTER;
367	}
368      else
369	{
370	  xd->port_conf.txmode.offloads |= DEV_TX_OFFLOAD_MULTI_SEGS;
371	  xd->port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_JUMBO_FRAME;
372	  xd->port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_SCATTER;
373	  xd->flags |= DPDK_DEVICE_FLAG_MAYBE_MULTISEG;
374	}
375
376      xd->tx_q_used = clib_min (dev_info.max_tx_queues, tm->n_vlib_mains);
377
378      if (devconf->num_tx_queues > 0
379	  && devconf->num_tx_queues < xd->tx_q_used)
380	xd->tx_q_used = clib_min (xd->tx_q_used, devconf->num_tx_queues);
381
382      if (devconf->num_rx_queues > 1
383	  && dev_info.max_rx_queues >= devconf->num_rx_queues)
384	{
385	  xd->rx_q_used = devconf->num_rx_queues;
386	  xd->port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
387	  if (devconf->rss_fn == 0)
388	    xd->port_conf.rx_adv_conf.rss_conf.rss_hf =
389	      ETH_RSS_IP | ETH_RSS_UDP | ETH_RSS_TCP;
390	  else
391	    {
392	      u64 unsupported_bits;
393	      xd->port_conf.rx_adv_conf.rss_conf.rss_hf = devconf->rss_fn;
394	      unsupported_bits = xd->port_conf.rx_adv_conf.rss_conf.rss_hf;
395	      unsupported_bits &= ~dev_info.flow_type_rss_offloads;
396	      if (unsupported_bits)
397		dpdk_log_warn ("Unsupported RSS hash functions: %U",
398			       format_dpdk_rss_hf_name, unsupported_bits);
399	    }
400	  xd->port_conf.rx_adv_conf.rss_conf.rss_hf &=
401	    dev_info.flow_type_rss_offloads;
402	}
403      else
404	xd->rx_q_used = 1;
405
406      xd->flags |= DPDK_DEVICE_FLAG_PMD;
407
408      /* workaround for drivers not setting driver_name */
409      if ((!dev_info.driver_name) && (pci_dev))
410	dev_info.driver_name = pci_dev->driver->driver.name;
411
412      ASSERT (dev_info.driver_name);
413
414      if (!xd->pmd)
415	{
416
417
418#define _(s,f) else if (dev_info.driver_name &&                 \
419                        !strcmp(dev_info.driver_name, s))       \
420                 xd->pmd = VNET_DPDK_PMD_##f;
421	  if (0)
422	    ;
423	  foreach_dpdk_pmd
424#undef _
425	    else
426	    xd->pmd = VNET_DPDK_PMD_UNKNOWN;
427
428	  xd->port_type = VNET_DPDK_PORT_TYPE_UNKNOWN;
429	  xd->nb_rx_desc = DPDK_NB_RX_DESC_DEFAULT;
430	  xd->nb_tx_desc = DPDK_NB_TX_DESC_DEFAULT;
431
432	  switch (xd->pmd)
433	    {
434	      /* Drivers with valid speed_capa set */
435	    case VNET_DPDK_PMD_E1000EM:
436	    case VNET_DPDK_PMD_IGB:
437	    case VNET_DPDK_PMD_IXGBE:
438	    case VNET_DPDK_PMD_I40E:
439	    case VNET_DPDK_PMD_ICE:
440	      xd->port_type = port_type_from_speed_capa (&dev_info);
441	      xd->supported_flow_actions = VNET_FLOW_ACTION_MARK |
442		VNET_FLOW_ACTION_REDIRECT_TO_NODE |
443		VNET_FLOW_ACTION_REDIRECT_TO_QUEUE |
444		VNET_FLOW_ACTION_BUFFER_ADVANCE |
445		VNET_FLOW_ACTION_COUNT | VNET_FLOW_ACTION_DROP |
446		VNET_FLOW_ACTION_RSS;
447
448	      if (dm->conf->no_tx_checksum_offload == 0)
449		{
450	          xd->port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_CKSUM;
451	          xd->port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM;
452		  xd->flags |=
453		    DPDK_DEVICE_FLAG_TX_OFFLOAD |
454		    DPDK_DEVICE_FLAG_INTEL_PHDR_CKSUM;
455		}
456
457
458	      break;
459	    case VNET_DPDK_PMD_CXGBE:
460	    case VNET_DPDK_PMD_MLX4:
461	    case VNET_DPDK_PMD_MLX5:
462	    case VNET_DPDK_PMD_QEDE:
463	    case VNET_DPDK_PMD_BNXT:
464	      xd->port_type = port_type_from_speed_capa (&dev_info);
465	      break;
466
467	      /* SR-IOV VFs */
468	    case VNET_DPDK_PMD_IGBVF:
469	    case VNET_DPDK_PMD_IXGBEVF:
470	    case VNET_DPDK_PMD_I40EVF:
471	      xd->port_type = VNET_DPDK_PORT_TYPE_ETH_VF;
472	      if (dm->conf->no_tx_checksum_offload == 0)
473		{
474	          xd->port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_CKSUM;
475	          xd->port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM;
476		  xd->flags |=
477		    DPDK_DEVICE_FLAG_TX_OFFLOAD |
478		    DPDK_DEVICE_FLAG_INTEL_PHDR_CKSUM;
479		}
480	      break;
481
482	      /* iAVF */
483	    case VNET_DPDK_PMD_IAVF:
484        xd->port_type = VNET_DPDK_PORT_TYPE_ETH_VF;
485	      xd->supported_flow_actions = VNET_FLOW_ACTION_MARK |
486		VNET_FLOW_ACTION_REDIRECT_TO_NODE |
487		VNET_FLOW_ACTION_REDIRECT_TO_QUEUE |
488		VNET_FLOW_ACTION_BUFFER_ADVANCE |
489		VNET_FLOW_ACTION_COUNT | VNET_FLOW_ACTION_DROP;
490
491	      if (dm->conf->no_tx_checksum_offload == 0)
492		{
493                  xd->port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_CKSUM;
494                  xd->port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM;
495		  xd->flags |=
496		    DPDK_DEVICE_FLAG_TX_OFFLOAD |
497		    DPDK_DEVICE_FLAG_INTEL_PHDR_CKSUM;
498		}
499              break;
500
501	    case VNET_DPDK_PMD_THUNDERX:
502	      xd->port_type = VNET_DPDK_PORT_TYPE_ETH_VF;
503
504	      if (dm->conf->no_tx_checksum_offload == 0)
505		{
506	          xd->port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_CKSUM;
507	          xd->port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM;
508		  xd->flags |= DPDK_DEVICE_FLAG_TX_OFFLOAD;
509		}
510	      break;
511
512	    case VNET_DPDK_PMD_ENA:
513	      xd->port_type = VNET_DPDK_PORT_TYPE_ETH_VF;
514	      xd->port_conf.rxmode.offloads &= ~DEV_RX_OFFLOAD_SCATTER;
515	      break;
516
517	    case VNET_DPDK_PMD_DPAA2:
518	      xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G;
519	      break;
520
521	      /* Cisco VIC */
522	    case VNET_DPDK_PMD_ENIC:
523                {
524                  struct rte_eth_link l;
525                  rte_eth_link_get_nowait (i, &l);
526                  xd->port_type = port_type_from_link_speed (l.link_speed);
527                  if (dm->conf->enable_tcp_udp_checksum)
528                    dpdk_enable_l4_csum_offload (xd);
529                }
530	      break;
531
532	      /* Intel Red Rock Canyon */
533	    case VNET_DPDK_PMD_FM10K:
534	      xd->port_type = VNET_DPDK_PORT_TYPE_ETH_SWITCH;
535	      break;
536
537	      /* virtio */
538	    case VNET_DPDK_PMD_VIRTIO:
539	      xd->port_conf.rxmode.mq_mode = ETH_MQ_RX_NONE;
540	      xd->port_type = VNET_DPDK_PORT_TYPE_ETH_1G;
541	      xd->nb_rx_desc = DPDK_NB_RX_DESC_VIRTIO;
542	      xd->nb_tx_desc = DPDK_NB_TX_DESC_VIRTIO;
543	      break;
544
545	      /* vmxnet3 */
546	    case VNET_DPDK_PMD_VMXNET3:
547	      xd->port_type = VNET_DPDK_PORT_TYPE_ETH_1G;
548	      xd->port_conf.txmode.offloads |= DEV_TX_OFFLOAD_MULTI_SEGS;
549	      break;
550
551	    case VNET_DPDK_PMD_AF_PACKET:
552	      xd->port_type = VNET_DPDK_PORT_TYPE_AF_PACKET;
553	      xd->af_packet_instance_num = af_packet_instance_num++;
554	      break;
555
556	    case VNET_DPDK_PMD_VIRTIO_USER:
557	      xd->port_type = VNET_DPDK_PORT_TYPE_VIRTIO_USER;
558	      break;
559
560	    case VNET_DPDK_PMD_VHOST_ETHER:
561	      xd->port_type = VNET_DPDK_PORT_TYPE_VHOST_ETHER;
562	      break;
563
564	    case VNET_DPDK_PMD_LIOVF_ETHER:
565	      xd->port_type = VNET_DPDK_PORT_TYPE_ETH_VF;
566	      break;
567
568	    case VNET_DPDK_PMD_FAILSAFE:
569	      xd->port_type = VNET_DPDK_PORT_TYPE_FAILSAFE;
570	      xd->port_conf.intr_conf.lsc = 1;
571	      break;
572
573	    case VNET_DPDK_PMD_NETVSC:
574                {
575                  struct rte_eth_link l;
576                  rte_eth_link_get_nowait (i, &l);
577		  xd->port_type = VNET_DPDK_PORT_TYPE_ETH_VF;
578                }
579	      break;
580
581	    default:
582	      xd->port_type = VNET_DPDK_PORT_TYPE_UNKNOWN;
583	    }
584
585	  if (devconf->num_rx_desc)
586	    xd->nb_rx_desc = devconf->num_rx_desc;
587          else {
588
589            /* If num_rx_desc is not specified by VPP user, the current CPU is working
590            with 2M page and has no L3 cache, default num_rx_desc is changed to 512
591            from original 1024 to help reduce TLB misses.
592            */
593            if ((clib_mem_get_default_hugepage_size () == 2 << 20)
594              && check_l3cache() == 0)
595              xd->nb_rx_desc = 512;
596          }
597
598	  if (devconf->num_tx_desc)
599	    xd->nb_tx_desc = devconf->num_tx_desc;
600          else {
601
602            /* If num_tx_desc is not specified by VPP user, the current CPU is working
603            with 2M page and has no L3 cache, default num_tx_desc is changed to 512
604            from original 1024 to help reduce TLB misses.
605            */
606            if ((clib_mem_get_default_hugepage_size () == 2 << 20)
607              && check_l3cache() == 0)
608              xd->nb_tx_desc = 512;
609	  }
610       }
611
612      if (xd->pmd == VNET_DPDK_PMD_AF_PACKET)
613	{
614	  f64 now = vlib_time_now (vm);
615	  u32 rnd;
616	  rnd = (u32) (now * 1e6);
617	  rnd = random_u32 (&rnd);
618	  clib_memcpy (addr + 2, &rnd, sizeof (rnd));
619	  addr[0] = 2;
620	  addr[1] = 0xfe;
621	}
622      else
623	rte_eth_macaddr_get (i, (void *) addr);
624
625      if (xd->tx_q_used < tm->n_vlib_mains)
626	dpdk_device_lock_init (xd);
627
628      xd->port_id = i;
629      xd->device_index = xd - dm->devices;
630      xd->per_interface_next_index = ~0;
631
632      /* assign interface to input thread */
633      int q;
634
635
636      error = ethernet_register_interface
637	(dm->vnet_main, dpdk_device_class.index, xd->device_index,
638	 /* ethernet address */ addr,
639	 &xd->hw_if_index, dpdk_flag_change);
640      if (error)
641	return error;
642
643      /*
644       * Ensure default mtu is not > the mtu read from the hardware.
645       * Otherwise rte_eth_dev_configure() will fail and the port will
646       * not be available.
647       * Calculate max_frame_size and mtu supported by NIC
648       */
649      if (ETHERNET_MAX_PACKET_BYTES > dev_info.max_rx_pktlen)
650	{
651	  /*
652	   * This device does not support the platforms's max frame
653	   * size. Use it's advertised mru instead.
654	   */
655	  max_rx_frame = dev_info.max_rx_pktlen;
656	  mtu = dev_info.max_rx_pktlen - sizeof (ethernet_header_t);
657	}
658      else
659	{
660	  /* VPP treats MTU and max_rx_pktlen both equal to
661	   * ETHERNET_MAX_PACKET_BYTES, if dev_info.max_rx_pktlen >=
662	   * ETHERNET_MAX_PACKET_BYTES + sizeof(ethernet_header_t)
663	   */
664	  if (dev_info.max_rx_pktlen >= (ETHERNET_MAX_PACKET_BYTES +
665					 sizeof (ethernet_header_t)))
666	    {
667	      mtu = ETHERNET_MAX_PACKET_BYTES;
668	      max_rx_frame = ETHERNET_MAX_PACKET_BYTES;
669
670	      /*
671	       * Some platforms do not account for Ethernet FCS (4 bytes) in
672	       * MTU calculations. To interop with them increase mru but only
673	       * if the device's settings can support it.
674	       */
675	      if (dpdk_port_crc_strip_enabled (xd) &&
676		  (dev_info.max_rx_pktlen >= (ETHERNET_MAX_PACKET_BYTES +
677					      sizeof (ethernet_header_t) +
678					      4)))
679		{
680		  max_rx_frame += 4;
681		}
682	    }
683	  else
684	    {
685	      max_rx_frame = ETHERNET_MAX_PACKET_BYTES;
686	      mtu = ETHERNET_MAX_PACKET_BYTES - sizeof (ethernet_header_t);
687
688	      if (dpdk_port_crc_strip_enabled (xd) &&
689		  (dev_info.max_rx_pktlen >= (ETHERNET_MAX_PACKET_BYTES + 4)))
690		{
691		  max_rx_frame += 4;
692		}
693	    }
694	}
695
696      if (xd->pmd == VNET_DPDK_PMD_FAILSAFE)
697	{
698	  /* failsafe device numerables are reported with active device only,
699	   * need to query the mtu for current device setup to overwrite
700	   * reported value.
701	   */
702	  uint16_t dev_mtu;
703	  if (!rte_eth_dev_get_mtu (i, &dev_mtu))
704	    {
705	      mtu = dev_mtu;
706	      max_rx_frame = mtu + sizeof (ethernet_header_t);
707
708	      if (dpdk_port_crc_strip_enabled (xd))
709		{
710		  max_rx_frame += 4;
711		}
712	    }
713	}
714
715      /*Set port rxmode config */
716      xd->port_conf.rxmode.max_rx_pkt_len = max_rx_frame;
717
718      sw = vnet_get_hw_sw_interface (dm->vnet_main, xd->hw_if_index);
719      xd->sw_if_index = sw->sw_if_index;
720      vnet_hw_interface_set_input_node (dm->vnet_main, xd->hw_if_index,
721					dpdk_input_node.index);
722
723      if (devconf->workers)
724	{
725	  int i;
726	  q = 0;
727	  clib_bitmap_foreach (i, devconf->workers, ({
728	    vnet_hw_interface_assign_rx_thread (dm->vnet_main, xd->hw_if_index, q++,
729					     vdm->first_worker_thread_index + i);
730	  }));
731	}
732      else
733	for (q = 0; q < xd->rx_q_used; q++)
734	  {
735	    vnet_hw_interface_assign_rx_thread (dm->vnet_main, xd->hw_if_index, q,	/* any */
736						~1);
737	  }
738
739      /*Get vnet hardware interface */
740      hi = vnet_get_hw_interface (dm->vnet_main, xd->hw_if_index);
741
742      /*Override default max_packet_bytes and max_supported_bytes set in
743       * ethernet_register_interface() above*/
744      if (hi)
745	{
746	  hi->max_packet_bytes = mtu;
747	  hi->max_supported_packet_bytes = max_rx_frame;
748	  hi->numa_node = xd->cpu_socket;
749
750	  /* Indicate ability to support L3 DMAC filtering and
751	   * initialize interface to L3 non-promisc mode */
752	  hi->flags |= VNET_HW_INTERFACE_FLAG_SUPPORTS_MAC_FILTER;
753	  ethernet_set_flags (dm->vnet_main, xd->hw_if_index,
754			     ETHERNET_INTERFACE_FLAG_DEFAULT_L3);
755	}
756
757      if (dm->conf->no_tx_checksum_offload == 0)
758	if (xd->flags & DPDK_DEVICE_FLAG_TX_OFFLOAD && hi != NULL)
759	  hi->flags |= VNET_HW_INTERFACE_FLAG_SUPPORTS_TX_L4_CKSUM_OFFLOAD;
760
761      if (devconf->tso == DPDK_DEVICE_TSO_ON && hi != NULL)
762	{
763	  /*tcp_udp checksum must be enabled*/
764	  if ((dm->conf->enable_tcp_udp_checksum) &&
765	      (hi->flags & VNET_HW_INTERFACE_FLAG_SUPPORTS_TX_L4_CKSUM_OFFLOAD))
766	    {
767		hi->flags |= VNET_HW_INTERFACE_FLAG_SUPPORTS_GSO;
768		xd->port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_TSO |
769		  DEV_TX_OFFLOAD_UDP_TSO;
770	    }
771	  else
772	    clib_warning ("%s: TCP/UDP checksum offload must be enabled",
773	      hi->name);
774	}
775
776      dpdk_device_setup (xd);
777
778      if (vec_len (xd->errors))
779	dpdk_log_err ("setup failed for device %U. Errors:\n  %U",
780		      format_dpdk_device_name, i,
781		      format_dpdk_device_errors, xd);
782
783      /*
784       * A note on Cisco VIC (PMD_ENIC) and VLAN:
785       *
786       * With Cisco VIC vNIC, every ingress packet is tagged. On a
787       * trunk vNIC (C series "standalone" server), packets on no VLAN
788       * are tagged with vlan 0. On an access vNIC (standalone or B
789       * series "blade" server), packets on the default/native VLAN
790       * are tagged with that vNIC's VLAN. VPP expects these packets
791       * to be untagged, and previously enabled VLAN strip on VIC by
792       * default. But it also broke vlan sub-interfaces.
793       *
794       * The VIC adapter has "untag default vlan" ingress VLAN rewrite
795       * mode, which removes tags from these packets. VPP now includes
796       * a local patch for the enic driver to use this untag mode, so
797       * enabling vlan stripping is no longer needed. In future, the
798       * driver + dpdk will have an API to set the mode after
799       * rte_eal_init. Then, this note and local patch will be
800       * removed.
801       */
802
803      /*
804       * VLAN stripping: default to VLAN strip disabled, unless specified
805       * otherwise in the startup config.
806       */
807
808      vlan_off = rte_eth_dev_get_vlan_offload (xd->port_id);
809      if (devconf->vlan_strip_offload == DPDK_DEVICE_VLAN_STRIP_ON)
810	{
811	  vlan_off |= ETH_VLAN_STRIP_OFFLOAD;
812	  if (rte_eth_dev_set_vlan_offload (xd->port_id, vlan_off) >= 0)
813	    dpdk_log_info ("VLAN strip enabled for interface\n");
814	  else
815	    dpdk_log_warn ("VLAN strip cannot be supported by interface\n");
816	  xd->port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_VLAN_STRIP;
817	}
818      else
819	{
820	  if (vlan_off & ETH_VLAN_STRIP_OFFLOAD)
821	    {
822	      vlan_off &= ~ETH_VLAN_STRIP_OFFLOAD;
823	      if (rte_eth_dev_set_vlan_offload (xd->port_id, vlan_off) >= 0)
824		dpdk_log_warn ("set VLAN offload failed\n");
825	    }
826	  xd->port_conf.rxmode.offloads &= ~DEV_RX_OFFLOAD_VLAN_STRIP;
827	}
828
829      if (hi)
830	hi->max_packet_bytes = xd->port_conf.rxmode.max_rx_pkt_len
831	  - sizeof (ethernet_header_t);
832      else
833	dpdk_log_warn ("hi NULL");
834
835      if (dm->conf->no_multi_seg)
836	mtu = mtu > ETHER_MAX_LEN ? ETHER_MAX_LEN : mtu;
837
838      rte_eth_dev_set_mtu (xd->port_id, mtu);
839    }
840  /* *INDENT-ON* */
841
842  return 0;
843}
844
845static void
846dpdk_bind_devices_to_uio (dpdk_config_main_t * conf)
847{
848  vlib_main_t *vm = vlib_get_main ();
849  clib_error_t *error;
850  u8 *pci_addr = 0;
851  int num_whitelisted = vec_len (conf->dev_confs);
852  vlib_pci_device_info_t *d = 0;
853  vlib_pci_addr_t *addr = 0, *addrs;
854  int i;
855
856  addrs = vlib_pci_get_all_dev_addrs ();
857  /* *INDENT-OFF* */
858  vec_foreach (addr, addrs)
859    {
860    dpdk_device_config_t * devconf = 0;
861    vec_reset_length (pci_addr);
862    pci_addr = format (pci_addr, "%U%c", format_vlib_pci_addr, addr, 0);
863    if (d)
864    {
865      vlib_pci_free_device_info (d);
866      d = 0;
867      }
868    d = vlib_pci_get_device_info (vm, addr, &error);
869    if (error)
870    {
871      clib_error_report (error);
872      continue;
873    }
874
875    if (d->device_class != PCI_CLASS_NETWORK_ETHERNET && d->device_class != PCI_CLASS_PROCESSOR_CO)
876      continue;
877
878    if (num_whitelisted)
879      {
880	uword * p = hash_get (conf->device_config_index_by_pci_addr, addr->as_u32);
881
882	if (!p)
883          {
884          skipped:
885            continue;
886          }
887
888	devconf = pool_elt_at_index (conf->dev_confs, p[0]);
889      }
890
891    /* Enforce Device blacklist by vendor and device */
892    for (i = 0; i < vec_len (conf->blacklist_by_pci_vendor_and_device); i++)
893      {
894        u16 vendor, device;
895        vendor = (u16)(conf->blacklist_by_pci_vendor_and_device[i] >> 16);
896        device = (u16)(conf->blacklist_by_pci_vendor_and_device[i] & 0xFFFF);
897        if (d->vendor_id == vendor && d->device_id == device)
898          {
899            /*
900             * Expected case: device isn't whitelisted,
901             * so blacklist it...
902             */
903            if (devconf == 0)
904              {
905                /* Device is blacklisted */
906                pool_get (conf->dev_confs, devconf);
907                hash_set (conf->device_config_index_by_pci_addr, addr->as_u32,
908                          devconf - conf->dev_confs);
909                devconf->pci_addr.as_u32 = addr->as_u32;
910                devconf->is_blacklisted = 1;
911                goto skipped;
912              }
913            else /* explicitly whitelisted, ignore the device blacklist  */
914              break;
915          }
916      }
917
918    /* virtio */
919    if (d->vendor_id == 0x1af4 &&
920            (d->device_id == VIRTIO_PCI_LEGACY_DEVICEID_NET ||
921             d->device_id == VIRTIO_PCI_MODERN_DEVICEID_NET))
922      ;
923    /* vmxnet3 */
924    else if (d->vendor_id == 0x15ad && d->device_id == 0x07b0)
925      {
926	/*
927	 * For vmxnet3 PCI, unless it is explicitly specified in the whitelist,
928	 * the default is to put it in the blacklist.
929	 */
930	if (devconf == 0)
931	  {
932	    pool_get (conf->dev_confs, devconf);
933	    hash_set (conf->device_config_index_by_pci_addr, addr->as_u32,
934		      devconf - conf->dev_confs);
935	    devconf->pci_addr.as_u32 = addr->as_u32;
936	    devconf->is_blacklisted = 1;
937	  }
938      }
939    /* all Intel network devices */
940    else if (d->vendor_id == 0x8086 && d->device_class == PCI_CLASS_NETWORK_ETHERNET)
941      ;
942    /* all Intel QAT devices VFs */
943    else if (d->vendor_id == 0x8086 && d->device_class == PCI_CLASS_PROCESSOR_CO &&
944        (d->device_id == 0x0443 || d->device_id == 0x18a1 || d->device_id == 0x19e3 ||
945        d->device_id == 0x37c9 || d->device_id == 0x6f55))
946      ;
947    /* Cisco VIC */
948    else if (d->vendor_id == 0x1137 &&
949        (d->device_id == 0x0043 || d->device_id == 0x0071))
950      ;
951    /* Chelsio T4/T5 */
952    else if (d->vendor_id == 0x1425 && (d->device_id & 0xe000) == 0x4000)
953      ;
954    /* Amazon Elastic Network Adapter */
955    else if (d->vendor_id == 0x1d0f && d->device_id >= 0xec20 && d->device_id <= 0xec21)
956      ;
957    /* Cavium Network Adapter */
958    else if (d->vendor_id == 0x177d && d->device_id == 0x9712)
959      ;
960    /* Cavium FastlinQ QL41000 Series */
961    else if (d->vendor_id == 0x1077 && d->device_id >= 0x8070 && d->device_id <= 0x8090)
962      ;
963    /* Mellanox CX3, CX3VF */
964    else if (d->vendor_id == 0x15b3 && d->device_id >= 0x1003 && d->device_id <= 0x1004)
965      {
966        continue;
967      }
968    /* Mellanox CX4, CX4VF, CX4LX, CX4LXVF, CX5, CX5VF, CX5EX, CX5EXVF */
969    else if (d->vendor_id == 0x15b3 && d->device_id >= 0x1013 && d->device_id <= 0x101a)
970      {
971        continue;
972      }
973    /* Mellanox CX6, CX6VF, CX6DX, CX6DXVF */
974    else if (d->vendor_id == 0x15b3 && d->device_id >= 0x101b && d->device_id <= 0x101e)
975      {
976        continue;
977      }
978    /* Broadcom NetXtreme S, and E series only */
979    else if (d->vendor_id == 0x14e4 &&
980	((d->device_id >= 0x16c0 &&
981		d->device_id != 0x16c6 && d->device_id != 0x16c7 &&
982		d->device_id != 0x16dd && d->device_id != 0x16f7 &&
983		d->device_id != 0x16fd && d->device_id != 0x16fe &&
984		d->device_id != 0x170d && d->device_id != 0x170c &&
985		d->device_id != 0x170e && d->device_id != 0x1712 &&
986		d->device_id != 0x1713) ||
987	(d->device_id == 0x1604 || d->device_id == 0x1605 ||
988	 d->device_id == 0x1614 || d->device_id == 0x1606 ||
989	 d->device_id == 0x1609 || d->device_id == 0x1614)))
990      ;
991    else
992      {
993        dpdk_log_warn ("Unsupported PCI device 0x%04x:0x%04x found "
994		      "at PCI address %s\n", (u16) d->vendor_id, (u16) d->device_id,
995		      pci_addr);
996        continue;
997      }
998
999    error = vlib_pci_bind_to_uio (vm, addr, (char *) conf->uio_driver_name);
1000
1001    if (error)
1002      {
1003	if (devconf == 0)
1004	  {
1005	    pool_get (conf->dev_confs, devconf);
1006	    hash_set (conf->device_config_index_by_pci_addr, addr->as_u32,
1007		      devconf - conf->dev_confs);
1008	    devconf->pci_addr.as_u32 = addr->as_u32;
1009	  }
1010	devconf->is_blacklisted = 1;
1011	clib_error_report (error);
1012      }
1013  }
1014  /* *INDENT-ON* */
1015  vec_free (pci_addr);
1016  vlib_pci_free_device_info (d);
1017}
1018
1019static void
1020dpdk_bind_vmbus_devices_to_uio (dpdk_config_main_t * conf)
1021{
1022  clib_error_t *error;
1023  vlib_vmbus_addr_t *addrs, *addr = 0;
1024
1025  addrs = vlib_vmbus_get_all_dev_addrs ();
1026
1027  /* *INDENT-OFF* */
1028  vec_foreach (addr, addrs)
1029    {
1030      error = vlib_vmbus_bind_to_uio (addr);
1031
1032      if (error)
1033	{
1034	  clib_error_report (error);
1035	}
1036    }
1037  /* *INDENT-ON* */
1038}
1039
1040static clib_error_t *
1041dpdk_device_config (dpdk_config_main_t * conf, vlib_pci_addr_t pci_addr,
1042		    unformat_input_t * input, u8 is_default)
1043{
1044  clib_error_t *error = 0;
1045  uword *p;
1046  dpdk_device_config_t *devconf;
1047  unformat_input_t sub_input;
1048
1049  if (is_default)
1050    {
1051      devconf = &conf->default_devconf;
1052    }
1053  else
1054    {
1055      p = hash_get (conf->device_config_index_by_pci_addr, pci_addr.as_u32);
1056
1057      if (!p)
1058	{
1059	  pool_get (conf->dev_confs, devconf);
1060	  hash_set (conf->device_config_index_by_pci_addr, pci_addr.as_u32,
1061		    devconf - conf->dev_confs);
1062	}
1063      else
1064	return clib_error_return (0,
1065				  "duplicate configuration for PCI address %U",
1066				  format_vlib_pci_addr, &pci_addr);
1067    }
1068
1069  devconf->pci_addr.as_u32 = pci_addr.as_u32;
1070  devconf->tso = DPDK_DEVICE_TSO_DEFAULT;
1071
1072  if (!input)
1073    return 0;
1074
1075  unformat_skip_white_space (input);
1076  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
1077    {
1078      if (unformat (input, "num-rx-queues %u", &devconf->num_rx_queues))
1079	;
1080      else if (unformat (input, "num-tx-queues %u", &devconf->num_tx_queues))
1081	;
1082      else if (unformat (input, "num-rx-desc %u", &devconf->num_rx_desc))
1083	;
1084      else if (unformat (input, "num-tx-desc %u", &devconf->num_tx_desc))
1085	;
1086      else if (unformat (input, "name %s", &devconf->name))
1087	;
1088      else if (unformat (input, "workers %U", unformat_bitmap_list,
1089			 &devconf->workers))
1090	;
1091      else
1092	if (unformat
1093	    (input, "rss %U", unformat_vlib_cli_sub_input, &sub_input))
1094	{
1095	  error = unformat_rss_fn (&sub_input, &devconf->rss_fn);
1096	  if (error)
1097	    break;
1098	}
1099      else if (unformat (input, "vlan-strip-offload off"))
1100	devconf->vlan_strip_offload = DPDK_DEVICE_VLAN_STRIP_OFF;
1101      else if (unformat (input, "vlan-strip-offload on"))
1102	devconf->vlan_strip_offload = DPDK_DEVICE_VLAN_STRIP_ON;
1103      else if (unformat (input, "tso on"))
1104	{
1105	  devconf->tso = DPDK_DEVICE_TSO_ON;
1106	}
1107      else if (unformat (input, "tso off"))
1108	{
1109	  devconf->tso = DPDK_DEVICE_TSO_OFF;
1110	}
1111      else if (unformat (input, "devargs %s", &devconf->devargs))
1112	;
1113      else
1114	{
1115	  error = clib_error_return (0, "unknown input `%U'",
1116				     format_unformat_error, input);
1117	  break;
1118	}
1119    }
1120
1121  if (error)
1122    return error;
1123
1124  if (devconf->workers && devconf->num_rx_queues == 0)
1125    devconf->num_rx_queues = clib_bitmap_count_set_bits (devconf->workers);
1126  else if (devconf->workers &&
1127	   clib_bitmap_count_set_bits (devconf->workers) !=
1128	   devconf->num_rx_queues)
1129    error =
1130      clib_error_return (0,
1131			 "%U: number of worker threads must be "
1132			 "equal to number of rx queues", format_vlib_pci_addr,
1133			 &pci_addr);
1134
1135  return error;
1136}
1137
1138static clib_error_t *
1139dpdk_log_read_ready (clib_file_t * uf)
1140{
1141  unformat_input_t input;
1142  u8 *line, *s = 0;
1143  int n, n_try;
1144
1145  n = n_try = 4096;
1146  while (n == n_try)
1147    {
1148      uword len = vec_len (s);
1149      vec_resize (s, len + n_try);
1150
1151      n = read (uf->file_descriptor, s + len, n_try);
1152      if (n < 0 && errno != EAGAIN)
1153	return clib_error_return_unix (0, "read");
1154      _vec_len (s) = len + (n < 0 ? 0 : n);
1155    }
1156
1157  unformat_init_vector (&input, s);
1158
1159  while (unformat_user (&input, unformat_line, &line))
1160    {
1161      dpdk_log_notice ("%v", line);
1162      vec_free (line);
1163    }
1164
1165  unformat_free (&input);
1166  return 0;
1167}
1168
1169static clib_error_t *
1170dpdk_config (vlib_main_t * vm, unformat_input_t * input)
1171{
1172  clib_error_t *error = 0;
1173  dpdk_config_main_t *conf = &dpdk_config_main;
1174  vlib_thread_main_t *tm = vlib_get_thread_main ();
1175  dpdk_device_config_t *devconf;
1176  vlib_pci_addr_t pci_addr;
1177  unformat_input_t sub_input;
1178  uword default_hugepage_sz, x;
1179  u8 *s, *tmp = 0;
1180  int ret, i;
1181  int num_whitelisted = 0;
1182  u8 no_pci = 0;
1183  u8 no_vmbus = 0;
1184  u8 file_prefix = 0;
1185  u8 *socket_mem = 0;
1186  u8 *huge_dir_path = 0;
1187  u32 vendor, device;
1188
1189  huge_dir_path =
1190    format (0, "%s/hugepages%c", vlib_unix_get_runtime_dir (), 0);
1191
1192  conf->device_config_index_by_pci_addr = hash_create (0, sizeof (uword));
1193
1194  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
1195    {
1196      /* Prime the pump */
1197      if (unformat (input, "no-hugetlb"))
1198	{
1199	  vec_add1 (conf->eal_init_args, (u8 *) "--no-huge");
1200	}
1201
1202      else if (unformat (input, "enable-tcp-udp-checksum"))
1203	conf->enable_tcp_udp_checksum = 1;
1204
1205      else if (unformat (input, "no-tx-checksum-offload"))
1206	conf->no_tx_checksum_offload = 1;
1207
1208      else if (unformat (input, "decimal-interface-names"))
1209	conf->interface_name_format_decimal = 1;
1210
1211      else if (unformat (input, "no-multi-seg"))
1212	conf->no_multi_seg = 1;
1213
1214      else if (unformat (input, "dev default %U", unformat_vlib_cli_sub_input,
1215			 &sub_input))
1216	{
1217	  error =
1218	    dpdk_device_config (conf, (vlib_pci_addr_t) (u32) ~ 1, &sub_input,
1219				1);
1220
1221	  if (error)
1222	    return error;
1223	}
1224      else
1225	if (unformat
1226	    (input, "dev %U %U", unformat_vlib_pci_addr, &pci_addr,
1227	     unformat_vlib_cli_sub_input, &sub_input))
1228	{
1229	  error = dpdk_device_config (conf, pci_addr, &sub_input, 0);
1230
1231	  if (error)
1232	    return error;
1233
1234	  num_whitelisted++;
1235	}
1236      else if (unformat (input, "dev %U", unformat_vlib_pci_addr, &pci_addr))
1237	{
1238	  error = dpdk_device_config (conf, pci_addr, 0, 0);
1239
1240	  if (error)
1241	    return error;
1242
1243	  num_whitelisted++;
1244	}
1245      else if (unformat (input, "num-mem-channels %d", &conf->nchannels))
1246	conf->nchannels_set_manually = 0;
1247      else if (unformat (input, "num-crypto-mbufs %d",
1248			 &conf->num_crypto_mbufs))
1249	;
1250      else if (unformat (input, "uio-driver %s", &conf->uio_driver_name))
1251	;
1252      else if (unformat (input, "socket-mem %s", &socket_mem))
1253	;
1254      else if (unformat (input, "no-pci"))
1255	{
1256	  no_pci = 1;
1257	  tmp = format (0, "--no-pci%c", 0);
1258	  vec_add1 (conf->eal_init_args, tmp);
1259	}
1260      else if (unformat (input, "blacklist %x:%x", &vendor, &device))
1261	{
1262	  u32 blacklist_entry;
1263	  if (vendor > 0xFFFF)
1264	    return clib_error_return (0, "blacklist PCI vendor out of range");
1265	  if (device > 0xFFFF)
1266	    return clib_error_return (0, "blacklist PCI device out of range");
1267	  blacklist_entry = (vendor << 16) | (device & 0xffff);
1268	  vec_add1 (conf->blacklist_by_pci_vendor_and_device,
1269		    blacklist_entry);
1270	}
1271      else if (unformat (input, "no-vmbus"))
1272	{
1273	  no_vmbus = 1;
1274	  tmp = format (0, "--no-vmbus%c", 0);
1275	  vec_add1 (conf->eal_init_args, tmp);
1276	}
1277
1278#define _(a)                                    \
1279      else if (unformat(input, #a))             \
1280        {                                       \
1281          tmp = format (0, "--%s%c", #a, 0);    \
1282          vec_add1 (conf->eal_init_args, tmp);    \
1283        }
1284      foreach_eal_double_hyphen_predicate_arg
1285#undef _
1286#define _(a)                                          \
1287	else if (unformat(input, #a " %s", &s))	      \
1288	  {					      \
1289            if (!strncmp(#a, "file-prefix", 11)) \
1290              file_prefix = 1;                        \
1291	    tmp = format (0, "--%s%c", #a, 0);	      \
1292	    vec_add1 (conf->eal_init_args, tmp);      \
1293	    vec_add1 (s, 0);			      \
1294            if (!strncmp(#a, "vdev", 4))              \
1295              if (strstr((char*)s, "af_packet"))      \
1296                clib_warning ("af_packet obsoleted. Use CLI 'create host-interface'."); \
1297	    vec_add1 (conf->eal_init_args, s);	      \
1298	  }
1299	foreach_eal_double_hyphen_arg
1300#undef _
1301#define _(a,b)						\
1302	  else if (unformat(input, #a " %s", &s))	\
1303	    {						\
1304	      tmp = format (0, "-%s%c", #b, 0);		\
1305	      vec_add1 (conf->eal_init_args, tmp);	\
1306	      vec_add1 (s, 0);				\
1307	      vec_add1 (conf->eal_init_args, s);	\
1308	    }
1309	foreach_eal_single_hyphen_arg
1310#undef _
1311#define _(a,b)						\
1312	    else if (unformat(input, #a " %s", &s))	\
1313	      {						\
1314		tmp = format (0, "-%s%c", #b, 0);	\
1315		vec_add1 (conf->eal_init_args, tmp);	\
1316		vec_add1 (s, 0);			\
1317		vec_add1 (conf->eal_init_args, s);	\
1318		conf->a##_set_manually = 1;		\
1319	      }
1320	foreach_eal_single_hyphen_mandatory_arg
1321#undef _
1322	else if (unformat (input, "default"))
1323	;
1324
1325      else if (unformat_skip_white_space (input))
1326	;
1327      else
1328	{
1329	  error = clib_error_return (0, "unknown input `%U'",
1330				     format_unformat_error, input);
1331	  goto done;
1332	}
1333    }
1334
1335  if (!conf->uio_driver_name)
1336    conf->uio_driver_name = format (0, "auto%c", 0);
1337
1338  default_hugepage_sz = clib_mem_get_default_hugepage_size ();
1339
1340  /* *INDENT-OFF* */
1341  clib_bitmap_foreach (x, tm->cpu_socket_bitmap, (
1342    {
1343      clib_error_t *e;
1344      uword n_pages;
1345      /* preallocate at least 16MB of hugepages per socket,
1346	 if more is needed it is up to consumer to preallocate more */
1347      n_pages = round_pow2 ((uword) 16 << 20, default_hugepage_sz);
1348      n_pages /= default_hugepage_sz;
1349
1350      if ((e = clib_sysfs_prealloc_hugepages(x, 0, n_pages)))
1351	clib_error_report (e);
1352  }));
1353  /* *INDENT-ON* */
1354
1355  if (!file_prefix)
1356    {
1357      tmp = format (0, "--file-prefix%c", 0);
1358      vec_add1 (conf->eal_init_args, tmp);
1359      tmp = format (0, "vpp%c", 0);
1360      vec_add1 (conf->eal_init_args, tmp);
1361    }
1362
1363  if (error)
1364    return error;
1365
1366  /* I'll bet that -c and -n must be the first and second args... */
1367  if (!conf->coremask_set_manually)
1368    {
1369      vlib_thread_registration_t *tr;
1370      uword *coremask = 0;
1371      int i;
1372
1373      /* main thread core */
1374      coremask = clib_bitmap_set (coremask, tm->main_lcore, 1);
1375
1376      for (i = 0; i < vec_len (tm->registrations); i++)
1377	{
1378	  tr = tm->registrations[i];
1379	  coremask = clib_bitmap_or (coremask, tr->coremask);
1380	}
1381
1382      vec_insert (conf->eal_init_args, 2, 1);
1383      conf->eal_init_args[1] = (u8 *) "-c";
1384      tmp = format (0, "%U%c", format_bitmap_hex, coremask, 0);
1385      conf->eal_init_args[2] = tmp;
1386      clib_bitmap_free (coremask);
1387    }
1388
1389  if (!conf->nchannels_set_manually)
1390    {
1391      vec_insert (conf->eal_init_args, 2, 3);
1392      conf->eal_init_args[3] = (u8 *) "-n";
1393      tmp = format (0, "%d", conf->nchannels);
1394      vec_terminate_c_string (tmp);
1395      conf->eal_init_args[4] = tmp;
1396    }
1397
1398  if (no_pci == 0 && geteuid () == 0)
1399    dpdk_bind_devices_to_uio (conf);
1400
1401  if (no_vmbus == 0 && geteuid () == 0)
1402    dpdk_bind_vmbus_devices_to_uio (conf);
1403
1404#define _(x) \
1405    if (devconf->x == 0 && conf->default_devconf.x > 0) \
1406      devconf->x = conf->default_devconf.x ;
1407
1408  /* *INDENT-OFF* */
1409  pool_foreach (devconf, conf->dev_confs, ({
1410
1411    /* default per-device config items */
1412    foreach_dpdk_device_config_item
1413
1414    /* copy vlan_strip config from default device */
1415	if (devconf->vlan_strip_offload == 0 &&
1416		conf->default_devconf.vlan_strip_offload > 0)
1417		devconf->vlan_strip_offload =
1418			conf->default_devconf.vlan_strip_offload;
1419
1420	/* copy tso config from default device */
1421	_(tso)
1422
1423	/* copy tso config from default device */
1424	_(devargs)
1425
1426    /* add DPDK EAL whitelist/blacklist entry */
1427    if (num_whitelisted > 0 && devconf->is_blacklisted == 0)
1428    {
1429	  tmp = format (0, "-w%c", 0);
1430	  vec_add1 (conf->eal_init_args, tmp);
1431	  if (devconf->devargs)
1432	  {
1433	    tmp = format (0, "%U,%s", format_vlib_pci_addr, &devconf->pci_addr, devconf->devargs, 0);
1434	  }
1435	  else
1436	  {
1437	    tmp = format (0, "%U%c", format_vlib_pci_addr, &devconf->pci_addr, 0);
1438	  }
1439	  vec_add1 (conf->eal_init_args, tmp);
1440    }
1441    else if (num_whitelisted == 0 && devconf->is_blacklisted != 0)
1442    {
1443	  tmp = format (0, "-b%c", 0);
1444	  vec_add1 (conf->eal_init_args, tmp);
1445	  tmp = format (0, "%U%c", format_vlib_pci_addr, &devconf->pci_addr, 0);
1446	  vec_add1 (conf->eal_init_args, tmp);
1447    }
1448  }));
1449  /* *INDENT-ON* */
1450
1451#undef _
1452
1453  /* set master-lcore */
1454  tmp = format (0, "--master-lcore%c", 0);
1455  vec_add1 (conf->eal_init_args, tmp);
1456  tmp = format (0, "%u%c", tm->main_lcore, 0);
1457  vec_add1 (conf->eal_init_args, tmp);
1458
1459
1460  if (socket_mem)
1461    clib_warning ("socket-mem argument is deprecated");
1462
1463  /* NULL terminate the "argv" vector, in case of stupidity */
1464  vec_add1 (conf->eal_init_args, 0);
1465  _vec_len (conf->eal_init_args) -= 1;
1466
1467  /* Set up DPDK eal and packet mbuf pool early. */
1468
1469  int log_fds[2] = { 0 };
1470  if (pipe (log_fds) == 0)
1471    {
1472      if (fcntl (log_fds[1], F_SETFL, O_NONBLOCK) == 0)
1473	{
1474	  FILE *f = fdopen (log_fds[1], "a");
1475	  if (f && rte_openlog_stream (f) == 0)
1476	    {
1477	      clib_file_t t = { 0 };
1478	      t.read_function = dpdk_log_read_ready;
1479	      t.file_descriptor = log_fds[0];
1480	      t.description = format (0, "DPDK logging pipe");
1481	      clib_file_add (&file_main, &t);
1482	    }
1483	}
1484      else
1485	{
1486	  close (log_fds[0]);
1487	  close (log_fds[1]);
1488	}
1489    }
1490
1491  vm = vlib_get_main ();
1492
1493  /* make copy of args as rte_eal_init tends to mess up with arg array */
1494  for (i = 1; i < vec_len (conf->eal_init_args); i++)
1495    conf->eal_init_args_str = format (conf->eal_init_args_str, "%s ",
1496				      conf->eal_init_args[i]);
1497
1498  vec_terminate_c_string (conf->eal_init_args_str);
1499
1500  dpdk_log_warn ("EAL init args: %s", conf->eal_init_args_str);
1501  ret = rte_eal_init (vec_len (conf->eal_init_args),
1502		      (char **) conf->eal_init_args);
1503
1504  /* lazy umount hugepages */
1505  umount2 ((char *) huge_dir_path, MNT_DETACH);
1506  rmdir ((char *) huge_dir_path);
1507  vec_free (huge_dir_path);
1508
1509  if (ret < 0)
1510    return clib_error_return (0, "rte_eal_init returned %d", ret);
1511
1512  /* main thread 1st */
1513  if ((error = dpdk_buffer_pools_create (vm)))
1514    return error;
1515
1516done:
1517  return error;
1518}
1519
1520VLIB_CONFIG_FUNCTION (dpdk_config, "dpdk");
1521
1522void
1523dpdk_update_link_state (dpdk_device_t * xd, f64 now)
1524{
1525  vnet_main_t *vnm = vnet_get_main ();
1526  struct rte_eth_link prev_link = xd->link;
1527  u32 hw_flags = 0;
1528  u8 hw_flags_chg = 0;
1529
1530  /* only update link state for PMD interfaces */
1531  if ((xd->flags & DPDK_DEVICE_FLAG_PMD) == 0)
1532    return;
1533
1534  xd->time_last_link_update = now ? now : xd->time_last_link_update;
1535  clib_memset (&xd->link, 0, sizeof (xd->link));
1536  rte_eth_link_get_nowait (xd->port_id, &xd->link);
1537
1538  if (LINK_STATE_ELOGS)
1539    {
1540      vlib_main_t *vm = vlib_get_main ();
1541      ELOG_TYPE_DECLARE (e) =
1542      {
1543      .format =
1544	  "update-link-state: sw_if_index %d, admin_up %d,"
1545	  "old link_state %d new link_state %d",.format_args = "i4i1i1i1",};
1546
1547      struct
1548      {
1549	u32 sw_if_index;
1550	u8 admin_up;
1551	u8 old_link_state;
1552	u8 new_link_state;
1553      } *ed;
1554      ed = ELOG_DATA (&vm->elog_main, e);
1555      ed->sw_if_index = xd->sw_if_index;
1556      ed->admin_up = (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) != 0;
1557      ed->old_link_state = (u8)
1558	vnet_hw_interface_is_link_up (vnm, xd->hw_if_index);
1559      ed->new_link_state = (u8) xd->link.link_status;
1560    }
1561
1562  if ((xd->link.link_duplex != prev_link.link_duplex))
1563    {
1564      hw_flags_chg = 1;
1565      switch (xd->link.link_duplex)
1566	{
1567	case ETH_LINK_HALF_DUPLEX:
1568	  hw_flags |= VNET_HW_INTERFACE_FLAG_HALF_DUPLEX;
1569	  break;
1570	case ETH_LINK_FULL_DUPLEX:
1571	  hw_flags |= VNET_HW_INTERFACE_FLAG_FULL_DUPLEX;
1572	  break;
1573	default:
1574	  break;
1575	}
1576    }
1577  if (xd->link.link_speed != prev_link.link_speed)
1578    vnet_hw_interface_set_link_speed (vnm, xd->hw_if_index,
1579				      xd->link.link_speed * 1000);
1580
1581  if (xd->link.link_status != prev_link.link_status)
1582    {
1583      hw_flags_chg = 1;
1584
1585      if (xd->link.link_status)
1586	hw_flags |= VNET_HW_INTERFACE_FLAG_LINK_UP;
1587    }
1588
1589  if (hw_flags_chg)
1590    {
1591      if (LINK_STATE_ELOGS)
1592	{
1593	  vlib_main_t *vm = vlib_get_main ();
1594
1595	  ELOG_TYPE_DECLARE (e) =
1596	  {
1597	  .format =
1598	      "update-link-state: sw_if_index %d, new flags %d",.format_args
1599	      = "i4i4",};
1600
1601	  struct
1602	  {
1603	    u32 sw_if_index;
1604	    u32 flags;
1605	  } *ed;
1606	  ed = ELOG_DATA (&vm->elog_main, e);
1607	  ed->sw_if_index = xd->sw_if_index;
1608	  ed->flags = hw_flags;
1609	}
1610      vnet_hw_interface_set_flags (vnm, xd->hw_if_index, hw_flags);
1611    }
1612}
1613
1614static uword
1615dpdk_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f)
1616{
1617  clib_error_t *error;
1618  dpdk_main_t *dm = &dpdk_main;
1619  dpdk_device_t *xd;
1620  vlib_thread_main_t *tm = vlib_get_thread_main ();
1621
1622  error = dpdk_lib_init (dm);
1623
1624  if (error)
1625    clib_error_report (error);
1626
1627  error = dpdk_cryptodev_init (vm);
1628  if (error)
1629    clib_error_report (error);
1630
1631  tm->worker_thread_release = 1;
1632
1633  f64 now = vlib_time_now (vm);
1634  vec_foreach (xd, dm->devices)
1635  {
1636    dpdk_update_link_state (xd, now);
1637  }
1638
1639  while (1)
1640    {
1641      /*
1642       * check each time through the loop in case intervals are changed
1643       */
1644      f64 min_wait = dm->link_state_poll_interval < dm->stat_poll_interval ?
1645	dm->link_state_poll_interval : dm->stat_poll_interval;
1646
1647      vlib_process_wait_for_event_or_clock (vm, min_wait);
1648
1649      if (dm->admin_up_down_in_progress)
1650	/* skip the poll if an admin up down is in progress (on any interface) */
1651	continue;
1652
1653      vec_foreach (xd, dm->devices)
1654      {
1655	f64 now = vlib_time_now (vm);
1656	if ((now - xd->time_last_stats_update) >= dm->stat_poll_interval)
1657	  dpdk_update_counters (xd, now);
1658	if ((now - xd->time_last_link_update) >= dm->link_state_poll_interval)
1659	  dpdk_update_link_state (xd, now);
1660
1661      }
1662    }
1663
1664  return 0;
1665}
1666
1667/* *INDENT-OFF* */
1668VLIB_REGISTER_NODE (dpdk_process_node,static) = {
1669    .function = dpdk_process,
1670    .type = VLIB_NODE_TYPE_PROCESS,
1671    .name = "dpdk-process",
1672    .process_log2_n_stack_bytes = 17,
1673};
1674/* *INDENT-ON* */
1675
1676static clib_error_t *
1677dpdk_init (vlib_main_t * vm)
1678{
1679  dpdk_main_t *dm = &dpdk_main;
1680  clib_error_t *error = 0;
1681
1682  /* verify that structs are cacheline aligned */
1683  STATIC_ASSERT (offsetof (dpdk_device_t, cacheline0) == 0,
1684		 "Cache line marker must be 1st element in dpdk_device_t");
1685  STATIC_ASSERT (offsetof (dpdk_device_t, cacheline1) ==
1686		 CLIB_CACHE_LINE_BYTES,
1687		 "Data in cache line 0 is bigger than cache line size");
1688  STATIC_ASSERT (offsetof (frame_queue_trace_t, cacheline0) == 0,
1689		 "Cache line marker must be 1st element in frame_queue_trace_t");
1690  STATIC_ASSERT (RTE_CACHE_LINE_SIZE == 1 << CLIB_LOG2_CACHE_LINE_BYTES,
1691		 "DPDK RTE CACHE LINE SIZE does not match with 1<<CLIB_LOG2_CACHE_LINE_BYTES");
1692
1693  dpdk_cli_reference ();
1694
1695  dm->vlib_main = vm;
1696  dm->vnet_main = vnet_get_main ();
1697  dm->conf = &dpdk_config_main;
1698
1699  dm->conf->nchannels = 4;
1700  vec_add1 (dm->conf->eal_init_args, (u8 *) "vnet");
1701  vec_add1 (dm->conf->eal_init_args, (u8 *) "--in-memory");
1702
1703  /* Default vlib_buffer_t flags, DISABLES tcp/udp checksumming... */
1704  dm->buffer_flags_template = (VLIB_BUFFER_TOTAL_LENGTH_VALID |
1705			       VLIB_BUFFER_EXT_HDR_VALID |
1706			       VNET_BUFFER_F_L4_CHECKSUM_COMPUTED |
1707			       VNET_BUFFER_F_L4_CHECKSUM_CORRECT);
1708
1709  dm->stat_poll_interval = DPDK_STATS_POLL_INTERVAL;
1710  dm->link_state_poll_interval = DPDK_LINK_POLL_INTERVAL;
1711
1712  dm->log_default = vlib_log_register_class ("dpdk", 0);
1713
1714  return error;
1715}
1716
1717VLIB_INIT_FUNCTION (dpdk_init);
1718
1719/*
1720 * fd.io coding-style-patch-verification: ON
1721 *
1722 * Local Variables:
1723 * eval: (c-set-style "gnu")
1724 * End:
1725 */
1726