vhost_user_input.c revision 7331005c
1/*
2 *------------------------------------------------------------------
3 * vhost-user-input
4 *
5 * Copyright (c) 2014-2018 Cisco and/or its affiliates.
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at:
9 *
10 *     http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 *------------------------------------------------------------------
18 */
19
20#include <fcntl.h>		/* for open */
21#include <sys/ioctl.h>
22#include <sys/socket.h>
23#include <sys/un.h>
24#include <sys/stat.h>
25#include <sys/types.h>
26#include <sys/uio.h>		/* for iovec */
27#include <netinet/in.h>
28#include <sys/vfs.h>
29
30#include <linux/if_arp.h>
31#include <linux/if_tun.h>
32
33#include <vlib/vlib.h>
34#include <vlib/unix/unix.h>
35
36#include <vnet/ip/ip.h>
37
38#include <vnet/ethernet/ethernet.h>
39#include <vnet/devices/devices.h>
40#include <vnet/feature/feature.h>
41
42#include <vnet/devices/virtio/virtio.h>
43#include <vnet/devices/virtio/vhost_user.h>
44#include <vnet/devices/virtio/vhost_user_inline.h>
45
46/*
47 * When an RX queue is down but active, received packets
48 * must be discarded. This value controls up to how many
49 * packets will be discarded during each round.
50 */
51#define VHOST_USER_DOWN_DISCARD_COUNT 256
52
53/*
54 * When the number of available buffers gets under this threshold,
55 * RX node will start discarding packets.
56 */
57#define VHOST_USER_RX_BUFFER_STARVATION 32
58
59/*
60 * On the receive side, the host should free descriptors as soon
61 * as possible in order to avoid TX drop in the VM.
62 * This value controls the number of copy operations that are stacked
63 * before copy is done for all and descriptors are given back to
64 * the guest.
65 * The value 64 was obtained by testing (48 and 128 were not as good).
66 */
67#define VHOST_USER_RX_COPY_THRESHOLD 64
68
69extern vlib_node_registration_t vhost_user_input_node;
70
71#define foreach_vhost_user_input_func_error      \
72  _(NO_ERROR, "no error")  \
73  _(NO_BUFFER, "no available buffer")  \
74  _(MMAP_FAIL, "mmap failure")  \
75  _(INDIRECT_OVERFLOW, "indirect descriptor overflows table")  \
76  _(UNDERSIZED_FRAME, "undersized ethernet frame received (< 14 bytes)") \
77  _(FULL_RX_QUEUE, "full rx queue (possible driver tx drop)")
78
79typedef enum
80{
81#define _(f,s) VHOST_USER_INPUT_FUNC_ERROR_##f,
82  foreach_vhost_user_input_func_error
83#undef _
84    VHOST_USER_INPUT_FUNC_N_ERROR,
85} vhost_user_input_func_error_t;
86
87static __clib_unused char *vhost_user_input_func_error_strings[] = {
88#define _(n,s) s,
89  foreach_vhost_user_input_func_error
90#undef _
91};
92
93static_always_inline void
94vhost_user_rx_trace (vhost_trace_t * t,
95		     vhost_user_intf_t * vui, u16 qid,
96		     vlib_buffer_t * b, vhost_user_vring_t * txvq,
97		     u16 last_avail_idx)
98{
99  vhost_user_main_t *vum = &vhost_user_main;
100  u32 desc_current = txvq->avail->ring[last_avail_idx & txvq->qsz_mask];
101  vring_desc_t *hdr_desc = 0;
102  virtio_net_hdr_mrg_rxbuf_t *hdr;
103  u32 hint = 0;
104
105  clib_memset (t, 0, sizeof (*t));
106  t->device_index = vui - vum->vhost_user_interfaces;
107  t->qid = qid;
108
109  hdr_desc = &txvq->desc[desc_current];
110  if (txvq->desc[desc_current].flags & VIRTQ_DESC_F_INDIRECT)
111    {
112      t->virtio_ring_flags |= 1 << VIRTIO_TRACE_F_INDIRECT;
113      /* Header is the first here */
114      hdr_desc = map_guest_mem (vui, txvq->desc[desc_current].addr, &hint);
115    }
116  if (txvq->desc[desc_current].flags & VIRTQ_DESC_F_NEXT)
117    {
118      t->virtio_ring_flags |= 1 << VIRTIO_TRACE_F_SIMPLE_CHAINED;
119    }
120  if (!(txvq->desc[desc_current].flags & VIRTQ_DESC_F_NEXT) &&
121      !(txvq->desc[desc_current].flags & VIRTQ_DESC_F_INDIRECT))
122    {
123      t->virtio_ring_flags |= 1 << VIRTIO_TRACE_F_SINGLE_DESC;
124    }
125
126  t->first_desc_len = hdr_desc ? hdr_desc->len : 0;
127
128  if (!hdr_desc || !(hdr = map_guest_mem (vui, hdr_desc->addr, &hint)))
129    {
130      t->virtio_ring_flags |= 1 << VIRTIO_TRACE_F_MAP_ERROR;
131    }
132  else
133    {
134      u32 len = vui->virtio_net_hdr_sz;
135      memcpy (&t->hdr, hdr, len > hdr_desc->len ? hdr_desc->len : len);
136    }
137}
138
139static_always_inline u32
140vhost_user_input_copy (vhost_user_intf_t * vui, vhost_copy_t * cpy,
141		       u16 copy_len, u32 * map_hint)
142{
143  void *src0, *src1, *src2, *src3;
144  if (PREDICT_TRUE (copy_len >= 4))
145    {
146      if (PREDICT_FALSE (!(src2 = map_guest_mem (vui, cpy[0].src, map_hint))))
147	return 1;
148      if (PREDICT_FALSE (!(src3 = map_guest_mem (vui, cpy[1].src, map_hint))))
149	return 1;
150
151      while (PREDICT_TRUE (copy_len >= 4))
152	{
153	  src0 = src2;
154	  src1 = src3;
155
156	  if (PREDICT_FALSE
157	      (!(src2 = map_guest_mem (vui, cpy[2].src, map_hint))))
158	    return 1;
159	  if (PREDICT_FALSE
160	      (!(src3 = map_guest_mem (vui, cpy[3].src, map_hint))))
161	    return 1;
162
163	  CLIB_PREFETCH (src2, 64, LOAD);
164	  CLIB_PREFETCH (src3, 64, LOAD);
165
166	  clib_memcpy_fast ((void *) cpy[0].dst, src0, cpy[0].len);
167	  clib_memcpy_fast ((void *) cpy[1].dst, src1, cpy[1].len);
168	  copy_len -= 2;
169	  cpy += 2;
170	}
171    }
172  while (copy_len)
173    {
174      if (PREDICT_FALSE (!(src0 = map_guest_mem (vui, cpy->src, map_hint))))
175	return 1;
176      clib_memcpy_fast ((void *) cpy->dst, src0, cpy->len);
177      copy_len -= 1;
178      cpy += 1;
179    }
180  return 0;
181}
182
183/**
184 * Try to discard packets from the tx ring (VPP RX path).
185 * Returns the number of discarded packets.
186 */
187static_always_inline u32
188vhost_user_rx_discard_packet (vlib_main_t * vm,
189			      vhost_user_intf_t * vui,
190			      vhost_user_vring_t * txvq, u32 discard_max)
191{
192  /*
193   * On the RX side, each packet corresponds to one descriptor
194   * (it is the same whether it is a shallow descriptor, chained, or indirect).
195   * Therefore, discarding a packet is like discarding a descriptor.
196   */
197  u32 discarded_packets = 0;
198  u32 avail_idx = txvq->avail->idx;
199  u16 mask = txvq->qsz_mask;
200  u16 last_avail_idx = txvq->last_avail_idx;
201  u16 last_used_idx = txvq->last_used_idx;
202  while (discarded_packets != discard_max)
203    {
204      if (avail_idx == last_avail_idx)
205	goto out;
206
207      u16 desc_chain_head = txvq->avail->ring[last_avail_idx & mask];
208      last_avail_idx++;
209      txvq->used->ring[last_used_idx & mask].id = desc_chain_head;
210      txvq->used->ring[last_used_idx & mask].len = 0;
211      vhost_user_log_dirty_ring (vui, txvq, ring[last_used_idx & mask]);
212      last_used_idx++;
213      discarded_packets++;
214    }
215
216out:
217  txvq->last_avail_idx = last_avail_idx;
218  txvq->last_used_idx = last_used_idx;
219  CLIB_MEMORY_STORE_BARRIER ();
220  txvq->used->idx = txvq->last_used_idx;
221  vhost_user_log_dirty_ring (vui, txvq, idx);
222  return discarded_packets;
223}
224
225/*
226 * In case of overflow, we need to rewind the array of allocated buffers.
227 */
228static_always_inline void
229vhost_user_input_rewind_buffers (vlib_main_t * vm,
230				 vhost_cpu_t * cpu, vlib_buffer_t * b_head)
231{
232  u32 bi_current = cpu->rx_buffers[cpu->rx_buffers_len];
233  vlib_buffer_t *b_current = vlib_get_buffer (vm, bi_current);
234  b_current->current_length = 0;
235  b_current->flags = 0;
236  while (b_current != b_head)
237    {
238      cpu->rx_buffers_len++;
239      bi_current = cpu->rx_buffers[cpu->rx_buffers_len];
240      b_current = vlib_get_buffer (vm, bi_current);
241      b_current->current_length = 0;
242      b_current->flags = 0;
243    }
244  cpu->rx_buffers_len++;
245}
246
247static_always_inline void
248vhost_user_handle_rx_offload (vlib_buffer_t * b0, u8 * b0_data,
249			      virtio_net_hdr_t * hdr)
250{
251  u8 l4_hdr_sz = 0;
252
253  if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)
254    {
255      u8 l4_proto = 0;
256      ethernet_header_t *eh = (ethernet_header_t *) b0_data;
257      u16 ethertype = clib_net_to_host_u16 (eh->type);
258      u16 l2hdr_sz = sizeof (ethernet_header_t);
259
260      if (ethernet_frame_is_tagged (ethertype))
261	{
262	  ethernet_vlan_header_t *vlan = (ethernet_vlan_header_t *) (eh + 1);
263
264	  ethertype = clib_net_to_host_u16 (vlan->type);
265	  l2hdr_sz += sizeof (*vlan);
266	  if (ethertype == ETHERNET_TYPE_VLAN)
267	    {
268	      vlan++;
269	      ethertype = clib_net_to_host_u16 (vlan->type);
270	      l2hdr_sz += sizeof (*vlan);
271	    }
272	}
273      vnet_buffer (b0)->l2_hdr_offset = 0;
274      vnet_buffer (b0)->l3_hdr_offset = l2hdr_sz;
275      vnet_buffer (b0)->l4_hdr_offset = hdr->csum_start;
276      b0->flags |= (VNET_BUFFER_F_L2_HDR_OFFSET_VALID |
277		    VNET_BUFFER_F_L3_HDR_OFFSET_VALID |
278		    VNET_BUFFER_F_L4_HDR_OFFSET_VALID |
279		    VNET_BUFFER_F_OFFLOAD_IP_CKSUM);
280
281      if (PREDICT_TRUE (ethertype == ETHERNET_TYPE_IP4))
282	{
283	  ip4_header_t *ip4 = (ip4_header_t *) (b0_data + l2hdr_sz);
284	  l4_proto = ip4->protocol;
285	  b0->flags |= VNET_BUFFER_F_IS_IP4;
286	}
287      else if (PREDICT_TRUE (ethertype == ETHERNET_TYPE_IP6))
288	{
289	  ip6_header_t *ip6 = (ip6_header_t *) (b0_data + l2hdr_sz);
290	  l4_proto = ip6->protocol;
291	  b0->flags |= VNET_BUFFER_F_IS_IP6;
292	}
293
294      if (l4_proto == IP_PROTOCOL_TCP)
295	{
296	  tcp_header_t *tcp = (tcp_header_t *)
297	    (b0_data + vnet_buffer (b0)->l4_hdr_offset);
298	  l4_hdr_sz = tcp_header_bytes (tcp);
299	  tcp->checksum = 0;
300	  b0->flags |= VNET_BUFFER_F_OFFLOAD_TCP_CKSUM;
301	}
302      else if (l4_proto == IP_PROTOCOL_UDP)
303	{
304	  udp_header_t *udp =
305	    (udp_header_t *) (b0_data + vnet_buffer (b0)->l4_hdr_offset);
306	  l4_hdr_sz = sizeof (*udp);
307	  udp->checksum = 0;
308	  b0->flags |= VNET_BUFFER_F_OFFLOAD_UDP_CKSUM;
309	}
310    }
311
312  if (hdr->gso_type == VIRTIO_NET_HDR_GSO_UDP)
313    {
314      vnet_buffer2 (b0)->gso_size = hdr->gso_size;
315      vnet_buffer2 (b0)->gso_l4_hdr_sz = l4_hdr_sz;
316      b0->flags |= VNET_BUFFER_F_GSO;
317    }
318  else if (hdr->gso_type == VIRTIO_NET_HDR_GSO_TCPV4)
319    {
320      vnet_buffer2 (b0)->gso_size = hdr->gso_size;
321      vnet_buffer2 (b0)->gso_l4_hdr_sz = l4_hdr_sz;
322      b0->flags |= (VNET_BUFFER_F_GSO | VNET_BUFFER_F_IS_IP4);
323    }
324  else if (hdr->gso_type == VIRTIO_NET_HDR_GSO_TCPV6)
325    {
326      vnet_buffer2 (b0)->gso_size = hdr->gso_size;
327      vnet_buffer2 (b0)->gso_l4_hdr_sz = l4_hdr_sz;
328      b0->flags |= (VNET_BUFFER_F_GSO | VNET_BUFFER_F_IS_IP6);
329    }
330}
331
332static_always_inline u32
333vhost_user_if_input (vlib_main_t * vm,
334		     vhost_user_main_t * vum,
335		     vhost_user_intf_t * vui,
336		     u16 qid, vlib_node_runtime_t * node,
337		     vnet_hw_interface_rx_mode mode, u8 enable_csum)
338{
339  vhost_user_vring_t *txvq = &vui->vrings[VHOST_VRING_IDX_TX (qid)];
340  vnet_feature_main_t *fm = &feature_main;
341  u16 n_rx_packets = 0;
342  u32 n_rx_bytes = 0;
343  u16 n_left;
344  u32 n_left_to_next, *to_next;
345  u32 next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT;
346  u32 n_trace = vlib_get_trace_count (vm, node);
347  u32 buffer_data_size = vlib_buffer_get_default_data_size (vm);
348  u32 map_hint = 0;
349  vhost_cpu_t *cpu = &vum->cpus[vm->thread_index];
350  u16 copy_len = 0;
351  u8 feature_arc_idx = fm->device_input_feature_arc_index;
352  u32 current_config_index = ~(u32) 0;
353  u16 mask = txvq->qsz_mask;
354
355  /* The descriptor table is not ready yet */
356  if (PREDICT_FALSE (txvq->avail == 0))
357    goto done;
358
359  {
360    /* do we have pending interrupts ? */
361    vhost_user_vring_t *rxvq = &vui->vrings[VHOST_VRING_IDX_RX (qid)];
362    f64 now = vlib_time_now (vm);
363
364    if ((txvq->n_since_last_int) && (txvq->int_deadline < now))
365      vhost_user_send_call (vm, txvq);
366
367    if ((rxvq->n_since_last_int) && (rxvq->int_deadline < now))
368      vhost_user_send_call (vm, rxvq);
369  }
370
371  /*
372   * For adaptive mode, it is optimized to reduce interrupts.
373   * If the scheduler switches the input node to polling due
374   * to burst of traffic, we tell the driver no interrupt.
375   * When the traffic subsides, the scheduler switches the node back to
376   * interrupt mode. We must tell the driver we want interrupt.
377   */
378  if (PREDICT_FALSE (mode == VNET_HW_INTERFACE_RX_MODE_ADAPTIVE))
379    {
380      if ((node->flags &
381	   VLIB_NODE_FLAG_SWITCH_FROM_POLLING_TO_INTERRUPT_MODE) ||
382	  !(node->flags &
383	    VLIB_NODE_FLAG_SWITCH_FROM_INTERRUPT_TO_POLLING_MODE))
384	/* Tell driver we want notification */
385	txvq->used->flags = 0;
386      else
387	/* Tell driver we don't want notification */
388	txvq->used->flags = VRING_USED_F_NO_NOTIFY;
389    }
390
391  if (PREDICT_FALSE (txvq->avail->flags & 0xFFFE))
392    goto done;
393
394  n_left = (u16) (txvq->avail->idx - txvq->last_avail_idx);
395
396  /* nothing to do */
397  if (PREDICT_FALSE (n_left == 0))
398    goto done;
399
400  if (PREDICT_FALSE (!vui->admin_up || !(txvq->enabled)))
401    {
402      /*
403       * Discard input packet if interface is admin down or vring is not
404       * enabled.
405       * "For example, for a networking device, in the disabled state
406       * client must not supply any new RX packets, but must process
407       * and discard any TX packets."
408       */
409      vhost_user_rx_discard_packet (vm, vui, txvq,
410				    VHOST_USER_DOWN_DISCARD_COUNT);
411      goto done;
412    }
413
414  if (PREDICT_FALSE (n_left == (mask + 1)))
415    {
416      /*
417       * Informational error logging when VPP is not
418       * receiving packets fast enough.
419       */
420      vlib_error_count (vm, node->node_index,
421			VHOST_USER_INPUT_FUNC_ERROR_FULL_RX_QUEUE, 1);
422    }
423
424  if (n_left > VLIB_FRAME_SIZE)
425    n_left = VLIB_FRAME_SIZE;
426
427  /*
428   * For small packets (<2kB), we will not need more than one vlib buffer
429   * per packet. In case packets are bigger, we will just yeld at some point
430   * in the loop and come back later. This is not an issue as for big packet,
431   * processing cost really comes from the memory copy.
432   * The assumption is that big packets will fit in 40 buffers.
433   */
434  if (PREDICT_FALSE (cpu->rx_buffers_len < n_left + 1 ||
435		     cpu->rx_buffers_len < 40))
436    {
437      u32 curr_len = cpu->rx_buffers_len;
438      cpu->rx_buffers_len +=
439	vlib_buffer_alloc (vm, cpu->rx_buffers + curr_len,
440			   VHOST_USER_RX_BUFFERS_N - curr_len);
441
442      if (PREDICT_FALSE
443	  (cpu->rx_buffers_len < VHOST_USER_RX_BUFFER_STARVATION))
444	{
445	  /* In case of buffer starvation, discard some packets from the queue
446	   * and log the event.
447	   * We keep doing best effort for the remaining packets. */
448	  u32 flush = (n_left + 1 > cpu->rx_buffers_len) ?
449	    n_left + 1 - cpu->rx_buffers_len : 1;
450	  flush = vhost_user_rx_discard_packet (vm, vui, txvq, flush);
451
452	  n_left -= flush;
453	  vlib_increment_simple_counter (vnet_main.
454					 interface_main.sw_if_counters +
455					 VNET_INTERFACE_COUNTER_DROP,
456					 vm->thread_index, vui->sw_if_index,
457					 flush);
458
459	  vlib_error_count (vm, vhost_user_input_node.index,
460			    VHOST_USER_INPUT_FUNC_ERROR_NO_BUFFER, flush);
461	}
462    }
463
464  if (PREDICT_FALSE (vnet_have_features (feature_arc_idx, vui->sw_if_index)))
465    {
466      vnet_feature_config_main_t *cm;
467      cm = &fm->feature_config_mains[feature_arc_idx];
468      current_config_index = vec_elt (cm->config_index_by_sw_if_index,
469				      vui->sw_if_index);
470      vnet_get_config_data (&cm->config_main, &current_config_index,
471			    &next_index, 0);
472    }
473
474  u16 last_avail_idx = txvq->last_avail_idx;
475  u16 last_used_idx = txvq->last_used_idx;
476
477  vlib_get_new_next_frame (vm, node, next_index, to_next, n_left_to_next);
478
479  if (next_index == VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT)
480    {
481      /* give some hints to ethernet-input */
482      vlib_next_frame_t *nf;
483      vlib_frame_t *f;
484      ethernet_input_frame_t *ef;
485      nf = vlib_node_runtime_get_next_frame (vm, node, next_index);
486      f = vlib_get_frame (vm, nf->frame);
487      f->flags = ETH_INPUT_FRAME_F_SINGLE_SW_IF_IDX;
488
489      ef = vlib_frame_scalar_args (f);
490      ef->sw_if_index = vui->sw_if_index;
491      ef->hw_if_index = vui->hw_if_index;
492      vlib_frame_no_append (f);
493    }
494
495  while (n_left > 0)
496    {
497      vlib_buffer_t *b_head, *b_current;
498      u32 bi_current;
499      u16 desc_current;
500      u32 desc_data_offset;
501      vring_desc_t *desc_table = txvq->desc;
502
503      if (PREDICT_FALSE (cpu->rx_buffers_len <= 1))
504	{
505	  /* Not enough rx_buffers
506	   * Note: We yeld on 1 so we don't need to do an additional
507	   * check for the next buffer prefetch.
508	   */
509	  n_left = 0;
510	  break;
511	}
512
513      desc_current = txvq->avail->ring[last_avail_idx & mask];
514      cpu->rx_buffers_len--;
515      bi_current = cpu->rx_buffers[cpu->rx_buffers_len];
516      b_head = b_current = vlib_get_buffer (vm, bi_current);
517      to_next[0] = bi_current;	//We do that now so we can forget about bi_current
518      to_next++;
519      n_left_to_next--;
520
521      vlib_prefetch_buffer_with_index
522	(vm, cpu->rx_buffers[cpu->rx_buffers_len - 1], LOAD);
523
524      /* Just preset the used descriptor id and length for later */
525      txvq->used->ring[last_used_idx & mask].id = desc_current;
526      txvq->used->ring[last_used_idx & mask].len = 0;
527      vhost_user_log_dirty_ring (vui, txvq, ring[last_used_idx & mask]);
528
529      /* The buffer should already be initialized */
530      b_head->total_length_not_including_first_buffer = 0;
531      b_head->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
532
533      if (PREDICT_FALSE (n_trace))
534	{
535	  vlib_trace_buffer (vm, node, next_index, b_head,
536			     /* follow_chain */ 0);
537	  vhost_trace_t *t0 =
538	    vlib_add_trace (vm, node, b_head, sizeof (t0[0]));
539	  vhost_user_rx_trace (t0, vui, qid, b_head, txvq, last_avail_idx);
540	  n_trace--;
541	  vlib_set_trace_count (vm, node, n_trace);
542	}
543
544      /* This depends on the setup but is very consistent
545       * So I think the CPU branch predictor will make a pretty good job
546       * at optimizing the decision. */
547      u8 indirect = 0;
548      if (txvq->desc[desc_current].flags & VIRTQ_DESC_F_INDIRECT)
549	{
550	  desc_table = map_guest_mem (vui, txvq->desc[desc_current].addr,
551				      &map_hint);
552	  desc_current = 0;
553	  indirect = 1;
554	  if (PREDICT_FALSE (desc_table == 0))
555	    {
556	      vlib_error_count (vm, node->node_index,
557				VHOST_USER_INPUT_FUNC_ERROR_MMAP_FAIL, 1);
558	      goto out;
559	    }
560	}
561
562      if (PREDICT_TRUE (vui->is_any_layout) ||
563	  (!(desc_table[desc_current].flags & VIRTQ_DESC_F_NEXT)))
564	{
565	  /* ANYLAYOUT or single buffer */
566	  desc_data_offset = vui->virtio_net_hdr_sz;
567	}
568      else
569	{
570	  /* CSR case without ANYLAYOUT, skip 1st buffer */
571	  desc_data_offset = desc_table[desc_current].len;
572	}
573
574      if (enable_csum)
575	{
576	  virtio_net_hdr_mrg_rxbuf_t *hdr;
577	  u8 *b_data;
578	  u16 current = desc_current;
579	  u32 data_offset = desc_data_offset;
580
581	  if ((data_offset == desc_table[current].len) &&
582	      (desc_table[current].flags & VIRTQ_DESC_F_NEXT))
583	    {
584	      current = desc_table[current].next;
585	      data_offset = 0;
586	    }
587	  hdr = map_guest_mem (vui, desc_table[current].addr, &map_hint);
588	  if (PREDICT_FALSE (hdr == 0))
589	    {
590	      vlib_error_count (vm, node->node_index,
591				VHOST_USER_INPUT_FUNC_ERROR_MMAP_FAIL, 1);
592	      goto out;
593	    }
594	  b_data = (u8 *) hdr + data_offset;
595	  if (indirect)
596	    {
597	      hdr = map_guest_mem (vui, desc_table[desc_current].addr,
598				   &map_hint);
599	      if (PREDICT_FALSE (hdr == 0))
600		{
601		  vlib_error_count (vm, node->node_index,
602				    VHOST_USER_INPUT_FUNC_ERROR_MMAP_FAIL, 1);
603		  goto out;
604		}
605	    }
606	  vhost_user_handle_rx_offload (b_head, b_data, &hdr->hdr);
607	}
608
609      while (1)
610	{
611	  /* Get more input if necessary. Or end of packet. */
612	  if (desc_data_offset == desc_table[desc_current].len)
613	    {
614	      if (PREDICT_FALSE (desc_table[desc_current].flags &
615				 VIRTQ_DESC_F_NEXT))
616		{
617		  desc_current = desc_table[desc_current].next;
618		  desc_data_offset = 0;
619		}
620	      else
621		{
622		  goto out;
623		}
624	    }
625
626	  /* Get more output if necessary. Or end of packet. */
627	  if (PREDICT_FALSE (b_current->current_length == buffer_data_size))
628	    {
629	      if (PREDICT_FALSE (cpu->rx_buffers_len == 0))
630		{
631		  /* Cancel speculation */
632		  to_next--;
633		  n_left_to_next++;
634
635		  /*
636		   * Checking if there are some left buffers.
637		   * If not, just rewind the used buffers and stop.
638		   * Note: Scheduled copies are not cancelled. This is
639		   * not an issue as they would still be valid. Useless,
640		   * but valid.
641		   */
642		  vhost_user_input_rewind_buffers (vm, cpu, b_head);
643		  n_left = 0;
644		  goto stop;
645		}
646
647	      /* Get next output */
648	      cpu->rx_buffers_len--;
649	      u32 bi_next = cpu->rx_buffers[cpu->rx_buffers_len];
650	      b_current->next_buffer = bi_next;
651	      b_current->flags |= VLIB_BUFFER_NEXT_PRESENT;
652	      bi_current = bi_next;
653	      b_current = vlib_get_buffer (vm, bi_current);
654	    }
655
656	  /* Prepare a copy order executed later for the data */
657	  ASSERT (copy_len < VHOST_USER_COPY_ARRAY_N);
658	  vhost_copy_t *cpy = &cpu->copy[copy_len];
659	  copy_len++;
660	  u32 desc_data_l = desc_table[desc_current].len - desc_data_offset;
661	  cpy->len = buffer_data_size - b_current->current_length;
662	  cpy->len = (cpy->len > desc_data_l) ? desc_data_l : cpy->len;
663	  cpy->dst = (uword) (vlib_buffer_get_current (b_current) +
664			      b_current->current_length);
665	  cpy->src = desc_table[desc_current].addr + desc_data_offset;
666
667	  desc_data_offset += cpy->len;
668
669	  b_current->current_length += cpy->len;
670	  b_head->total_length_not_including_first_buffer += cpy->len;
671	}
672
673    out:
674
675      n_rx_bytes += b_head->total_length_not_including_first_buffer;
676      n_rx_packets++;
677
678      b_head->total_length_not_including_first_buffer -=
679	b_head->current_length;
680
681      /* consume the descriptor and return it as used */
682      last_avail_idx++;
683      last_used_idx++;
684
685      VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b_head);
686
687      vnet_buffer (b_head)->sw_if_index[VLIB_RX] = vui->sw_if_index;
688      vnet_buffer (b_head)->sw_if_index[VLIB_TX] = (u32) ~ 0;
689      b_head->error = 0;
690
691      if (current_config_index != ~(u32) 0)
692	{
693	  b_head->current_config_index = current_config_index;
694	  vnet_buffer (b_head)->feature_arc_index = feature_arc_idx;
695	}
696
697      n_left--;
698
699      /*
700       * Although separating memory copies from virtio ring parsing
701       * is beneficial, we can offer to perform the copies from time
702       * to time in order to free some space in the ring.
703       */
704      if (PREDICT_FALSE (copy_len >= VHOST_USER_RX_COPY_THRESHOLD))
705	{
706	  if (PREDICT_FALSE (vhost_user_input_copy (vui, cpu->copy,
707						    copy_len, &map_hint)))
708	    {
709	      vlib_error_count (vm, node->node_index,
710				VHOST_USER_INPUT_FUNC_ERROR_MMAP_FAIL, 1);
711	    }
712	  copy_len = 0;
713
714	  /* give buffers back to driver */
715	  CLIB_MEMORY_STORE_BARRIER ();
716	  txvq->used->idx = last_used_idx;
717	  vhost_user_log_dirty_ring (vui, txvq, idx);
718	}
719    }
720stop:
721  vlib_put_next_frame (vm, node, next_index, n_left_to_next);
722
723  txvq->last_used_idx = last_used_idx;
724  txvq->last_avail_idx = last_avail_idx;
725
726  /* Do the memory copies */
727  if (PREDICT_FALSE (vhost_user_input_copy (vui, cpu->copy, copy_len,
728					    &map_hint)))
729    {
730      vlib_error_count (vm, node->node_index,
731			VHOST_USER_INPUT_FUNC_ERROR_MMAP_FAIL, 1);
732    }
733
734  /* give buffers back to driver */
735  CLIB_MEMORY_STORE_BARRIER ();
736  txvq->used->idx = txvq->last_used_idx;
737  vhost_user_log_dirty_ring (vui, txvq, idx);
738
739  /* interrupt (call) handling */
740  if ((txvq->callfd_idx != ~0) &&
741      !(txvq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
742    {
743      txvq->n_since_last_int += n_rx_packets;
744
745      if (txvq->n_since_last_int > vum->coalesce_frames)
746	vhost_user_send_call (vm, txvq);
747    }
748
749  /* increase rx counters */
750  vlib_increment_combined_counter
751    (vnet_main.interface_main.combined_sw_if_counters
752     + VNET_INTERFACE_COUNTER_RX, vm->thread_index, vui->sw_if_index,
753     n_rx_packets, n_rx_bytes);
754
755  vnet_device_increment_rx_packets (vm->thread_index, n_rx_packets);
756
757done:
758  return n_rx_packets;
759}
760
761VLIB_NODE_FN (vhost_user_input_node) (vlib_main_t * vm,
762				      vlib_node_runtime_t * node,
763				      vlib_frame_t * frame)
764{
765  vhost_user_main_t *vum = &vhost_user_main;
766  uword n_rx_packets = 0;
767  vhost_user_intf_t *vui;
768  vnet_device_input_runtime_t *rt =
769    (vnet_device_input_runtime_t *) node->runtime_data;
770  vnet_device_and_queue_t *dq;
771
772  vec_foreach (dq, rt->devices_and_queues)
773  {
774    if ((node->state == VLIB_NODE_STATE_POLLING) ||
775	clib_atomic_swap_acq_n (&dq->interrupt_pending, 0))
776      {
777	vui =
778	  pool_elt_at_index (vum->vhost_user_interfaces, dq->dev_instance);
779	if (vui->features & (1ULL << FEAT_VIRTIO_NET_F_CSUM))
780	  n_rx_packets +=
781	    vhost_user_if_input (vm, vum, vui, dq->queue_id, node, dq->mode,
782				 1);
783	else
784	  n_rx_packets +=
785	    vhost_user_if_input (vm, vum, vui, dq->queue_id, node, dq->mode,
786				 0);
787      }
788  }
789
790  return n_rx_packets;
791}
792
793/* *INDENT-OFF* */
794VLIB_REGISTER_NODE (vhost_user_input_node) = {
795  .type = VLIB_NODE_TYPE_INPUT,
796  .name = "vhost-user-input",
797  .sibling_of = "device-input",
798  .flags = VLIB_NODE_FLAG_TRACE_SUPPORTED,
799
800  /* Will be enabled if/when hardware is detected. */
801  .state = VLIB_NODE_STATE_DISABLED,
802
803  .format_buffer = format_ethernet_header_with_length,
804  .format_trace = format_vhost_trace,
805
806  .n_errors = VHOST_USER_INPUT_FUNC_N_ERROR,
807  .error_strings = vhost_user_input_func_error_strings,
808};
809/* *INDENT-ON* */
810
811/*
812 * fd.io coding-style-patch-verification: ON
813 *
814 * Local Variables:
815 * eval: (c-set-style "gnu")
816 * End:
817 */
818