node.c revision adff8bfb
1/*
2 *------------------------------------------------------------------
3 * Copyright (c) 2016 Cisco and/or its affiliates.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *------------------------------------------------------------------
16 */
17
18#define _GNU_SOURCE
19
20#include <stdint.h>
21#include <net/if.h>
22#include <sys/ioctl.h>
23#include <sys/socket.h>
24#include <sys/stat.h>
25#include <sys/types.h>
26#include <sys/uio.h>            /* for iovec */
27#include <netinet/in.h>
28
29#include <vlib/vlib.h>
30#include <vlib/unix/unix.h>
31#include <vnet/ethernet/ethernet.h>
32#include <vnet/vnet.h>
33
34#include <vnet/ip/ip.h>
35
36#if DPDK == 1
37#include <vnet/devices/dpdk/dpdk.h>
38#endif
39
40#include <turbotap/turbotap.h>
41
42vlib_node_registration_t turbotap_rx_node;
43
44enum {
45  TURBOTAP_RX_NEXT_IP4_INPUT,
46  TURBOTAP_RX_NEXT_IP6_INPUT,
47  TURBOTAP_RX_NEXT_ETHERNET_INPUT,
48  TURBOTAP_RX_NEXT_DROP,
49  TURBOTAP_RX_N_NEXT,
50};
51
52typedef struct {
53  u16 sw_if_index;
54} turbotap_rx_trace_t;
55
56u8 * format_turbotap_rx_trace (u8 * s, va_list * va)
57{
58  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *);
59  CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *);
60  vnet_main_t * vnm = vnet_get_main();
61  turbotap_rx_trace_t * t = va_arg (*va, turbotap_rx_trace_t *);
62  s = format (s, "%U", format_vnet_sw_if_index_name,
63                vnm, t->sw_if_index);
64  return s;
65}
66
67always_inline void
68buffer_add_to_chain(vlib_main_t *vm, u32 bi, u32 first_bi, u32 prev_bi)
69{
70  vlib_buffer_t * b = vlib_get_buffer (vm, bi);
71  vlib_buffer_t * first_b = vlib_get_buffer (vm, first_bi);
72  vlib_buffer_t * prev_b = vlib_get_buffer (vm, prev_bi);
73
74  /* update first buffer */
75  first_b->total_length_not_including_first_buffer +=  b->current_length;
76
77  /* update previous buffer */
78  prev_b->next_buffer = bi;
79  prev_b->flags |= VLIB_BUFFER_NEXT_PRESENT;
80
81  /* update current buffer */
82  b->next_buffer = 0;
83
84#if DPDK > 0
85  struct rte_mbuf * mbuf = rte_mbuf_from_vlib_buffer(b);
86  struct rte_mbuf * first_mbuf = rte_mbuf_from_vlib_buffer(first_b);
87  struct rte_mbuf * prev_mbuf = rte_mbuf_from_vlib_buffer(prev_b);
88  first_mbuf->nb_segs++;
89  prev_mbuf->next = mbuf;
90  mbuf->data_len = b->current_length;
91  mbuf->data_off = RTE_PKTMBUF_HEADROOM + b->current_data;
92  mbuf->next = 0;
93#endif
94}
95
96static uword
97turbotap_rx_iface(vlib_main_t * vm,
98           vlib_node_runtime_t * node,
99           turbotap_interface_t * ti)
100{
101  turbotap_main_t * tr = &turbotap_main;
102  const uword buffer_size = vlib_buffer_free_list_buffer_size ( vm,
103                                  VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
104  u32 n_trace = vlib_get_trace_count (vm, node);
105  u8 set_trace = 0;
106  vnet_main_t *vnm;
107  vnet_sw_interface_t * si;
108  u8 admin_down;
109  uword len = 0;
110  u32 next_index =  TURBOTAP_RX_NEXT_ETHERNET_INPUT;
111  u32 *to_next;
112
113  vnm = vnet_get_main();
114  si = vnet_get_sw_interface (vnm, ti->sw_if_index);
115  admin_down = !(si->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP);
116
117  if (ti->per_interface_next_index != ~0)
118     next_index = ti->per_interface_next_index;
119
120  /* Buffer Allocation */
121  u32 desired_allocation = ti->rx_ready * ti->mtu_buffers + 32;
122  if (PREDICT_TRUE(vec_len(tr->rx_buffers) < ti->rx_ready * ti->mtu_buffers))
123    {
124      len = vec_len(tr->rx_buffers);
125      vec_validate(tr->rx_buffers, desired_allocation - 1);
126      vec_validate(tr->unused_buffer_list, desired_allocation - 1);
127      _vec_len(tr->unused_buffer_list) = 0;
128      _vec_len(tr->rx_buffers) = len +
129          vlib_buffer_alloc(vm, &tr->rx_buffers[len], desired_allocation - len);
130      if (PREDICT_FALSE(vec_len(tr->rx_buffers) < ti->rx_ready * ti->mtu_buffers))
131        {
132          vlib_node_increment_counter(vm, turbotap_rx_node.index, TURBOTAP_ERROR_BUFFER_ALLOCATION, 1);
133        }
134    }
135
136  /* Filling msgs */
137  u32 i = 0;
138  len = vec_len(tr->rx_buffers);
139  while (i < ti->rx_ready && len > ti->mtu_buffers)
140    {
141      u32 j = 0;
142      vec_validate(ti->rx_msg[i].msg_hdr.msg_iov, ti->mtu_buffers - 1);
143      while (j < ti->mtu_buffers)
144        {
145          vlib_buffer_t *b = vlib_get_buffer(vm, tr->rx_buffers[len - 1]);
146          ti->rx_msg[i].msg_hdr.msg_iov[j].iov_base = b->data;
147          ti->rx_msg[i].msg_hdr.msg_iov[j].iov_len = buffer_size;
148          len--;
149          j++;
150        }
151
152      ti->rx_msg[i].msg_hdr.msg_iovlen = ti->mtu_buffers;
153      ti->rx_msg[i].msg_hdr.msg_flags = MSG_DONTWAIT;
154      ti->rx_msg[i].msg_hdr.msg_name = NULL;
155      ti->rx_msg[i].msg_hdr.msg_namelen = 0;
156      ti->rx_msg[i].msg_hdr.msg_control = NULL;
157      ti->rx_msg[i].msg_hdr.msg_controllen = 0;
158      ti->rx_msg[i].msg_len = 0;
159      i++;
160    }
161
162  /*
163   * Be careful here
164   *
165   * Experiments show that we need to set the time according
166   * to the number of msgs receive from kernel even if the call
167   * is NON-BLOCKING. If timeout is so small, then recvmmsg
168   * call gets as many packets as it can in that time period.
169   */
170  struct timespec timeout = {.tv_sec = 0, .tv_nsec = 500000};
171  int num_rx_msgs = recvmmsg(ti->sock_fd, ti->rx_msg, i, MSG_DONTWAIT, &timeout);
172  if (num_rx_msgs <= 0) {
173    if (errno != EAGAIN) {
174      vlib_node_increment_counter(vm, turbotap_rx_node.index,
175                                  TURBOTAP_ERROR_READ, 1);
176    }
177    return 0;
178  }
179
180  u32 next = next_index;
181  u32 n_left_to_next = 0;
182
183  i = 0;
184  len = vec_len(tr->rx_buffers);
185  vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next);
186
187  while (i != num_rx_msgs && n_left_to_next)
188    {
189      vlib_buffer_t *b0, *first_b0;
190      u32 bi0 = 0, first_bi0 = 0, prev_bi0, j = 0;
191      u32 bytes_to_put = 0, bytes_already_put = 0;
192      u32 remain_len = ti->rx_msg[i].msg_len;
193
194      while (remain_len && len)
195        {
196          /* grab free buffer */
197          prev_bi0 = bi0;
198          bi0 = tr->rx_buffers[len - 1];
199          b0 = vlib_get_buffer(vm, bi0);
200
201	  bytes_to_put = remain_len > buffer_size ? buffer_size : remain_len;
202          b0->current_length = bytes_to_put;
203
204          if (bytes_already_put == 0)
205            {
206#if DPDK > 0
207              struct rte_mbuf * mb = rte_mbuf_from_vlib_buffer(b0);
208              rte_pktmbuf_data_len (mb) = b0->current_length;
209              rte_pktmbuf_pkt_len (mb) = b0->current_length;
210#endif
211              b0->total_length_not_including_first_buffer = 0;
212              b0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID;
213              vnet_buffer(b0)->sw_if_index[VLIB_RX] = ti->sw_if_index;
214              vnet_buffer(b0)->sw_if_index[VLIB_TX] = (u32)~0;
215              first_bi0 = bi0;
216              first_b0 = vlib_get_buffer(vm, first_bi0);
217            }
218          else
219            buffer_add_to_chain(vm, bi0, first_bi0, prev_bi0);
220
221
222          bytes_already_put += bytes_to_put;
223          remain_len -= bytes_to_put;
224          j++;
225          len--;
226        }
227
228    /* record unused buffers */
229    while (j < ti->mtu_buffers)
230      {
231        u32 vec_len_unused = vec_len(tr->unused_buffer_list);
232        tr->unused_buffer_list[vec_len_unused] = tr->rx_buffers[len - 1];
233        len--;
234        j++;
235        _vec_len(tr->unused_buffer_list) = vec_len_unused + 1;
236      }
237
238    /* trace */
239    VLIB_BUFFER_TRACE_TRAJECTORY_INIT(first_b0);
240
241    first_b0->error = node->errors[TURBOTAP_ERROR_NONE];
242
243    /* Interface counters and tracing. */
244    if (PREDICT_TRUE(!admin_down))
245      {
246        vlib_increment_combined_counter (
247           vnet_main.interface_main.combined_sw_if_counters
248           + VNET_INTERFACE_COUNTER_RX,
249           os_get_cpu_number(), ti->sw_if_index,
250           1, ti->rx_msg[i].msg_len);
251
252        if (PREDICT_FALSE(n_trace > 0))
253          {
254            vlib_trace_buffer (vm, node, next_index,
255                             first_b0, /* follow_chain */ 1);
256            n_trace--;
257            set_trace = 1;
258            turbotap_rx_trace_t *t0 = vlib_add_trace (vm, node, first_b0, sizeof (*t0));
259            t0->sw_if_index = si->sw_if_index;
260          }
261      } else {
262        next = TURBOTAP_RX_NEXT_DROP;
263      }
264
265    /* next packet */
266    to_next[0] = first_bi0;
267    n_left_to_next -= 1;
268    to_next +=1;
269
270    /* enque and take next packet */
271    vlib_validate_buffer_enqueue_x1(vm, node, next_index , to_next,
272                            n_left_to_next, first_bi0, next);
273
274    i++;
275  }
276
277  _vec_len(tr->rx_buffers) = len;
278  vlib_put_next_frame(vm, node, next_index, n_left_to_next);
279
280  /* put unused buffers back */
281  while (vec_len(tr->unused_buffer_list) > 0)
282    {
283      u32 vec_len_unused = vec_len(tr->unused_buffer_list);
284      u32 vec_len_rx = vec_len(tr->rx_buffers);
285      tr->rx_buffers[vec_len_rx] = tr->unused_buffer_list[vec_len_unused - 1];
286      _vec_len(tr->unused_buffer_list) -= 1;
287      _vec_len(tr->rx_buffers) += 1;
288    }
289
290  if (ti->rx_ready - i > 0 )
291    {
292      ti->rx_ready -= i;
293      if (ti->rx_ready < i)
294	ti->rx_ready = i;
295    }
296  else if (ti->rx_ready + i > MAX_RECV)
297        ti->rx_ready = MAX_RECV;
298  else
299        ti->rx_ready += i;
300
301  if (set_trace)
302    vlib_set_trace_count (vm, node, n_trace);
303  return i;
304}
305
306static uword
307turbotap_rx (vlib_main_t * vm,
308           vlib_node_runtime_t * node,
309           vlib_frame_t * frame)
310{
311  turbotap_main_t * tr = &turbotap_main;
312  static u32 * ready_interface_indices;
313  turbotap_interface_t * ti;
314  int i;
315  u32 total_count = 0;
316
317  vec_reset_length (ready_interface_indices);
318  clib_bitmap_foreach (i, tr->pending_read_bitmap,
319  ({
320    vec_add1 (ready_interface_indices, i);
321  }));
322
323  if (vec_len (ready_interface_indices) == 0)
324    return 0;
325
326  for (i = 0; i < vec_len(ready_interface_indices); i++)
327    {
328      tr->pending_read_bitmap =
329        clib_bitmap_set (tr->pending_read_bitmap,
330                         ready_interface_indices[i], 0);
331
332      ti = vec_elt_at_index (tr->turbotap_interfaces, ready_interface_indices[i]);
333      total_count += turbotap_rx_iface(vm, node, ti);
334    }
335  return total_count; //This might return more than 256.
336}
337
338static char * turbotap_rx_error_strings[] = {
339#define _(sym,string) string,
340  foreach_turbotap_error
341#undef _
342};
343
344VLIB_REGISTER_NODE (turbotap_rx_node) = {
345  .function = turbotap_rx,
346  .name = "turbotap-rx",
347  .type = VLIB_NODE_TYPE_INPUT,
348  .state = VLIB_NODE_STATE_INTERRUPT,
349  .vector_size = 4,
350  .n_errors = TURBOTAP_N_ERROR,
351  .error_strings = turbotap_rx_error_strings,
352  .format_trace = format_turbotap_rx_trace,
353
354  .n_next_nodes = TURBOTAP_RX_N_NEXT,
355  .next_nodes = {
356    [TURBOTAP_RX_NEXT_IP4_INPUT] = "ip4-input-no-checksum",
357    [TURBOTAP_RX_NEXT_IP6_INPUT] = "ip6-input",
358    [TURBOTAP_RX_NEXT_DROP] = "error-drop",
359    [TURBOTAP_RX_NEXT_ETHERNET_INPUT] = "ethernet-input",
360  },
361};
362
363