1/*
2 * Copyright (c) 2017 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
6 *
7 *     http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16/**
17 * @file
18 * @brief IPv4 Full Reassembly.
19 *
20 * This file contains the source code for IPv4 full reassembly.
21 */
22
23#include <vppinfra/vec.h>
24#include <vnet/vnet.h>
25#include <vnet/ip/ip.h>
26#include <vppinfra/fifo.h>
27#include <vppinfra/bihash_16_8.h>
28#include <vnet/ip/reass/ip4_full_reass.h>
29#include <stddef.h>
30
31#define MSEC_PER_SEC 1000
32#define IP4_REASS_TIMEOUT_DEFAULT_MS 100
33#define IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS 10000	// 10 seconds default
34#define IP4_REASS_MAX_REASSEMBLIES_DEFAULT 1024
35#define IP4_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT 3
36#define IP4_REASS_HT_LOAD_FACTOR (0.75)
37
38#define IP4_REASS_DEBUG_BUFFERS 0
39#if IP4_REASS_DEBUG_BUFFERS
40#define IP4_REASS_DEBUG_BUFFER(bi, what)             \
41  do                                                 \
42    {                                                \
43      u32 _bi = bi;                                  \
44      printf (#what "buffer %u", _bi);               \
45      vlib_buffer_t *_b = vlib_get_buffer (vm, _bi); \
46      while (_b->flags & VLIB_BUFFER_NEXT_PRESENT)   \
47        {                                            \
48          _bi = _b->next_buffer;                     \
49          printf ("[%u]", _bi);                      \
50          _b = vlib_get_buffer (vm, _bi);            \
51        }                                            \
52      printf ("\n");                                 \
53      fflush (stdout);                               \
54    }                                                \
55  while (0)
56#else
57#define IP4_REASS_DEBUG_BUFFER(...)
58#endif
59
60typedef enum
61{
62  IP4_REASS_RC_OK,
63  IP4_REASS_RC_TOO_MANY_FRAGMENTS,
64  IP4_REASS_RC_INTERNAL_ERROR,
65  IP4_REASS_RC_NO_BUF,
66  IP4_REASS_RC_HANDOFF,
67} ip4_full_reass_rc_t;
68
69typedef struct
70{
71  union
72  {
73    struct
74    {
75      u32 xx_id;
76      ip4_address_t src;
77      ip4_address_t dst;
78      u16 frag_id;
79      u8 proto;
80      u8 unused;
81    };
82    u64 as_u64[2];
83  };
84} ip4_full_reass_key_t;
85
86typedef union
87{
88  struct
89  {
90    u32 reass_index;
91    u32 memory_owner_thread_index;
92  };
93  u64 as_u64;
94} ip4_full_reass_val_t;
95
96typedef union
97{
98  struct
99  {
100    ip4_full_reass_key_t k;
101    ip4_full_reass_val_t v;
102  };
103  clib_bihash_kv_16_8_t kv;
104} ip4_full_reass_kv_t;
105
106always_inline u32
107ip4_full_reass_buffer_get_data_offset (vlib_buffer_t * b)
108{
109  vnet_buffer_opaque_t *vnb = vnet_buffer (b);
110  return vnb->ip.reass.range_first - vnb->ip.reass.fragment_first;
111}
112
113always_inline u16
114ip4_full_reass_buffer_get_data_len (vlib_buffer_t * b)
115{
116  vnet_buffer_opaque_t *vnb = vnet_buffer (b);
117  return clib_min (vnb->ip.reass.range_last, vnb->ip.reass.fragment_last) -
118    (vnb->ip.reass.fragment_first +
119     ip4_full_reass_buffer_get_data_offset (b)) + 1;
120}
121
122typedef struct
123{
124  // hash table key
125  ip4_full_reass_key_t key;
126  // time when last packet was received
127  f64 last_heard;
128  // internal id of this reassembly
129  u64 id;
130  // buffer index of first buffer in this reassembly context
131  u32 first_bi;
132  // last octet of packet, ~0 until fragment without more_fragments arrives
133  u32 last_packet_octet;
134  // length of data collected so far
135  u32 data_len;
136  // trace operation counter
137  u32 trace_op_counter;
138  // next index - used by non-feature node
139  u32 next_index;
140  // error next index - used by custom apps (~0 if not used)
141  u32 error_next_index;
142  // minimum fragment length for this reassembly - used to estimate MTU
143  u16 min_fragment_length;
144  // number of fragments in this reassembly
145  u32 fragments_n;
146  // thread owning memory for this context (whose pool contains this ctx)
147  u32 memory_owner_thread_index;
148  // thread which received fragment with offset 0 and which sends out the
149  // completed reassembly
150  u32 sendout_thread_index;
151} ip4_full_reass_t;
152
153typedef struct
154{
155  ip4_full_reass_t *pool;
156  u32 reass_n;
157  u32 id_counter;
158  clib_spinlock_t lock;
159} ip4_full_reass_per_thread_t;
160
161typedef struct
162{
163  // IPv4 config
164  u32 timeout_ms;
165  f64 timeout;
166  u32 expire_walk_interval_ms;
167  // maximum number of fragments in one reassembly
168  u32 max_reass_len;
169  // maximum number of reassemblies
170  u32 max_reass_n;
171
172  // IPv4 runtime
173  clib_bihash_16_8_t hash;
174  // per-thread data
175  ip4_full_reass_per_thread_t *per_thread_data;
176
177  // convenience
178  vlib_main_t *vlib_main;
179
180  // node index of ip4-drop node
181  u32 ip4_drop_idx;
182  u32 ip4_full_reass_expire_node_idx;
183
184  /** Worker handoff */
185  u32 fq_index;
186  u32 fq_feature_index;
187
188  // reference count for enabling/disabling feature - per interface
189  u32 *feature_use_refcount_per_intf;
190} ip4_full_reass_main_t;
191
192extern ip4_full_reass_main_t ip4_full_reass_main;
193
194#ifndef CLIB_MARCH_VARIANT
195ip4_full_reass_main_t ip4_full_reass_main;
196#endif /* CLIB_MARCH_VARIANT */
197
198typedef enum
199{
200  IP4_FULL_REASS_NEXT_INPUT,
201  IP4_FULL_REASS_NEXT_DROP,
202  IP4_FULL_REASS_NEXT_HANDOFF,
203  IP4_FULL_REASS_N_NEXT,
204} ip4_full_reass_next_t;
205
206typedef enum
207{
208  RANGE_NEW,
209  RANGE_SHRINK,
210  RANGE_DISCARD,
211  RANGE_OVERLAP,
212  FINALIZE,
213  HANDOFF,
214} ip4_full_reass_trace_operation_e;
215
216typedef struct
217{
218  u16 range_first;
219  u16 range_last;
220  u32 range_bi;
221  i32 data_offset;
222  u32 data_len;
223  u32 first_bi;
224} ip4_full_reass_range_trace_t;
225
226typedef struct
227{
228  ip4_full_reass_trace_operation_e action;
229  u32 reass_id;
230  ip4_full_reass_range_trace_t trace_range;
231  u32 size_diff;
232  u32 op_id;
233  u32 thread_id;
234  u32 thread_id_to;
235  u32 fragment_first;
236  u32 fragment_last;
237  u32 total_data_len;
238  bool is_after_handoff;
239  ip4_header_t ip4_header;
240} ip4_full_reass_trace_t;
241
242extern vlib_node_registration_t ip4_full_reass_node;
243extern vlib_node_registration_t ip4_full_reass_node_feature;
244
245static void
246ip4_full_reass_trace_details (vlib_main_t * vm, u32 bi,
247			      ip4_full_reass_range_trace_t * trace)
248{
249  vlib_buffer_t *b = vlib_get_buffer (vm, bi);
250  vnet_buffer_opaque_t *vnb = vnet_buffer (b);
251  trace->range_first = vnb->ip.reass.range_first;
252  trace->range_last = vnb->ip.reass.range_last;
253  trace->data_offset = ip4_full_reass_buffer_get_data_offset (b);
254  trace->data_len = ip4_full_reass_buffer_get_data_len (b);
255  trace->range_bi = bi;
256}
257
258static u8 *
259format_ip4_full_reass_range_trace (u8 * s, va_list * args)
260{
261  ip4_full_reass_range_trace_t *trace =
262    va_arg (*args, ip4_full_reass_range_trace_t *);
263  s =
264    format (s, "range: [%u, %u], off %d, len %u, bi %u", trace->range_first,
265	    trace->range_last, trace->data_offset, trace->data_len,
266	    trace->range_bi);
267  return s;
268}
269
270static u8 *
271format_ip4_full_reass_trace (u8 * s, va_list * args)
272{
273  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
274  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
275  ip4_full_reass_trace_t *t = va_arg (*args, ip4_full_reass_trace_t *);
276  u32 indent = 0;
277  if (~0 != t->reass_id)
278    {
279      if (t->is_after_handoff)
280	{
281	  s =
282	    format (s, "%U\n", format_ip4_header, &t->ip4_header,
283		    sizeof (t->ip4_header));
284	  indent = 2;
285	}
286      s =
287	format (s, "%Ureass id: %u, op id: %u, ", format_white_space, indent,
288		t->reass_id, t->op_id);
289      indent = format_get_indent (s);
290      s =
291	format (s,
292		"first bi: %u, data len: %u, ip/fragment[%u, %u]",
293		t->trace_range.first_bi, t->total_data_len, t->fragment_first,
294		t->fragment_last);
295    }
296  switch (t->action)
297    {
298    case RANGE_SHRINK:
299      s = format (s, "\n%Ushrink %U by %u", format_white_space, indent,
300		  format_ip4_full_reass_range_trace, &t->trace_range,
301		  t->size_diff);
302      break;
303    case RANGE_DISCARD:
304      s = format (s, "\n%Udiscard %U", format_white_space, indent,
305		  format_ip4_full_reass_range_trace, &t->trace_range);
306      break;
307    case RANGE_NEW:
308      s = format (s, "\n%Unew %U", format_white_space, indent,
309		  format_ip4_full_reass_range_trace, &t->trace_range);
310      break;
311    case RANGE_OVERLAP:
312      s = format (s, "\n%Uoverlapping/ignored %U", format_white_space, indent,
313		  format_ip4_full_reass_range_trace, &t->trace_range);
314      break;
315    case FINALIZE:
316      s = format (s, "\n%Ufinalize reassembly", format_white_space, indent);
317      break;
318    case HANDOFF:
319      s =
320	format (s, "handoff from thread #%u to thread #%u", t->thread_id,
321		t->thread_id_to);
322      break;
323    }
324  return s;
325}
326
327static void
328ip4_full_reass_add_trace (vlib_main_t * vm, vlib_node_runtime_t * node,
329			  ip4_full_reass_main_t * rm,
330			  ip4_full_reass_t * reass, u32 bi,
331			  ip4_full_reass_trace_operation_e action,
332			  u32 size_diff, u32 thread_id_to)
333{
334  vlib_buffer_t *b = vlib_get_buffer (vm, bi);
335  vnet_buffer_opaque_t *vnb = vnet_buffer (b);
336  bool is_after_handoff = false;
337  if (vlib_buffer_get_trace_thread (b) != vm->thread_index)
338    {
339      is_after_handoff = true;
340    }
341  ip4_full_reass_trace_t *t = vlib_add_trace (vm, node, b, sizeof (t[0]));
342  t->is_after_handoff = is_after_handoff;
343  if (t->is_after_handoff)
344    {
345      clib_memcpy (&t->ip4_header, vlib_buffer_get_current (b),
346		   clib_min (sizeof (t->ip4_header), b->current_length));
347    }
348  if (reass)
349    {
350      t->reass_id = reass->id;
351      t->op_id = reass->trace_op_counter;
352      t->trace_range.first_bi = reass->first_bi;
353      t->total_data_len = reass->data_len;
354      ++reass->trace_op_counter;
355    }
356  else
357    {
358      t->reass_id = ~0;
359      t->op_id = 0;
360      t->trace_range.first_bi = 0;
361      t->total_data_len = 0;
362    }
363  t->action = action;
364  ip4_full_reass_trace_details (vm, bi, &t->trace_range);
365  t->size_diff = size_diff;
366  t->thread_id = vm->thread_index;
367  t->thread_id_to = thread_id_to;
368  t->fragment_first = vnb->ip.reass.fragment_first;
369  t->fragment_last = vnb->ip.reass.fragment_last;
370#if 0
371  static u8 *s = NULL;
372  s = format (s, "%U", format_ip4_full_reass_trace, NULL, NULL, t);
373  printf ("%.*s\n", vec_len (s), s);
374  fflush (stdout);
375  vec_reset_length (s);
376#endif
377}
378
379always_inline void
380ip4_full_reass_free_ctx (ip4_full_reass_per_thread_t * rt,
381			 ip4_full_reass_t * reass)
382{
383  pool_put (rt->pool, reass);
384  --rt->reass_n;
385}
386
387always_inline void
388ip4_full_reass_free (ip4_full_reass_main_t * rm,
389		     ip4_full_reass_per_thread_t * rt,
390		     ip4_full_reass_t * reass)
391{
392  clib_bihash_kv_16_8_t kv;
393  kv.key[0] = reass->key.as_u64[0];
394  kv.key[1] = reass->key.as_u64[1];
395  clib_bihash_add_del_16_8 (&rm->hash, &kv, 0);
396  return ip4_full_reass_free_ctx (rt, reass);
397}
398
399always_inline void
400ip4_full_reass_drop_all (vlib_main_t * vm, vlib_node_runtime_t * node,
401			 ip4_full_reass_main_t * rm, ip4_full_reass_t * reass)
402{
403  u32 range_bi = reass->first_bi;
404  vlib_buffer_t *range_b;
405  vnet_buffer_opaque_t *range_vnb;
406  u32 *to_free = NULL;
407  while (~0 != range_bi)
408    {
409      range_b = vlib_get_buffer (vm, range_bi);
410      range_vnb = vnet_buffer (range_b);
411      u32 bi = range_bi;
412      while (~0 != bi)
413	{
414	  vec_add1 (to_free, bi);
415	  vlib_buffer_t *b = vlib_get_buffer (vm, bi);
416	  if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
417	    {
418	      bi = b->next_buffer;
419	      b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
420	    }
421	  else
422	    {
423	      bi = ~0;
424	    }
425	}
426      range_bi = range_vnb->ip.reass.next_range_bi;
427    }
428  /* send to next_error_index */
429  if (~0 != reass->error_next_index)
430    {
431      u32 n_left_to_next, *to_next, next_index;
432
433      next_index = reass->error_next_index;
434      u32 bi = ~0;
435
436      while (vec_len (to_free) > 0)
437	{
438	  vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
439
440	  while (vec_len (to_free) > 0 && n_left_to_next > 0)
441	    {
442	      bi = vec_pop (to_free);
443
444	      if (~0 != bi)
445		{
446		  to_next[0] = bi;
447		  to_next += 1;
448		  n_left_to_next -= 1;
449		}
450	    }
451	  vlib_put_next_frame (vm, node, next_index, n_left_to_next);
452	}
453    }
454  else
455    {
456      vlib_buffer_free (vm, to_free, vec_len (to_free));
457    }
458}
459
460always_inline void
461ip4_full_reass_init (ip4_full_reass_t * reass)
462{
463  reass->first_bi = ~0;
464  reass->last_packet_octet = ~0;
465  reass->data_len = 0;
466  reass->next_index = ~0;
467  reass->error_next_index = ~0;
468}
469
470always_inline ip4_full_reass_t *
471ip4_full_reass_find_or_create (vlib_main_t * vm, vlib_node_runtime_t * node,
472			       ip4_full_reass_main_t * rm,
473			       ip4_full_reass_per_thread_t * rt,
474			       ip4_full_reass_kv_t * kv, u8 * do_handoff)
475{
476  ip4_full_reass_t *reass;
477  f64 now;
478
479again:
480
481  reass = NULL;
482  now = vlib_time_now (vm);
483  if (!clib_bihash_search_16_8 (&rm->hash, &kv->kv, &kv->kv))
484    {
485      if (vm->thread_index != kv->v.memory_owner_thread_index)
486	{
487	  *do_handoff = 1;
488	  return NULL;
489	}
490      reass =
491	pool_elt_at_index (rm->per_thread_data
492			   [kv->v.memory_owner_thread_index].pool,
493			   kv->v.reass_index);
494
495      if (now > reass->last_heard + rm->timeout)
496	{
497	  ip4_full_reass_drop_all (vm, node, rm, reass);
498	  ip4_full_reass_free (rm, rt, reass);
499	  reass = NULL;
500	}
501    }
502
503  if (reass)
504    {
505      reass->last_heard = now;
506      return reass;
507    }
508
509  if (rt->reass_n >= rm->max_reass_n)
510    {
511      reass = NULL;
512      return reass;
513    }
514  else
515    {
516      pool_get (rt->pool, reass);
517      clib_memset (reass, 0, sizeof (*reass));
518      reass->id = ((u64) vm->thread_index * 1000000000) + rt->id_counter;
519      reass->memory_owner_thread_index = vm->thread_index;
520      ++rt->id_counter;
521      ip4_full_reass_init (reass);
522      ++rt->reass_n;
523    }
524
525  reass->key.as_u64[0] = kv->kv.key[0];
526  reass->key.as_u64[1] = kv->kv.key[1];
527  kv->v.reass_index = (reass - rt->pool);
528  kv->v.memory_owner_thread_index = vm->thread_index;
529  reass->last_heard = now;
530
531  int rv = clib_bihash_add_del_16_8 (&rm->hash, &kv->kv, 2);
532  if (rv)
533    {
534      ip4_full_reass_free_ctx (rt, reass);
535      reass = NULL;
536      // if other worker created a context already work with the other copy
537      if (-2 == rv)
538	goto again;
539    }
540
541  return reass;
542}
543
544always_inline ip4_full_reass_rc_t
545ip4_full_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
546			 ip4_full_reass_main_t * rm,
547			 ip4_full_reass_per_thread_t * rt,
548			 ip4_full_reass_t * reass, u32 * bi0,
549			 u32 * next0, u32 * error0, bool is_custom_app)
550{
551  vlib_buffer_t *first_b = vlib_get_buffer (vm, reass->first_bi);
552  vlib_buffer_t *last_b = NULL;
553  u32 sub_chain_bi = reass->first_bi;
554  u32 total_length = 0;
555  u32 buf_cnt = 0;
556  do
557    {
558      u32 tmp_bi = sub_chain_bi;
559      vlib_buffer_t *tmp = vlib_get_buffer (vm, tmp_bi);
560      ip4_header_t *ip = vlib_buffer_get_current (tmp);
561      vnet_buffer_opaque_t *vnb = vnet_buffer (tmp);
562      if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
563	  !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
564	{
565	  return IP4_REASS_RC_INTERNAL_ERROR;
566	}
567
568      u32 data_len = ip4_full_reass_buffer_get_data_len (tmp);
569      u32 trim_front =
570	ip4_header_bytes (ip) + ip4_full_reass_buffer_get_data_offset (tmp);
571      u32 trim_end =
572	vlib_buffer_length_in_chain (vm, tmp) - trim_front - data_len;
573      if (tmp_bi == reass->first_bi)
574	{
575	  /* first buffer - keep ip4 header */
576	  if (0 != ip4_full_reass_buffer_get_data_offset (tmp))
577	    {
578	      return IP4_REASS_RC_INTERNAL_ERROR;
579	    }
580	  trim_front = 0;
581	  trim_end = vlib_buffer_length_in_chain (vm, tmp) - data_len -
582	    ip4_header_bytes (ip);
583	  if (!(vlib_buffer_length_in_chain (vm, tmp) - trim_end > 0))
584	    {
585	      return IP4_REASS_RC_INTERNAL_ERROR;
586	    }
587	}
588      u32 keep_data =
589	vlib_buffer_length_in_chain (vm, tmp) - trim_front - trim_end;
590      while (1)
591	{
592	  ++buf_cnt;
593	  if (trim_front)
594	    {
595	      if (trim_front > tmp->current_length)
596		{
597		  /* drop whole buffer */
598		  u32 to_be_freed_bi = tmp_bi;
599		  trim_front -= tmp->current_length;
600		  if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
601		    {
602		      return IP4_REASS_RC_INTERNAL_ERROR;
603		    }
604		  tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
605		  tmp_bi = tmp->next_buffer;
606		  tmp->next_buffer = 0;
607		  tmp = vlib_get_buffer (vm, tmp_bi);
608		  vlib_buffer_free_one (vm, to_be_freed_bi);
609		  continue;
610		}
611	      else
612		{
613		  vlib_buffer_advance (tmp, trim_front);
614		  trim_front = 0;
615		}
616	    }
617	  if (keep_data)
618	    {
619	      if (last_b)
620		{
621		  last_b->flags |= VLIB_BUFFER_NEXT_PRESENT;
622		  last_b->next_buffer = tmp_bi;
623		}
624	      last_b = tmp;
625	      if (keep_data <= tmp->current_length)
626		{
627		  tmp->current_length = keep_data;
628		  keep_data = 0;
629		}
630	      else
631		{
632		  keep_data -= tmp->current_length;
633		  if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
634		    {
635		      return IP4_REASS_RC_INTERNAL_ERROR;
636		    }
637		}
638	      total_length += tmp->current_length;
639	      if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
640		{
641		  tmp_bi = tmp->next_buffer;
642		  tmp = vlib_get_buffer (vm, tmp->next_buffer);
643		}
644	      else
645		{
646		  break;
647		}
648	    }
649	  else
650	    {
651	      u32 to_be_freed_bi = tmp_bi;
652	      if (reass->first_bi == tmp_bi)
653		{
654		  return IP4_REASS_RC_INTERNAL_ERROR;
655		}
656	      if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
657		{
658		  tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
659		  tmp_bi = tmp->next_buffer;
660		  tmp->next_buffer = 0;
661		  tmp = vlib_get_buffer (vm, tmp_bi);
662		  vlib_buffer_free_one (vm, to_be_freed_bi);
663		}
664	      else
665		{
666		  tmp->next_buffer = 0;
667		  vlib_buffer_free_one (vm, to_be_freed_bi);
668		  break;
669		}
670	    }
671	}
672      sub_chain_bi =
673	vnet_buffer (vlib_get_buffer (vm, sub_chain_bi))->ip.
674	reass.next_range_bi;
675    }
676  while (~0 != sub_chain_bi);
677
678  if (!last_b)
679    {
680      return IP4_REASS_RC_INTERNAL_ERROR;
681    }
682  last_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
683
684  if (total_length < first_b->current_length)
685    {
686      return IP4_REASS_RC_INTERNAL_ERROR;
687    }
688  total_length -= first_b->current_length;
689  first_b->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
690  first_b->total_length_not_including_first_buffer = total_length;
691  ip4_header_t *ip = vlib_buffer_get_current (first_b);
692  ip->flags_and_fragment_offset = 0;
693  ip->length = clib_host_to_net_u16 (first_b->current_length + total_length);
694  ip->checksum = ip4_header_checksum (ip);
695  if (!vlib_buffer_chain_linearize (vm, first_b))
696    {
697      return IP4_REASS_RC_NO_BUF;
698    }
699  // reset to reconstruct the mbuf linking
700  first_b->flags &= ~VLIB_BUFFER_EXT_HDR_VALID;
701  if (PREDICT_FALSE (first_b->flags & VLIB_BUFFER_IS_TRACED))
702    {
703      ip4_full_reass_add_trace (vm, node, rm, reass, reass->first_bi,
704				FINALIZE, 0, ~0);
705#if 0
706      // following code does a hexdump of packet fragments to stdout ...
707      do
708	{
709	  u32 bi = reass->first_bi;
710	  u8 *s = NULL;
711	  while (~0 != bi)
712	    {
713	      vlib_buffer_t *b = vlib_get_buffer (vm, bi);
714	      s = format (s, "%u: %U\n", bi, format_hexdump,
715			  vlib_buffer_get_current (b), b->current_length);
716	      if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
717		{
718		  bi = b->next_buffer;
719		}
720	      else
721		{
722		  break;
723		}
724	    }
725	  printf ("%.*s\n", vec_len (s), s);
726	  fflush (stdout);
727	  vec_free (s);
728	}
729      while (0);
730#endif
731    }
732  *bi0 = reass->first_bi;
733  if (!is_custom_app)
734    {
735      *next0 = IP4_FULL_REASS_NEXT_INPUT;
736    }
737  else
738    {
739      *next0 = reass->next_index;
740    }
741  vnet_buffer (first_b)->ip.reass.estimated_mtu = reass->min_fragment_length;
742  *error0 = IP4_ERROR_NONE;
743  ip4_full_reass_free (rm, rt, reass);
744  reass = NULL;
745  return IP4_REASS_RC_OK;
746}
747
748always_inline ip4_full_reass_rc_t
749ip4_full_reass_insert_range_in_chain (vlib_main_t * vm,
750				      ip4_full_reass_main_t * rm,
751				      ip4_full_reass_per_thread_t * rt,
752				      ip4_full_reass_t * reass,
753				      u32 prev_range_bi, u32 new_next_bi)
754{
755  vlib_buffer_t *new_next_b = vlib_get_buffer (vm, new_next_bi);
756  vnet_buffer_opaque_t *new_next_vnb = vnet_buffer (new_next_b);
757  if (~0 != prev_range_bi)
758    {
759      vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
760      vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
761      new_next_vnb->ip.reass.next_range_bi = prev_vnb->ip.reass.next_range_bi;
762      prev_vnb->ip.reass.next_range_bi = new_next_bi;
763    }
764  else
765    {
766      if (~0 != reass->first_bi)
767	{
768	  new_next_vnb->ip.reass.next_range_bi = reass->first_bi;
769	}
770      reass->first_bi = new_next_bi;
771    }
772  vnet_buffer_opaque_t *vnb = vnet_buffer (new_next_b);
773  if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
774      !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
775    {
776      return IP4_REASS_RC_INTERNAL_ERROR;
777    }
778  reass->data_len += ip4_full_reass_buffer_get_data_len (new_next_b);
779  return IP4_REASS_RC_OK;
780}
781
782always_inline ip4_full_reass_rc_t
783ip4_full_reass_remove_range_from_chain (vlib_main_t * vm,
784					vlib_node_runtime_t * node,
785					ip4_full_reass_main_t * rm,
786					ip4_full_reass_t * reass,
787					u32 prev_range_bi, u32 discard_bi)
788{
789  vlib_buffer_t *discard_b = vlib_get_buffer (vm, discard_bi);
790  vnet_buffer_opaque_t *discard_vnb = vnet_buffer (discard_b);
791  if (~0 != prev_range_bi)
792    {
793      vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
794      vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
795      if (!(prev_vnb->ip.reass.next_range_bi == discard_bi))
796	{
797	  return IP4_REASS_RC_INTERNAL_ERROR;
798	}
799      prev_vnb->ip.reass.next_range_bi = discard_vnb->ip.reass.next_range_bi;
800    }
801  else
802    {
803      reass->first_bi = discard_vnb->ip.reass.next_range_bi;
804    }
805  vnet_buffer_opaque_t *vnb = vnet_buffer (discard_b);
806  if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
807      !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
808    {
809      return IP4_REASS_RC_INTERNAL_ERROR;
810    }
811  reass->data_len -= ip4_full_reass_buffer_get_data_len (discard_b);
812  while (1)
813    {
814      u32 to_be_freed_bi = discard_bi;
815      if (PREDICT_FALSE (discard_b->flags & VLIB_BUFFER_IS_TRACED))
816	{
817	  ip4_full_reass_add_trace (vm, node, rm, reass, discard_bi,
818				    RANGE_DISCARD, 0, ~0);
819	}
820      if (discard_b->flags & VLIB_BUFFER_NEXT_PRESENT)
821	{
822	  discard_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
823	  discard_bi = discard_b->next_buffer;
824	  discard_b->next_buffer = 0;
825	  discard_b = vlib_get_buffer (vm, discard_bi);
826	  vlib_buffer_free_one (vm, to_be_freed_bi);
827	}
828      else
829	{
830	  discard_b->next_buffer = 0;
831	  vlib_buffer_free_one (vm, to_be_freed_bi);
832	  break;
833	}
834    }
835  return IP4_REASS_RC_OK;
836}
837
838always_inline ip4_full_reass_rc_t
839ip4_full_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node,
840		       ip4_full_reass_main_t * rm,
841		       ip4_full_reass_per_thread_t * rt,
842		       ip4_full_reass_t * reass, u32 * bi0, u32 * next0,
843		       u32 * error0, bool is_custom_app,
844		       u32 * handoff_thread_idx)
845{
846  vlib_buffer_t *fb = vlib_get_buffer (vm, *bi0);
847  vnet_buffer_opaque_t *fvnb = vnet_buffer (fb);
848  if (is_custom_app)
849    {
850      // store (error_)next_index before it's overwritten
851      reass->next_index = fvnb->ip.reass.next_index;
852      reass->error_next_index = fvnb->ip.reass.error_next_index;
853    }
854  ip4_full_reass_rc_t rc = IP4_REASS_RC_OK;
855  int consumed = 0;
856  ip4_header_t *fip = vlib_buffer_get_current (fb);
857  const u32 fragment_first = ip4_get_fragment_offset_bytes (fip);
858  const u32 fragment_length =
859    clib_net_to_host_u16 (fip->length) - ip4_header_bytes (fip);
860  const u32 fragment_last = fragment_first + fragment_length - 1;
861  fvnb->ip.reass.fragment_first = fragment_first;
862  fvnb->ip.reass.fragment_last = fragment_last;
863  int more_fragments = ip4_get_fragment_more (fip);
864  u32 candidate_range_bi = reass->first_bi;
865  u32 prev_range_bi = ~0;
866  fvnb->ip.reass.range_first = fragment_first;
867  fvnb->ip.reass.range_last = fragment_last;
868  fvnb->ip.reass.next_range_bi = ~0;
869  if (!more_fragments)
870    {
871      reass->last_packet_octet = fragment_last;
872    }
873  if (~0 == reass->first_bi)
874    {
875      // starting a new reassembly
876      rc =
877	ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
878					      prev_range_bi, *bi0);
879      if (IP4_REASS_RC_OK != rc)
880	{
881	  return rc;
882	}
883      if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
884	{
885	  ip4_full_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_NEW, 0,
886				    ~0);
887	}
888      *bi0 = ~0;
889      reass->min_fragment_length = clib_net_to_host_u16 (fip->length);
890      reass->fragments_n = 1;
891      return IP4_REASS_RC_OK;
892    }
893  reass->min_fragment_length =
894    clib_min (clib_net_to_host_u16 (fip->length),
895	      fvnb->ip.reass.estimated_mtu);
896  while (~0 != candidate_range_bi)
897    {
898      vlib_buffer_t *candidate_b = vlib_get_buffer (vm, candidate_range_bi);
899      vnet_buffer_opaque_t *candidate_vnb = vnet_buffer (candidate_b);
900      if (fragment_first > candidate_vnb->ip.reass.range_last)
901	{
902	  // this fragments starts after candidate range
903	  prev_range_bi = candidate_range_bi;
904	  candidate_range_bi = candidate_vnb->ip.reass.next_range_bi;
905	  if (candidate_vnb->ip.reass.range_last < fragment_last &&
906	      ~0 == candidate_range_bi)
907	    {
908	      // special case - this fragment falls beyond all known ranges
909	      rc =
910		ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
911						      prev_range_bi, *bi0);
912	      if (IP4_REASS_RC_OK != rc)
913		{
914		  return rc;
915		}
916	      consumed = 1;
917	      break;
918	    }
919	  continue;
920	}
921      if (fragment_last < candidate_vnb->ip.reass.range_first)
922	{
923	  // this fragment ends before candidate range without any overlap
924	  rc =
925	    ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
926						  prev_range_bi, *bi0);
927	  if (IP4_REASS_RC_OK != rc)
928	    {
929	      return rc;
930	    }
931	  consumed = 1;
932	}
933      else
934	{
935	  if (fragment_first >= candidate_vnb->ip.reass.range_first &&
936	      fragment_last <= candidate_vnb->ip.reass.range_last)
937	    {
938	      // this fragment is a (sub)part of existing range, ignore it
939	      if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
940		{
941		  ip4_full_reass_add_trace (vm, node, rm, reass, *bi0,
942					    RANGE_OVERLAP, 0, ~0);
943		}
944	      break;
945	    }
946	  int discard_candidate = 0;
947	  if (fragment_first < candidate_vnb->ip.reass.range_first)
948	    {
949	      u32 overlap =
950		fragment_last - candidate_vnb->ip.reass.range_first + 1;
951	      if (overlap < ip4_full_reass_buffer_get_data_len (candidate_b))
952		{
953		  candidate_vnb->ip.reass.range_first += overlap;
954		  if (reass->data_len < overlap)
955		    {
956		      return IP4_REASS_RC_INTERNAL_ERROR;
957		    }
958		  reass->data_len -= overlap;
959		  if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
960		    {
961		      ip4_full_reass_add_trace (vm, node, rm, reass,
962						candidate_range_bi,
963						RANGE_SHRINK, 0, ~0);
964		    }
965		  rc =
966		    ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
967							  prev_range_bi,
968							  *bi0);
969		  if (IP4_REASS_RC_OK != rc)
970		    {
971		      return rc;
972		    }
973		  consumed = 1;
974		}
975	      else
976		{
977		  discard_candidate = 1;
978		}
979	    }
980	  else if (fragment_last > candidate_vnb->ip.reass.range_last)
981	    {
982	      u32 overlap =
983		candidate_vnb->ip.reass.range_last - fragment_first + 1;
984	      if (overlap < ip4_full_reass_buffer_get_data_len (candidate_b))
985		{
986		  fvnb->ip.reass.range_first += overlap;
987		  if (~0 != candidate_vnb->ip.reass.next_range_bi)
988		    {
989		      prev_range_bi = candidate_range_bi;
990		      candidate_range_bi =
991			candidate_vnb->ip.reass.next_range_bi;
992		      continue;
993		    }
994		  else
995		    {
996		      // special case - last range discarded
997		      rc =
998			ip4_full_reass_insert_range_in_chain (vm, rm, rt,
999							      reass,
1000							      candidate_range_bi,
1001							      *bi0);
1002		      if (IP4_REASS_RC_OK != rc)
1003			{
1004			  return rc;
1005			}
1006		      consumed = 1;
1007		    }
1008		}
1009	      else
1010		{
1011		  discard_candidate = 1;
1012		}
1013	    }
1014	  else
1015	    {
1016	      discard_candidate = 1;
1017	    }
1018	  if (discard_candidate)
1019	    {
1020	      u32 next_range_bi = candidate_vnb->ip.reass.next_range_bi;
1021	      // discard candidate range, probe next range
1022	      rc =
1023		ip4_full_reass_remove_range_from_chain (vm, node, rm, reass,
1024							prev_range_bi,
1025							candidate_range_bi);
1026	      if (IP4_REASS_RC_OK != rc)
1027		{
1028		  return rc;
1029		}
1030	      if (~0 != next_range_bi)
1031		{
1032		  candidate_range_bi = next_range_bi;
1033		  continue;
1034		}
1035	      else
1036		{
1037		  // special case - last range discarded
1038		  rc =
1039		    ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
1040							  prev_range_bi,
1041							  *bi0);
1042		  if (IP4_REASS_RC_OK != rc)
1043		    {
1044		      return rc;
1045		    }
1046		  consumed = 1;
1047		}
1048	    }
1049	}
1050      break;
1051    }
1052  ++reass->fragments_n;
1053  if (consumed)
1054    {
1055      if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
1056	{
1057	  ip4_full_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_NEW, 0,
1058				    ~0);
1059	}
1060    }
1061  if (~0 != reass->last_packet_octet &&
1062      reass->data_len == reass->last_packet_octet + 1)
1063    {
1064      *handoff_thread_idx = reass->sendout_thread_index;
1065      int handoff =
1066	reass->memory_owner_thread_index != reass->sendout_thread_index;
1067      rc =
1068	ip4_full_reass_finalize (vm, node, rm, rt, reass, bi0, next0, error0,
1069				 is_custom_app);
1070      if (IP4_REASS_RC_OK == rc && handoff)
1071	{
1072	  rc = IP4_REASS_RC_HANDOFF;
1073	}
1074    }
1075  else
1076    {
1077      if (consumed)
1078	{
1079	  *bi0 = ~0;
1080	  if (reass->fragments_n > rm->max_reass_len)
1081	    {
1082	      rc = IP4_REASS_RC_TOO_MANY_FRAGMENTS;
1083	    }
1084	}
1085      else
1086	{
1087	  *next0 = IP4_FULL_REASS_NEXT_DROP;
1088	  *error0 = IP4_ERROR_REASS_DUPLICATE_FRAGMENT;
1089	}
1090    }
1091  return rc;
1092}
1093
1094always_inline uword
1095ip4_full_reass_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
1096		       vlib_frame_t * frame, bool is_feature,
1097		       bool is_custom_app)
1098{
1099  u32 *from = vlib_frame_vector_args (frame);
1100  u32 n_left_from, n_left_to_next, *to_next, next_index;
1101  ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1102  ip4_full_reass_per_thread_t *rt = &rm->per_thread_data[vm->thread_index];
1103  clib_spinlock_lock (&rt->lock);
1104
1105  n_left_from = frame->n_vectors;
1106  next_index = node->cached_next_index;
1107  while (n_left_from > 0)
1108    {
1109      vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1110
1111      while (n_left_from > 0 && n_left_to_next > 0)
1112	{
1113	  u32 bi0;
1114	  vlib_buffer_t *b0;
1115	  u32 next0;
1116	  u32 error0 = IP4_ERROR_NONE;
1117
1118	  bi0 = from[0];
1119	  b0 = vlib_get_buffer (vm, bi0);
1120
1121	  ip4_header_t *ip0 = vlib_buffer_get_current (b0);
1122	  if (!ip4_get_fragment_more (ip0) && !ip4_get_fragment_offset (ip0))
1123	    {
1124	      // this is a whole packet - no fragmentation
1125	      if (!is_custom_app)
1126		{
1127		  next0 = IP4_FULL_REASS_NEXT_INPUT;
1128		}
1129	      else
1130		{
1131		  next0 = vnet_buffer (b0)->ip.reass.next_index;
1132		}
1133	      goto packet_enqueue;
1134	    }
1135	  const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
1136	  const u32 fragment_length =
1137	    clib_net_to_host_u16 (ip0->length) - ip4_header_bytes (ip0);
1138	  const u32 fragment_last = fragment_first + fragment_length - 1;
1139	  if (fragment_first > fragment_last || fragment_first + fragment_length > UINT16_MAX - 20 || (fragment_length < 8 && ip4_get_fragment_more (ip0)))	// 8 is minimum frag length per RFC 791
1140	    {
1141	      next0 = IP4_FULL_REASS_NEXT_DROP;
1142	      error0 = IP4_ERROR_REASS_MALFORMED_PACKET;
1143	      goto packet_enqueue;
1144	    }
1145	  ip4_full_reass_kv_t kv;
1146	  u8 do_handoff = 0;
1147
1148	  kv.k.as_u64[0] =
1149	    (u64) vec_elt (ip4_main.fib_index_by_sw_if_index,
1150			   vnet_buffer (b0)->sw_if_index[VLIB_RX]) |
1151	    (u64) ip0->src_address.as_u32 << 32;
1152	  kv.k.as_u64[1] =
1153	    (u64) ip0->dst_address.
1154	    as_u32 | (u64) ip0->fragment_id << 32 | (u64) ip0->protocol << 48;
1155
1156	  ip4_full_reass_t *reass =
1157	    ip4_full_reass_find_or_create (vm, node, rm, rt, &kv,
1158					   &do_handoff);
1159
1160	  if (reass)
1161	    {
1162	      const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
1163	      if (0 == fragment_first)
1164		{
1165		  reass->sendout_thread_index = vm->thread_index;
1166		}
1167	    }
1168
1169	  if (PREDICT_FALSE (do_handoff))
1170	    {
1171	      next0 = IP4_FULL_REASS_NEXT_HANDOFF;
1172	      vnet_buffer (b0)->ip.reass.owner_thread_index =
1173		kv.v.memory_owner_thread_index;
1174	    }
1175	  else if (reass)
1176	    {
1177	      u32 handoff_thread_idx;
1178	      switch (ip4_full_reass_update
1179		      (vm, node, rm, rt, reass, &bi0, &next0,
1180		       &error0, is_custom_app, &handoff_thread_idx))
1181		{
1182		case IP4_REASS_RC_OK:
1183		  /* nothing to do here */
1184		  break;
1185		case IP4_REASS_RC_HANDOFF:
1186		  next0 = IP4_FULL_REASS_NEXT_HANDOFF;
1187		  b0 = vlib_get_buffer (vm, bi0);
1188		  vnet_buffer (b0)->ip.reass.owner_thread_index =
1189		    handoff_thread_idx;
1190		  break;
1191		case IP4_REASS_RC_TOO_MANY_FRAGMENTS:
1192		  vlib_node_increment_counter (vm, node->node_index,
1193					       IP4_ERROR_REASS_FRAGMENT_CHAIN_TOO_LONG,
1194					       1);
1195		  ip4_full_reass_drop_all (vm, node, rm, reass);
1196		  ip4_full_reass_free (rm, rt, reass);
1197		  goto next_packet;
1198		  break;
1199		case IP4_REASS_RC_NO_BUF:
1200		  vlib_node_increment_counter (vm, node->node_index,
1201					       IP4_ERROR_REASS_NO_BUF, 1);
1202		  ip4_full_reass_drop_all (vm, node, rm, reass);
1203		  ip4_full_reass_free (rm, rt, reass);
1204		  goto next_packet;
1205		  break;
1206		case IP4_REASS_RC_INTERNAL_ERROR:
1207		  /* drop everything and start with a clean slate */
1208		  vlib_node_increment_counter (vm, node->node_index,
1209					       IP4_ERROR_REASS_INTERNAL_ERROR,
1210					       1);
1211		  ip4_full_reass_drop_all (vm, node, rm, reass);
1212		  ip4_full_reass_free (rm, rt, reass);
1213		  goto next_packet;
1214		  break;
1215		}
1216	    }
1217	  else
1218	    {
1219	      next0 = IP4_FULL_REASS_NEXT_DROP;
1220	      error0 = IP4_ERROR_REASS_LIMIT_REACHED;
1221	    }
1222
1223
1224	packet_enqueue:
1225
1226	  if (bi0 != ~0)
1227	    {
1228	      to_next[0] = bi0;
1229	      to_next += 1;
1230	      n_left_to_next -= 1;
1231
1232	      /* bi0 might have been updated by reass_finalize, reload */
1233	      b0 = vlib_get_buffer (vm, bi0);
1234	      if (IP4_ERROR_NONE != error0)
1235		{
1236		  b0->error = node->errors[error0];
1237		}
1238
1239	      if (next0 == IP4_FULL_REASS_NEXT_HANDOFF)
1240		{
1241		  if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
1242		    {
1243		      ip4_full_reass_add_trace (vm, node, rm, NULL, bi0,
1244						HANDOFF, 0,
1245						vnet_buffer (b0)->ip.
1246						reass.owner_thread_index);
1247		    }
1248		}
1249	      else if (is_feature && IP4_ERROR_NONE == error0)
1250		{
1251		  vnet_feature_next (&next0, b0);
1252		}
1253	      vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
1254					       to_next, n_left_to_next,
1255					       bi0, next0);
1256	      IP4_REASS_DEBUG_BUFFER (bi0, enqueue_next);
1257	    }
1258
1259	next_packet:
1260	  from += 1;
1261	  n_left_from -= 1;
1262	}
1263
1264      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1265    }
1266
1267  clib_spinlock_unlock (&rt->lock);
1268  return frame->n_vectors;
1269}
1270
1271static char *ip4_full_reass_error_strings[] = {
1272#define _(sym, string) string,
1273  foreach_ip4_error
1274#undef _
1275};
1276
1277VLIB_NODE_FN (ip4_full_reass_node) (vlib_main_t * vm,
1278				    vlib_node_runtime_t * node,
1279				    vlib_frame_t * frame)
1280{
1281  return ip4_full_reass_inline (vm, node, frame, false /* is_feature */ ,
1282				false /* is_custom_app */ );
1283}
1284
1285/* *INDENT-OFF* */
1286VLIB_REGISTER_NODE (ip4_full_reass_node) = {
1287    .name = "ip4-full-reassembly",
1288    .vector_size = sizeof (u32),
1289    .format_trace = format_ip4_full_reass_trace,
1290    .n_errors = ARRAY_LEN (ip4_full_reass_error_strings),
1291    .error_strings = ip4_full_reass_error_strings,
1292    .n_next_nodes = IP4_FULL_REASS_N_NEXT,
1293    .next_nodes =
1294        {
1295                [IP4_FULL_REASS_NEXT_INPUT] = "ip4-input",
1296                [IP4_FULL_REASS_NEXT_DROP] = "ip4-drop",
1297                [IP4_FULL_REASS_NEXT_HANDOFF] = "ip4-full-reassembly-handoff",
1298
1299        },
1300};
1301/* *INDENT-ON* */
1302
1303VLIB_NODE_FN (ip4_full_reass_node_feature) (vlib_main_t * vm,
1304					    vlib_node_runtime_t * node,
1305					    vlib_frame_t * frame)
1306{
1307  return ip4_full_reass_inline (vm, node, frame, true /* is_feature */ ,
1308				false /* is_custom_app */ );
1309}
1310
1311/* *INDENT-OFF* */
1312VLIB_REGISTER_NODE (ip4_full_reass_node_feature) = {
1313    .name = "ip4-full-reassembly-feature",
1314    .vector_size = sizeof (u32),
1315    .format_trace = format_ip4_full_reass_trace,
1316    .n_errors = ARRAY_LEN (ip4_full_reass_error_strings),
1317    .error_strings = ip4_full_reass_error_strings,
1318    .n_next_nodes = IP4_FULL_REASS_N_NEXT,
1319    .next_nodes =
1320        {
1321                [IP4_FULL_REASS_NEXT_INPUT] = "ip4-input",
1322                [IP4_FULL_REASS_NEXT_DROP] = "ip4-drop",
1323                [IP4_FULL_REASS_NEXT_HANDOFF] = "ip4-full-reass-feature-hoff",
1324        },
1325};
1326/* *INDENT-ON* */
1327
1328/* *INDENT-OFF* */
1329VNET_FEATURE_INIT (ip4_full_reass_feature, static) = {
1330    .arc_name = "ip4-unicast",
1331    .node_name = "ip4-full-reassembly-feature",
1332    .runs_before = VNET_FEATURES ("ip4-lookup",
1333                                  "ipsec4-input-feature"),
1334    .runs_after = 0,
1335};
1336/* *INDENT-ON* */
1337
1338#ifndef CLIB_MARCH_VARIANT
1339always_inline u32
1340ip4_full_reass_get_nbuckets ()
1341{
1342  ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1343  u32 nbuckets;
1344  u8 i;
1345
1346  nbuckets = (u32) (rm->max_reass_n / IP4_REASS_HT_LOAD_FACTOR);
1347
1348  for (i = 0; i < 31; i++)
1349    if ((1 << i) >= nbuckets)
1350      break;
1351  nbuckets = 1 << i;
1352
1353  return nbuckets;
1354}
1355#endif /* CLIB_MARCH_VARIANT */
1356
1357typedef enum
1358{
1359  IP4_EVENT_CONFIG_CHANGED = 1,
1360} ip4_full_reass_event_t;
1361
1362typedef struct
1363{
1364  int failure;
1365  clib_bihash_16_8_t *new_hash;
1366} ip4_rehash_cb_ctx;
1367
1368#ifndef CLIB_MARCH_VARIANT
1369static int
1370ip4_rehash_cb (clib_bihash_kv_16_8_t * kv, void *_ctx)
1371{
1372  ip4_rehash_cb_ctx *ctx = _ctx;
1373  if (clib_bihash_add_del_16_8 (ctx->new_hash, kv, 1))
1374    {
1375      ctx->failure = 1;
1376    }
1377  return (BIHASH_WALK_CONTINUE);
1378}
1379
1380static void
1381ip4_full_reass_set_params (u32 timeout_ms, u32 max_reassemblies,
1382			   u32 max_reassembly_length,
1383			   u32 expire_walk_interval_ms)
1384{
1385  ip4_full_reass_main.timeout_ms = timeout_ms;
1386  ip4_full_reass_main.timeout = (f64) timeout_ms / (f64) MSEC_PER_SEC;
1387  ip4_full_reass_main.max_reass_n = max_reassemblies;
1388  ip4_full_reass_main.max_reass_len = max_reassembly_length;
1389  ip4_full_reass_main.expire_walk_interval_ms = expire_walk_interval_ms;
1390}
1391
1392vnet_api_error_t
1393ip4_full_reass_set (u32 timeout_ms, u32 max_reassemblies,
1394		    u32 max_reassembly_length, u32 expire_walk_interval_ms)
1395{
1396  u32 old_nbuckets = ip4_full_reass_get_nbuckets ();
1397  ip4_full_reass_set_params (timeout_ms, max_reassemblies,
1398			     max_reassembly_length, expire_walk_interval_ms);
1399  vlib_process_signal_event (ip4_full_reass_main.vlib_main,
1400			     ip4_full_reass_main.ip4_full_reass_expire_node_idx,
1401			     IP4_EVENT_CONFIG_CHANGED, 0);
1402  u32 new_nbuckets = ip4_full_reass_get_nbuckets ();
1403  if (ip4_full_reass_main.max_reass_n > 0 && new_nbuckets > old_nbuckets)
1404    {
1405      clib_bihash_16_8_t new_hash;
1406      clib_memset (&new_hash, 0, sizeof (new_hash));
1407      ip4_rehash_cb_ctx ctx;
1408      ctx.failure = 0;
1409      ctx.new_hash = &new_hash;
1410      clib_bihash_init_16_8 (&new_hash, "ip4-dr", new_nbuckets,
1411			     new_nbuckets * 1024);
1412      clib_bihash_foreach_key_value_pair_16_8 (&ip4_full_reass_main.hash,
1413					       ip4_rehash_cb, &ctx);
1414      if (ctx.failure)
1415	{
1416	  clib_bihash_free_16_8 (&new_hash);
1417	  return -1;
1418	}
1419      else
1420	{
1421	  clib_bihash_free_16_8 (&ip4_full_reass_main.hash);
1422	  clib_memcpy_fast (&ip4_full_reass_main.hash, &new_hash,
1423			    sizeof (ip4_full_reass_main.hash));
1424	  clib_bihash_copied (&ip4_full_reass_main.hash, &new_hash);
1425	}
1426    }
1427  return 0;
1428}
1429
1430vnet_api_error_t
1431ip4_full_reass_get (u32 * timeout_ms, u32 * max_reassemblies,
1432		    u32 * max_reassembly_length,
1433		    u32 * expire_walk_interval_ms)
1434{
1435  *timeout_ms = ip4_full_reass_main.timeout_ms;
1436  *max_reassemblies = ip4_full_reass_main.max_reass_n;
1437  *max_reassembly_length = ip4_full_reass_main.max_reass_len;
1438  *expire_walk_interval_ms = ip4_full_reass_main.expire_walk_interval_ms;
1439  return 0;
1440}
1441
1442static clib_error_t *
1443ip4_full_reass_init_function (vlib_main_t * vm)
1444{
1445  ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1446  clib_error_t *error = 0;
1447  u32 nbuckets;
1448  vlib_node_t *node;
1449
1450  rm->vlib_main = vm;
1451
1452  vec_validate (rm->per_thread_data, vlib_num_workers ());
1453  ip4_full_reass_per_thread_t *rt;
1454  vec_foreach (rt, rm->per_thread_data)
1455  {
1456    clib_spinlock_init (&rt->lock);
1457    pool_alloc (rt->pool, rm->max_reass_n);
1458  }
1459
1460  node = vlib_get_node_by_name (vm, (u8 *) "ip4-full-reassembly-expire-walk");
1461  ASSERT (node);
1462  rm->ip4_full_reass_expire_node_idx = node->index;
1463
1464  ip4_full_reass_set_params (IP4_REASS_TIMEOUT_DEFAULT_MS,
1465			     IP4_REASS_MAX_REASSEMBLIES_DEFAULT,
1466			     IP4_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT,
1467			     IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS);
1468
1469  nbuckets = ip4_full_reass_get_nbuckets ();
1470  clib_bihash_init_16_8 (&rm->hash, "ip4-dr", nbuckets, nbuckets * 1024);
1471
1472  node = vlib_get_node_by_name (vm, (u8 *) "ip4-drop");
1473  ASSERT (node);
1474  rm->ip4_drop_idx = node->index;
1475
1476  rm->fq_index = vlib_frame_queue_main_init (ip4_full_reass_node.index, 0);
1477  rm->fq_feature_index =
1478    vlib_frame_queue_main_init (ip4_full_reass_node_feature.index, 0);
1479
1480  rm->feature_use_refcount_per_intf = NULL;
1481  return error;
1482}
1483
1484VLIB_INIT_FUNCTION (ip4_full_reass_init_function);
1485#endif /* CLIB_MARCH_VARIANT */
1486
1487static uword
1488ip4_full_reass_walk_expired (vlib_main_t * vm,
1489			     vlib_node_runtime_t * node, vlib_frame_t * f)
1490{
1491  ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1492  uword event_type, *event_data = 0;
1493
1494  while (true)
1495    {
1496      vlib_process_wait_for_event_or_clock (vm,
1497					    (f64)
1498					    rm->expire_walk_interval_ms /
1499					    (f64) MSEC_PER_SEC);
1500      event_type = vlib_process_get_events (vm, &event_data);
1501
1502      switch (event_type)
1503	{
1504	case ~0:		/* no events => timeout */
1505	  /* nothing to do here */
1506	  break;
1507	case IP4_EVENT_CONFIG_CHANGED:
1508	  break;
1509	default:
1510	  clib_warning ("BUG: event type 0x%wx", event_type);
1511	  break;
1512	}
1513      f64 now = vlib_time_now (vm);
1514
1515      ip4_full_reass_t *reass;
1516      int *pool_indexes_to_free = NULL;
1517
1518      uword thread_index = 0;
1519      int index;
1520      const uword nthreads = vlib_num_workers () + 1;
1521      for (thread_index = 0; thread_index < nthreads; ++thread_index)
1522	{
1523	  ip4_full_reass_per_thread_t *rt =
1524	    &rm->per_thread_data[thread_index];
1525	  clib_spinlock_lock (&rt->lock);
1526
1527	  vec_reset_length (pool_indexes_to_free);
1528          /* *INDENT-OFF* */
1529          pool_foreach_index (index, rt->pool, ({
1530                                reass = pool_elt_at_index (rt->pool, index);
1531                                if (now > reass->last_heard + rm->timeout)
1532                                  {
1533                                    vec_add1 (pool_indexes_to_free, index);
1534                                  }
1535                              }));
1536          /* *INDENT-ON* */
1537	  int *i;
1538          /* *INDENT-OFF* */
1539          vec_foreach (i, pool_indexes_to_free)
1540          {
1541            ip4_full_reass_t *reass = pool_elt_at_index (rt->pool, i[0]);
1542            ip4_full_reass_drop_all (vm, node, rm, reass);
1543            ip4_full_reass_free (rm, rt, reass);
1544          }
1545          /* *INDENT-ON* */
1546
1547	  clib_spinlock_unlock (&rt->lock);
1548	}
1549
1550      vec_free (pool_indexes_to_free);
1551      if (event_data)
1552	{
1553	  _vec_len (event_data) = 0;
1554	}
1555    }
1556
1557  return 0;
1558}
1559
1560/* *INDENT-OFF* */
1561VLIB_REGISTER_NODE (ip4_full_reass_expire_node) = {
1562    .function = ip4_full_reass_walk_expired,
1563    .type = VLIB_NODE_TYPE_PROCESS,
1564    .name = "ip4-full-reassembly-expire-walk",
1565    .format_trace = format_ip4_full_reass_trace,
1566    .n_errors = ARRAY_LEN (ip4_full_reass_error_strings),
1567    .error_strings = ip4_full_reass_error_strings,
1568
1569};
1570/* *INDENT-ON* */
1571
1572static u8 *
1573format_ip4_full_reass_key (u8 * s, va_list * args)
1574{
1575  ip4_full_reass_key_t *key = va_arg (*args, ip4_full_reass_key_t *);
1576  s =
1577    format (s,
1578	    "xx_id: %u, src: %U, dst: %U, frag_id: %u, proto: %u",
1579	    key->xx_id, format_ip4_address, &key->src, format_ip4_address,
1580	    &key->dst, clib_net_to_host_u16 (key->frag_id), key->proto);
1581  return s;
1582}
1583
1584static u8 *
1585format_ip4_reass (u8 * s, va_list * args)
1586{
1587  vlib_main_t *vm = va_arg (*args, vlib_main_t *);
1588  ip4_full_reass_t *reass = va_arg (*args, ip4_full_reass_t *);
1589
1590  s = format (s, "ID: %lu, key: %U\n  first_bi: %u, data_len: %u, "
1591	      "last_packet_octet: %u, trace_op_counter: %u\n",
1592	      reass->id, format_ip4_full_reass_key, &reass->key,
1593	      reass->first_bi, reass->data_len,
1594	      reass->last_packet_octet, reass->trace_op_counter);
1595
1596  u32 bi = reass->first_bi;
1597  u32 counter = 0;
1598  while (~0 != bi)
1599    {
1600      vlib_buffer_t *b = vlib_get_buffer (vm, bi);
1601      vnet_buffer_opaque_t *vnb = vnet_buffer (b);
1602      s =
1603	format (s,
1604		"  #%03u: range: [%u, %u], bi: %u, off: %d, len: %u, "
1605		"fragment[%u, %u]\n", counter, vnb->ip.reass.range_first,
1606		vnb->ip.reass.range_last, bi,
1607		ip4_full_reass_buffer_get_data_offset (b),
1608		ip4_full_reass_buffer_get_data_len (b),
1609		vnb->ip.reass.fragment_first, vnb->ip.reass.fragment_last);
1610      if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
1611	{
1612	  bi = b->next_buffer;
1613	}
1614      else
1615	{
1616	  bi = ~0;
1617	}
1618    }
1619  return s;
1620}
1621
1622static clib_error_t *
1623show_ip4_reass (vlib_main_t * vm,
1624		unformat_input_t * input,
1625		CLIB_UNUSED (vlib_cli_command_t * lmd))
1626{
1627  ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1628
1629  vlib_cli_output (vm, "---------------------");
1630  vlib_cli_output (vm, "IP4 reassembly status");
1631  vlib_cli_output (vm, "---------------------");
1632  bool details = false;
1633  if (unformat (input, "details"))
1634    {
1635      details = true;
1636    }
1637
1638  u32 sum_reass_n = 0;
1639  ip4_full_reass_t *reass;
1640  uword thread_index;
1641  const uword nthreads = vlib_num_workers () + 1;
1642  for (thread_index = 0; thread_index < nthreads; ++thread_index)
1643    {
1644      ip4_full_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1645      clib_spinlock_lock (&rt->lock);
1646      if (details)
1647	{
1648          /* *INDENT-OFF* */
1649          pool_foreach (reass, rt->pool, {
1650            vlib_cli_output (vm, "%U", format_ip4_reass, vm, reass);
1651          });
1652          /* *INDENT-ON* */
1653	}
1654      sum_reass_n += rt->reass_n;
1655      clib_spinlock_unlock (&rt->lock);
1656    }
1657  vlib_cli_output (vm, "---------------------");
1658  vlib_cli_output (vm, "Current full IP4 reassemblies count: %lu\n",
1659		   (long unsigned) sum_reass_n);
1660  vlib_cli_output (vm,
1661		   "Maximum configured concurrent full IP4 reassemblies per worker-thread: %lu\n",
1662		   (long unsigned) rm->max_reass_n);
1663  vlib_cli_output (vm,
1664		   "Maximum configured full IP4 reassembly timeout: %lums\n",
1665		   (long unsigned) rm->timeout_ms);
1666  vlib_cli_output (vm,
1667		   "Maximum configured full IP4 reassembly expire walk interval: %lums\n",
1668		   (long unsigned) rm->expire_walk_interval_ms);
1669  return 0;
1670}
1671
1672/* *INDENT-OFF* */
1673VLIB_CLI_COMMAND (show_ip4_full_reass_cmd, static) = {
1674    .path = "show ip4-full-reassembly",
1675    .short_help = "show ip4-full-reassembly [details]",
1676    .function = show_ip4_reass,
1677};
1678/* *INDENT-ON* */
1679
1680#ifndef CLIB_MARCH_VARIANT
1681vnet_api_error_t
1682ip4_full_reass_enable_disable (u32 sw_if_index, u8 enable_disable)
1683{
1684  return vnet_feature_enable_disable ("ip4-unicast",
1685				      "ip4-full-reassembly-feature",
1686				      sw_if_index, enable_disable, 0, 0);
1687}
1688#endif /* CLIB_MARCH_VARIANT */
1689
1690
1691#define foreach_ip4_full_reass_handoff_error                       \
1692_(CONGESTION_DROP, "congestion drop")
1693
1694
1695typedef enum
1696{
1697#define _(sym,str) IP4_FULL_REASS_HANDOFF_ERROR_##sym,
1698  foreach_ip4_full_reass_handoff_error
1699#undef _
1700    IP4_FULL_REASS_HANDOFF_N_ERROR,
1701} ip4_full_reass_handoff_error_t;
1702
1703static char *ip4_full_reass_handoff_error_strings[] = {
1704#define _(sym,string) string,
1705  foreach_ip4_full_reass_handoff_error
1706#undef _
1707};
1708
1709typedef struct
1710{
1711  u32 next_worker_index;
1712} ip4_full_reass_handoff_trace_t;
1713
1714static u8 *
1715format_ip4_full_reass_handoff_trace (u8 * s, va_list * args)
1716{
1717  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1718  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1719  ip4_full_reass_handoff_trace_t *t =
1720    va_arg (*args, ip4_full_reass_handoff_trace_t *);
1721
1722  s =
1723    format (s, "ip4-full-reassembly-handoff: next-worker %d",
1724	    t->next_worker_index);
1725
1726  return s;
1727}
1728
1729always_inline uword
1730ip4_full_reass_handoff_node_inline (vlib_main_t * vm,
1731				    vlib_node_runtime_t * node,
1732				    vlib_frame_t * frame, bool is_feature)
1733{
1734  ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1735
1736  vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
1737  u32 n_enq, n_left_from, *from;
1738  u16 thread_indices[VLIB_FRAME_SIZE], *ti;
1739  u32 fq_index;
1740
1741  from = vlib_frame_vector_args (frame);
1742  n_left_from = frame->n_vectors;
1743  vlib_get_buffers (vm, from, bufs, n_left_from);
1744
1745  b = bufs;
1746  ti = thread_indices;
1747
1748  fq_index = (is_feature) ? rm->fq_feature_index : rm->fq_index;
1749
1750  while (n_left_from > 0)
1751    {
1752      ti[0] = vnet_buffer (b[0])->ip.reass.owner_thread_index;
1753
1754      if (PREDICT_FALSE
1755	  ((node->flags & VLIB_NODE_FLAG_TRACE)
1756	   && (b[0]->flags & VLIB_BUFFER_IS_TRACED)))
1757	{
1758	  ip4_full_reass_handoff_trace_t *t =
1759	    vlib_add_trace (vm, node, b[0], sizeof (*t));
1760	  t->next_worker_index = ti[0];
1761	}
1762
1763      n_left_from -= 1;
1764      ti += 1;
1765      b += 1;
1766    }
1767  n_enq =
1768    vlib_buffer_enqueue_to_thread (vm, fq_index, from, thread_indices,
1769				   frame->n_vectors, 1);
1770
1771  if (n_enq < frame->n_vectors)
1772    vlib_node_increment_counter (vm, node->node_index,
1773				 IP4_FULL_REASS_HANDOFF_ERROR_CONGESTION_DROP,
1774				 frame->n_vectors - n_enq);
1775  return frame->n_vectors;
1776}
1777
1778VLIB_NODE_FN (ip4_full_reass_handoff_node) (vlib_main_t * vm,
1779					    vlib_node_runtime_t * node,
1780					    vlib_frame_t * frame)
1781{
1782  return ip4_full_reass_handoff_node_inline (vm, node, frame,
1783					     false /* is_feature */ );
1784}
1785
1786
1787/* *INDENT-OFF* */
1788VLIB_REGISTER_NODE (ip4_full_reass_handoff_node) = {
1789  .name = "ip4-full-reassembly-handoff",
1790  .vector_size = sizeof (u32),
1791  .n_errors = ARRAY_LEN(ip4_full_reass_handoff_error_strings),
1792  .error_strings = ip4_full_reass_handoff_error_strings,
1793  .format_trace = format_ip4_full_reass_handoff_trace,
1794
1795  .n_next_nodes = 1,
1796
1797  .next_nodes = {
1798    [0] = "error-drop",
1799  },
1800};
1801/* *INDENT-ON* */
1802
1803
1804/* *INDENT-OFF* */
1805VLIB_NODE_FN (ip4_full_reass_feature_handoff_node) (vlib_main_t * vm,
1806						    vlib_node_runtime_t *
1807						    node,
1808						    vlib_frame_t * frame)
1809{
1810  return ip4_full_reass_handoff_node_inline (vm, node, frame,
1811					     true /* is_feature */ );
1812}
1813/* *INDENT-ON* */
1814
1815
1816/* *INDENT-OFF* */
1817VLIB_REGISTER_NODE (ip4_full_reass_feature_handoff_node) = {
1818  .name = "ip4-full-reass-feature-hoff",
1819  .vector_size = sizeof (u32),
1820  .n_errors = ARRAY_LEN(ip4_full_reass_handoff_error_strings),
1821  .error_strings = ip4_full_reass_handoff_error_strings,
1822  .format_trace = format_ip4_full_reass_handoff_trace,
1823
1824  .n_next_nodes = 1,
1825
1826  .next_nodes = {
1827    [0] = "error-drop",
1828  },
1829};
1830/* *INDENT-ON* */
1831
1832#ifndef CLIB_MARCH_VARIANT
1833int
1834ip4_full_reass_enable_disable_with_refcnt (u32 sw_if_index, int is_enable)
1835{
1836  ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1837  vec_validate (rm->feature_use_refcount_per_intf, sw_if_index);
1838  if (is_enable)
1839    {
1840      if (!rm->feature_use_refcount_per_intf[sw_if_index])
1841	{
1842	  ++rm->feature_use_refcount_per_intf[sw_if_index];
1843	  return vnet_feature_enable_disable ("ip4-unicast",
1844					      "ip4-full-reassembly-feature",
1845					      sw_if_index, 1, 0, 0);
1846	}
1847      ++rm->feature_use_refcount_per_intf[sw_if_index];
1848    }
1849  else
1850    {
1851      --rm->feature_use_refcount_per_intf[sw_if_index];
1852      if (!rm->feature_use_refcount_per_intf[sw_if_index])
1853	return vnet_feature_enable_disable ("ip4-unicast",
1854					    "ip4-full-reassembly-feature",
1855					    sw_if_index, 0, 0, 0);
1856    }
1857  return -1;
1858}
1859#endif
1860
1861/*
1862 * fd.io coding-style-patch-verification: ON
1863 *
1864 * Local Variables:
1865 * eval: (c-set-style "gnu")
1866 * End:
1867 */
1868