ip4_full_reass.c revision 2d0ebd7e
1/*
2 * Copyright (c) 2017 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
6 *
7 *     http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16/**
17 * @file
18 * @brief IPv4 Full Reassembly.
19 *
20 * This file contains the source code for IPv4 full reassembly.
21 */
22
23#include <vppinfra/vec.h>
24#include <vnet/vnet.h>
25#include <vnet/ip/ip.h>
26#include <vppinfra/fifo.h>
27#include <vppinfra/bihash_16_8.h>
28#include <vnet/ip/reass/ip4_full_reass.h>
29#include <stddef.h>
30
31#define MSEC_PER_SEC 1000
32#define IP4_REASS_TIMEOUT_DEFAULT_MS 100
33#define IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS 10000	// 10 seconds default
34#define IP4_REASS_MAX_REASSEMBLIES_DEFAULT 1024
35#define IP4_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT 3
36#define IP4_REASS_HT_LOAD_FACTOR (0.75)
37
38#define IP4_REASS_DEBUG_BUFFERS 0
39#if IP4_REASS_DEBUG_BUFFERS
40#define IP4_REASS_DEBUG_BUFFER(bi, what)             \
41  do                                                 \
42    {                                                \
43      u32 _bi = bi;                                  \
44      printf (#what "buffer %u", _bi);               \
45      vlib_buffer_t *_b = vlib_get_buffer (vm, _bi); \
46      while (_b->flags & VLIB_BUFFER_NEXT_PRESENT)   \
47        {                                            \
48          _bi = _b->next_buffer;                     \
49          printf ("[%u]", _bi);                      \
50          _b = vlib_get_buffer (vm, _bi);            \
51        }                                            \
52      printf ("\n");                                 \
53      fflush (stdout);                               \
54    }                                                \
55  while (0)
56#else
57#define IP4_REASS_DEBUG_BUFFER(...)
58#endif
59
60typedef enum
61{
62  IP4_REASS_RC_OK,
63  IP4_REASS_RC_TOO_MANY_FRAGMENTS,
64  IP4_REASS_RC_INTERNAL_ERROR,
65  IP4_REASS_RC_NO_BUF,
66  IP4_REASS_RC_HANDOFF,
67} ip4_full_reass_rc_t;
68
69typedef struct
70{
71  union
72  {
73    struct
74    {
75      u32 xx_id;
76      ip4_address_t src;
77      ip4_address_t dst;
78      u16 frag_id;
79      u8 proto;
80      u8 unused;
81    };
82    u64 as_u64[2];
83  };
84} ip4_full_reass_key_t;
85
86typedef union
87{
88  struct
89  {
90    u32 reass_index;
91    u32 memory_owner_thread_index;
92  };
93  u64 as_u64;
94} ip4_full_reass_val_t;
95
96typedef union
97{
98  struct
99  {
100    ip4_full_reass_key_t k;
101    ip4_full_reass_val_t v;
102  };
103  clib_bihash_kv_16_8_t kv;
104} ip4_full_reass_kv_t;
105
106always_inline u32
107ip4_full_reass_buffer_get_data_offset (vlib_buffer_t * b)
108{
109  vnet_buffer_opaque_t *vnb = vnet_buffer (b);
110  return vnb->ip.reass.range_first - vnb->ip.reass.fragment_first;
111}
112
113always_inline u16
114ip4_full_reass_buffer_get_data_len (vlib_buffer_t * b)
115{
116  vnet_buffer_opaque_t *vnb = vnet_buffer (b);
117  return clib_min (vnb->ip.reass.range_last, vnb->ip.reass.fragment_last) -
118    (vnb->ip.reass.fragment_first +
119     ip4_full_reass_buffer_get_data_offset (b)) + 1;
120}
121
122typedef struct
123{
124  // hash table key
125  ip4_full_reass_key_t key;
126  // time when last packet was received
127  f64 last_heard;
128  // internal id of this reassembly
129  u64 id;
130  // buffer index of first buffer in this reassembly context
131  u32 first_bi;
132  // last octet of packet, ~0 until fragment without more_fragments arrives
133  u32 last_packet_octet;
134  // length of data collected so far
135  u32 data_len;
136  // trace operation counter
137  u32 trace_op_counter;
138  // next index - used by non-feature node
139  u32 next_index;
140  // error next index - used by custom apps (~0 if not used)
141  u32 error_next_index;
142  // minimum fragment length for this reassembly - used to estimate MTU
143  u16 min_fragment_length;
144  // number of fragments in this reassembly
145  u32 fragments_n;
146  // thread owning memory for this context (whose pool contains this ctx)
147  u32 memory_owner_thread_index;
148  // thread which received fragment with offset 0 and which sends out the
149  // completed reassembly
150  u32 sendout_thread_index;
151} ip4_full_reass_t;
152
153typedef struct
154{
155  ip4_full_reass_t *pool;
156  u32 reass_n;
157  u32 id_counter;
158  clib_spinlock_t lock;
159} ip4_full_reass_per_thread_t;
160
161typedef struct
162{
163  // IPv4 config
164  u32 timeout_ms;
165  f64 timeout;
166  u32 expire_walk_interval_ms;
167  // maximum number of fragments in one reassembly
168  u32 max_reass_len;
169  // maximum number of reassemblies
170  u32 max_reass_n;
171
172  // IPv4 runtime
173  clib_bihash_16_8_t hash;
174  // per-thread data
175  ip4_full_reass_per_thread_t *per_thread_data;
176
177  // convenience
178  vlib_main_t *vlib_main;
179
180  // node index of ip4-drop node
181  u32 ip4_drop_idx;
182  u32 ip4_full_reass_expire_node_idx;
183
184  /** Worker handoff */
185  u32 fq_index;
186  u32 fq_feature_index;
187
188} ip4_full_reass_main_t;
189
190extern ip4_full_reass_main_t ip4_full_reass_main;
191
192#ifndef CLIB_MARCH_VARIANT
193ip4_full_reass_main_t ip4_full_reass_main;
194#endif /* CLIB_MARCH_VARIANT */
195
196typedef enum
197{
198  IP4_FULL_REASS_NEXT_INPUT,
199  IP4_FULL_REASS_NEXT_DROP,
200  IP4_FULL_REASS_NEXT_HANDOFF,
201  IP4_FULL_REASS_N_NEXT,
202} ip4_full_reass_next_t;
203
204typedef enum
205{
206  RANGE_NEW,
207  RANGE_SHRINK,
208  RANGE_DISCARD,
209  RANGE_OVERLAP,
210  FINALIZE,
211  HANDOFF,
212} ip4_full_reass_trace_operation_e;
213
214typedef struct
215{
216  u16 range_first;
217  u16 range_last;
218  u32 range_bi;
219  i32 data_offset;
220  u32 data_len;
221  u32 first_bi;
222} ip4_full_reass_range_trace_t;
223
224typedef struct
225{
226  ip4_full_reass_trace_operation_e action;
227  u32 reass_id;
228  ip4_full_reass_range_trace_t trace_range;
229  u32 size_diff;
230  u32 op_id;
231  u32 thread_id;
232  u32 thread_id_to;
233  u32 fragment_first;
234  u32 fragment_last;
235  u32 total_data_len;
236} ip4_full_reass_trace_t;
237
238extern vlib_node_registration_t ip4_full_reass_node;
239extern vlib_node_registration_t ip4_full_reass_node_feature;
240
241static void
242ip4_full_reass_trace_details (vlib_main_t * vm, u32 bi,
243			      ip4_full_reass_range_trace_t * trace)
244{
245  vlib_buffer_t *b = vlib_get_buffer (vm, bi);
246  vnet_buffer_opaque_t *vnb = vnet_buffer (b);
247  trace->range_first = vnb->ip.reass.range_first;
248  trace->range_last = vnb->ip.reass.range_last;
249  trace->data_offset = ip4_full_reass_buffer_get_data_offset (b);
250  trace->data_len = ip4_full_reass_buffer_get_data_len (b);
251  trace->range_bi = bi;
252}
253
254static u8 *
255format_ip4_full_reass_range_trace (u8 * s, va_list * args)
256{
257  ip4_full_reass_range_trace_t *trace =
258    va_arg (*args, ip4_full_reass_range_trace_t *);
259  s =
260    format (s, "range: [%u, %u], off %d, len %u, bi %u", trace->range_first,
261	    trace->range_last, trace->data_offset, trace->data_len,
262	    trace->range_bi);
263  return s;
264}
265
266static u8 *
267format_ip4_full_reass_trace (u8 * s, va_list * args)
268{
269  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
270  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
271  ip4_full_reass_trace_t *t = va_arg (*args, ip4_full_reass_trace_t *);
272  u32 indent = 0;
273  if (~0 != t->reass_id)
274    {
275      s = format (s, "reass id: %u, op id: %u, ", t->reass_id, t->op_id);
276      indent = format_get_indent (s);
277      s =
278	format (s,
279		"first bi: %u, data len: %u, ip/fragment[%u, %u]",
280		t->trace_range.first_bi, t->total_data_len, t->fragment_first,
281		t->fragment_last);
282    }
283  switch (t->action)
284    {
285    case RANGE_SHRINK:
286      s = format (s, "\n%Ushrink %U by %u", format_white_space, indent,
287		  format_ip4_full_reass_range_trace, &t->trace_range,
288		  t->size_diff);
289      break;
290    case RANGE_DISCARD:
291      s = format (s, "\n%Udiscard %U", format_white_space, indent,
292		  format_ip4_full_reass_range_trace, &t->trace_range);
293      break;
294    case RANGE_NEW:
295      s = format (s, "\n%Unew %U", format_white_space, indent,
296		  format_ip4_full_reass_range_trace, &t->trace_range);
297      break;
298    case RANGE_OVERLAP:
299      s = format (s, "\n%Uoverlapping/ignored %U", format_white_space, indent,
300		  format_ip4_full_reass_range_trace, &t->trace_range);
301      break;
302    case FINALIZE:
303      s = format (s, "\n%Ufinalize reassembly", format_white_space, indent);
304      break;
305    case HANDOFF:
306      s =
307	format (s, "handoff from thread #%u to thread #%u", t->thread_id,
308		t->thread_id_to);
309      break;
310    }
311  return s;
312}
313
314static void
315ip4_full_reass_add_trace (vlib_main_t * vm, vlib_node_runtime_t * node,
316			  ip4_full_reass_main_t * rm,
317			  ip4_full_reass_t * reass, u32 bi,
318			  ip4_full_reass_trace_operation_e action,
319			  u32 size_diff, u32 thread_id_to)
320{
321  vlib_buffer_t *b = vlib_get_buffer (vm, bi);
322  vnet_buffer_opaque_t *vnb = vnet_buffer (b);
323  ip4_full_reass_trace_t *t = vlib_add_trace (vm, node, b, sizeof (t[0]));
324  if (reass)
325    {
326      t->reass_id = reass->id;
327      t->op_id = reass->trace_op_counter;
328      t->trace_range.first_bi = reass->first_bi;
329      t->total_data_len = reass->data_len;
330      ++reass->trace_op_counter;
331    }
332  else
333    {
334      t->reass_id = ~0;
335      t->op_id = 0;
336      t->trace_range.first_bi = 0;
337      t->total_data_len = 0;
338    }
339  t->action = action;
340  ip4_full_reass_trace_details (vm, bi, &t->trace_range);
341  t->size_diff = size_diff;
342  t->thread_id = vm->thread_index;
343  t->thread_id_to = thread_id_to;
344  t->fragment_first = vnb->ip.reass.fragment_first;
345  t->fragment_last = vnb->ip.reass.fragment_last;
346#if 0
347  static u8 *s = NULL;
348  s = format (s, "%U", format_ip4_full_reass_trace, NULL, NULL, t);
349  printf ("%.*s\n", vec_len (s), s);
350  fflush (stdout);
351  vec_reset_length (s);
352#endif
353}
354
355always_inline void
356ip4_full_reass_free_ctx (ip4_full_reass_per_thread_t * rt,
357			 ip4_full_reass_t * reass)
358{
359  pool_put (rt->pool, reass);
360  --rt->reass_n;
361}
362
363always_inline void
364ip4_full_reass_free (ip4_full_reass_main_t * rm,
365		     ip4_full_reass_per_thread_t * rt,
366		     ip4_full_reass_t * reass)
367{
368  clib_bihash_kv_16_8_t kv;
369  kv.key[0] = reass->key.as_u64[0];
370  kv.key[1] = reass->key.as_u64[1];
371  clib_bihash_add_del_16_8 (&rm->hash, &kv, 0);
372  return ip4_full_reass_free_ctx (rt, reass);
373}
374
375always_inline void
376ip4_full_reass_drop_all (vlib_main_t * vm, vlib_node_runtime_t * node,
377			 ip4_full_reass_main_t * rm, ip4_full_reass_t * reass)
378{
379  u32 range_bi = reass->first_bi;
380  vlib_buffer_t *range_b;
381  vnet_buffer_opaque_t *range_vnb;
382  u32 *to_free = NULL;
383  while (~0 != range_bi)
384    {
385      range_b = vlib_get_buffer (vm, range_bi);
386      range_vnb = vnet_buffer (range_b);
387      u32 bi = range_bi;
388      while (~0 != bi)
389	{
390	  vec_add1 (to_free, bi);
391	  vlib_buffer_t *b = vlib_get_buffer (vm, bi);
392	  if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
393	    {
394	      bi = b->next_buffer;
395	      b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
396	    }
397	  else
398	    {
399	      bi = ~0;
400	    }
401	}
402      range_bi = range_vnb->ip.reass.next_range_bi;
403    }
404  /* send to next_error_index */
405  if (~0 != reass->error_next_index)
406    {
407      u32 n_left_to_next, *to_next, next_index;
408
409      next_index = reass->error_next_index;
410      u32 bi = ~0;
411
412      while (vec_len (to_free) > 0)
413	{
414	  vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
415
416	  while (vec_len (to_free) > 0 && n_left_to_next > 0)
417	    {
418	      bi = vec_pop (to_free);
419
420	      if (~0 != bi)
421		{
422		  to_next[0] = bi;
423		  to_next += 1;
424		  n_left_to_next -= 1;
425		}
426	    }
427	  vlib_put_next_frame (vm, node, next_index, n_left_to_next);
428	}
429    }
430  else
431    {
432      vlib_buffer_free (vm, to_free, vec_len (to_free));
433    }
434}
435
436always_inline void
437ip4_full_reass_init (ip4_full_reass_t * reass)
438{
439  reass->first_bi = ~0;
440  reass->last_packet_octet = ~0;
441  reass->data_len = 0;
442  reass->next_index = ~0;
443  reass->error_next_index = ~0;
444}
445
446always_inline ip4_full_reass_t *
447ip4_full_reass_find_or_create (vlib_main_t * vm, vlib_node_runtime_t * node,
448			       ip4_full_reass_main_t * rm,
449			       ip4_full_reass_per_thread_t * rt,
450			       ip4_full_reass_kv_t * kv, u8 * do_handoff)
451{
452  ip4_full_reass_t *reass;
453  f64 now;
454
455again:
456
457  reass = NULL;
458  now = vlib_time_now (vm);
459  if (!clib_bihash_search_16_8
460      (&rm->hash, (clib_bihash_kv_16_8_t *) kv, (clib_bihash_kv_16_8_t *) kv))
461    {
462      reass =
463	pool_elt_at_index (rm->per_thread_data
464			   [kv->v.memory_owner_thread_index].pool,
465			   kv->v.reass_index);
466      if (vm->thread_index != reass->memory_owner_thread_index)
467	{
468	  *do_handoff = 1;
469	  return reass;
470	}
471
472      if (now > reass->last_heard + rm->timeout)
473	{
474	  ip4_full_reass_drop_all (vm, node, rm, reass);
475	  ip4_full_reass_free (rm, rt, reass);
476	  reass = NULL;
477	}
478    }
479
480  if (reass)
481    {
482      reass->last_heard = now;
483      return reass;
484    }
485
486  if (rt->reass_n >= rm->max_reass_n)
487    {
488      reass = NULL;
489      return reass;
490    }
491  else
492    {
493      pool_get (rt->pool, reass);
494      clib_memset (reass, 0, sizeof (*reass));
495      reass->id = ((u64) vm->thread_index * 1000000000) + rt->id_counter;
496      reass->memory_owner_thread_index = vm->thread_index;
497      ++rt->id_counter;
498      ip4_full_reass_init (reass);
499      ++rt->reass_n;
500    }
501
502  reass->key.as_u64[0] = ((clib_bihash_kv_16_8_t *) kv)->key[0];
503  reass->key.as_u64[1] = ((clib_bihash_kv_16_8_t *) kv)->key[1];
504  kv->v.reass_index = (reass - rt->pool);
505  kv->v.memory_owner_thread_index = vm->thread_index;
506  reass->last_heard = now;
507
508  int rv =
509    clib_bihash_add_del_16_8 (&rm->hash, (clib_bihash_kv_16_8_t *) kv, 2);
510  if (rv)
511    {
512      ip4_full_reass_free_ctx (rt, reass);
513      reass = NULL;
514      // if other worker created a context already work with the other copy
515      if (-2 == rv)
516	goto again;
517    }
518
519  return reass;
520}
521
522always_inline ip4_full_reass_rc_t
523ip4_full_reass_finalize (vlib_main_t * vm, vlib_node_runtime_t * node,
524			 ip4_full_reass_main_t * rm,
525			 ip4_full_reass_per_thread_t * rt,
526			 ip4_full_reass_t * reass, u32 * bi0,
527			 u32 * next0, u32 * error0, bool is_custom_app)
528{
529  vlib_buffer_t *first_b = vlib_get_buffer (vm, reass->first_bi);
530  vlib_buffer_t *last_b = NULL;
531  u32 sub_chain_bi = reass->first_bi;
532  u32 total_length = 0;
533  u32 buf_cnt = 0;
534  do
535    {
536      u32 tmp_bi = sub_chain_bi;
537      vlib_buffer_t *tmp = vlib_get_buffer (vm, tmp_bi);
538      ip4_header_t *ip = vlib_buffer_get_current (tmp);
539      vnet_buffer_opaque_t *vnb = vnet_buffer (tmp);
540      if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
541	  !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
542	{
543	  return IP4_REASS_RC_INTERNAL_ERROR;
544	}
545
546      u32 data_len = ip4_full_reass_buffer_get_data_len (tmp);
547      u32 trim_front =
548	ip4_header_bytes (ip) + ip4_full_reass_buffer_get_data_offset (tmp);
549      u32 trim_end =
550	vlib_buffer_length_in_chain (vm, tmp) - trim_front - data_len;
551      if (tmp_bi == reass->first_bi)
552	{
553	  /* first buffer - keep ip4 header */
554	  if (0 != ip4_full_reass_buffer_get_data_offset (tmp))
555	    {
556	      return IP4_REASS_RC_INTERNAL_ERROR;
557	    }
558	  trim_front = 0;
559	  trim_end = vlib_buffer_length_in_chain (vm, tmp) - data_len -
560	    ip4_header_bytes (ip);
561	  if (!(vlib_buffer_length_in_chain (vm, tmp) - trim_end > 0))
562	    {
563	      return IP4_REASS_RC_INTERNAL_ERROR;
564	    }
565	}
566      u32 keep_data =
567	vlib_buffer_length_in_chain (vm, tmp) - trim_front - trim_end;
568      while (1)
569	{
570	  ++buf_cnt;
571	  if (trim_front)
572	    {
573	      if (trim_front > tmp->current_length)
574		{
575		  /* drop whole buffer */
576		  u32 to_be_freed_bi = tmp_bi;
577		  trim_front -= tmp->current_length;
578		  if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
579		    {
580		      return IP4_REASS_RC_INTERNAL_ERROR;
581		    }
582		  tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
583		  tmp_bi = tmp->next_buffer;
584		  tmp->next_buffer = 0;
585		  tmp = vlib_get_buffer (vm, tmp_bi);
586		  vlib_buffer_free_one (vm, to_be_freed_bi);
587		  continue;
588		}
589	      else
590		{
591		  vlib_buffer_advance (tmp, trim_front);
592		  trim_front = 0;
593		}
594	    }
595	  if (keep_data)
596	    {
597	      if (last_b)
598		{
599		  last_b->flags |= VLIB_BUFFER_NEXT_PRESENT;
600		  last_b->next_buffer = tmp_bi;
601		}
602	      last_b = tmp;
603	      if (keep_data <= tmp->current_length)
604		{
605		  tmp->current_length = keep_data;
606		  keep_data = 0;
607		}
608	      else
609		{
610		  keep_data -= tmp->current_length;
611		  if (!(tmp->flags & VLIB_BUFFER_NEXT_PRESENT))
612		    {
613		      return IP4_REASS_RC_INTERNAL_ERROR;
614		    }
615		}
616	      total_length += tmp->current_length;
617	      if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
618		{
619		  tmp_bi = tmp->next_buffer;
620		  tmp = vlib_get_buffer (vm, tmp->next_buffer);
621		}
622	      else
623		{
624		  break;
625		}
626	    }
627	  else
628	    {
629	      u32 to_be_freed_bi = tmp_bi;
630	      if (reass->first_bi == tmp_bi)
631		{
632		  return IP4_REASS_RC_INTERNAL_ERROR;
633		}
634	      if (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
635		{
636		  tmp->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
637		  tmp_bi = tmp->next_buffer;
638		  tmp->next_buffer = 0;
639		  tmp = vlib_get_buffer (vm, tmp_bi);
640		  vlib_buffer_free_one (vm, to_be_freed_bi);
641		}
642	      else
643		{
644		  tmp->next_buffer = 0;
645		  vlib_buffer_free_one (vm, to_be_freed_bi);
646		  break;
647		}
648	    }
649	}
650      sub_chain_bi =
651	vnet_buffer (vlib_get_buffer (vm, sub_chain_bi))->ip.
652	reass.next_range_bi;
653    }
654  while (~0 != sub_chain_bi);
655
656  if (!last_b)
657    {
658      return IP4_REASS_RC_INTERNAL_ERROR;
659    }
660  last_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
661
662  if (total_length < first_b->current_length)
663    {
664      return IP4_REASS_RC_INTERNAL_ERROR;
665    }
666  total_length -= first_b->current_length;
667  first_b->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
668  first_b->total_length_not_including_first_buffer = total_length;
669  ip4_header_t *ip = vlib_buffer_get_current (first_b);
670  ip->flags_and_fragment_offset = 0;
671  ip->length = clib_host_to_net_u16 (first_b->current_length + total_length);
672  ip->checksum = ip4_header_checksum (ip);
673  if (!vlib_buffer_chain_linearize (vm, first_b))
674    {
675      return IP4_REASS_RC_NO_BUF;
676    }
677  // reset to reconstruct the mbuf linking
678  first_b->flags &= ~VLIB_BUFFER_EXT_HDR_VALID;
679  if (PREDICT_FALSE (first_b->flags & VLIB_BUFFER_IS_TRACED))
680    {
681      ip4_full_reass_add_trace (vm, node, rm, reass, reass->first_bi,
682				FINALIZE, 0, ~0);
683#if 0
684      // following code does a hexdump of packet fragments to stdout ...
685      do
686	{
687	  u32 bi = reass->first_bi;
688	  u8 *s = NULL;
689	  while (~0 != bi)
690	    {
691	      vlib_buffer_t *b = vlib_get_buffer (vm, bi);
692	      s = format (s, "%u: %U\n", bi, format_hexdump,
693			  vlib_buffer_get_current (b), b->current_length);
694	      if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
695		{
696		  bi = b->next_buffer;
697		}
698	      else
699		{
700		  break;
701		}
702	    }
703	  printf ("%.*s\n", vec_len (s), s);
704	  fflush (stdout);
705	  vec_free (s);
706	}
707      while (0);
708#endif
709    }
710  *bi0 = reass->first_bi;
711  if (!is_custom_app)
712    {
713      *next0 = IP4_FULL_REASS_NEXT_INPUT;
714    }
715  else
716    {
717      *next0 = reass->next_index;
718    }
719  vnet_buffer (first_b)->ip.reass.estimated_mtu = reass->min_fragment_length;
720  *error0 = IP4_ERROR_NONE;
721  ip4_full_reass_free (rm, rt, reass);
722  reass = NULL;
723  return IP4_REASS_RC_OK;
724}
725
726always_inline ip4_full_reass_rc_t
727ip4_full_reass_insert_range_in_chain (vlib_main_t * vm,
728				      ip4_full_reass_main_t * rm,
729				      ip4_full_reass_per_thread_t * rt,
730				      ip4_full_reass_t * reass,
731				      u32 prev_range_bi, u32 new_next_bi)
732{
733  vlib_buffer_t *new_next_b = vlib_get_buffer (vm, new_next_bi);
734  vnet_buffer_opaque_t *new_next_vnb = vnet_buffer (new_next_b);
735  if (~0 != prev_range_bi)
736    {
737      vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
738      vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
739      new_next_vnb->ip.reass.next_range_bi = prev_vnb->ip.reass.next_range_bi;
740      prev_vnb->ip.reass.next_range_bi = new_next_bi;
741    }
742  else
743    {
744      if (~0 != reass->first_bi)
745	{
746	  new_next_vnb->ip.reass.next_range_bi = reass->first_bi;
747	}
748      reass->first_bi = new_next_bi;
749    }
750  vnet_buffer_opaque_t *vnb = vnet_buffer (new_next_b);
751  if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
752      !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
753    {
754      return IP4_REASS_RC_INTERNAL_ERROR;
755    }
756  reass->data_len += ip4_full_reass_buffer_get_data_len (new_next_b);
757  return IP4_REASS_RC_OK;
758}
759
760always_inline ip4_full_reass_rc_t
761ip4_full_reass_remove_range_from_chain (vlib_main_t * vm,
762					vlib_node_runtime_t * node,
763					ip4_full_reass_main_t * rm,
764					ip4_full_reass_t * reass,
765					u32 prev_range_bi, u32 discard_bi)
766{
767  vlib_buffer_t *discard_b = vlib_get_buffer (vm, discard_bi);
768  vnet_buffer_opaque_t *discard_vnb = vnet_buffer (discard_b);
769  if (~0 != prev_range_bi)
770    {
771      vlib_buffer_t *prev_b = vlib_get_buffer (vm, prev_range_bi);
772      vnet_buffer_opaque_t *prev_vnb = vnet_buffer (prev_b);
773      if (!(prev_vnb->ip.reass.next_range_bi == discard_bi))
774	{
775	  return IP4_REASS_RC_INTERNAL_ERROR;
776	}
777      prev_vnb->ip.reass.next_range_bi = discard_vnb->ip.reass.next_range_bi;
778    }
779  else
780    {
781      reass->first_bi = discard_vnb->ip.reass.next_range_bi;
782    }
783  vnet_buffer_opaque_t *vnb = vnet_buffer (discard_b);
784  if (!(vnb->ip.reass.range_first >= vnb->ip.reass.fragment_first) &&
785      !(vnb->ip.reass.range_last > vnb->ip.reass.fragment_first))
786    {
787      return IP4_REASS_RC_INTERNAL_ERROR;
788    }
789  reass->data_len -= ip4_full_reass_buffer_get_data_len (discard_b);
790  while (1)
791    {
792      u32 to_be_freed_bi = discard_bi;
793      if (PREDICT_FALSE (discard_b->flags & VLIB_BUFFER_IS_TRACED))
794	{
795	  ip4_full_reass_add_trace (vm, node, rm, reass, discard_bi,
796				    RANGE_DISCARD, 0, ~0);
797	}
798      if (discard_b->flags & VLIB_BUFFER_NEXT_PRESENT)
799	{
800	  discard_b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
801	  discard_bi = discard_b->next_buffer;
802	  discard_b->next_buffer = 0;
803	  discard_b = vlib_get_buffer (vm, discard_bi);
804	  vlib_buffer_free_one (vm, to_be_freed_bi);
805	}
806      else
807	{
808	  discard_b->next_buffer = 0;
809	  vlib_buffer_free_one (vm, to_be_freed_bi);
810	  break;
811	}
812    }
813  return IP4_REASS_RC_OK;
814}
815
816always_inline ip4_full_reass_rc_t
817ip4_full_reass_update (vlib_main_t * vm, vlib_node_runtime_t * node,
818		       ip4_full_reass_main_t * rm,
819		       ip4_full_reass_per_thread_t * rt,
820		       ip4_full_reass_t * reass, u32 * bi0, u32 * next0,
821		       u32 * error0, bool is_custom_app,
822		       u32 * handoff_thread_idx)
823{
824  vlib_buffer_t *fb = vlib_get_buffer (vm, *bi0);
825  vnet_buffer_opaque_t *fvnb = vnet_buffer (fb);
826  if (is_custom_app)
827    {
828      // store (error_)next_index before it's overwritten
829      reass->next_index = fvnb->ip.reass.next_index;
830      reass->error_next_index = fvnb->ip.reass.error_next_index;
831    }
832  ip4_full_reass_rc_t rc = IP4_REASS_RC_OK;
833  int consumed = 0;
834  ip4_header_t *fip = vlib_buffer_get_current (fb);
835  const u32 fragment_first = ip4_get_fragment_offset_bytes (fip);
836  const u32 fragment_length =
837    clib_net_to_host_u16 (fip->length) - ip4_header_bytes (fip);
838  const u32 fragment_last = fragment_first + fragment_length - 1;
839  fvnb->ip.reass.fragment_first = fragment_first;
840  fvnb->ip.reass.fragment_last = fragment_last;
841  int more_fragments = ip4_get_fragment_more (fip);
842  u32 candidate_range_bi = reass->first_bi;
843  u32 prev_range_bi = ~0;
844  fvnb->ip.reass.range_first = fragment_first;
845  fvnb->ip.reass.range_last = fragment_last;
846  fvnb->ip.reass.next_range_bi = ~0;
847  if (!more_fragments)
848    {
849      reass->last_packet_octet = fragment_last;
850    }
851  if (~0 == reass->first_bi)
852    {
853      // starting a new reassembly
854      rc =
855	ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
856					      prev_range_bi, *bi0);
857      if (IP4_REASS_RC_OK != rc)
858	{
859	  return rc;
860	}
861      if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
862	{
863	  ip4_full_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_NEW, 0,
864				    ~0);
865	}
866      *bi0 = ~0;
867      reass->min_fragment_length = clib_net_to_host_u16 (fip->length);
868      reass->fragments_n = 1;
869      return IP4_REASS_RC_OK;
870    }
871  reass->min_fragment_length =
872    clib_min (clib_net_to_host_u16 (fip->length),
873	      fvnb->ip.reass.estimated_mtu);
874  while (~0 != candidate_range_bi)
875    {
876      vlib_buffer_t *candidate_b = vlib_get_buffer (vm, candidate_range_bi);
877      vnet_buffer_opaque_t *candidate_vnb = vnet_buffer (candidate_b);
878      if (fragment_first > candidate_vnb->ip.reass.range_last)
879	{
880	  // this fragments starts after candidate range
881	  prev_range_bi = candidate_range_bi;
882	  candidate_range_bi = candidate_vnb->ip.reass.next_range_bi;
883	  if (candidate_vnb->ip.reass.range_last < fragment_last &&
884	      ~0 == candidate_range_bi)
885	    {
886	      // special case - this fragment falls beyond all known ranges
887	      rc =
888		ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
889						      prev_range_bi, *bi0);
890	      if (IP4_REASS_RC_OK != rc)
891		{
892		  return rc;
893		}
894	      consumed = 1;
895	      break;
896	    }
897	  continue;
898	}
899      if (fragment_last < candidate_vnb->ip.reass.range_first)
900	{
901	  // this fragment ends before candidate range without any overlap
902	  rc =
903	    ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
904						  prev_range_bi, *bi0);
905	  if (IP4_REASS_RC_OK != rc)
906	    {
907	      return rc;
908	    }
909	  consumed = 1;
910	}
911      else
912	{
913	  if (fragment_first >= candidate_vnb->ip.reass.range_first &&
914	      fragment_last <= candidate_vnb->ip.reass.range_last)
915	    {
916	      // this fragment is a (sub)part of existing range, ignore it
917	      if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
918		{
919		  ip4_full_reass_add_trace (vm, node, rm, reass, *bi0,
920					    RANGE_OVERLAP, 0, ~0);
921		}
922	      break;
923	    }
924	  int discard_candidate = 0;
925	  if (fragment_first < candidate_vnb->ip.reass.range_first)
926	    {
927	      u32 overlap =
928		fragment_last - candidate_vnb->ip.reass.range_first + 1;
929	      if (overlap < ip4_full_reass_buffer_get_data_len (candidate_b))
930		{
931		  candidate_vnb->ip.reass.range_first += overlap;
932		  if (reass->data_len < overlap)
933		    {
934		      return IP4_REASS_RC_INTERNAL_ERROR;
935		    }
936		  reass->data_len -= overlap;
937		  if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
938		    {
939		      ip4_full_reass_add_trace (vm, node, rm, reass,
940						candidate_range_bi,
941						RANGE_SHRINK, 0, ~0);
942		    }
943		  rc =
944		    ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
945							  prev_range_bi,
946							  *bi0);
947		  if (IP4_REASS_RC_OK != rc)
948		    {
949		      return rc;
950		    }
951		  consumed = 1;
952		}
953	      else
954		{
955		  discard_candidate = 1;
956		}
957	    }
958	  else if (fragment_last > candidate_vnb->ip.reass.range_last)
959	    {
960	      u32 overlap =
961		candidate_vnb->ip.reass.range_last - fragment_first + 1;
962	      if (overlap < ip4_full_reass_buffer_get_data_len (candidate_b))
963		{
964		  fvnb->ip.reass.range_first += overlap;
965		  if (~0 != candidate_vnb->ip.reass.next_range_bi)
966		    {
967		      prev_range_bi = candidate_range_bi;
968		      candidate_range_bi =
969			candidate_vnb->ip.reass.next_range_bi;
970		      continue;
971		    }
972		  else
973		    {
974		      // special case - last range discarded
975		      rc =
976			ip4_full_reass_insert_range_in_chain (vm, rm, rt,
977							      reass,
978							      candidate_range_bi,
979							      *bi0);
980		      if (IP4_REASS_RC_OK != rc)
981			{
982			  return rc;
983			}
984		      consumed = 1;
985		    }
986		}
987	      else
988		{
989		  discard_candidate = 1;
990		}
991	    }
992	  else
993	    {
994	      discard_candidate = 1;
995	    }
996	  if (discard_candidate)
997	    {
998	      u32 next_range_bi = candidate_vnb->ip.reass.next_range_bi;
999	      // discard candidate range, probe next range
1000	      rc =
1001		ip4_full_reass_remove_range_from_chain (vm, node, rm, reass,
1002							prev_range_bi,
1003							candidate_range_bi);
1004	      if (IP4_REASS_RC_OK != rc)
1005		{
1006		  return rc;
1007		}
1008	      if (~0 != next_range_bi)
1009		{
1010		  candidate_range_bi = next_range_bi;
1011		  continue;
1012		}
1013	      else
1014		{
1015		  // special case - last range discarded
1016		  rc =
1017		    ip4_full_reass_insert_range_in_chain (vm, rm, rt, reass,
1018							  prev_range_bi,
1019							  *bi0);
1020		  if (IP4_REASS_RC_OK != rc)
1021		    {
1022		      return rc;
1023		    }
1024		  consumed = 1;
1025		}
1026	    }
1027	}
1028      break;
1029    }
1030  ++reass->fragments_n;
1031  if (consumed)
1032    {
1033      if (PREDICT_FALSE (fb->flags & VLIB_BUFFER_IS_TRACED))
1034	{
1035	  ip4_full_reass_add_trace (vm, node, rm, reass, *bi0, RANGE_NEW, 0,
1036				    ~0);
1037	}
1038    }
1039  if (~0 != reass->last_packet_octet &&
1040      reass->data_len == reass->last_packet_octet + 1)
1041    {
1042      *handoff_thread_idx = reass->sendout_thread_index;
1043      int handoff =
1044	reass->memory_owner_thread_index != reass->sendout_thread_index;
1045      rc =
1046	ip4_full_reass_finalize (vm, node, rm, rt, reass, bi0, next0, error0,
1047				 is_custom_app);
1048      if (IP4_REASS_RC_OK == rc && handoff)
1049	{
1050	  rc = IP4_REASS_RC_HANDOFF;
1051	}
1052    }
1053  else
1054    {
1055      if (consumed)
1056	{
1057	  *bi0 = ~0;
1058	  if (reass->fragments_n > rm->max_reass_len)
1059	    {
1060	      rc = IP4_REASS_RC_TOO_MANY_FRAGMENTS;
1061	    }
1062	}
1063      else
1064	{
1065	  *next0 = IP4_FULL_REASS_NEXT_DROP;
1066	  *error0 = IP4_ERROR_REASS_DUPLICATE_FRAGMENT;
1067	}
1068    }
1069  return rc;
1070}
1071
1072always_inline uword
1073ip4_full_reass_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
1074		       vlib_frame_t * frame, bool is_feature,
1075		       bool is_custom_app)
1076{
1077  u32 *from = vlib_frame_vector_args (frame);
1078  u32 n_left_from, n_left_to_next, *to_next, next_index;
1079  ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1080  ip4_full_reass_per_thread_t *rt = &rm->per_thread_data[vm->thread_index];
1081  clib_spinlock_lock (&rt->lock);
1082
1083  n_left_from = frame->n_vectors;
1084  next_index = node->cached_next_index;
1085  while (n_left_from > 0)
1086    {
1087      vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1088
1089      while (n_left_from > 0 && n_left_to_next > 0)
1090	{
1091	  u32 bi0;
1092	  vlib_buffer_t *b0;
1093	  u32 next0;
1094	  u32 error0 = IP4_ERROR_NONE;
1095
1096	  bi0 = from[0];
1097	  b0 = vlib_get_buffer (vm, bi0);
1098
1099	  ip4_header_t *ip0 = vlib_buffer_get_current (b0);
1100	  if (!ip4_get_fragment_more (ip0) && !ip4_get_fragment_offset (ip0))
1101	    {
1102	      // this is a whole packet - no fragmentation
1103	      if (!is_custom_app)
1104		{
1105		  next0 = IP4_FULL_REASS_NEXT_INPUT;
1106		}
1107	      else
1108		{
1109		  next0 = vnet_buffer (b0)->ip.reass.next_index;
1110		}
1111	      goto packet_enqueue;
1112	    }
1113	  const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
1114	  const u32 fragment_length =
1115	    clib_net_to_host_u16 (ip0->length) - ip4_header_bytes (ip0);
1116	  const u32 fragment_last = fragment_first + fragment_length - 1;
1117	  if (fragment_first > fragment_last || fragment_first + fragment_length > UINT16_MAX - 20 || (fragment_length < 8 && ip4_get_fragment_more (ip0)))	// 8 is minimum frag length per RFC 791
1118	    {
1119	      next0 = IP4_FULL_REASS_NEXT_DROP;
1120	      error0 = IP4_ERROR_REASS_MALFORMED_PACKET;
1121	      goto packet_enqueue;
1122	    }
1123	  ip4_full_reass_kv_t kv;
1124	  u8 do_handoff = 0;
1125
1126	  kv.k.as_u64[0] =
1127	    (u64) vec_elt (ip4_main.fib_index_by_sw_if_index,
1128			   vnet_buffer (b0)->sw_if_index[VLIB_RX]) |
1129	    (u64) ip0->src_address.as_u32 << 32;
1130	  kv.k.as_u64[1] =
1131	    (u64) ip0->dst_address.
1132	    as_u32 | (u64) ip0->fragment_id << 32 | (u64) ip0->protocol << 48;
1133
1134	  ip4_full_reass_t *reass =
1135	    ip4_full_reass_find_or_create (vm, node, rm, rt, &kv,
1136					   &do_handoff);
1137
1138	  if (reass)
1139	    {
1140	      const u32 fragment_first = ip4_get_fragment_offset_bytes (ip0);
1141	      if (0 == fragment_first)
1142		{
1143		  reass->sendout_thread_index = vm->thread_index;
1144		}
1145	    }
1146
1147	  if (PREDICT_FALSE (do_handoff))
1148	    {
1149	      next0 = IP4_FULL_REASS_NEXT_HANDOFF;
1150	      vnet_buffer (b0)->ip.reass.owner_thread_index =
1151		kv.v.memory_owner_thread_index;
1152	    }
1153	  else if (reass)
1154	    {
1155	      u32 handoff_thread_idx;
1156	      switch (ip4_full_reass_update
1157		      (vm, node, rm, rt, reass, &bi0, &next0,
1158		       &error0, is_custom_app, &handoff_thread_idx))
1159		{
1160		case IP4_REASS_RC_OK:
1161		  /* nothing to do here */
1162		  break;
1163		case IP4_REASS_RC_HANDOFF:
1164		  next0 = IP4_FULL_REASS_NEXT_HANDOFF;
1165		  b0 = vlib_get_buffer (vm, bi0);
1166		  vnet_buffer (b0)->ip.reass.owner_thread_index =
1167		    handoff_thread_idx;
1168		  break;
1169		case IP4_REASS_RC_TOO_MANY_FRAGMENTS:
1170		  vlib_node_increment_counter (vm, node->node_index,
1171					       IP4_ERROR_REASS_FRAGMENT_CHAIN_TOO_LONG,
1172					       1);
1173		  ip4_full_reass_drop_all (vm, node, rm, reass);
1174		  ip4_full_reass_free (rm, rt, reass);
1175		  goto next_packet;
1176		  break;
1177		case IP4_REASS_RC_NO_BUF:
1178		  vlib_node_increment_counter (vm, node->node_index,
1179					       IP4_ERROR_REASS_NO_BUF, 1);
1180		  ip4_full_reass_drop_all (vm, node, rm, reass);
1181		  ip4_full_reass_free (rm, rt, reass);
1182		  goto next_packet;
1183		  break;
1184		case IP4_REASS_RC_INTERNAL_ERROR:
1185		  /* drop everything and start with a clean slate */
1186		  vlib_node_increment_counter (vm, node->node_index,
1187					       IP4_ERROR_REASS_INTERNAL_ERROR,
1188					       1);
1189		  ip4_full_reass_drop_all (vm, node, rm, reass);
1190		  ip4_full_reass_free (rm, rt, reass);
1191		  goto next_packet;
1192		  break;
1193		}
1194	    }
1195	  else
1196	    {
1197	      next0 = IP4_FULL_REASS_NEXT_DROP;
1198	      error0 = IP4_ERROR_REASS_LIMIT_REACHED;
1199	    }
1200
1201
1202	packet_enqueue:
1203	  b0->error = node->errors[error0];
1204
1205	  if (bi0 != ~0)
1206	    {
1207	      to_next[0] = bi0;
1208	      to_next += 1;
1209	      n_left_to_next -= 1;
1210	      if (next0 == IP4_FULL_REASS_NEXT_HANDOFF)
1211		{
1212		  if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
1213		    {
1214		      ip4_full_reass_add_trace (vm, node, rm, NULL, bi0,
1215						HANDOFF, 0,
1216						vnet_buffer (b0)->ip.
1217						reass.owner_thread_index);
1218		    }
1219		}
1220	      else if (is_feature && IP4_ERROR_NONE == error0)
1221		{
1222		  b0 = vlib_get_buffer (vm, bi0);
1223		  vnet_feature_next (&next0, b0);
1224		}
1225	      vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
1226					       to_next, n_left_to_next,
1227					       bi0, next0);
1228	      IP4_REASS_DEBUG_BUFFER (bi0, enqueue_next);
1229	    }
1230
1231	next_packet:
1232	  from += 1;
1233	  n_left_from -= 1;
1234	}
1235
1236      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1237    }
1238
1239  clib_spinlock_unlock (&rt->lock);
1240  return frame->n_vectors;
1241}
1242
1243static char *ip4_full_reass_error_strings[] = {
1244#define _(sym, string) string,
1245  foreach_ip4_error
1246#undef _
1247};
1248
1249VLIB_NODE_FN (ip4_full_reass_node) (vlib_main_t * vm,
1250				    vlib_node_runtime_t * node,
1251				    vlib_frame_t * frame)
1252{
1253  return ip4_full_reass_inline (vm, node, frame, false /* is_feature */ ,
1254				false /* is_custom_app */ );
1255}
1256
1257/* *INDENT-OFF* */
1258VLIB_REGISTER_NODE (ip4_full_reass_node) = {
1259    .name = "ip4-full-reassembly",
1260    .vector_size = sizeof (u32),
1261    .format_trace = format_ip4_full_reass_trace,
1262    .n_errors = ARRAY_LEN (ip4_full_reass_error_strings),
1263    .error_strings = ip4_full_reass_error_strings,
1264    .n_next_nodes = IP4_FULL_REASS_N_NEXT,
1265    .next_nodes =
1266        {
1267                [IP4_FULL_REASS_NEXT_INPUT] = "ip4-input",
1268                [IP4_FULL_REASS_NEXT_DROP] = "ip4-drop",
1269                [IP4_FULL_REASS_NEXT_HANDOFF] = "ip4-full-reassembly-handoff",
1270
1271        },
1272};
1273/* *INDENT-ON* */
1274
1275VLIB_NODE_FN (ip4_full_reass_node_feature) (vlib_main_t * vm,
1276					    vlib_node_runtime_t * node,
1277					    vlib_frame_t * frame)
1278{
1279  return ip4_full_reass_inline (vm, node, frame, true /* is_feature */ ,
1280				false /* is_custom_app */ );
1281}
1282
1283/* *INDENT-OFF* */
1284VLIB_REGISTER_NODE (ip4_full_reass_node_feature) = {
1285    .name = "ip4-full-reassembly-feature",
1286    .vector_size = sizeof (u32),
1287    .format_trace = format_ip4_full_reass_trace,
1288    .n_errors = ARRAY_LEN (ip4_full_reass_error_strings),
1289    .error_strings = ip4_full_reass_error_strings,
1290    .n_next_nodes = IP4_FULL_REASS_N_NEXT,
1291    .next_nodes =
1292        {
1293                [IP4_FULL_REASS_NEXT_INPUT] = "ip4-input",
1294                [IP4_FULL_REASS_NEXT_DROP] = "ip4-drop",
1295                [IP4_FULL_REASS_NEXT_HANDOFF] = "ip4-full-reass-feature-hoff",
1296        },
1297};
1298/* *INDENT-ON* */
1299
1300/* *INDENT-OFF* */
1301VNET_FEATURE_INIT (ip4_full_reass_feature, static) = {
1302    .arc_name = "ip4-unicast",
1303    .node_name = "ip4-full-reassembly-feature",
1304    .runs_before = VNET_FEATURES ("ip4-lookup",
1305                                  "ipsec4-input-feature"),
1306    .runs_after = 0,
1307};
1308/* *INDENT-ON* */
1309
1310#ifndef CLIB_MARCH_VARIANT
1311always_inline u32
1312ip4_full_reass_get_nbuckets ()
1313{
1314  ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1315  u32 nbuckets;
1316  u8 i;
1317
1318  nbuckets = (u32) (rm->max_reass_n / IP4_REASS_HT_LOAD_FACTOR);
1319
1320  for (i = 0; i < 31; i++)
1321    if ((1 << i) >= nbuckets)
1322      break;
1323  nbuckets = 1 << i;
1324
1325  return nbuckets;
1326}
1327#endif /* CLIB_MARCH_VARIANT */
1328
1329typedef enum
1330{
1331  IP4_EVENT_CONFIG_CHANGED = 1,
1332} ip4_full_reass_event_t;
1333
1334typedef struct
1335{
1336  int failure;
1337  clib_bihash_16_8_t *new_hash;
1338} ip4_rehash_cb_ctx;
1339
1340#ifndef CLIB_MARCH_VARIANT
1341static void
1342ip4_rehash_cb (clib_bihash_kv_16_8_t * kv, void *_ctx)
1343{
1344  ip4_rehash_cb_ctx *ctx = _ctx;
1345  if (clib_bihash_add_del_16_8 (ctx->new_hash, kv, 1))
1346    {
1347      ctx->failure = 1;
1348    }
1349}
1350
1351static void
1352ip4_full_reass_set_params (u32 timeout_ms, u32 max_reassemblies,
1353			   u32 max_reassembly_length,
1354			   u32 expire_walk_interval_ms)
1355{
1356  ip4_full_reass_main.timeout_ms = timeout_ms;
1357  ip4_full_reass_main.timeout = (f64) timeout_ms / (f64) MSEC_PER_SEC;
1358  ip4_full_reass_main.max_reass_n = max_reassemblies;
1359  ip4_full_reass_main.max_reass_len = max_reassembly_length;
1360  ip4_full_reass_main.expire_walk_interval_ms = expire_walk_interval_ms;
1361}
1362
1363vnet_api_error_t
1364ip4_full_reass_set (u32 timeout_ms, u32 max_reassemblies,
1365		    u32 max_reassembly_length, u32 expire_walk_interval_ms)
1366{
1367  u32 old_nbuckets = ip4_full_reass_get_nbuckets ();
1368  ip4_full_reass_set_params (timeout_ms, max_reassemblies,
1369			     max_reassembly_length, expire_walk_interval_ms);
1370  vlib_process_signal_event (ip4_full_reass_main.vlib_main,
1371			     ip4_full_reass_main.ip4_full_reass_expire_node_idx,
1372			     IP4_EVENT_CONFIG_CHANGED, 0);
1373  u32 new_nbuckets = ip4_full_reass_get_nbuckets ();
1374  if (ip4_full_reass_main.max_reass_n > 0 && new_nbuckets > old_nbuckets)
1375    {
1376      clib_bihash_16_8_t new_hash;
1377      clib_memset (&new_hash, 0, sizeof (new_hash));
1378      ip4_rehash_cb_ctx ctx;
1379      ctx.failure = 0;
1380      ctx.new_hash = &new_hash;
1381      clib_bihash_init_16_8 (&new_hash, "ip4-dr", new_nbuckets,
1382			     new_nbuckets * 1024);
1383      clib_bihash_foreach_key_value_pair_16_8 (&ip4_full_reass_main.hash,
1384					       ip4_rehash_cb, &ctx);
1385      if (ctx.failure)
1386	{
1387	  clib_bihash_free_16_8 (&new_hash);
1388	  return -1;
1389	}
1390      else
1391	{
1392	  clib_bihash_free_16_8 (&ip4_full_reass_main.hash);
1393	  clib_memcpy_fast (&ip4_full_reass_main.hash, &new_hash,
1394			    sizeof (ip4_full_reass_main.hash));
1395	  clib_bihash_copied (&ip4_full_reass_main.hash, &new_hash);
1396	}
1397    }
1398  return 0;
1399}
1400
1401vnet_api_error_t
1402ip4_full_reass_get (u32 * timeout_ms, u32 * max_reassemblies,
1403		    u32 * max_reassembly_length,
1404		    u32 * expire_walk_interval_ms)
1405{
1406  *timeout_ms = ip4_full_reass_main.timeout_ms;
1407  *max_reassemblies = ip4_full_reass_main.max_reass_n;
1408  *max_reassembly_length = ip4_full_reass_main.max_reass_len;
1409  *expire_walk_interval_ms = ip4_full_reass_main.expire_walk_interval_ms;
1410  return 0;
1411}
1412
1413static clib_error_t *
1414ip4_full_reass_init_function (vlib_main_t * vm)
1415{
1416  ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1417  clib_error_t *error = 0;
1418  u32 nbuckets;
1419  vlib_node_t *node;
1420
1421  rm->vlib_main = vm;
1422
1423  vec_validate (rm->per_thread_data, vlib_num_workers ());
1424  ip4_full_reass_per_thread_t *rt;
1425  vec_foreach (rt, rm->per_thread_data)
1426  {
1427    clib_spinlock_init (&rt->lock);
1428    pool_alloc (rt->pool, rm->max_reass_n);
1429  }
1430
1431  node = vlib_get_node_by_name (vm, (u8 *) "ip4-full-reassembly-expire-walk");
1432  ASSERT (node);
1433  rm->ip4_full_reass_expire_node_idx = node->index;
1434
1435  ip4_full_reass_set_params (IP4_REASS_TIMEOUT_DEFAULT_MS,
1436			     IP4_REASS_MAX_REASSEMBLIES_DEFAULT,
1437			     IP4_REASS_MAX_REASSEMBLY_LENGTH_DEFAULT,
1438			     IP4_REASS_EXPIRE_WALK_INTERVAL_DEFAULT_MS);
1439
1440  nbuckets = ip4_full_reass_get_nbuckets ();
1441  clib_bihash_init_16_8 (&rm->hash, "ip4-dr", nbuckets, nbuckets * 1024);
1442
1443  node = vlib_get_node_by_name (vm, (u8 *) "ip4-drop");
1444  ASSERT (node);
1445  rm->ip4_drop_idx = node->index;
1446
1447  rm->fq_index = vlib_frame_queue_main_init (ip4_full_reass_node.index, 0);
1448  rm->fq_feature_index =
1449    vlib_frame_queue_main_init (ip4_full_reass_node_feature.index, 0);
1450
1451  return error;
1452}
1453
1454VLIB_INIT_FUNCTION (ip4_full_reass_init_function);
1455#endif /* CLIB_MARCH_VARIANT */
1456
1457static uword
1458ip4_full_reass_walk_expired (vlib_main_t * vm,
1459			     vlib_node_runtime_t * node, vlib_frame_t * f)
1460{
1461  ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1462  uword event_type, *event_data = 0;
1463
1464  while (true)
1465    {
1466      vlib_process_wait_for_event_or_clock (vm,
1467					    (f64)
1468					    rm->expire_walk_interval_ms /
1469					    (f64) MSEC_PER_SEC);
1470      event_type = vlib_process_get_events (vm, &event_data);
1471
1472      switch (event_type)
1473	{
1474	case ~0:		/* no events => timeout */
1475	  /* nothing to do here */
1476	  break;
1477	case IP4_EVENT_CONFIG_CHANGED:
1478	  break;
1479	default:
1480	  clib_warning ("BUG: event type 0x%wx", event_type);
1481	  break;
1482	}
1483      f64 now = vlib_time_now (vm);
1484
1485      ip4_full_reass_t *reass;
1486      int *pool_indexes_to_free = NULL;
1487
1488      uword thread_index = 0;
1489      int index;
1490      const uword nthreads = vlib_num_workers () + 1;
1491      for (thread_index = 0; thread_index < nthreads; ++thread_index)
1492	{
1493	  ip4_full_reass_per_thread_t *rt =
1494	    &rm->per_thread_data[thread_index];
1495	  clib_spinlock_lock (&rt->lock);
1496
1497	  vec_reset_length (pool_indexes_to_free);
1498          /* *INDENT-OFF* */
1499          pool_foreach_index (index, rt->pool, ({
1500                                reass = pool_elt_at_index (rt->pool, index);
1501                                if (now > reass->last_heard + rm->timeout)
1502                                  {
1503                                    vec_add1 (pool_indexes_to_free, index);
1504                                  }
1505                              }));
1506          /* *INDENT-ON* */
1507	  int *i;
1508          /* *INDENT-OFF* */
1509          vec_foreach (i, pool_indexes_to_free)
1510          {
1511            ip4_full_reass_t *reass = pool_elt_at_index (rt->pool, i[0]);
1512            ip4_full_reass_drop_all (vm, node, rm, reass);
1513            ip4_full_reass_free (rm, rt, reass);
1514          }
1515          /* *INDENT-ON* */
1516
1517	  clib_spinlock_unlock (&rt->lock);
1518	}
1519
1520      vec_free (pool_indexes_to_free);
1521      if (event_data)
1522	{
1523	  _vec_len (event_data) = 0;
1524	}
1525    }
1526
1527  return 0;
1528}
1529
1530/* *INDENT-OFF* */
1531VLIB_REGISTER_NODE (ip4_full_reass_expire_node) = {
1532    .function = ip4_full_reass_walk_expired,
1533    .type = VLIB_NODE_TYPE_PROCESS,
1534    .name = "ip4-full-reassembly-expire-walk",
1535    .format_trace = format_ip4_full_reass_trace,
1536    .n_errors = ARRAY_LEN (ip4_full_reass_error_strings),
1537    .error_strings = ip4_full_reass_error_strings,
1538
1539};
1540/* *INDENT-ON* */
1541
1542static u8 *
1543format_ip4_full_reass_key (u8 * s, va_list * args)
1544{
1545  ip4_full_reass_key_t *key = va_arg (*args, ip4_full_reass_key_t *);
1546  s =
1547    format (s,
1548	    "xx_id: %u, src: %U, dst: %U, frag_id: %u, proto: %u",
1549	    key->xx_id, format_ip4_address, &key->src, format_ip4_address,
1550	    &key->dst, clib_net_to_host_u16 (key->frag_id), key->proto);
1551  return s;
1552}
1553
1554static u8 *
1555format_ip4_reass (u8 * s, va_list * args)
1556{
1557  vlib_main_t *vm = va_arg (*args, vlib_main_t *);
1558  ip4_full_reass_t *reass = va_arg (*args, ip4_full_reass_t *);
1559
1560  s = format (s, "ID: %lu, key: %U\n  first_bi: %u, data_len: %u, "
1561	      "last_packet_octet: %u, trace_op_counter: %u\n",
1562	      reass->id, format_ip4_full_reass_key, &reass->key,
1563	      reass->first_bi, reass->data_len,
1564	      reass->last_packet_octet, reass->trace_op_counter);
1565
1566  u32 bi = reass->first_bi;
1567  u32 counter = 0;
1568  while (~0 != bi)
1569    {
1570      vlib_buffer_t *b = vlib_get_buffer (vm, bi);
1571      vnet_buffer_opaque_t *vnb = vnet_buffer (b);
1572      s =
1573	format (s,
1574		"  #%03u: range: [%u, %u], bi: %u, off: %d, len: %u, "
1575		"fragment[%u, %u]\n", counter, vnb->ip.reass.range_first,
1576		vnb->ip.reass.range_last, bi,
1577		ip4_full_reass_buffer_get_data_offset (b),
1578		ip4_full_reass_buffer_get_data_len (b),
1579		vnb->ip.reass.fragment_first, vnb->ip.reass.fragment_last);
1580      if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
1581	{
1582	  bi = b->next_buffer;
1583	}
1584      else
1585	{
1586	  bi = ~0;
1587	}
1588    }
1589  return s;
1590}
1591
1592static clib_error_t *
1593show_ip4_reass (vlib_main_t * vm,
1594		unformat_input_t * input,
1595		CLIB_UNUSED (vlib_cli_command_t * lmd))
1596{
1597  ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1598
1599  vlib_cli_output (vm, "---------------------");
1600  vlib_cli_output (vm, "IP4 reassembly status");
1601  vlib_cli_output (vm, "---------------------");
1602  bool details = false;
1603  if (unformat (input, "details"))
1604    {
1605      details = true;
1606    }
1607
1608  u32 sum_reass_n = 0;
1609  ip4_full_reass_t *reass;
1610  uword thread_index;
1611  const uword nthreads = vlib_num_workers () + 1;
1612  for (thread_index = 0; thread_index < nthreads; ++thread_index)
1613    {
1614      ip4_full_reass_per_thread_t *rt = &rm->per_thread_data[thread_index];
1615      clib_spinlock_lock (&rt->lock);
1616      if (details)
1617	{
1618          /* *INDENT-OFF* */
1619          pool_foreach (reass, rt->pool, {
1620            vlib_cli_output (vm, "%U", format_ip4_reass, vm, reass);
1621          });
1622          /* *INDENT-ON* */
1623	}
1624      sum_reass_n += rt->reass_n;
1625      clib_spinlock_unlock (&rt->lock);
1626    }
1627  vlib_cli_output (vm, "---------------------");
1628  vlib_cli_output (vm, "Current IP4 reassemblies count: %lu\n",
1629		   (long unsigned) sum_reass_n);
1630  vlib_cli_output (vm,
1631		   "Maximum configured concurrent IP4 reassemblies per worker-thread: %lu\n",
1632		   (long unsigned) rm->max_reass_n);
1633  return 0;
1634}
1635
1636/* *INDENT-OFF* */
1637VLIB_CLI_COMMAND (show_ip4_full_reass_cmd, static) = {
1638    .path = "show ip4-full-reassembly",
1639    .short_help = "show ip4-full-reassembly [details]",
1640    .function = show_ip4_reass,
1641};
1642/* *INDENT-ON* */
1643
1644#ifndef CLIB_MARCH_VARIANT
1645vnet_api_error_t
1646ip4_full_reass_enable_disable (u32 sw_if_index, u8 enable_disable)
1647{
1648  return vnet_feature_enable_disable ("ip4-unicast",
1649				      "ip4-full-reassembly-feature",
1650				      sw_if_index, enable_disable, 0, 0);
1651}
1652#endif /* CLIB_MARCH_VARIANT */
1653
1654
1655#define foreach_ip4_full_reass_handoff_error                       \
1656_(CONGESTION_DROP, "congestion drop")
1657
1658
1659typedef enum
1660{
1661#define _(sym,str) IP4_FULL_REASS_HANDOFF_ERROR_##sym,
1662  foreach_ip4_full_reass_handoff_error
1663#undef _
1664    IP4_FULL_REASS_HANDOFF_N_ERROR,
1665} ip4_full_reass_handoff_error_t;
1666
1667static char *ip4_full_reass_handoff_error_strings[] = {
1668#define _(sym,string) string,
1669  foreach_ip4_full_reass_handoff_error
1670#undef _
1671};
1672
1673typedef struct
1674{
1675  u32 next_worker_index;
1676} ip4_full_reass_handoff_trace_t;
1677
1678static u8 *
1679format_ip4_full_reass_handoff_trace (u8 * s, va_list * args)
1680{
1681  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1682  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1683  ip4_full_reass_handoff_trace_t *t =
1684    va_arg (*args, ip4_full_reass_handoff_trace_t *);
1685
1686  s =
1687    format (s, "ip4-full-reassembly-handoff: next-worker %d",
1688	    t->next_worker_index);
1689
1690  return s;
1691}
1692
1693always_inline uword
1694ip4_full_reass_handoff_node_inline (vlib_main_t * vm,
1695				    vlib_node_runtime_t * node,
1696				    vlib_frame_t * frame, bool is_feature)
1697{
1698  ip4_full_reass_main_t *rm = &ip4_full_reass_main;
1699
1700  vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
1701  u32 n_enq, n_left_from, *from;
1702  u16 thread_indices[VLIB_FRAME_SIZE], *ti;
1703  u32 fq_index;
1704
1705  from = vlib_frame_vector_args (frame);
1706  n_left_from = frame->n_vectors;
1707  vlib_get_buffers (vm, from, bufs, n_left_from);
1708
1709  b = bufs;
1710  ti = thread_indices;
1711
1712  fq_index = (is_feature) ? rm->fq_feature_index : rm->fq_index;
1713
1714  while (n_left_from > 0)
1715    {
1716      ti[0] = vnet_buffer (b[0])->ip.reass.owner_thread_index;
1717
1718      if (PREDICT_FALSE
1719	  ((node->flags & VLIB_NODE_FLAG_TRACE)
1720	   && (b[0]->flags & VLIB_BUFFER_IS_TRACED)))
1721	{
1722	  ip4_full_reass_handoff_trace_t *t =
1723	    vlib_add_trace (vm, node, b[0], sizeof (*t));
1724	  t->next_worker_index = ti[0];
1725	}
1726
1727      n_left_from -= 1;
1728      ti += 1;
1729      b += 1;
1730    }
1731  n_enq =
1732    vlib_buffer_enqueue_to_thread (vm, fq_index, from, thread_indices,
1733				   frame->n_vectors, 1);
1734
1735  if (n_enq < frame->n_vectors)
1736    vlib_node_increment_counter (vm, node->node_index,
1737				 IP4_FULL_REASS_HANDOFF_ERROR_CONGESTION_DROP,
1738				 frame->n_vectors - n_enq);
1739  return frame->n_vectors;
1740}
1741
1742VLIB_NODE_FN (ip4_full_reass_handoff_node) (vlib_main_t * vm,
1743					    vlib_node_runtime_t * node,
1744					    vlib_frame_t * frame)
1745{
1746  return ip4_full_reass_handoff_node_inline (vm, node, frame,
1747					     false /* is_feature */ );
1748}
1749
1750
1751/* *INDENT-OFF* */
1752VLIB_REGISTER_NODE (ip4_full_reass_handoff_node) = {
1753  .name = "ip4-full-reassembly-handoff",
1754  .vector_size = sizeof (u32),
1755  .n_errors = ARRAY_LEN(ip4_full_reass_handoff_error_strings),
1756  .error_strings = ip4_full_reass_handoff_error_strings,
1757  .format_trace = format_ip4_full_reass_handoff_trace,
1758
1759  .n_next_nodes = 1,
1760
1761  .next_nodes = {
1762    [0] = "error-drop",
1763  },
1764};
1765/* *INDENT-ON* */
1766
1767
1768/* *INDENT-OFF* */
1769VLIB_NODE_FN (ip4_full_reass_feature_handoff_node) (vlib_main_t * vm,
1770						    vlib_node_runtime_t *
1771						    node,
1772						    vlib_frame_t * frame)
1773{
1774  return ip4_full_reass_handoff_node_inline (vm, node, frame,
1775					     true /* is_feature */ );
1776}
1777/* *INDENT-ON* */
1778
1779
1780/* *INDENT-OFF* */
1781VLIB_REGISTER_NODE (ip4_full_reass_feature_handoff_node) = {
1782  .name = "ip4-full-reass-feature-hoff",
1783  .vector_size = sizeof (u32),
1784  .n_errors = ARRAY_LEN(ip4_full_reass_handoff_error_strings),
1785  .error_strings = ip4_full_reass_handoff_error_strings,
1786  .format_trace = format_ip4_full_reass_handoff_trace,
1787
1788  .n_next_nodes = 1,
1789
1790  .next_nodes = {
1791    [0] = "error-drop",
1792  },
1793};
1794/* *INDENT-ON* */
1795
1796/*
1797 * fd.io coding-style-patch-verification: ON
1798 *
1799 * Local Variables:
1800 * eval: (c-set-style "gnu")
1801 * End:
1802 */
1803