adj_nbr.c revision e7ee3009
1/*
2 * Copyright (c) 2016 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
6 *
7 *     http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16#include <vnet/adj/adj_nbr.h>
17#include <vnet/adj/adj_internal.h>
18#include <vnet/ethernet/arp_packet.h>
19#include <vnet/fib/fib_walk.h>
20
21#include <vppinfra/bihash_24_8.h>
22
23/*
24 * Vector Hash tables of neighbour (traditional) adjacencies
25 *  Key: interface(for the vector index), address (and its proto),
26 *       link-type/ether-type.
27 */
28static BVT(clib_bihash) **adj_nbr_tables[FIB_PROTOCOL_MAX];
29
30// FIXME SIZE APPROPRIATELY. ASK DAVEB.
31#define ADJ_NBR_DEFAULT_HASH_NUM_BUCKETS (64 * 64)
32#define ADJ_NBR_DEFAULT_HASH_MEMORY_SIZE (32<<20)
33
34
35#define ADJ_NBR_SET_KEY(_key, _lt, _nh)         \
36{						\
37    _key.key[0] = (_nh)->as_u64[0];		\
38    _key.key[1] = (_nh)->as_u64[1];		\
39    _key.key[2] = (_lt);			\
40}
41
42#define ADJ_NBR_ITF_OK(_proto, _itf)			\
43    (((_itf) < vec_len(adj_nbr_tables[_proto])) &&	\
44     (NULL != adj_nbr_tables[_proto][sw_if_index]))
45
46static void
47adj_nbr_insert (fib_protocol_t nh_proto,
48		vnet_link_t link_type,
49		const ip46_address_t *nh_addr,
50		u32 sw_if_index,
51		adj_index_t adj_index)
52{
53    BVT(clib_bihash_kv) kv;
54
55    if (sw_if_index >= vec_len(adj_nbr_tables[nh_proto]))
56    {
57	vec_validate(adj_nbr_tables[nh_proto], sw_if_index);
58    }
59    if (NULL == adj_nbr_tables[nh_proto][sw_if_index])
60    {
61	adj_nbr_tables[nh_proto][sw_if_index] =
62	    clib_mem_alloc_aligned(sizeof(BVT(clib_bihash)),
63				   CLIB_CACHE_LINE_BYTES);
64	clib_memset(adj_nbr_tables[nh_proto][sw_if_index],
65	       0,
66	       sizeof(BVT(clib_bihash)));
67
68	BV(clib_bihash_init) (adj_nbr_tables[nh_proto][sw_if_index],
69			      "Adjacency Neighbour table",
70			      ADJ_NBR_DEFAULT_HASH_NUM_BUCKETS,
71			      ADJ_NBR_DEFAULT_HASH_MEMORY_SIZE);
72    }
73
74    ADJ_NBR_SET_KEY(kv, link_type, nh_addr);
75    kv.value = adj_index;
76
77    BV(clib_bihash_add_del) (adj_nbr_tables[nh_proto][sw_if_index], &kv, 1);
78}
79
80void
81adj_nbr_remove (adj_index_t ai,
82                fib_protocol_t nh_proto,
83		vnet_link_t link_type,
84		const ip46_address_t *nh_addr,
85		u32 sw_if_index)
86{
87    BVT(clib_bihash_kv) kv;
88
89    if (!ADJ_NBR_ITF_OK(nh_proto, sw_if_index))
90	return;
91
92    ADJ_NBR_SET_KEY(kv, link_type, nh_addr);
93    kv.value = ai;
94
95    BV(clib_bihash_add_del) (adj_nbr_tables[nh_proto][sw_if_index], &kv, 0);
96}
97
98adj_index_t
99adj_nbr_find (fib_protocol_t nh_proto,
100	      vnet_link_t link_type,
101	      const ip46_address_t *nh_addr,
102	      u32 sw_if_index)
103{
104    BVT(clib_bihash_kv) kv;
105
106    ADJ_NBR_SET_KEY(kv, link_type, nh_addr);
107
108    if (!ADJ_NBR_ITF_OK(nh_proto, sw_if_index))
109	return (ADJ_INDEX_INVALID);
110
111    if (BV(clib_bihash_search)(adj_nbr_tables[nh_proto][sw_if_index],
112			       &kv, &kv) < 0)
113    {
114	return (ADJ_INDEX_INVALID);
115    }
116    else
117    {
118	return (kv.value);
119    }
120}
121
122static inline u32
123adj_get_nd_node (fib_protocol_t proto)
124{
125    switch (proto) {
126    case FIB_PROTOCOL_IP4:
127	return (ip4_arp_node.index);
128    case FIB_PROTOCOL_IP6:
129	return (ip6_discover_neighbor_node.index);
130    case FIB_PROTOCOL_MPLS:
131	break;
132    }
133    ASSERT(0);
134    return (ip4_arp_node.index);
135}
136
137/**
138 * @brief Check and set feature flags if o/p interface has any o/p features.
139 */
140static void
141adj_nbr_evaluate_feature (adj_index_t ai)
142{
143    ip_adjacency_t *adj;
144    vnet_feature_main_t *fm = &feature_main;
145    i16 feature_count;
146    u8 arc_index;
147    u32 sw_if_index;
148
149    adj = adj_get(ai);
150
151    switch (adj->ia_link)
152    {
153    case VNET_LINK_IP4:
154        arc_index = ip4_main.lookup_main.output_feature_arc_index;
155        break;
156    case VNET_LINK_IP6:
157        arc_index = ip6_main.lookup_main.output_feature_arc_index;
158        break;
159    case VNET_LINK_MPLS:
160        arc_index = mpls_main.output_feature_arc_index;
161        break;
162    default:
163        return;
164    }
165
166    sw_if_index = adj->rewrite_header.sw_if_index;
167    if (vec_len(fm->feature_count_by_sw_if_index[arc_index]) > sw_if_index)
168    {
169        feature_count = fm->feature_count_by_sw_if_index[arc_index][sw_if_index];
170        if (feature_count > 0)
171            adj->rewrite_header.flags |= VNET_REWRITE_HAS_FEATURES;
172    }
173
174    return;
175}
176
177static ip_adjacency_t*
178adj_nbr_alloc (fib_protocol_t nh_proto,
179	       vnet_link_t link_type,
180	       const ip46_address_t *nh_addr,
181	       u32 sw_if_index)
182{
183    ip_adjacency_t *adj;
184
185    adj = adj_alloc(nh_proto);
186
187    adj_nbr_insert(nh_proto, link_type, nh_addr,
188		   sw_if_index,
189		   adj_get_index(adj));
190
191    /*
192     * since we just added the ADJ we have no rewrite string for it,
193     * so its for ARP
194     */
195    adj->lookup_next_index = IP_LOOKUP_NEXT_ARP;
196    adj->sub_type.nbr.next_hop = *nh_addr;
197    adj->ia_link = link_type;
198    adj->ia_nh_proto = nh_proto;
199    adj->rewrite_header.sw_if_index = sw_if_index;
200    vnet_rewrite_update_mtu(vnet_get_main(), adj->ia_link,
201                            &adj->rewrite_header);
202
203    adj_nbr_evaluate_feature (adj_get_index(adj));
204    return (adj);
205}
206
207/*
208 * adj_nbr_add_or_lock
209 *
210 * Add an adjacency for the neighbour requested.
211 *
212 * The key for an adj is:
213 *   - the Next-hops protocol (i.e. v4 or v6)
214 *   - the address of the next-hop
215 *   - the interface the next-hop is reachable through
216 */
217adj_index_t
218adj_nbr_add_or_lock (fib_protocol_t nh_proto,
219		     vnet_link_t link_type,
220		     const ip46_address_t *nh_addr,
221		     u32 sw_if_index)
222{
223    adj_index_t adj_index;
224
225    adj_index = adj_nbr_find(nh_proto, link_type, nh_addr, sw_if_index);
226
227    if (ADJ_INDEX_INVALID == adj_index)
228    {
229        ip_adjacency_t *adj;
230	vnet_main_t *vnm;
231
232	vnm = vnet_get_main();
233	adj = adj_nbr_alloc(nh_proto, link_type, nh_addr, sw_if_index);
234	adj_index = adj_get_index(adj);
235	adj_lock(adj_index);
236
237        if (ip46_address_is_equal(&ADJ_BCAST_ADDR, nh_addr))
238        {
239            adj->lookup_next_index = IP_LOOKUP_NEXT_BCAST;
240        }
241
242	vnet_rewrite_init(vnm, sw_if_index, link_type,
243			  adj_get_nd_node(nh_proto),
244			  vnet_tx_node_index_for_sw_interface(vnm, sw_if_index),
245			  &adj->rewrite_header);
246
247	/*
248	 * we need a rewrite where the destination IP address is converted
249	 * to the appropriate link-layer address. This is interface specific.
250	 * So ask the interface to do it.
251	 */
252	vnet_update_adjacency_for_sw_interface(vnm, sw_if_index, adj_index);
253    }
254    else
255    {
256	adj_lock(adj_index);
257    }
258
259    adj_delegate_adj_created(adj_get(adj_index));
260    return (adj_index);
261}
262
263adj_index_t
264adj_nbr_add_or_lock_w_rewrite (fib_protocol_t nh_proto,
265			       vnet_link_t link_type,
266			       const ip46_address_t *nh_addr,
267			       u32 sw_if_index,
268			       u8 *rewrite)
269{
270    adj_index_t adj_index;
271
272    adj_index = adj_nbr_find(nh_proto, link_type, nh_addr, sw_if_index);
273
274    if (ADJ_INDEX_INVALID == adj_index)
275    {
276        ip_adjacency_t *adj;
277
278        adj = adj_nbr_alloc(nh_proto, link_type, nh_addr, sw_if_index);
279	adj->rewrite_header.sw_if_index = sw_if_index;
280        adj_index = adj_get_index(adj);
281    }
282
283    adj_lock(adj_index);
284    adj_nbr_update_rewrite(adj_index,
285			   ADJ_NBR_REWRITE_FLAG_COMPLETE,
286			   rewrite);
287
288    adj_delegate_adj_created(adj_get(adj_index));
289
290    return (adj_index);
291}
292
293/**
294 * adj_nbr_update_rewrite
295 *
296 * Update the adjacency's rewrite string. A NULL string implies the
297 * rewrite is reset (i.e. when ARP/ND entry is gone).
298 * NB: the adj being updated may be handling traffic in the DP.
299 */
300void
301adj_nbr_update_rewrite (adj_index_t adj_index,
302			adj_nbr_rewrite_flag_t flags,
303			u8 *rewrite)
304{
305    ip_adjacency_t *adj;
306
307    ASSERT(ADJ_INDEX_INVALID != adj_index);
308
309    adj = adj_get(adj_index);
310
311    if (flags & ADJ_NBR_REWRITE_FLAG_COMPLETE)
312    {
313	/*
314	 * update the adj's rewrite string and build the arc
315	 * from the rewrite node to the interface's TX node
316	 */
317	adj_nbr_update_rewrite_internal(adj, IP_LOOKUP_NEXT_REWRITE,
318					adj_get_rewrite_node(adj->ia_link),
319					vnet_tx_node_index_for_sw_interface(
320					    vnet_get_main(),
321					    adj->rewrite_header.sw_if_index),
322					rewrite);
323    }
324    else
325    {
326	adj_nbr_update_rewrite_internal(adj, IP_LOOKUP_NEXT_ARP,
327					adj_get_nd_node(adj->ia_nh_proto),
328					vnet_tx_node_index_for_sw_interface(
329					    vnet_get_main(),
330					    adj->rewrite_header.sw_if_index),
331					rewrite);
332    }
333}
334
335/**
336 * adj_nbr_update_rewrite_internal
337 *
338 * Update the adjacency's rewrite string. A NULL string implies the
339 * rewrite is reset (i.e. when ARP/ND entry is gone).
340 * NB: the adj being updated may be handling traffic in the DP.
341 */
342void
343adj_nbr_update_rewrite_internal (ip_adjacency_t *adj,
344				 ip_lookup_next_t adj_next_index,
345				 u32 this_node,
346				 u32 next_node,
347				 u8 *rewrite)
348{
349    ip_adjacency_t *walk_adj;
350    adj_index_t walk_ai;
351    vlib_main_t * vm;
352    u32 old_next;
353    int do_walk;
354
355    vm = vlib_get_main();
356    old_next = adj->lookup_next_index;
357
358    walk_ai = adj_get_index(adj);
359    if (VNET_LINK_MPLS == adj->ia_link)
360    {
361        /*
362         * The link type MPLS has no children in the control plane graph, it only
363         * has children in the data-plane graph. The backwalk is up the former.
364         * So we need to walk from its IP cousin.
365         */
366        walk_ai = adj_nbr_find(adj->ia_nh_proto,
367                               fib_proto_to_link(adj->ia_nh_proto),
368                               &adj->sub_type.nbr.next_hop,
369                               adj->rewrite_header.sw_if_index);
370    }
371
372    /*
373     * Don't call the walk re-entrantly
374     */
375    if (ADJ_INDEX_INVALID != walk_ai)
376    {
377        walk_adj = adj_get(walk_ai);
378        if (ADJ_FLAG_SYNC_WALK_ACTIVE & walk_adj->ia_flags)
379        {
380            do_walk = 0;
381        }
382        else
383        {
384            /*
385             * Prevent re-entrant walk of the same adj
386             */
387            walk_adj->ia_flags |= ADJ_FLAG_SYNC_WALK_ACTIVE;
388            do_walk = 1;
389        }
390    }
391    else
392    {
393        do_walk = 0;
394    }
395
396    /*
397     * lock the adjacencies that are affected by updates this walk will provoke.
398     * Since the aim of the walk is to update children to link to a different
399     * DPO, this adj will no longer be in use and its lock count will drop to 0.
400     * We don't want it to be deleted as part of this endeavour.
401     */
402    adj_lock(adj_get_index(adj));
403    adj_lock(walk_ai);
404
405    /*
406     * Updating a rewrite string is not atomic;
407     *  - the rewrite string is too long to write in one instruction
408     *  - when swapping from incomplete to complete, we also need to update
409     *    the VLIB graph next-index of the adj.
410     * ideally we would only want to suspend forwarding via this adj whilst we
411     * do this, but we do not have that level of granularity - it's suspend all
412     * worker threads or nothing.
413     * The other choices are:
414     *  - to mark the adj down and back walk so child load-balances drop this adj
415     *    from the set.
416     *  - update the next_node index of this adj to point to error-drop
417     * both of which will mean for MAC change we will drop for this adj
418     * which is not acceptable. However, when the adj changes type (from
419     * complete to incomplete and vice-versa) the child DPOs, which have the
420     * VLIB graph next node index, will be sending packets to the wrong graph
421     * node. So from the options above, updating the next_node of the adj to
422     * be drop will work, but it relies on each graph node v4/v6/mpls, rewrite/
423     * arp/midchain always be valid w.r.t. a mis-match of adj type and node type
424     * (i.e. a rewrite adj in the arp node). This is not enforceable. Getting it
425     * wrong will lead to hard to find bugs since its a race condition. So we
426     * choose the more reliable method of updating the children to use the drop,
427     * then switching adj's type, then updating the children again. Did I mention
428     * that this doesn't happen often...
429     * So we need to distinguish between the two cases:
430     *  1 - mac change
431     *  2 - adj type change
432     */
433    if (do_walk &&
434        old_next != adj_next_index &&
435        ADJ_INDEX_INVALID != walk_ai)
436    {
437        /*
438         * the adj is changing type. we need to fix all children so that they
439         * stack momentarily on a drop, while the adj changes. If we don't do
440         * this  the children will send packets to a VLIB graph node that does
441         * not correspond to the adj's type - and it goes downhill from there.
442         */
443	fib_node_back_walk_ctx_t bw_ctx = {
444	    .fnbw_reason = FIB_NODE_BW_REASON_FLAG_ADJ_DOWN,
445            /*
446             * force this walk to be synchronous. if we don't and a node in the graph
447             * (a heavily shared path-list) chooses to back-ground the walk (make it
448             * async) then it will pause and we will do the adj update below, before
449             * all the children are updated. not good.
450             */
451            .fnbw_flags = FIB_NODE_BW_FLAG_FORCE_SYNC,
452	};
453
454	fib_walk_sync(FIB_NODE_TYPE_ADJ, walk_ai, &bw_ctx);
455    }
456
457    /*
458     * If we are just updating the MAC string of the adj (which we also can't
459     * do atomically), then we need to stop packets switching through the adj.
460     * We can't do that on a per-adj basis, so it's all the packets.
461     * If we are updating the type, and we walked back to the children above,
462     * then this barrier serves to flush the queues/frames.
463     */
464    vlib_worker_thread_barrier_sync(vm);
465
466    adj->lookup_next_index = adj_next_index;
467
468    if (NULL != rewrite)
469    {
470	/*
471	 * new rewrite provided.
472	 * fill in the adj's rewrite string, and build the VLIB graph arc.
473	 */
474	vnet_rewrite_set_data_internal(&adj->rewrite_header,
475				       sizeof(adj->rewrite_data),
476				       rewrite,
477				       vec_len(rewrite));
478	vec_free(rewrite);
479    }
480    else
481    {
482	vnet_rewrite_clear_data_internal(&adj->rewrite_header,
483					 sizeof(adj->rewrite_data));
484    }
485    adj->rewrite_header.next_index = vlib_node_add_next(vlib_get_main(),
486                                                        this_node,
487                                                        next_node);
488
489    /*
490     * done with the rewrite update - let the workers loose.
491     */
492    vlib_worker_thread_barrier_release(vm);
493
494    if (do_walk &&
495        (old_next != adj->lookup_next_index) &&
496        (ADJ_INDEX_INVALID != walk_ai))
497    {
498        /*
499         * backwalk to the children so they can stack on the now updated
500         * adjacency
501         */
502        fib_node_back_walk_ctx_t bw_ctx = {
503	    .fnbw_reason = FIB_NODE_BW_REASON_FLAG_ADJ_UPDATE,
504	};
505
506	fib_walk_sync(FIB_NODE_TYPE_ADJ, walk_ai, &bw_ctx);
507    }
508    /*
509     * Prevent re-entrant walk of the same adj
510     */
511    if (do_walk)
512    {
513        walk_adj->ia_flags &= ~ADJ_FLAG_SYNC_WALK_ACTIVE;
514    }
515
516    adj_unlock(adj_get_index(adj));
517    adj_unlock(walk_ai);
518}
519
520typedef struct adj_db_count_ctx_t_ {
521    u64 count;
522} adj_db_count_ctx_t;
523
524static int
525adj_db_count (BVT(clib_bihash_kv) * kvp,
526	      void *arg)
527{
528    adj_db_count_ctx_t * ctx = arg;
529    ctx->count++;
530    return (BIHASH_WALK_CONTINUE);
531}
532
533u32
534adj_nbr_db_size (void)
535{
536    adj_db_count_ctx_t ctx = {
537	.count = 0,
538    };
539    fib_protocol_t proto;
540    u32 sw_if_index = 0;
541
542    for (proto = FIB_PROTOCOL_IP4; proto <= FIB_PROTOCOL_IP6; proto++)
543    {
544	vec_foreach_index(sw_if_index, adj_nbr_tables[proto])
545	{
546	    if (NULL != adj_nbr_tables[proto][sw_if_index])
547	    {
548		BV(clib_bihash_foreach_key_value_pair) (
549		    adj_nbr_tables[proto][sw_if_index],
550		    adj_db_count,
551		    &ctx);
552	    }
553	}
554    }
555    return (ctx.count);
556}
557
558/**
559 * @brief Context for a walk of the adjacency neighbour DB
560 */
561typedef struct adj_walk_ctx_t_
562{
563    adj_walk_cb_t awc_cb;
564    void *awc_ctx;
565} adj_walk_ctx_t;
566
567static int
568adj_nbr_walk_cb (BVT(clib_bihash_kv) * kvp,
569		 void *arg)
570{
571    adj_walk_ctx_t *ctx = arg;
572
573    // FIXME: can't stop early...
574    if (ADJ_WALK_RC_STOP == ctx->awc_cb(kvp->value, ctx->awc_ctx))
575        return (BIHASH_WALK_STOP);
576    return (BIHASH_WALK_CONTINUE);
577}
578
579void
580adj_nbr_walk (u32 sw_if_index,
581	      fib_protocol_t adj_nh_proto,
582	      adj_walk_cb_t cb,
583	      void *ctx)
584{
585    if (!ADJ_NBR_ITF_OK(adj_nh_proto, sw_if_index))
586	return;
587
588    adj_walk_ctx_t awc = {
589	.awc_ctx = ctx,
590	.awc_cb = cb,
591    };
592
593    BV(clib_bihash_foreach_key_value_pair) (
594	adj_nbr_tables[adj_nh_proto][sw_if_index],
595	adj_nbr_walk_cb,
596	&awc);
597}
598
599/**
600 * @brief Walk adjacencies on a link with a given v4 next-hop.
601 * that is visit the adjacencies with different link types.
602 */
603void
604adj_nbr_walk_nh4 (u32 sw_if_index,
605		 const ip4_address_t *addr,
606		 adj_walk_cb_t cb,
607		 void *ctx)
608{
609    if (!ADJ_NBR_ITF_OK(FIB_PROTOCOL_IP4, sw_if_index))
610	return;
611
612    ip46_address_t nh = {
613	.ip4 = *addr,
614    };
615    vnet_link_t linkt;
616    adj_index_t ai;
617
618    FOR_EACH_VNET_LINK(linkt)
619    {
620        ai = adj_nbr_find (FIB_PROTOCOL_IP4, linkt, &nh, sw_if_index);
621
622        if (INDEX_INVALID != ai)
623            cb(ai, ctx);
624    }
625}
626
627/**
628 * @brief Walk adjacencies on a link with a given v6 next-hop.
629 * that is visit the adjacencies with different link types.
630 */
631void
632adj_nbr_walk_nh6 (u32 sw_if_index,
633		 const ip6_address_t *addr,
634		 adj_walk_cb_t cb,
635		 void *ctx)
636{
637    if (!ADJ_NBR_ITF_OK(FIB_PROTOCOL_IP6, sw_if_index))
638	return;
639
640    ip46_address_t nh = {
641	.ip6 = *addr,
642    };
643    vnet_link_t linkt;
644    adj_index_t ai;
645
646    FOR_EACH_VNET_LINK(linkt)
647    {
648        ai = adj_nbr_find (FIB_PROTOCOL_IP6, linkt, &nh, sw_if_index);
649
650        if (INDEX_INVALID != ai)
651            cb(ai, ctx);
652    }
653}
654
655/**
656 * @brief Walk adjacencies on a link with a given next-hop.
657 * that is visit the adjacencies with different link types.
658 */
659void
660adj_nbr_walk_nh (u32 sw_if_index,
661		 fib_protocol_t adj_nh_proto,
662		 const ip46_address_t *nh,
663		 adj_walk_cb_t cb,
664		 void *ctx)
665{
666    if (!ADJ_NBR_ITF_OK(adj_nh_proto, sw_if_index))
667	return;
668
669    vnet_link_t linkt;
670    adj_index_t ai;
671
672    FOR_EACH_VNET_LINK(linkt)
673    {
674        ai = adj_nbr_find (adj_nh_proto, linkt, nh, sw_if_index);
675
676        if (INDEX_INVALID != ai)
677            cb(ai, ctx);
678    }
679}
680
681/**
682 * Flags associated with the interface state walks
683 */
684typedef enum adj_nbr_interface_flags_t_
685{
686    ADJ_NBR_INTERFACE_UP = (1 << 0),
687} adj_nbr_interface_flags_t;
688
689/**
690 * Context for the state change walk of the DB
691 */
692typedef struct adj_nbr_interface_state_change_ctx_t_
693{
694    /**
695     * Flags on the interface
696     */
697    adj_nbr_interface_flags_t flags;
698} adj_nbr_interface_state_change_ctx_t;
699
700static adj_walk_rc_t
701adj_nbr_interface_state_change_one (adj_index_t ai,
702                                    void *arg)
703{
704    /*
705     * Back walk the graph to inform the forwarding entries
706     * that this interface state has changed. Do this synchronously
707     * since this is the walk that provides convergence
708     */
709    adj_nbr_interface_state_change_ctx_t *ctx = arg;
710    fib_node_back_walk_ctx_t bw_ctx = {
711	.fnbw_reason = ((ctx->flags & ADJ_NBR_INTERFACE_UP) ?
712                        FIB_NODE_BW_REASON_FLAG_INTERFACE_UP :
713                        FIB_NODE_BW_REASON_FLAG_INTERFACE_DOWN),
714        /*
715         * the force sync applies only as far as the first fib_entry.
716         * And it's the fib_entry's we need to converge away from
717         * the adjacencies on the now down link
718         */
719        .fnbw_flags = (!(ctx->flags & ADJ_NBR_INTERFACE_UP) ?
720                       FIB_NODE_BW_FLAG_FORCE_SYNC :
721                       FIB_NODE_BW_FLAG_NONE),
722    };
723    ip_adjacency_t *adj;
724
725    adj = adj_get(ai);
726
727    adj->ia_flags |= ADJ_FLAG_SYNC_WALK_ACTIVE;
728    fib_walk_sync(FIB_NODE_TYPE_ADJ, ai, &bw_ctx);
729    adj->ia_flags &= ~ADJ_FLAG_SYNC_WALK_ACTIVE;
730
731    return (ADJ_WALK_RC_CONTINUE);
732}
733
734/**
735 * @brief Registered function for SW interface state changes
736 */
737static clib_error_t *
738adj_nbr_sw_interface_state_change (vnet_main_t * vnm,
739                                   u32 sw_if_index,
740                                   u32 flags)
741{
742    fib_protocol_t proto;
743
744    /*
745     * walk each adj on the interface and trigger a walk from that adj
746     */
747    for (proto = FIB_PROTOCOL_IP4; proto <= FIB_PROTOCOL_IP6; proto++)
748    {
749	adj_nbr_interface_state_change_ctx_t ctx = {
750	    .flags = ((flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) ?
751                      ADJ_NBR_INTERFACE_UP :
752                      0),
753	};
754
755	adj_nbr_walk(sw_if_index, proto,
756		     adj_nbr_interface_state_change_one,
757		     &ctx);
758    }
759
760    return (NULL);
761}
762
763VNET_SW_INTERFACE_ADMIN_UP_DOWN_FUNCTION_PRIO(
764    adj_nbr_sw_interface_state_change,
765    VNET_ITF_FUNC_PRIORITY_HIGH);
766
767/**
768 * @brief Invoked on each SW interface of a HW interface when the
769 * HW interface state changes
770 */
771static walk_rc_t
772adj_nbr_hw_sw_interface_state_change (vnet_main_t * vnm,
773                                      u32 sw_if_index,
774                                      void *arg)
775{
776    adj_nbr_interface_state_change_ctx_t *ctx = arg;
777    fib_protocol_t proto;
778
779    /*
780     * walk each adj on the interface and trigger a walk from that adj
781     */
782    for (proto = FIB_PROTOCOL_IP4; proto <= FIB_PROTOCOL_IP6; proto++)
783    {
784	adj_nbr_walk(sw_if_index, proto,
785		     adj_nbr_interface_state_change_one,
786		     ctx);
787    }
788    return (WALK_CONTINUE);
789}
790
791/**
792 * @brief Registered callback for HW interface state changes
793 */
794static clib_error_t *
795adj_nbr_hw_interface_state_change (vnet_main_t * vnm,
796                                   u32 hw_if_index,
797                                   u32 flags)
798{
799    /*
800     * walk SW interface on the HW
801     */
802    adj_nbr_interface_state_change_ctx_t ctx = {
803        .flags = ((flags & VNET_HW_INTERFACE_FLAG_LINK_UP) ?
804                  ADJ_NBR_INTERFACE_UP :
805                  0),
806    };
807
808    vnet_hw_interface_walk_sw(vnm, hw_if_index,
809                              adj_nbr_hw_sw_interface_state_change,
810                              &ctx);
811
812    return (NULL);
813}
814
815VNET_HW_INTERFACE_LINK_UP_DOWN_FUNCTION_PRIO(
816    adj_nbr_hw_interface_state_change,
817    VNET_ITF_FUNC_PRIORITY_HIGH);
818
819static adj_walk_rc_t
820adj_nbr_interface_delete_one (adj_index_t ai,
821			      void *arg)
822{
823    /*
824     * Back walk the graph to inform the forwarding entries
825     * that this interface has been deleted.
826     */
827    fib_node_back_walk_ctx_t bw_ctx = {
828	.fnbw_reason = FIB_NODE_BW_REASON_FLAG_INTERFACE_DELETE,
829    };
830    ip_adjacency_t *adj;
831
832    adj_lock(ai);
833
834    adj = adj_get(ai);
835
836    adj->ia_flags |= ADJ_FLAG_SYNC_WALK_ACTIVE;
837    fib_walk_sync(FIB_NODE_TYPE_ADJ, ai, &bw_ctx);
838    adj->ia_flags &= ~ADJ_FLAG_SYNC_WALK_ACTIVE;
839
840    adj_unlock(ai);
841    return (ADJ_WALK_RC_CONTINUE);
842}
843
844/**
845 * adj_nbr_interface_add_del
846 *
847 * Registered to receive interface Add and delete notifications
848 */
849static clib_error_t *
850adj_nbr_interface_add_del (vnet_main_t * vnm,
851			   u32 sw_if_index,
852			   u32 is_add)
853{
854    fib_protocol_t proto;
855
856    if (is_add)
857    {
858	/*
859	 * not interested in interface additions. we will not back walk
860	 * to resolve paths through newly added interfaces. Why? The control
861	 * plane should have the brains to add interfaces first, then routes.
862	 * So the case where there are paths with a interface that matches
863	 * one just created is the case where the path resolved through an
864	 * interface that was deleted, and still has not been removed. The
865	 * new interface added, is NO GUARANTEE that the interface being
866	 * added now, even though it may have the same sw_if_index, is the
867	 * same interface that the path needs. So tough!
868	 * If the control plane wants these routes to resolve it needs to
869	 * remove and add them again.
870	 */
871	return (NULL);
872    }
873
874    for (proto = FIB_PROTOCOL_IP4; proto <= FIB_PROTOCOL_IP6; proto++)
875    {
876	adj_nbr_walk(sw_if_index, proto,
877		     adj_nbr_interface_delete_one,
878		     NULL);
879    }
880
881    return (NULL);
882
883}
884
885VNET_SW_INTERFACE_ADD_DEL_FUNCTION(adj_nbr_interface_add_del);
886
887
888static adj_walk_rc_t
889adj_nbr_show_one (adj_index_t ai,
890		  void *arg)
891{
892    vlib_cli_output (arg, "[@%d]  %U",
893                     ai,
894                     format_ip_adjacency, ai,
895		     FORMAT_IP_ADJACENCY_NONE);
896
897    return (ADJ_WALK_RC_CONTINUE);
898}
899
900static clib_error_t *
901adj_nbr_show (vlib_main_t * vm,
902	      unformat_input_t * input,
903	      vlib_cli_command_t * cmd)
904{
905    adj_index_t ai = ADJ_INDEX_INVALID;
906    u32 sw_if_index = ~0;
907
908    while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
909    {
910	if (unformat (input, "%d", &ai))
911	    ;
912	else if (unformat (input, "%U",
913			   unformat_vnet_sw_interface, vnet_get_main(),
914			   &sw_if_index))
915	    ;
916	else
917	    break;
918    }
919
920    if (ADJ_INDEX_INVALID != ai)
921    {
922	vlib_cli_output (vm, "[@%d] %U",
923                         ai,
924                         format_ip_adjacency, ai,
925			 FORMAT_IP_ADJACENCY_DETAIL);
926    }
927    else if (~0 != sw_if_index)
928    {
929	fib_protocol_t proto;
930
931	for (proto = FIB_PROTOCOL_IP4; proto <= FIB_PROTOCOL_IP6; proto++)
932	{
933	    adj_nbr_walk(sw_if_index, proto,
934			 adj_nbr_show_one,
935			 vm);
936	}
937    }
938    else
939    {
940	fib_protocol_t proto;
941
942	for (proto = FIB_PROTOCOL_IP4; proto <= FIB_PROTOCOL_IP6; proto++)
943	{
944	    vec_foreach_index(sw_if_index, adj_nbr_tables[proto])
945	    {
946		adj_nbr_walk(sw_if_index, proto,
947			     adj_nbr_show_one,
948			     vm);
949	    }
950	}
951    }
952
953    return 0;
954}
955
956/*?
957 * Show all neighbour adjacencies.
958 * @cliexpar
959 * @cliexstart{sh adj nbr}
960 * [@2] ipv4 via 1.0.0.2 loop0: IP4: 00:00:22:aa:bb:cc -> 00:00:11:aa:bb:cc
961 * [@3] mpls via 1.0.0.2 loop0: MPLS_UNICAST: 00:00:22:aa:bb:cc -> 00:00:11:aa:bb:cc
962 * [@4] ipv4 via 1.0.0.3 loop0: IP4: 00:00:22:aa:bb:cc -> 00:00:11:aa:bb:cc
963 * [@5] mpls via 1.0.0.3 loop0: MPLS_UNICAST: 00:00:22:aa:bb:cc -> 00:00:11:aa:bb:cc
964 * @cliexend
965 ?*/
966VLIB_CLI_COMMAND (ip4_show_fib_command, static) = {
967    .path = "show adj nbr",
968    .short_help = "show adj nbr [<adj_index>] [interface]",
969    .function = adj_nbr_show,
970};
971
972u8*
973format_adj_nbr_incomplete (u8* s, va_list *ap)
974{
975    index_t index = va_arg(*ap, index_t);
976    CLIB_UNUSED(u32 indent) = va_arg(*ap, u32);
977    vnet_main_t * vnm = vnet_get_main();
978    ip_adjacency_t * adj = adj_get(index);
979
980    s = format (s, "arp-%U", format_vnet_link, adj->ia_link);
981    s = format (s, ": via %U",
982                format_ip46_address, &adj->sub_type.nbr.next_hop,
983		adj_proto_to_46(adj->ia_nh_proto));
984    s = format (s, " %U",
985                format_vnet_sw_if_index_name,
986                vnm, adj->rewrite_header.sw_if_index);
987
988    return (s);
989}
990
991u8*
992format_adj_nbr (u8* s, va_list *ap)
993{
994    index_t index = va_arg(*ap, index_t);
995    CLIB_UNUSED(u32 indent) = va_arg(*ap, u32);
996    ip_adjacency_t * adj = adj_get(index);
997
998    s = format (s, "%U", format_vnet_link, adj->ia_link);
999    s = format (s, " via %U ",
1000		format_ip46_address, &adj->sub_type.nbr.next_hop,
1001		adj_proto_to_46(adj->ia_nh_proto));
1002    s = format (s, "%U",
1003		format_vnet_rewrite,
1004		&adj->rewrite_header, sizeof (adj->rewrite_data), 0);
1005
1006    return (s);
1007}
1008
1009static void
1010adj_dpo_lock (dpo_id_t *dpo)
1011{
1012    adj_lock(dpo->dpoi_index);
1013}
1014static void
1015adj_dpo_unlock (dpo_id_t *dpo)
1016{
1017    adj_unlock(dpo->dpoi_index);
1018}
1019
1020static void
1021adj_mem_show (void)
1022{
1023    fib_show_memory_usage("Adjacency",
1024			  pool_elts(adj_pool),
1025			  pool_len(adj_pool),
1026			  sizeof(ip_adjacency_t));
1027}
1028
1029const static dpo_vft_t adj_nbr_dpo_vft = {
1030    .dv_lock = adj_dpo_lock,
1031    .dv_unlock = adj_dpo_unlock,
1032    .dv_format = format_adj_nbr,
1033    .dv_mem_show = adj_mem_show,
1034    .dv_get_urpf = adj_dpo_get_urpf,
1035};
1036const static dpo_vft_t adj_nbr_incompl_dpo_vft = {
1037    .dv_lock = adj_dpo_lock,
1038    .dv_unlock = adj_dpo_unlock,
1039    .dv_format = format_adj_nbr_incomplete,
1040    .dv_get_urpf = adj_dpo_get_urpf,
1041};
1042
1043/**
1044 * @brief The per-protocol VLIB graph nodes that are assigned to an adjacency
1045 *        object.
1046 *
1047 * this means that these graph nodes are ones from which a nbr is the
1048 * parent object in the DPO-graph.
1049 */
1050const static char* const nbr_ip4_nodes[] =
1051{
1052    "ip4-rewrite",
1053    NULL,
1054};
1055const static char* const nbr_ip6_nodes[] =
1056{
1057    "ip6-rewrite",
1058    NULL,
1059};
1060const static char* const nbr_mpls_nodes[] =
1061{
1062    "mpls-output",
1063    NULL,
1064};
1065const static char* const nbr_ethernet_nodes[] =
1066{
1067    "adj-l2-rewrite",
1068    NULL,
1069};
1070const static char* const * const nbr_nodes[DPO_PROTO_NUM] =
1071{
1072    [DPO_PROTO_IP4]  = nbr_ip4_nodes,
1073    [DPO_PROTO_IP6]  = nbr_ip6_nodes,
1074    [DPO_PROTO_MPLS] = nbr_mpls_nodes,
1075    [DPO_PROTO_ETHERNET] = nbr_ethernet_nodes,
1076};
1077
1078const static char* const nbr_incomplete_ip4_nodes[] =
1079{
1080    "ip4-arp",
1081    NULL,
1082};
1083const static char* const nbr_incomplete_ip6_nodes[] =
1084{
1085    "ip6-discover-neighbor",
1086    NULL,
1087};
1088const static char* const nbr_incomplete_mpls_nodes[] =
1089{
1090    "mpls-adj-incomplete",
1091    NULL,
1092};
1093
1094const static char* const * const nbr_incomplete_nodes[DPO_PROTO_NUM] =
1095{
1096    [DPO_PROTO_IP4]  = nbr_incomplete_ip4_nodes,
1097    [DPO_PROTO_IP6]  = nbr_incomplete_ip6_nodes,
1098    [DPO_PROTO_MPLS] = nbr_incomplete_mpls_nodes,
1099};
1100
1101void
1102adj_nbr_module_init (void)
1103{
1104    dpo_register(DPO_ADJACENCY,
1105                 &adj_nbr_dpo_vft,
1106                 nbr_nodes);
1107    dpo_register(DPO_ADJACENCY_INCOMPLETE,
1108                 &adj_nbr_incompl_dpo_vft,
1109                 nbr_incomplete_nodes);
1110}
1111