rtnl.c revision 9f07da4a
1/*
2 * Copyright (c) 2016 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
6 *
7 *     http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16#define _GNU_SOURCE
17#include <sched.h>
18
19#include <librtnl/rtnl.h>
20#include <librtnl/netns.h>
21
22#include <vlib/vlib.h>
23#include <vlib/unix/unix.h>
24#include <vppinfra/error.h>
25
26#include <sys/socket.h>
27#include <linux/netlink.h>
28#include <linux/rtnetlink.h>
29#include <float.h>
30#include <fcntl.h>
31
32#include <sys/types.h>
33#include <sys/wait.h>
34
35typedef enum {
36  RTNL_E_OPEN,
37  RTNL_E_CLOSE,
38  RTNL_E_READ,
39} rtnl_event_t;
40
41typedef enum {
42  RTNL_S_INIT,
43  RTNL_S_SYNC,
44  RTNL_S_READY,
45} rtnl_state_t;
46
47typedef enum {
48  RTNL_SS_OPENING,
49  RTNL_SS_LINK,
50  RTNL_SS_ADDR,
51  RTNL_SS_ROUTE4,
52  RTNL_SS_ROUTE6,
53  RTNL_SS_NEIGH,
54} rtnl_sync_state_t;
55
56typedef struct {
57  rtnl_stream_t stream;
58  rtnl_state_t state;
59  rtnl_sync_state_t sync_state;
60  int ns_fd;
61  int rtnl_socket;
62  u32 unix_index;
63  u32 rtnl_seq;
64  f64 timeout;
65} rtnl_ns_t;
66
67typedef struct {
68  f64 now;
69  rtnl_ns_t *streams;
70} rtnl_main_t;
71
72static rtnl_main_t rtnl_main;
73static vlib_node_registration_t rtnl_process_node;
74
75#define RTNL_BUFFSIZ 16384
76#define RTNL_DUMP_TIMEOUT 1
77
78u8 *format_rtnl_nsname2path(u8 *s, va_list *args)
79{
80  char *nsname = va_arg(*args, char *);
81  if (!nsname || !strlen(nsname)) {
82    return format(s, "/proc/self/ns/net");
83  } else if (strpbrk(nsname, "/") != NULL) {
84    return format(s, "%s", nsname);
85  } else {
86    return format((u8 *)0, "/var/run/netns/%s", nsname);
87  }
88}
89
90static_always_inline void
91rtnl_schedule_timeout(rtnl_ns_t *ns, f64 when)
92{
93  ns->timeout = when;
94}
95
96static_always_inline void
97rtnl_cancel_timeout(rtnl_ns_t *ns)
98{
99  ns->timeout = DBL_MAX;
100}
101
102static clib_error_t *rtnl_read_cb(struct unix_file * f)
103{
104  rtnl_main_t *rm = &rtnl_main;
105  vlib_main_t *vm = vlib_get_main();
106  rtnl_ns_t *ns = &rm->streams[f->private_data];
107  vlib_process_signal_event(vm, rtnl_process_node.index, RTNL_E_READ, (uword)(ns - rm->streams));
108  return 0;
109}
110
111int rtnl_dump_request(rtnl_ns_t *ns, int type, void *req, size_t len)
112{
113  struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
114  struct nlmsghdr nlh = {
115      .nlmsg_len = NLMSG_LENGTH(len),
116      .nlmsg_type = type,
117      .nlmsg_flags = NLM_F_DUMP|NLM_F_REQUEST,
118      .nlmsg_pid = 0,
119      .nlmsg_seq = ++ns->rtnl_seq,
120  };
121  struct iovec iov[2] = {
122      { .iov_base = &nlh, .iov_len = sizeof(nlh) },
123      { .iov_base = req, .iov_len = len }
124  };
125  struct msghdr msg = {
126      .msg_name = &nladdr,
127      .msg_namelen =  sizeof(nladdr),
128      .msg_iov = iov,
129      .msg_iovlen = 2,
130  };
131  if(sendmsg(ns->rtnl_socket, &msg, 0) < 0)
132    return -1;
133  return 0;
134}
135
136static void rtnl_socket_close(rtnl_ns_t *ns)
137{
138  unix_file_del(&unix_main, &unix_main.file_pool[ns->unix_index]);
139  close(ns->rtnl_socket);
140}
141
142struct rtnl_thread_exec {
143  int fd;
144  void *(*fn)(void *);
145  void *arg;
146  void **ret;
147};
148
149static void *rtnl_exec_in_thread_fn(void *p)
150{
151  struct rtnl_thread_exec *ex = (struct rtnl_thread_exec *) p;
152  if (setns(ex->fd, 0))
153    return (void *) ((uword) (-errno));
154
155  *ex->ret = ex->fn(ex->arg);
156  return NULL;
157}
158
159static int rtnl_exec_in_namespace_byfd(int fd, void *(*fn)(void *), void *arg, void **ret)
160{
161  pthread_t thread;
162  void *thread_ret;
163  struct rtnl_thread_exec ex = {
164      .fd = fd,
165      .fn = fn,
166      .arg = arg,
167      .ret = ret
168  };
169  if(pthread_create(&thread, NULL, rtnl_exec_in_thread_fn, &ex))
170    return -errno;
171
172  if(pthread_join(thread, &thread_ret))
173    return -errno;
174
175  if (thread_ret)
176    return (int) ((uword)thread_ret);
177
178  return 0;
179}
180
181int rtnl_exec_in_namespace(u32 stream_index, void *(*fn)(void *), void *arg, void **ret)
182{
183  rtnl_main_t *rm = &rtnl_main;
184  if (pool_is_free_index(rm->streams, stream_index))
185    return -EBADR;
186
187  rtnl_ns_t *ns = pool_elt_at_index(rm->streams, stream_index);
188  return rtnl_exec_in_namespace_byfd(ns->ns_fd, fn, arg, ret);
189}
190
191int rtnl_exec_in_namespace_by_name(char *nsname, void *(*fn)(void *), void *arg, void **ret)
192{
193  int fd;
194  u8 *s = format((u8 *)0, "%U", format_rtnl_nsname2path, nsname);
195
196  if ((fd = open((char *)s, O_RDONLY)) < 0) {
197    vec_free(s);
198    return -errno;
199  }
200
201  int r = rtnl_exec_in_namespace_byfd(fd, fn, arg, ret);
202  vec_free(s);
203  close(fd);
204  return r;
205}
206
207/* this function is run by the second thread */
208static void *rtnl_thread_fn(void *p)
209{
210  rtnl_ns_t *ns = (rtnl_ns_t *) p;
211  if (setns(ns->ns_fd, 0)) {
212    clib_warning("setns(%d, %d) error %d", ns->ns_fd, CLONE_NEWNET, errno);
213    return (void *) -1;
214  }
215
216  if ((ns->rtnl_socket = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE)) == -1) {
217    clib_warning("Cannot open socket");
218    return (void *) -2;
219  }
220
221  return NULL;
222}
223
224static int rtnl_socket_open(rtnl_ns_t *ns)
225{
226  rtnl_main_t *rm = &rtnl_main;
227  pthread_t thread;
228  void *thread_ret;
229  if(pthread_create(&thread, NULL, rtnl_thread_fn, ns)) {
230    clib_warning("Can't create opening thread");
231    return -1;
232  }
233
234  if(pthread_join(thread, &thread_ret)) {
235    clib_warning("Can't join opening thread");
236    return -2;
237  }
238
239  if (thread_ret) {
240    clib_warning("Could not open netlink socket");
241    return -3;
242  }
243
244  struct sockaddr_nl addr = {
245      .nl_family = AF_NETLINK,
246      .nl_pad = 0,
247      .nl_pid = 0,
248      .nl_groups =
249           RTMGRP_LINK | RTMGRP_IPV6_IFADDR | RTMGRP_IPV4_IFADDR |
250           RTMGRP_IPV4_ROUTE | RTMGRP_IPV6_ROUTE | RTMGRP_NEIGH |
251           RTMGRP_NOTIFY,
252  };
253
254  if (bind(ns->rtnl_socket, (struct sockaddr*) &addr, sizeof(addr))) {
255    close(ns->rtnl_socket);
256    return -3;
257  }
258
259  unix_file_t template = {0};
260  template.read_function = rtnl_read_cb;
261  template.file_descriptor = ns->rtnl_socket;
262  template.private_data = (uword) (ns - rm->streams);
263  ns->unix_index = unix_file_add (&unix_main, &template);
264  return 0;
265}
266
267static int
268rtnl_rcv_error(rtnl_ns_t *ns, struct nlmsghdr *hdr, int *error)
269{
270  struct nlmsgerr *err = NLMSG_DATA(hdr);
271  size_t datalen = hdr->nlmsg_len - NLMSG_ALIGN(sizeof(*hdr));
272  if(datalen < sizeof(*err))
273    return -1;
274
275  *error = err->error;
276  return 0;
277}
278
279static void
280rtnl_sync_reset(rtnl_ns_t *ns)
281{
282  if (ns->sync_state == RTNL_SS_OPENING)
283    return;
284
285  rtnl_socket_close(ns);
286  ns->sync_state = RTNL_SS_OPENING;
287}
288
289static void
290rtnl_sync_done(rtnl_ns_t *ns)
291{
292  rtnl_main_t *rm = &rtnl_main;
293  struct ifaddrmsg addrmsg;
294  struct rtmsg rtmsg;
295  struct ndmsg ndmsg;
296  switch (ns->sync_state) {
297    case RTNL_SS_OPENING:
298      //Cannot happen here
299      break;
300    case RTNL_SS_LINK:
301      memset(&addrmsg, 0, sizeof(addrmsg));
302      addrmsg.ifa_family = AF_UNSPEC;
303      if(rtnl_dump_request(ns, RTM_GETADDR, &addrmsg, sizeof(addrmsg))) {
304        rtnl_sync_reset(ns);
305        rtnl_schedule_timeout(ns, rm->now + 1);
306        return;
307      }
308      rtnl_schedule_timeout(ns, rm->now + RTNL_DUMP_TIMEOUT);
309      ns->sync_state = RTNL_SS_ADDR;
310      break;
311    case RTNL_SS_ADDR:
312    case RTNL_SS_ROUTE4:
313      memset(&rtmsg, 0, sizeof(rtmsg));
314      rtmsg.rtm_family = (ns->sync_state == RTNL_SS_ADDR)?AF_INET:AF_INET6;
315      rtmsg.rtm_table = RT_TABLE_UNSPEC;
316      if(rtnl_dump_request(ns, RTM_GETROUTE, &rtmsg, sizeof(rtmsg))) {
317        rtnl_sync_reset(ns);
318        rtnl_schedule_timeout(ns, rm->now + 1);
319        return;
320      }
321      rtnl_schedule_timeout(ns, rm->now + RTNL_DUMP_TIMEOUT);
322      ns->sync_state = (ns->sync_state == RTNL_SS_ADDR)?RTNL_SS_ROUTE4:RTNL_SS_ROUTE6;
323      break;
324    case RTNL_SS_ROUTE6:
325      memset(&ndmsg, 0, sizeof(ndmsg));
326      ndmsg.ndm_family = AF_UNSPEC;
327      if(rtnl_dump_request(ns, RTM_GETNEIGH, &ndmsg, sizeof(ndmsg))) {
328        rtnl_sync_reset(ns);
329        rtnl_schedule_timeout(ns, rm->now + 1);
330        return;
331      }
332      rtnl_schedule_timeout(ns, rm->now + RTNL_DUMP_TIMEOUT);
333      ns->sync_state = RTNL_SS_NEIGH;
334      break;
335    case RTNL_SS_NEIGH:
336      ns->state = RTNL_S_READY;
337      ns->sync_state = 0;
338      rtnl_cancel_timeout(ns);
339      break;
340  }
341}
342
343static void
344rtnl_sync_timeout(rtnl_ns_t *ns)
345{
346  rtnl_main_t *rm = &rtnl_main;
347  struct ifinfomsg imsg = {};
348  switch (ns->sync_state) {
349    case RTNL_SS_OPENING:
350      if (rtnl_socket_open(ns)) {
351        rtnl_schedule_timeout(ns, rm->now + 10);
352        return;
353      }
354      imsg.ifi_family = AF_UNSPEC;
355      if (rtnl_dump_request(ns, RTM_GETLINK, &imsg, sizeof(imsg))) {
356        rtnl_sync_reset(ns);
357        rtnl_schedule_timeout(ns, rm->now + 10);
358      }
359      ns->sync_state = RTNL_SS_LINK;
360      rtnl_schedule_timeout(ns, rm->now + 2);
361      break;
362    case RTNL_SS_LINK:
363    case RTNL_SS_ADDR:
364    case RTNL_SS_ROUTE4:
365    case RTNL_SS_ROUTE6:
366    case RTNL_SS_NEIGH:
367      //Timeout happened while synchronizing
368      rtnl_sync_reset(ns);
369      rtnl_schedule_timeout(ns, rm->now + 1);
370      break;
371  }
372}
373
374static int
375rtnl_ns_recv(rtnl_ns_t *ns, struct nlmsghdr *hdr)
376{
377  rtnl_main_t *rm = &rtnl_main;
378  int ret, error = 0;
379
380  if (ns->state == RTNL_S_SYNC &&
381      ((hdr->nlmsg_flags & RTM_F_NOTIFY) ||
382          (hdr->nlmsg_seq != (ns->rtnl_seq)))) {
383    clib_warning("Received notification while in sync. Restart synchronization.");
384    rtnl_sync_reset(ns);
385    rtnl_schedule_timeout(ns, rm->now);
386  }
387
388  switch (hdr->nlmsg_type) {
389    case NLMSG_DONE:
390      rtnl_sync_done(ns);
391      break;
392    case NLMSG_ERROR:
393      if((ret = rtnl_rcv_error(ns, hdr, &error)))
394        return ret;
395      break;
396    case RTM_NEWROUTE:
397    case RTM_DELROUTE:
398    case RTM_NEWLINK:
399    case RTM_DELLINK:
400    case RTM_NEWADDR:
401    case RTM_DELADDR:
402    case RTM_NEWNEIGH:
403    case RTM_DELNEIGH:
404      if (ns->stream.recv_message)
405        ns->stream.recv_message(hdr, ns->stream.opaque);
406      break;
407    default:
408      clib_warning("Unknown rtnetlink type %d", hdr->nlmsg_type);
409      break;
410  }
411  return 0;
412}
413
414static void
415rtnl_process_open(rtnl_ns_t *ns)
416{
417  rtnl_main_t *rm = &rtnl_main;
418  if (ns->state != RTNL_S_INIT)
419    return;
420
421  ns->state = RTNL_S_SYNC;
422  ns->sync_state = RTNL_SS_OPENING;
423  rtnl_schedule_timeout(ns, rm->now);
424}
425
426static void
427rtnl_process_close(rtnl_ns_t *ns)
428{
429  rtnl_main_t *rm = &rtnl_main;
430  if (ns->state == RTNL_S_INIT)
431    return;
432
433  rtnl_socket_close(ns);
434  close(ns->ns_fd);
435  pool_put(rm->streams, ns);
436}
437
438static int
439rtnl_process_read(rtnl_ns_t *ns)
440{
441  uint8_t buff[RTNL_BUFFSIZ];
442  ssize_t len;
443  struct nlmsghdr *hdr;
444  while(1) {
445    if((len = recv(ns->rtnl_socket, buff, RTNL_BUFFSIZ, MSG_DONTWAIT)) < 0) {
446      if(errno != EAGAIN) {
447        clib_warning("rtnetlink recv error (%d) [%s]: %s", ns->rtnl_socket, ns->stream.name, strerror(errno));
448        return -1;
449      }
450      return 0;
451    }
452
453    for(hdr = (struct nlmsghdr *) buff;
454        len > 0;
455        len -= NLMSG_ALIGN(hdr->nlmsg_len),
456            hdr = (struct nlmsghdr *) (((uint8_t *) hdr) + NLMSG_ALIGN(hdr->nlmsg_len))) {
457      if((sizeof(*hdr) > (size_t)len) || (hdr->nlmsg_len > (size_t)len)) {
458        clib_warning("rtnetlink buffer too small (%d Vs %d)", (int) hdr->nlmsg_len, (int) len);
459        return -1;
460      }
461      if (rtnl_ns_recv(ns, hdr))
462        return -1;
463    }
464  }
465  return 0;
466}
467
468static void
469rtnl_process_timeout(rtnl_ns_t *ns)
470{
471  switch (ns->state) {
472    case RTNL_S_SYNC:
473      rtnl_sync_timeout(ns);
474      break;
475    case RTNL_S_INIT:
476    case RTNL_S_READY:
477      clib_warning("Should not happen");
478      break;
479  }
480}
481
482static uword
483rtnl_process (vlib_main_t * vm,
484              vlib_node_runtime_t * node,
485              vlib_frame_t * frame)
486{
487  rtnl_main_t *rm = &rtnl_main;
488  uword event_type;
489  uword *event_data = 0;
490  rm->now = vlib_time_now(vm);
491  f64 timeout = DBL_MAX;
492  rtnl_ns_t *ns;
493
494  //Setting up
495  while (1) {
496    vlib_process_wait_for_event_or_clock(vm, timeout - rm->now);
497    event_type = vlib_process_get_events(vm, &event_data);
498    rm->now = vlib_time_now(vm);
499
500    if (event_type == ~0) { //Clock event or no event
501      pool_foreach(ns, rm->streams, {
502         if (ns->timeout < rm->now) {
503           ns->timeout = DBL_MAX;
504           rtnl_process_timeout(ns);
505         }
506      });
507    } else {
508      rtnl_ns_t *ns;
509      uword *d;
510      vec_foreach(d, event_data) {
511        ns = &rm->streams[d[0]];
512        switch (event_type)
513        {
514          case RTNL_E_CLOSE:
515            rtnl_process_close(ns);
516            break;
517          case RTNL_E_OPEN:
518            rtnl_process_open(ns);
519            break;
520          case RTNL_E_READ:
521            rtnl_process_read(ns);
522            break;
523        }
524      }
525    }
526
527    vec_reset_length (event_data);
528
529    timeout = DBL_MAX;
530    pool_foreach(ns, rm->streams, {
531        if (ns->timeout < timeout)
532          timeout = ns->timeout;
533    });
534  }
535  return frame->n_vectors;
536}
537
538VLIB_REGISTER_NODE(rtnl_process_node, static) = {
539    .function = rtnl_process,
540    .name = "rtnl-process",
541    .type = VLIB_NODE_TYPE_PROCESS,
542};
543
544u32
545rtnl_stream_open(rtnl_stream_t *template)
546{
547  vlib_main_t *vm = vlib_get_main();
548  rtnl_main_t *rm = &rtnl_main;
549  rtnl_ns_t *ns;
550  int fd;
551  u8 *s = format((u8 *)0, "%U", format_rtnl_nsname2path, template->name);
552
553  if ((fd = open((char *)s, O_RDONLY)) < 0) {
554    vec_free(s);
555    return ~0;
556  }
557
558  vec_free(s);
559  pool_get(rm->streams, ns);
560  ns->state = RTNL_S_INIT;
561  ns->ns_fd = fd;
562  ns->stream = *template;
563  vlib_process_signal_event(vm, rtnl_process_node.index, RTNL_E_OPEN, (uword)(ns - rm->streams));
564  return ns - rm->streams;
565}
566
567void
568rtnl_stream_close(u32 stream_index)
569{
570  vlib_main_t *vm = vlib_get_main();
571  rtnl_main_t *rm = &rtnl_main;
572  ASSERT(!pool_is_free_index(rm->streams, stream_index));
573  vlib_process_signal_event(vm, rtnl_process_node.index, RTNL_E_CLOSE, stream_index);
574}
575
576clib_error_t *
577rtnl_init (vlib_main_t * vm)
578{
579  rtnl_main_t *rm = &rtnl_main;
580  rm->streams = 0;
581  return 0;
582}
583
584VLIB_INIT_FUNCTION (rtnl_init);
585