rtnl.c revision 2ab698c9
1/*
2 * Copyright (c) 2016 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
6 *
7 *     http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16#define _GNU_SOURCE
17#include <sched.h>
18
19#include <librtnl/rtnl.h>
20#include <librtnl/netns.h>
21
22#include <vlib/vlib.h>
23#include <vlib/unix/unix.h>
24#include <vppinfra/error.h>
25
26#include <sys/socket.h>
27#include <linux/netlink.h>
28#include <linux/rtnetlink.h>
29#include <float.h>
30#include <fcntl.h>
31
32#include <sys/types.h>
33#include <sys/wait.h>
34#include <errno.h>
35
36#undef DBL_MAX
37#define DBL_MAX 1000000000.0
38
39typedef enum {
40  RTNL_E_OPEN,
41  RTNL_E_CLOSE,
42  RTNL_E_READ,
43} rtnl_event_t;
44
45typedef enum {
46  RTNL_S_INIT,
47  RTNL_S_SYNC,
48  RTNL_S_READY,
49} rtnl_state_t;
50
51typedef enum {
52  RTNL_SS_OPENING,
53  RTNL_SS_LINK,
54  RTNL_SS_ADDR,
55  RTNL_SS_ROUTE4,
56  RTNL_SS_ROUTE6,
57  RTNL_SS_NEIGH,
58} rtnl_sync_state_t;
59
60typedef struct {
61  rtnl_stream_t stream;
62  rtnl_state_t state;
63  rtnl_sync_state_t sync_state;
64  int ns_fd;
65  int rtnl_socket;
66  u32 unix_index;
67  u32 rtnl_seq;
68  f64 timeout;
69} rtnl_ns_t;
70
71typedef struct {
72  f64 now;
73  rtnl_ns_t *streams;
74} rtnl_main_t;
75
76static rtnl_main_t rtnl_main;
77static vlib_node_registration_t rtnl_process_node;
78
79#define RTNL_BUFFSIZ 16384
80#define RTNL_DUMP_TIMEOUT 1
81
82static inline u32 grpmask(u32 g)
83{
84  ASSERT (g <= 31);
85  if (g) {
86    return 1 << (g - 1);
87  } else
88    return 0;
89}
90
91
92u8 *format_rtnl_nsname2path(u8 *s, va_list *args)
93{
94  char *nsname = va_arg(*args, char *);
95  if (!nsname || !strlen(nsname)) {
96    return format(s, "/proc/self/ns/net");
97  } else if (strpbrk(nsname, "/") != NULL) {
98    return format(s, "%s", nsname);
99  } else {
100    return format(s, "/var/run/netns/%s", nsname);
101  }
102}
103
104static_always_inline void
105rtnl_schedule_timeout(rtnl_ns_t *ns, f64 when)
106{
107  ns->timeout = when;
108}
109
110static_always_inline void
111rtnl_cancel_timeout(rtnl_ns_t *ns)
112{
113  ns->timeout = DBL_MAX;
114}
115
116static clib_error_t *rtnl_read_cb(struct clib_file * f)
117{
118  rtnl_main_t *rm = &rtnl_main;
119  vlib_main_t *vm = vlib_get_main();
120  rtnl_ns_t *ns = &rm->streams[f->private_data];
121  vlib_process_signal_event(vm, rtnl_process_node.index, RTNL_E_READ, (uword)(ns - rm->streams));
122  return 0;
123}
124
125int rtnl_dump_request(rtnl_ns_t *ns, int type, void *req, size_t len)
126{
127  struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
128  struct nlmsghdr nlh = {
129    .nlmsg_len = NLMSG_LENGTH(len),
130    .nlmsg_type = type,
131    .nlmsg_flags = NLM_F_DUMP|NLM_F_REQUEST,
132    .nlmsg_pid = 0,
133    .nlmsg_seq = ++ns->rtnl_seq,
134  };
135  struct iovec iov[2] = {
136    { .iov_base = &nlh, .iov_len = sizeof(nlh) },
137    { .iov_base = req, .iov_len = len }
138  };
139  struct msghdr msg = {
140    .msg_name = &nladdr,
141    .msg_namelen =  sizeof(nladdr),
142    .msg_iov = iov,
143    .msg_iovlen = 2,
144  };
145  if(sendmsg(ns->rtnl_socket, &msg, 0) < 0)
146    return -1;
147  return 0;
148}
149
150static void rtnl_socket_close(rtnl_ns_t *ns)
151{
152  clib_file_del(&file_main, &file_main.file_pool[ns->unix_index]);
153  close(ns->rtnl_socket);
154}
155
156struct rtnl_thread_exec {
157  int fd;
158  void *(*fn)(void *);
159  void *arg;
160  void **ret;
161};
162
163static void *rtnl_exec_in_thread_fn(void *p)
164{
165  struct rtnl_thread_exec *ex = (struct rtnl_thread_exec *) p;
166  if (setns(ex->fd, 0))
167    return (void *) ((uword) (-errno));
168
169  *ex->ret = ex->fn(ex->arg);
170  return NULL;
171}
172
173static int rtnl_exec_in_namespace_byfd(int fd, void *(*fn)(void *), void *arg, void **ret)
174{
175  pthread_t thread;
176  void *thread_ret;
177  struct rtnl_thread_exec ex = {
178    .fd = fd,
179    .fn = fn,
180    .arg = arg,
181    .ret = ret
182  };
183  if(pthread_create(&thread, NULL, rtnl_exec_in_thread_fn, &ex))
184    return -errno;
185
186  if(pthread_join(thread, &thread_ret))
187    return -errno;
188
189  if (thread_ret)
190    return (int) ((uword)thread_ret);
191
192  return 0;
193}
194
195int rtnl_exec_in_namespace(u32 stream_index, void *(*fn)(void *), void *arg, void **ret)
196{
197  rtnl_main_t *rm = &rtnl_main;
198  if (pool_is_free_index(rm->streams, stream_index))
199    return -EBADR;
200
201  rtnl_ns_t *ns = pool_elt_at_index(rm->streams, stream_index);
202  return rtnl_exec_in_namespace_byfd(ns->ns_fd, fn, arg, ret);
203}
204
205int rtnl_exec_in_namespace_by_name(char *nsname, void *(*fn)(void *), void *arg, void **ret)
206{
207  int fd;
208  u8 *s = format((u8 *)0, "%U", format_rtnl_nsname2path, nsname);
209
210  if ((fd = open((char *)s, O_RDONLY)) < 0) {
211    vec_free(s);
212    return -errno;
213  }
214
215  int r = rtnl_exec_in_namespace_byfd(fd, fn, arg, ret);
216  vec_free(s);
217  close(fd);
218  return r;
219}
220
221/* this function is run by the second thread */
222static void *rtnl_thread_fn(void *p)
223{
224  rtnl_ns_t *ns = (rtnl_ns_t *) p;
225  if (setns(ns->ns_fd, 0)) {
226    clib_warning("setns(%d, %d) error %d", ns->ns_fd, CLONE_NEWNET, errno);
227    return (void *) -1;
228  }
229
230  if ((ns->rtnl_socket = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE)) == -1) {
231    clib_warning("Cannot open socket");
232    return (void *) -2;
233  }
234
235  return NULL;
236}
237
238static int rtnl_socket_open(rtnl_ns_t *ns)
239{
240  rtnl_main_t *rm = &rtnl_main;
241  pthread_t thread;
242  void *thread_ret;
243  if(pthread_create(&thread, NULL, rtnl_thread_fn, ns)) {
244    clib_warning("Can't create opening thread");
245    return -1;
246  }
247
248  if(pthread_join(thread, &thread_ret)) {
249    clib_warning("Can't join opening thread");
250    return -2;
251  }
252
253  if (thread_ret) {
254    clib_warning("Could not open netlink socket");
255    return -3;
256  }
257
258  struct sockaddr_nl addr = {
259    .nl_family = AF_NETLINK,
260    .nl_pad = 0,
261    .nl_pid = 0,
262    /*add mpls message group*/
263    .nl_groups = grpmask(RTNLGRP_LINK)| grpmask(RTNLGRP_IPV6_IFADDR) |
264    grpmask(RTNLGRP_IPV4_IFADDR) | grpmask(RTNLGRP_IPV4_ROUTE) |
265    grpmask(RTNLGRP_IPV6_ROUTE) | grpmask(RTNLGRP_NEIGH) |
266    grpmask(RTNLGRP_NOTIFY) | grpmask(RTNLGRP_MPLS_ROUTE),
267  };
268
269  if (bind(ns->rtnl_socket, (struct sockaddr*) &addr, sizeof(addr))) {
270    close(ns->rtnl_socket);
271    return -3;
272  }
273
274  clib_file_t template = {0};
275  template.read_function = rtnl_read_cb;
276  template.file_descriptor = ns->rtnl_socket;
277  template.private_data = (uword) (ns - rm->streams);
278  ns->unix_index = clib_file_add (&file_main, &template);
279  return 0;
280}
281
282static int
283rtnl_rcv_error(rtnl_ns_t *ns, struct nlmsghdr *hdr, int *error)
284{
285  struct nlmsgerr *err = NLMSG_DATA(hdr);
286  size_t datalen = hdr->nlmsg_len - NLMSG_ALIGN(sizeof(*hdr));
287  if(datalen < sizeof(*err))
288    return -1;
289
290  *error = err->error;
291  return 0;
292}
293
294static void
295rtnl_sync_reset(rtnl_ns_t *ns)
296{
297  if (ns->sync_state == RTNL_SS_OPENING)
298    return;
299
300  rtnl_socket_close(ns);
301  ns->sync_state = RTNL_SS_OPENING;
302}
303
304static void
305rtnl_sync_done(rtnl_ns_t *ns)
306{
307  rtnl_main_t *rm = &rtnl_main;
308  struct ifaddrmsg addrmsg;
309  struct rtmsg rtmsg;
310  struct ndmsg ndmsg;
311  switch (ns->sync_state) {
312  case RTNL_SS_OPENING:
313    //Cannot happen here
314    break;
315  case RTNL_SS_LINK:
316    memset(&addrmsg, 0, sizeof(addrmsg));
317    addrmsg.ifa_family = AF_UNSPEC;
318    if(rtnl_dump_request(ns, RTM_GETADDR, &addrmsg, sizeof(addrmsg))) {
319      rtnl_sync_reset(ns);
320      rtnl_schedule_timeout(ns, rm->now + 1);
321      return;
322    }
323    rtnl_schedule_timeout(ns, rm->now + RTNL_DUMP_TIMEOUT);
324    ns->sync_state = RTNL_SS_ADDR;
325    break;
326  case RTNL_SS_ADDR:
327  case RTNL_SS_ROUTE4:
328    memset(&rtmsg, 0, sizeof(rtmsg));
329    rtmsg.rtm_family = (ns->sync_state == RTNL_SS_ADDR)?AF_INET:AF_INET6;
330    rtmsg.rtm_table = RT_TABLE_UNSPEC;
331    if(rtnl_dump_request(ns, RTM_GETROUTE, &rtmsg, sizeof(rtmsg))) {
332      rtnl_sync_reset(ns);
333      rtnl_schedule_timeout(ns, rm->now + 1);
334      return;
335    }
336    rtnl_schedule_timeout(ns, rm->now + RTNL_DUMP_TIMEOUT);
337    ns->sync_state = (ns->sync_state == RTNL_SS_ADDR)?RTNL_SS_ROUTE4:RTNL_SS_ROUTE6;
338    break;
339  case RTNL_SS_ROUTE6:
340    memset(&ndmsg, 0, sizeof(ndmsg));
341    ndmsg.ndm_family = AF_UNSPEC;
342    if(rtnl_dump_request(ns, RTM_GETNEIGH, &ndmsg, sizeof(ndmsg))) {
343      rtnl_sync_reset(ns);
344      rtnl_schedule_timeout(ns, rm->now + 1);
345      return;
346    }
347    rtnl_schedule_timeout(ns, rm->now + RTNL_DUMP_TIMEOUT);
348    ns->sync_state = RTNL_SS_NEIGH;
349    break;
350  case RTNL_SS_NEIGH:
351    ns->state = RTNL_S_READY;
352    ns->sync_state = 0;
353    rtnl_cancel_timeout(ns);
354    break;
355  }
356}
357
358static void
359rtnl_sync_timeout(rtnl_ns_t *ns)
360{
361  rtnl_main_t *rm = &rtnl_main;
362  struct ifinfomsg imsg = {};
363  switch (ns->sync_state) {
364  case RTNL_SS_OPENING:
365    if (rtnl_socket_open(ns)) {
366      rtnl_schedule_timeout(ns, rm->now + 10);
367      return;
368    }
369    imsg.ifi_family = AF_UNSPEC;
370    if (rtnl_dump_request(ns, RTM_GETLINK, &imsg, sizeof(imsg))) {
371      rtnl_sync_reset(ns);
372      rtnl_schedule_timeout(ns, rm->now + 10);
373    }
374    ns->sync_state = RTNL_SS_LINK;
375    rtnl_schedule_timeout(ns, rm->now + 2);
376    break;
377  case RTNL_SS_LINK:
378  case RTNL_SS_ADDR:
379  case RTNL_SS_ROUTE4:
380  case RTNL_SS_ROUTE6:
381  case RTNL_SS_NEIGH:
382    //Timeout happened while synchronizing
383    rtnl_sync_reset(ns);
384    rtnl_schedule_timeout(ns, rm->now + 1);
385    break;
386  }
387}
388
389static int
390rtnl_ns_recv(rtnl_ns_t *ns, struct nlmsghdr *hdr)
391{
392  rtnl_main_t *rm = &rtnl_main;
393  int ret, error = 0;
394
395  if (ns->state == RTNL_S_SYNC &&
396      ((hdr->nlmsg_flags & RTM_F_NOTIFY) ||
397       (hdr->nlmsg_seq != (ns->rtnl_seq)))) {
398    clib_warning("Received notification while in sync. Restart synchronization.");
399    rtnl_sync_reset(ns);
400    rtnl_schedule_timeout(ns, rm->now);
401  }
402
403  switch (hdr->nlmsg_type) {
404  case NLMSG_DONE:
405    rtnl_sync_done(ns);
406    break;
407  case NLMSG_ERROR:
408    if((ret = rtnl_rcv_error(ns, hdr, &error)))
409      return ret;
410    break;
411  case RTM_NEWROUTE:
412  case RTM_DELROUTE:
413  case RTM_NEWLINK:
414  case RTM_DELLINK:
415  case RTM_NEWADDR:
416  case RTM_DELADDR:
417  case RTM_NEWNEIGH:
418  case RTM_DELNEIGH:
419    if (ns->stream.recv_message)
420      ns->stream.recv_message(hdr, ns->stream.opaque);
421    break;
422  default:
423    clib_warning("Unknown rtnetlink type %d", hdr->nlmsg_type);
424    break;
425  }
426  return 0;
427}
428
429static void
430rtnl_process_open(rtnl_ns_t *ns)
431{
432  rtnl_main_t *rm = &rtnl_main;
433  if (ns->state != RTNL_S_INIT)
434    return;
435
436  ns->state = RTNL_S_SYNC;
437  ns->sync_state = RTNL_SS_OPENING;
438  rtnl_schedule_timeout(ns, rm->now);
439}
440
441static void
442rtnl_process_close(rtnl_ns_t *ns)
443{
444  rtnl_main_t *rm = &rtnl_main;
445  if (ns->state == RTNL_S_INIT)
446    return;
447
448  rtnl_socket_close(ns);
449  close(ns->ns_fd);
450  pool_put(rm->streams, ns);
451}
452
453static int
454rtnl_process_read(rtnl_ns_t *ns)
455{
456  uint8_t buff[RTNL_BUFFSIZ];
457  ssize_t len;
458  struct nlmsghdr *hdr;
459  while(1) {
460    if((len = recv(ns->rtnl_socket, buff, RTNL_BUFFSIZ, MSG_DONTWAIT)) < 0) {
461      if(errno != EAGAIN) {
462        clib_warning("rtnetlink recv error (%d) [%s]: %s", ns->rtnl_socket, ns->stream.name, strerror(errno));
463        return -1;
464      }
465      return 0;
466    }
467
468    for(hdr = (struct nlmsghdr *) buff;
469        len > 0;
470        len -= NLMSG_ALIGN(hdr->nlmsg_len),
471          hdr = (struct nlmsghdr *) (((uint8_t *) hdr) + NLMSG_ALIGN(hdr->nlmsg_len))) {
472      if((sizeof(*hdr) > (size_t)len) || (hdr->nlmsg_len > (size_t)len)) {
473        clib_warning("rtnetlink buffer too small (%d Vs %d)", (int) hdr->nlmsg_len, (int) len);
474        return -1;
475      }
476      if (rtnl_ns_recv(ns, hdr))
477        return -1;
478    }
479  }
480  return 0;
481}
482
483static void
484rtnl_process_timeout(rtnl_ns_t *ns)
485{
486  switch (ns->state) {
487  case RTNL_S_SYNC:
488    rtnl_sync_timeout(ns);
489    break;
490  case RTNL_S_INIT:
491  case RTNL_S_READY:
492    clib_warning("Should not happen");
493    break;
494  }
495}
496
497static uword
498rtnl_process (vlib_main_t * vm,
499              vlib_node_runtime_t * node,
500              vlib_frame_t * frame)
501{
502  rtnl_main_t *rm = &rtnl_main;
503  uword event_type;
504  uword *event_data = 0;
505  rm->now = vlib_time_now(vm);
506  f64 timeout = DBL_MAX;
507  rtnl_ns_t *ns;
508
509  //Setting up
510  while (1) {
511    vlib_process_wait_for_event_or_clock(vm, timeout - rm->now);
512    event_type = vlib_process_get_events(vm, &event_data);
513    rm->now = vlib_time_now(vm);
514
515    if (event_type == ~0) { //Clock event or no event
516      pool_foreach(ns, rm->streams, {
517          if (ns->timeout < rm->now) {
518            ns->timeout = DBL_MAX;
519            rtnl_process_timeout(ns);
520          }
521        });
522    } else {
523      rtnl_ns_t *ns;
524      uword *d;
525      vec_foreach(d, event_data) {
526        ns = &rm->streams[d[0]];
527        switch (event_type)
528          {
529          case RTNL_E_CLOSE:
530            rtnl_process_close(ns);
531            break;
532          case RTNL_E_OPEN:
533            rtnl_process_open(ns);
534            break;
535          case RTNL_E_READ:
536            rtnl_process_read(ns);
537            break;
538          }
539      }
540    }
541
542    vec_reset_length (event_data);
543
544    timeout = DBL_MAX;
545    pool_foreach(ns, rm->streams, {
546        if (ns->timeout < timeout)
547          timeout = ns->timeout;
548      });
549  }
550  return frame->n_vectors;
551}
552
553VLIB_REGISTER_NODE(rtnl_process_node, static) = {
554  .function = rtnl_process,
555  .name = "rtnl-process",
556  .type = VLIB_NODE_TYPE_PROCESS,
557};
558
559u32
560rtnl_stream_open(rtnl_stream_t *template)
561{
562  vlib_main_t *vm = vlib_get_main();
563  rtnl_main_t *rm = &rtnl_main;
564  rtnl_ns_t *ns;
565  int fd;
566  u8 *s = format((u8 *)0, "%U", format_rtnl_nsname2path, template->name);
567  vec_add1(s, 0);
568
569  if ((fd = open((char *)s, O_RDONLY)) < 0) {
570    clib_unix_warning("open stream %s: ", s);
571    vec_free(s);
572    return ~0;
573  }
574
575  vec_free(s);
576  pool_get(rm->streams, ns);
577  ns->state = RTNL_S_INIT;
578  ns->ns_fd = fd;
579  ns->stream = *template;
580  vlib_process_signal_event(vm, rtnl_process_node.index, RTNL_E_OPEN, (uword)(ns - rm->streams));
581  return ns - rm->streams;
582}
583
584void
585rtnl_stream_close(u32 stream_index)
586{
587  vlib_main_t *vm = vlib_get_main();
588  rtnl_main_t *rm = &rtnl_main;
589  ASSERT(!pool_is_free_index(rm->streams, stream_index));
590  vlib_process_signal_event(vm, rtnl_process_node.index, RTNL_E_CLOSE, stream_index);
591}
592
593clib_error_t *
594rtnl_init (vlib_main_t * vm)
595{
596  rtnl_main_t *rm = &rtnl_main;
597  rm->streams = 0;
598  return 0;
599}
600
601VLIB_INIT_FUNCTION (rtnl_init);
602