rtnl.c revision 042a782c
1/*
2 * Copyright (c) 2016 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
6 *
7 *     http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16#define _GNU_SOURCE
17#include <sched.h>
18
19#include <librtnl/rtnl.h>
20#include <librtnl/netns.h>
21
22#include <vlib/vlib.h>
23#include <vlib/unix/unix.h>
24#include <vppinfra/error.h>
25
26#include <sys/socket.h>
27#include <linux/netlink.h>
28#include <linux/rtnetlink.h>
29#include <float.h>
30#include <fcntl.h>
31
32#include <sys/types.h>
33#include <sys/wait.h>
34#include <errno.h>
35
36#undef DBL_MAX
37#define DBL_MAX 1000000000.0
38
39typedef enum {
40  RTNL_E_OPEN,
41  RTNL_E_CLOSE,
42  RTNL_E_READ,
43} rtnl_event_t;
44
45typedef enum {
46  RTNL_S_INIT,
47  RTNL_S_SYNC,
48  RTNL_S_READY,
49} rtnl_state_t;
50
51typedef enum {
52  RTNL_SS_OPENING,
53  RTNL_SS_LINK,
54  RTNL_SS_ADDR,
55  RTNL_SS_ROUTE4,
56  RTNL_SS_ROUTE6,
57  RTNL_SS_NEIGH,
58} rtnl_sync_state_t;
59
60typedef struct {
61  rtnl_stream_t stream;
62  rtnl_state_t state;
63  rtnl_sync_state_t sync_state;
64  int ns_fd;
65  int rtnl_socket;
66  u32 unix_index;
67  u32 rtnl_seq;
68  f64 timeout;
69} rtnl_ns_t;
70
71typedef struct {
72  f64 now;
73  rtnl_ns_t *streams;
74} rtnl_main_t;
75
76static rtnl_main_t rtnl_main;
77static vlib_node_registration_t rtnl_process_node;
78
79#define RTNL_BUFFSIZ 16384
80#define RTNL_DUMP_TIMEOUT 1
81
82u8 *format_rtnl_nsname2path(u8 *s, va_list *args)
83{
84  char *nsname = va_arg(*args, char *);
85  if (!nsname || !strlen(nsname)) {
86    return format(s, "/proc/self/ns/net");
87  } else if (strpbrk(nsname, "/") != NULL) {
88    return format(s, "%s", nsname);
89  } else {
90    return format(s, "/var/run/netns/%s", nsname);
91  }
92}
93
94static_always_inline void
95rtnl_schedule_timeout(rtnl_ns_t *ns, f64 when)
96{
97  ns->timeout = when;
98}
99
100static_always_inline void
101rtnl_cancel_timeout(rtnl_ns_t *ns)
102{
103  ns->timeout = DBL_MAX;
104}
105
106static clib_error_t *rtnl_read_cb(struct clib_file * f)
107{
108  rtnl_main_t *rm = &rtnl_main;
109  vlib_main_t *vm = vlib_get_main();
110  rtnl_ns_t *ns = &rm->streams[f->private_data];
111  vlib_process_signal_event(vm, rtnl_process_node.index, RTNL_E_READ, (uword)(ns - rm->streams));
112  return 0;
113}
114
115int rtnl_dump_request(rtnl_ns_t *ns, int type, void *req, size_t len)
116{
117  struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
118  struct nlmsghdr nlh = {
119      .nlmsg_len = NLMSG_LENGTH(len),
120      .nlmsg_type = type,
121      .nlmsg_flags = NLM_F_DUMP|NLM_F_REQUEST,
122      .nlmsg_pid = 0,
123      .nlmsg_seq = ++ns->rtnl_seq,
124  };
125  struct iovec iov[2] = {
126      { .iov_base = &nlh, .iov_len = sizeof(nlh) },
127      { .iov_base = req, .iov_len = len }
128  };
129  struct msghdr msg = {
130      .msg_name = &nladdr,
131      .msg_namelen =  sizeof(nladdr),
132      .msg_iov = iov,
133      .msg_iovlen = 2,
134  };
135  if(sendmsg(ns->rtnl_socket, &msg, 0) < 0)
136    return -1;
137  return 0;
138}
139
140static void rtnl_socket_close(rtnl_ns_t *ns)
141{
142  clib_file_del(&file_main, &file_main.file_pool[ns->unix_index]);
143  close(ns->rtnl_socket);
144}
145
146struct rtnl_thread_exec {
147  int fd;
148  void *(*fn)(void *);
149  void *arg;
150  void **ret;
151};
152
153static void *rtnl_exec_in_thread_fn(void *p)
154{
155  struct rtnl_thread_exec *ex = (struct rtnl_thread_exec *) p;
156  if (setns(ex->fd, 0))
157    return (void *) ((uword) (-errno));
158
159  *ex->ret = ex->fn(ex->arg);
160  return NULL;
161}
162
163static int rtnl_exec_in_namespace_byfd(int fd, void *(*fn)(void *), void *arg, void **ret)
164{
165  pthread_t thread;
166  void *thread_ret;
167  struct rtnl_thread_exec ex = {
168      .fd = fd,
169      .fn = fn,
170      .arg = arg,
171      .ret = ret
172  };
173  if(pthread_create(&thread, NULL, rtnl_exec_in_thread_fn, &ex))
174    return -errno;
175
176  if(pthread_join(thread, &thread_ret))
177    return -errno;
178
179  if (thread_ret)
180    return (int) ((uword)thread_ret);
181
182  return 0;
183}
184
185int rtnl_exec_in_namespace(u32 stream_index, void *(*fn)(void *), void *arg, void **ret)
186{
187  rtnl_main_t *rm = &rtnl_main;
188  if (pool_is_free_index(rm->streams, stream_index))
189    return -EBADR;
190
191  rtnl_ns_t *ns = pool_elt_at_index(rm->streams, stream_index);
192  return rtnl_exec_in_namespace_byfd(ns->ns_fd, fn, arg, ret);
193}
194
195int rtnl_exec_in_namespace_by_name(char *nsname, void *(*fn)(void *), void *arg, void **ret)
196{
197  int fd;
198  u8 *s = format((u8 *)0, "%U", format_rtnl_nsname2path, nsname);
199
200  if ((fd = open((char *)s, O_RDONLY)) < 0) {
201    vec_free(s);
202    return -errno;
203  }
204
205  int r = rtnl_exec_in_namespace_byfd(fd, fn, arg, ret);
206  vec_free(s);
207  close(fd);
208  return r;
209}
210
211/* this function is run by the second thread */
212static void *rtnl_thread_fn(void *p)
213{
214  rtnl_ns_t *ns = (rtnl_ns_t *) p;
215  if (setns(ns->ns_fd, 0)) {
216    clib_warning("setns(%d, %d) error %d", ns->ns_fd, CLONE_NEWNET, errno);
217    return (void *) -1;
218  }
219
220  if ((ns->rtnl_socket = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE)) == -1) {
221    clib_warning("Cannot open socket");
222    return (void *) -2;
223  }
224
225  return NULL;
226}
227
228static int rtnl_socket_open(rtnl_ns_t *ns)
229{
230  rtnl_main_t *rm = &rtnl_main;
231  pthread_t thread;
232  void *thread_ret;
233  if(pthread_create(&thread, NULL, rtnl_thread_fn, ns)) {
234    clib_warning("Can't create opening thread");
235    return -1;
236  }
237
238  if(pthread_join(thread, &thread_ret)) {
239    clib_warning("Can't join opening thread");
240    return -2;
241  }
242
243  if (thread_ret) {
244    clib_warning("Could not open netlink socket");
245    return -3;
246  }
247
248  struct sockaddr_nl addr = {
249      .nl_family = AF_NETLINK,
250      .nl_pad = 0,
251      .nl_pid = 0,
252      .nl_groups =
253           RTMGRP_LINK | RTMGRP_IPV6_IFADDR | RTMGRP_IPV4_IFADDR |
254           RTMGRP_IPV4_ROUTE | RTMGRP_IPV6_ROUTE | RTMGRP_NEIGH |
255           RTMGRP_NOTIFY,
256  };
257
258  if (bind(ns->rtnl_socket, (struct sockaddr*) &addr, sizeof(addr))) {
259    close(ns->rtnl_socket);
260    return -3;
261  }
262
263  clib_file_t template = {0};
264  template.read_function = rtnl_read_cb;
265  template.file_descriptor = ns->rtnl_socket;
266  template.private_data = (uword) (ns - rm->streams);
267  ns->unix_index = clib_file_add (&file_main, &template);
268  return 0;
269}
270
271static int
272rtnl_rcv_error(rtnl_ns_t *ns, struct nlmsghdr *hdr, int *error)
273{
274  struct nlmsgerr *err = NLMSG_DATA(hdr);
275  size_t datalen = hdr->nlmsg_len - NLMSG_ALIGN(sizeof(*hdr));
276  if(datalen < sizeof(*err))
277    return -1;
278
279  *error = err->error;
280  return 0;
281}
282
283static void
284rtnl_sync_reset(rtnl_ns_t *ns)
285{
286  if (ns->sync_state == RTNL_SS_OPENING)
287    return;
288
289  rtnl_socket_close(ns);
290  ns->sync_state = RTNL_SS_OPENING;
291}
292
293static void
294rtnl_sync_done(rtnl_ns_t *ns)
295{
296  rtnl_main_t *rm = &rtnl_main;
297  struct ifaddrmsg addrmsg;
298  struct rtmsg rtmsg;
299  struct ndmsg ndmsg;
300  switch (ns->sync_state) {
301    case RTNL_SS_OPENING:
302      //Cannot happen here
303      break;
304    case RTNL_SS_LINK:
305      memset(&addrmsg, 0, sizeof(addrmsg));
306      addrmsg.ifa_family = AF_UNSPEC;
307      if(rtnl_dump_request(ns, RTM_GETADDR, &addrmsg, sizeof(addrmsg))) {
308        rtnl_sync_reset(ns);
309        rtnl_schedule_timeout(ns, rm->now + 1);
310        return;
311      }
312      rtnl_schedule_timeout(ns, rm->now + RTNL_DUMP_TIMEOUT);
313      ns->sync_state = RTNL_SS_ADDR;
314      break;
315    case RTNL_SS_ADDR:
316    case RTNL_SS_ROUTE4:
317      memset(&rtmsg, 0, sizeof(rtmsg));
318      rtmsg.rtm_family = (ns->sync_state == RTNL_SS_ADDR)?AF_INET:AF_INET6;
319      rtmsg.rtm_table = RT_TABLE_UNSPEC;
320      if(rtnl_dump_request(ns, RTM_GETROUTE, &rtmsg, sizeof(rtmsg))) {
321        rtnl_sync_reset(ns);
322        rtnl_schedule_timeout(ns, rm->now + 1);
323        return;
324      }
325      rtnl_schedule_timeout(ns, rm->now + RTNL_DUMP_TIMEOUT);
326      ns->sync_state = (ns->sync_state == RTNL_SS_ADDR)?RTNL_SS_ROUTE4:RTNL_SS_ROUTE6;
327      break;
328    case RTNL_SS_ROUTE6:
329      memset(&ndmsg, 0, sizeof(ndmsg));
330      ndmsg.ndm_family = AF_UNSPEC;
331      if(rtnl_dump_request(ns, RTM_GETNEIGH, &ndmsg, sizeof(ndmsg))) {
332        rtnl_sync_reset(ns);
333        rtnl_schedule_timeout(ns, rm->now + 1);
334        return;
335      }
336      rtnl_schedule_timeout(ns, rm->now + RTNL_DUMP_TIMEOUT);
337      ns->sync_state = RTNL_SS_NEIGH;
338      break;
339    case RTNL_SS_NEIGH:
340      ns->state = RTNL_S_READY;
341      ns->sync_state = 0;
342      rtnl_cancel_timeout(ns);
343      break;
344  }
345}
346
347static void
348rtnl_sync_timeout(rtnl_ns_t *ns)
349{
350  rtnl_main_t *rm = &rtnl_main;
351  struct ifinfomsg imsg = {};
352  switch (ns->sync_state) {
353    case RTNL_SS_OPENING:
354      if (rtnl_socket_open(ns)) {
355        rtnl_schedule_timeout(ns, rm->now + 10);
356        return;
357      }
358      imsg.ifi_family = AF_UNSPEC;
359      if (rtnl_dump_request(ns, RTM_GETLINK, &imsg, sizeof(imsg))) {
360        rtnl_sync_reset(ns);
361        rtnl_schedule_timeout(ns, rm->now + 10);
362      }
363      ns->sync_state = RTNL_SS_LINK;
364      rtnl_schedule_timeout(ns, rm->now + 2);
365      break;
366    case RTNL_SS_LINK:
367    case RTNL_SS_ADDR:
368    case RTNL_SS_ROUTE4:
369    case RTNL_SS_ROUTE6:
370    case RTNL_SS_NEIGH:
371      //Timeout happened while synchronizing
372      rtnl_sync_reset(ns);
373      rtnl_schedule_timeout(ns, rm->now + 1);
374      break;
375  }
376}
377
378static int
379rtnl_ns_recv(rtnl_ns_t *ns, struct nlmsghdr *hdr)
380{
381  rtnl_main_t *rm = &rtnl_main;
382  int ret, error = 0;
383
384  if (ns->state == RTNL_S_SYNC &&
385      ((hdr->nlmsg_flags & RTM_F_NOTIFY) ||
386          (hdr->nlmsg_seq != (ns->rtnl_seq)))) {
387    clib_warning("Received notification while in sync. Restart synchronization.");
388    rtnl_sync_reset(ns);
389    rtnl_schedule_timeout(ns, rm->now);
390  }
391
392  switch (hdr->nlmsg_type) {
393    case NLMSG_DONE:
394      rtnl_sync_done(ns);
395      break;
396    case NLMSG_ERROR:
397      if((ret = rtnl_rcv_error(ns, hdr, &error)))
398        return ret;
399      break;
400    case RTM_NEWROUTE:
401    case RTM_DELROUTE:
402    case RTM_NEWLINK:
403    case RTM_DELLINK:
404    case RTM_NEWADDR:
405    case RTM_DELADDR:
406    case RTM_NEWNEIGH:
407    case RTM_DELNEIGH:
408      if (ns->stream.recv_message)
409        ns->stream.recv_message(hdr, ns->stream.opaque);
410      break;
411    default:
412      clib_warning("Unknown rtnetlink type %d", hdr->nlmsg_type);
413      break;
414  }
415  return 0;
416}
417
418static void
419rtnl_process_open(rtnl_ns_t *ns)
420{
421  rtnl_main_t *rm = &rtnl_main;
422  if (ns->state != RTNL_S_INIT)
423    return;
424
425  ns->state = RTNL_S_SYNC;
426  ns->sync_state = RTNL_SS_OPENING;
427  rtnl_schedule_timeout(ns, rm->now);
428}
429
430static void
431rtnl_process_close(rtnl_ns_t *ns)
432{
433  rtnl_main_t *rm = &rtnl_main;
434  if (ns->state == RTNL_S_INIT)
435    return;
436
437  rtnl_socket_close(ns);
438  close(ns->ns_fd);
439  pool_put(rm->streams, ns);
440}
441
442static int
443rtnl_process_read(rtnl_ns_t *ns)
444{
445  uint8_t buff[RTNL_BUFFSIZ];
446  ssize_t len;
447  struct nlmsghdr *hdr;
448  while(1) {
449    if((len = recv(ns->rtnl_socket, buff, RTNL_BUFFSIZ, MSG_DONTWAIT)) < 0) {
450      if(errno != EAGAIN) {
451        clib_warning("rtnetlink recv error (%d) [%s]: %s", ns->rtnl_socket, ns->stream.name, strerror(errno));
452        return -1;
453      }
454      return 0;
455    }
456
457    for(hdr = (struct nlmsghdr *) buff;
458        len > 0;
459        len -= NLMSG_ALIGN(hdr->nlmsg_len),
460            hdr = (struct nlmsghdr *) (((uint8_t *) hdr) + NLMSG_ALIGN(hdr->nlmsg_len))) {
461      if((sizeof(*hdr) > (size_t)len) || (hdr->nlmsg_len > (size_t)len)) {
462        clib_warning("rtnetlink buffer too small (%d Vs %d)", (int) hdr->nlmsg_len, (int) len);
463        return -1;
464      }
465      if (rtnl_ns_recv(ns, hdr))
466        return -1;
467    }
468  }
469  return 0;
470}
471
472static void
473rtnl_process_timeout(rtnl_ns_t *ns)
474{
475  switch (ns->state) {
476    case RTNL_S_SYNC:
477      rtnl_sync_timeout(ns);
478      break;
479    case RTNL_S_INIT:
480    case RTNL_S_READY:
481      clib_warning("Should not happen");
482      break;
483  }
484}
485
486static uword
487rtnl_process (vlib_main_t * vm,
488              vlib_node_runtime_t * node,
489              vlib_frame_t * frame)
490{
491  rtnl_main_t *rm = &rtnl_main;
492  uword event_type;
493  uword *event_data = 0;
494  rm->now = vlib_time_now(vm);
495  f64 timeout = DBL_MAX;
496  rtnl_ns_t *ns;
497
498  //Setting up
499  while (1) {
500    vlib_process_wait_for_event_or_clock(vm, timeout - rm->now);
501    event_type = vlib_process_get_events(vm, &event_data);
502    rm->now = vlib_time_now(vm);
503
504    if (event_type == ~0) { //Clock event or no event
505      pool_foreach(ns, rm->streams, {
506         if (ns->timeout < rm->now) {
507           ns->timeout = DBL_MAX;
508           rtnl_process_timeout(ns);
509         }
510      });
511    } else {
512      rtnl_ns_t *ns;
513      uword *d;
514      vec_foreach(d, event_data) {
515        ns = &rm->streams[d[0]];
516        switch (event_type)
517        {
518          case RTNL_E_CLOSE:
519            rtnl_process_close(ns);
520            break;
521          case RTNL_E_OPEN:
522            rtnl_process_open(ns);
523            break;
524          case RTNL_E_READ:
525            rtnl_process_read(ns);
526            break;
527        }
528      }
529    }
530
531    vec_reset_length (event_data);
532
533    timeout = DBL_MAX;
534    pool_foreach(ns, rm->streams, {
535        if (ns->timeout < timeout)
536          timeout = ns->timeout;
537    });
538  }
539  return frame->n_vectors;
540}
541
542VLIB_REGISTER_NODE(rtnl_process_node, static) = {
543    .function = rtnl_process,
544    .name = "rtnl-process",
545    .type = VLIB_NODE_TYPE_PROCESS,
546};
547
548u32
549rtnl_stream_open(rtnl_stream_t *template)
550{
551  vlib_main_t *vm = vlib_get_main();
552  rtnl_main_t *rm = &rtnl_main;
553  rtnl_ns_t *ns;
554  int fd;
555  u8 *s = format((u8 *)0, "%U", format_rtnl_nsname2path, template->name);
556  vec_add1(s, 0);
557
558  if ((fd = open((char *)s, O_RDONLY)) < 0) {
559    clib_unix_warning("open stream %s: ", s);
560    vec_free(s);
561    return ~0;
562  }
563
564  vec_free(s);
565  pool_get(rm->streams, ns);
566  ns->state = RTNL_S_INIT;
567  ns->ns_fd = fd;
568  ns->stream = *template;
569  vlib_process_signal_event(vm, rtnl_process_node.index, RTNL_E_OPEN, (uword)(ns - rm->streams));
570  return ns - rm->streams;
571}
572
573void
574rtnl_stream_close(u32 stream_index)
575{
576  vlib_main_t *vm = vlib_get_main();
577  rtnl_main_t *rm = &rtnl_main;
578  ASSERT(!pool_is_free_index(rm->streams, stream_index));
579  vlib_process_signal_event(vm, rtnl_process_node.index, RTNL_E_CLOSE, stream_index);
580}
581
582clib_error_t *
583rtnl_init (vlib_main_t * vm)
584{
585  rtnl_main_t *rm = &rtnl_main;
586  rm->streams = 0;
587  return 0;
588}
589
590VLIB_INIT_FUNCTION (rtnl_init);
591