rtnl.c revision 41d2e788
1/*
2 * Copyright (c) 2016 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
6 *
7 *     http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16#define _GNU_SOURCE
17#include <sched.h>
18
19#include <librtnl/rtnl.h>
20#include <librtnl/netns.h>
21
22#include <vlib/vlib.h>
23#include <vlib/unix/unix.h>
24#include <vppinfra/error.h>
25
26#include <sys/socket.h>
27#include <linux/netlink.h>
28#include <linux/rtnetlink.h>
29#include <float.h>
30#include <fcntl.h>
31
32#include <sys/types.h>
33#include <sys/wait.h>
34
35#undef DBL_MAX
36#define DBL_MAX 1000000000.0
37
38typedef enum {
39  RTNL_E_OPEN,
40  RTNL_E_CLOSE,
41  RTNL_E_READ,
42} rtnl_event_t;
43
44typedef enum {
45  RTNL_S_INIT,
46  RTNL_S_SYNC,
47  RTNL_S_READY,
48} rtnl_state_t;
49
50typedef enum {
51  RTNL_SS_OPENING,
52  RTNL_SS_LINK,
53  RTNL_SS_ADDR,
54  RTNL_SS_ROUTE4,
55  RTNL_SS_ROUTE6,
56  RTNL_SS_NEIGH,
57} rtnl_sync_state_t;
58
59typedef struct {
60  rtnl_stream_t stream;
61  rtnl_state_t state;
62  rtnl_sync_state_t sync_state;
63  int ns_fd;
64  int rtnl_socket;
65  u32 unix_index;
66  u32 rtnl_seq;
67  f64 timeout;
68} rtnl_ns_t;
69
70typedef struct {
71  f64 now;
72  rtnl_ns_t *streams;
73} rtnl_main_t;
74
75static rtnl_main_t rtnl_main;
76static vlib_node_registration_t rtnl_process_node;
77
78#define RTNL_BUFFSIZ 16384
79#define RTNL_DUMP_TIMEOUT 1
80
81u8 *format_rtnl_nsname2path(u8 *s, va_list *args)
82{
83  char *nsname = va_arg(*args, char *);
84  if (!nsname || !strlen(nsname)) {
85    return format(s, "/proc/self/ns/net");
86  } else if (strpbrk(nsname, "/") != NULL) {
87    return format(s, "%s", nsname);
88  } else {
89    return format(s, "/var/run/netns/%s", nsname);
90  }
91}
92
93static_always_inline void
94rtnl_schedule_timeout(rtnl_ns_t *ns, f64 when)
95{
96  ns->timeout = when;
97}
98
99static_always_inline void
100rtnl_cancel_timeout(rtnl_ns_t *ns)
101{
102  ns->timeout = DBL_MAX;
103}
104
105static clib_error_t *rtnl_read_cb(struct unix_file * f)
106{
107  rtnl_main_t *rm = &rtnl_main;
108  vlib_main_t *vm = vlib_get_main();
109  rtnl_ns_t *ns = &rm->streams[f->private_data];
110  vlib_process_signal_event(vm, rtnl_process_node.index, RTNL_E_READ, (uword)(ns - rm->streams));
111  return 0;
112}
113
114int rtnl_dump_request(rtnl_ns_t *ns, int type, void *req, size_t len)
115{
116  struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
117  struct nlmsghdr nlh = {
118      .nlmsg_len = NLMSG_LENGTH(len),
119      .nlmsg_type = type,
120      .nlmsg_flags = NLM_F_DUMP|NLM_F_REQUEST,
121      .nlmsg_pid = 0,
122      .nlmsg_seq = ++ns->rtnl_seq,
123  };
124  struct iovec iov[2] = {
125      { .iov_base = &nlh, .iov_len = sizeof(nlh) },
126      { .iov_base = req, .iov_len = len }
127  };
128  struct msghdr msg = {
129      .msg_name = &nladdr,
130      .msg_namelen =  sizeof(nladdr),
131      .msg_iov = iov,
132      .msg_iovlen = 2,
133  };
134  if(sendmsg(ns->rtnl_socket, &msg, 0) < 0)
135    return -1;
136  return 0;
137}
138
139static void rtnl_socket_close(rtnl_ns_t *ns)
140{
141  unix_file_del(&unix_main, &unix_main.file_pool[ns->unix_index]);
142  close(ns->rtnl_socket);
143}
144
145struct rtnl_thread_exec {
146  int fd;
147  void *(*fn)(void *);
148  void *arg;
149  void **ret;
150};
151
152static void *rtnl_exec_in_thread_fn(void *p)
153{
154  struct rtnl_thread_exec *ex = (struct rtnl_thread_exec *) p;
155  if (setns(ex->fd, 0))
156    return (void *) ((uword) (-errno));
157
158  *ex->ret = ex->fn(ex->arg);
159  return NULL;
160}
161
162static int rtnl_exec_in_namespace_byfd(int fd, void *(*fn)(void *), void *arg, void **ret)
163{
164  pthread_t thread;
165  void *thread_ret;
166  struct rtnl_thread_exec ex = {
167      .fd = fd,
168      .fn = fn,
169      .arg = arg,
170      .ret = ret
171  };
172  if(pthread_create(&thread, NULL, rtnl_exec_in_thread_fn, &ex))
173    return -errno;
174
175  if(pthread_join(thread, &thread_ret))
176    return -errno;
177
178  if (thread_ret)
179    return (int) ((uword)thread_ret);
180
181  return 0;
182}
183
184int rtnl_exec_in_namespace(u32 stream_index, void *(*fn)(void *), void *arg, void **ret)
185{
186  rtnl_main_t *rm = &rtnl_main;
187  if (pool_is_free_index(rm->streams, stream_index))
188    return -EBADR;
189
190  rtnl_ns_t *ns = pool_elt_at_index(rm->streams, stream_index);
191  return rtnl_exec_in_namespace_byfd(ns->ns_fd, fn, arg, ret);
192}
193
194int rtnl_exec_in_namespace_by_name(char *nsname, void *(*fn)(void *), void *arg, void **ret)
195{
196  int fd;
197  u8 *s = format((u8 *)0, "%U", format_rtnl_nsname2path, nsname);
198
199  if ((fd = open((char *)s, O_RDONLY)) < 0) {
200    vec_free(s);
201    return -errno;
202  }
203
204  int r = rtnl_exec_in_namespace_byfd(fd, fn, arg, ret);
205  vec_free(s);
206  close(fd);
207  return r;
208}
209
210/* this function is run by the second thread */
211static void *rtnl_thread_fn(void *p)
212{
213  rtnl_ns_t *ns = (rtnl_ns_t *) p;
214  if (setns(ns->ns_fd, 0)) {
215    clib_warning("setns(%d, %d) error %d", ns->ns_fd, CLONE_NEWNET, errno);
216    return (void *) -1;
217  }
218
219  if ((ns->rtnl_socket = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE)) == -1) {
220    clib_warning("Cannot open socket");
221    return (void *) -2;
222  }
223
224  return NULL;
225}
226
227static int rtnl_socket_open(rtnl_ns_t *ns)
228{
229  rtnl_main_t *rm = &rtnl_main;
230  pthread_t thread;
231  void *thread_ret;
232  if(pthread_create(&thread, NULL, rtnl_thread_fn, ns)) {
233    clib_warning("Can't create opening thread");
234    return -1;
235  }
236
237  if(pthread_join(thread, &thread_ret)) {
238    clib_warning("Can't join opening thread");
239    return -2;
240  }
241
242  if (thread_ret) {
243    clib_warning("Could not open netlink socket");
244    return -3;
245  }
246
247  struct sockaddr_nl addr = {
248      .nl_family = AF_NETLINK,
249      .nl_pad = 0,
250      .nl_pid = 0,
251      .nl_groups =
252           RTMGRP_LINK | RTMGRP_IPV6_IFADDR | RTMGRP_IPV4_IFADDR |
253           RTMGRP_IPV4_ROUTE | RTMGRP_IPV6_ROUTE | RTMGRP_NEIGH |
254           RTMGRP_NOTIFY,
255  };
256
257  if (bind(ns->rtnl_socket, (struct sockaddr*) &addr, sizeof(addr))) {
258    close(ns->rtnl_socket);
259    return -3;
260  }
261
262  unix_file_t template = {0};
263  template.read_function = rtnl_read_cb;
264  template.file_descriptor = ns->rtnl_socket;
265  template.private_data = (uword) (ns - rm->streams);
266  ns->unix_index = unix_file_add (&unix_main, &template);
267  return 0;
268}
269
270static int
271rtnl_rcv_error(rtnl_ns_t *ns, struct nlmsghdr *hdr, int *error)
272{
273  struct nlmsgerr *err = NLMSG_DATA(hdr);
274  size_t datalen = hdr->nlmsg_len - NLMSG_ALIGN(sizeof(*hdr));
275  if(datalen < sizeof(*err))
276    return -1;
277
278  *error = err->error;
279  return 0;
280}
281
282static void
283rtnl_sync_reset(rtnl_ns_t *ns)
284{
285  if (ns->sync_state == RTNL_SS_OPENING)
286    return;
287
288  rtnl_socket_close(ns);
289  ns->sync_state = RTNL_SS_OPENING;
290}
291
292static void
293rtnl_sync_done(rtnl_ns_t *ns)
294{
295  rtnl_main_t *rm = &rtnl_main;
296  struct ifaddrmsg addrmsg;
297  struct rtmsg rtmsg;
298  struct ndmsg ndmsg;
299  switch (ns->sync_state) {
300    case RTNL_SS_OPENING:
301      //Cannot happen here
302      break;
303    case RTNL_SS_LINK:
304      memset(&addrmsg, 0, sizeof(addrmsg));
305      addrmsg.ifa_family = AF_UNSPEC;
306      if(rtnl_dump_request(ns, RTM_GETADDR, &addrmsg, sizeof(addrmsg))) {
307        rtnl_sync_reset(ns);
308        rtnl_schedule_timeout(ns, rm->now + 1);
309        return;
310      }
311      rtnl_schedule_timeout(ns, rm->now + RTNL_DUMP_TIMEOUT);
312      ns->sync_state = RTNL_SS_ADDR;
313      break;
314    case RTNL_SS_ADDR:
315    case RTNL_SS_ROUTE4:
316      memset(&rtmsg, 0, sizeof(rtmsg));
317      rtmsg.rtm_family = (ns->sync_state == RTNL_SS_ADDR)?AF_INET:AF_INET6;
318      rtmsg.rtm_table = RT_TABLE_UNSPEC;
319      if(rtnl_dump_request(ns, RTM_GETROUTE, &rtmsg, sizeof(rtmsg))) {
320        rtnl_sync_reset(ns);
321        rtnl_schedule_timeout(ns, rm->now + 1);
322        return;
323      }
324      rtnl_schedule_timeout(ns, rm->now + RTNL_DUMP_TIMEOUT);
325      ns->sync_state = (ns->sync_state == RTNL_SS_ADDR)?RTNL_SS_ROUTE4:RTNL_SS_ROUTE6;
326      break;
327    case RTNL_SS_ROUTE6:
328      memset(&ndmsg, 0, sizeof(ndmsg));
329      ndmsg.ndm_family = AF_UNSPEC;
330      if(rtnl_dump_request(ns, RTM_GETNEIGH, &ndmsg, sizeof(ndmsg))) {
331        rtnl_sync_reset(ns);
332        rtnl_schedule_timeout(ns, rm->now + 1);
333        return;
334      }
335      rtnl_schedule_timeout(ns, rm->now + RTNL_DUMP_TIMEOUT);
336      ns->sync_state = RTNL_SS_NEIGH;
337      break;
338    case RTNL_SS_NEIGH:
339      ns->state = RTNL_S_READY;
340      ns->sync_state = 0;
341      rtnl_cancel_timeout(ns);
342      break;
343  }
344}
345
346static void
347rtnl_sync_timeout(rtnl_ns_t *ns)
348{
349  rtnl_main_t *rm = &rtnl_main;
350  struct ifinfomsg imsg = {};
351  switch (ns->sync_state) {
352    case RTNL_SS_OPENING:
353      if (rtnl_socket_open(ns)) {
354        rtnl_schedule_timeout(ns, rm->now + 10);
355        return;
356      }
357      imsg.ifi_family = AF_UNSPEC;
358      if (rtnl_dump_request(ns, RTM_GETLINK, &imsg, sizeof(imsg))) {
359        rtnl_sync_reset(ns);
360        rtnl_schedule_timeout(ns, rm->now + 10);
361      }
362      ns->sync_state = RTNL_SS_LINK;
363      rtnl_schedule_timeout(ns, rm->now + 2);
364      break;
365    case RTNL_SS_LINK:
366    case RTNL_SS_ADDR:
367    case RTNL_SS_ROUTE4:
368    case RTNL_SS_ROUTE6:
369    case RTNL_SS_NEIGH:
370      //Timeout happened while synchronizing
371      rtnl_sync_reset(ns);
372      rtnl_schedule_timeout(ns, rm->now + 1);
373      break;
374  }
375}
376
377static int
378rtnl_ns_recv(rtnl_ns_t *ns, struct nlmsghdr *hdr)
379{
380  rtnl_main_t *rm = &rtnl_main;
381  int ret, error = 0;
382
383  if (ns->state == RTNL_S_SYNC &&
384      ((hdr->nlmsg_flags & RTM_F_NOTIFY) ||
385          (hdr->nlmsg_seq != (ns->rtnl_seq)))) {
386    clib_warning("Received notification while in sync. Restart synchronization.");
387    rtnl_sync_reset(ns);
388    rtnl_schedule_timeout(ns, rm->now);
389  }
390
391  switch (hdr->nlmsg_type) {
392    case NLMSG_DONE:
393      rtnl_sync_done(ns);
394      break;
395    case NLMSG_ERROR:
396      if((ret = rtnl_rcv_error(ns, hdr, &error)))
397        return ret;
398      break;
399    case RTM_NEWROUTE:
400    case RTM_DELROUTE:
401    case RTM_NEWLINK:
402    case RTM_DELLINK:
403    case RTM_NEWADDR:
404    case RTM_DELADDR:
405    case RTM_NEWNEIGH:
406    case RTM_DELNEIGH:
407      if (ns->stream.recv_message)
408        ns->stream.recv_message(hdr, ns->stream.opaque);
409      break;
410    default:
411      clib_warning("Unknown rtnetlink type %d", hdr->nlmsg_type);
412      break;
413  }
414  return 0;
415}
416
417static void
418rtnl_process_open(rtnl_ns_t *ns)
419{
420  rtnl_main_t *rm = &rtnl_main;
421  if (ns->state != RTNL_S_INIT)
422    return;
423
424  ns->state = RTNL_S_SYNC;
425  ns->sync_state = RTNL_SS_OPENING;
426  rtnl_schedule_timeout(ns, rm->now);
427}
428
429static void
430rtnl_process_close(rtnl_ns_t *ns)
431{
432  rtnl_main_t *rm = &rtnl_main;
433  if (ns->state == RTNL_S_INIT)
434    return;
435
436  rtnl_socket_close(ns);
437  close(ns->ns_fd);
438  pool_put(rm->streams, ns);
439}
440
441static int
442rtnl_process_read(rtnl_ns_t *ns)
443{
444  uint8_t buff[RTNL_BUFFSIZ];
445  ssize_t len;
446  struct nlmsghdr *hdr;
447  while(1) {
448    if((len = recv(ns->rtnl_socket, buff, RTNL_BUFFSIZ, MSG_DONTWAIT)) < 0) {
449      if(errno != EAGAIN) {
450        clib_warning("rtnetlink recv error (%d) [%s]: %s", ns->rtnl_socket, ns->stream.name, strerror(errno));
451        return -1;
452      }
453      return 0;
454    }
455
456    for(hdr = (struct nlmsghdr *) buff;
457        len > 0;
458        len -= NLMSG_ALIGN(hdr->nlmsg_len),
459            hdr = (struct nlmsghdr *) (((uint8_t *) hdr) + NLMSG_ALIGN(hdr->nlmsg_len))) {
460      if((sizeof(*hdr) > (size_t)len) || (hdr->nlmsg_len > (size_t)len)) {
461        clib_warning("rtnetlink buffer too small (%d Vs %d)", (int) hdr->nlmsg_len, (int) len);
462        return -1;
463      }
464      if (rtnl_ns_recv(ns, hdr))
465        return -1;
466    }
467  }
468  return 0;
469}
470
471static void
472rtnl_process_timeout(rtnl_ns_t *ns)
473{
474  switch (ns->state) {
475    case RTNL_S_SYNC:
476      rtnl_sync_timeout(ns);
477      break;
478    case RTNL_S_INIT:
479    case RTNL_S_READY:
480      clib_warning("Should not happen");
481      break;
482  }
483}
484
485static uword
486rtnl_process (vlib_main_t * vm,
487              vlib_node_runtime_t * node,
488              vlib_frame_t * frame)
489{
490  rtnl_main_t *rm = &rtnl_main;
491  uword event_type;
492  uword *event_data = 0;
493  rm->now = vlib_time_now(vm);
494  f64 timeout = DBL_MAX;
495  rtnl_ns_t *ns;
496
497  //Setting up
498  while (1) {
499    vlib_process_wait_for_event_or_clock(vm, timeout - rm->now);
500    event_type = vlib_process_get_events(vm, &event_data);
501    rm->now = vlib_time_now(vm);
502
503    if (event_type == ~0) { //Clock event or no event
504      pool_foreach(ns, rm->streams, {
505         if (ns->timeout < rm->now) {
506           ns->timeout = DBL_MAX;
507           rtnl_process_timeout(ns);
508         }
509      });
510    } else {
511      rtnl_ns_t *ns;
512      uword *d;
513      vec_foreach(d, event_data) {
514        ns = &rm->streams[d[0]];
515        switch (event_type)
516        {
517          case RTNL_E_CLOSE:
518            rtnl_process_close(ns);
519            break;
520          case RTNL_E_OPEN:
521            rtnl_process_open(ns);
522            break;
523          case RTNL_E_READ:
524            rtnl_process_read(ns);
525            break;
526        }
527      }
528    }
529
530    vec_reset_length (event_data);
531
532    timeout = DBL_MAX;
533    pool_foreach(ns, rm->streams, {
534        if (ns->timeout < timeout)
535          timeout = ns->timeout;
536    });
537  }
538  return frame->n_vectors;
539}
540
541VLIB_REGISTER_NODE(rtnl_process_node, static) = {
542    .function = rtnl_process,
543    .name = "rtnl-process",
544    .type = VLIB_NODE_TYPE_PROCESS,
545};
546
547u32
548rtnl_stream_open(rtnl_stream_t *template)
549{
550  vlib_main_t *vm = vlib_get_main();
551  rtnl_main_t *rm = &rtnl_main;
552  rtnl_ns_t *ns;
553  int fd;
554  u8 *s = format((u8 *)0, "%U", format_rtnl_nsname2path, template->name);
555  vec_add1(s, 0);
556
557  if ((fd = open((char *)s, O_RDONLY)) < 0) {
558    clib_unix_warning("open stream %s: ", s);
559    vec_free(s);
560    return ~0;
561  }
562
563  vec_free(s);
564  pool_get(rm->streams, ns);
565  ns->state = RTNL_S_INIT;
566  ns->ns_fd = fd;
567  ns->stream = *template;
568  vlib_process_signal_event(vm, rtnl_process_node.index, RTNL_E_OPEN, (uword)(ns - rm->streams));
569  return ns - rm->streams;
570}
571
572void
573rtnl_stream_close(u32 stream_index)
574{
575  vlib_main_t *vm = vlib_get_main();
576  rtnl_main_t *rm = &rtnl_main;
577  ASSERT(!pool_is_free_index(rm->streams, stream_index));
578  vlib_process_signal_event(vm, rtnl_process_node.index, RTNL_E_CLOSE, stream_index);
579}
580
581clib_error_t *
582rtnl_init (vlib_main_t * vm)
583{
584  rtnl_main_t *rm = &rtnl_main;
585  rm->streams = 0;
586  return 0;
587}
588
589VLIB_INIT_FUNCTION (rtnl_init);
590