rtnl.c revision 83ddb4c4
1/*
2 * Copyright (c) 2016 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
6 *
7 *     http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16#define _GNU_SOURCE
17#include <sched.h>
18
19#include <librtnl/rtnl.h>
20#include <librtnl/netns.h>
21
22#include <vlib/vlib.h>
23#include <vlib/unix/unix.h>
24#include <vppinfra/error.h>
25
26#include <sys/socket.h>
27#include <linux/netlink.h>
28#include <linux/rtnetlink.h>
29#include <float.h>
30#include <fcntl.h>
31
32#include <sys/types.h>
33#include <sys/wait.h>
34
35typedef enum {
36  RTNL_E_OPEN,
37  RTNL_E_CLOSE,
38  RTNL_E_READ,
39} rtnl_event_t;
40
41typedef enum {
42  RTNL_S_INIT,
43  RTNL_S_SYNC,
44  RTNL_S_READY,
45} rtnl_state_t;
46
47typedef enum {
48  RTNL_SS_OPENING,
49  RTNL_SS_LINK,
50  RTNL_SS_ADDR,
51  RTNL_SS_ROUTE4,
52  RTNL_SS_ROUTE6,
53  RTNL_SS_NEIGH,
54} rtnl_sync_state_t;
55
56typedef struct {
57  rtnl_stream_t stream;
58  rtnl_state_t state;
59  rtnl_sync_state_t sync_state;
60  int ns_fd;
61  int rtnl_socket;
62  u32 unix_index;
63  u32 rtnl_seq;
64  f64 timeout;
65} rtnl_ns_t;
66
67typedef struct {
68  f64 now;
69  rtnl_ns_t *streams;
70} rtnl_main_t;
71
72static rtnl_main_t rtnl_main;
73static vlib_node_registration_t rtnl_process_node;
74
75#define RTNL_BUFFSIZ 16384
76#define RTNL_DUMP_TIMEOUT 1
77
78u8 *format_rtnl_nsname2path(u8 *s, va_list *args)
79{
80  char *nsname = va_arg(*args, char *);
81  if (!nsname || !strlen(nsname)) {
82    return format(s, "/proc/self/ns/net");
83  } else if (strpbrk(nsname, "/") != NULL) {
84    return format(s, "%s", nsname);
85  } else {
86    return format((u8 *)0, "/var/run/netns/%s", nsname);
87  }
88}
89
90static_always_inline void
91rtnl_schedule_timeout(rtnl_ns_t *ns, f64 when)
92{
93  ns->timeout = when;
94}
95
96static_always_inline void
97rtnl_cancel_timeout(rtnl_ns_t *ns)
98{
99  ns->timeout = DBL_MAX;
100}
101
102static clib_error_t *rtnl_read_cb(struct unix_file * f)
103{
104  vlib_main_t *vm = vlib_get_main();
105  rtnl_ns_t *ns = (rtnl_ns_t *) f->private_data;
106  vlib_process_signal_event_pointer(vm, rtnl_process_node.index, RTNL_E_READ, ns);
107  return 0;
108}
109
110int rtnl_dump_request(rtnl_ns_t *ns, int type, void *req, size_t len)
111{
112  struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
113  struct nlmsghdr nlh = {
114      .nlmsg_len = NLMSG_LENGTH(len),
115      .nlmsg_type = type,
116      .nlmsg_flags = NLM_F_DUMP|NLM_F_REQUEST,
117      .nlmsg_pid = 0,
118      .nlmsg_seq = ++ns->rtnl_seq,
119  };
120  struct iovec iov[2] = {
121      { .iov_base = &nlh, .iov_len = sizeof(nlh) },
122      { .iov_base = req, .iov_len = len }
123  };
124  struct msghdr msg = {
125      .msg_name = &nladdr,
126      .msg_namelen =  sizeof(nladdr),
127      .msg_iov = iov,
128      .msg_iovlen = 2,
129  };
130  if(sendmsg(ns->rtnl_socket, &msg, 0) < 0)
131    return -1;
132  return 0;
133}
134
135static void rtnl_socket_close(rtnl_ns_t *ns)
136{
137  unix_file_del(&unix_main, &unix_main.file_pool[ns->unix_index]);
138  close(ns->rtnl_socket);
139}
140
141struct rtnl_thread_exec {
142  int fd;
143  void *(*fn)(void *);
144  void *arg;
145  void **ret;
146};
147
148static void *rtnl_exec_in_thread_fn(void *p)
149{
150  struct rtnl_thread_exec *ex = (struct rtnl_thread_exec *) p;
151  if (setns(ex->fd, 0))
152    return (void *) ((uword) (-errno));
153
154  *ex->ret = ex->fn(ex->arg);
155  return NULL;
156}
157
158static int rtnl_exec_in_namespace_byfd(int fd, void *(*fn)(void *), void *arg, void **ret)
159{
160  pthread_t thread;
161  void *thread_ret;
162  struct rtnl_thread_exec ex = {
163      .fd = fd,
164      .fn = fn,
165      .arg = arg,
166      .ret = ret
167  };
168  if(pthread_create(&thread, NULL, rtnl_exec_in_thread_fn, &ex))
169    return -errno;
170
171  if(pthread_join(thread, &thread_ret))
172    return -errno;
173
174  if (thread_ret)
175    return (int) ((uword)thread_ret);
176
177  return 0;
178}
179
180int rtnl_exec_in_namespace(u32 stream_index, void *(*fn)(void *), void *arg, void **ret)
181{
182  rtnl_main_t *rm = &rtnl_main;
183  if (pool_is_free_index(rm->streams, stream_index))
184    return -EBADR;
185
186  rtnl_ns_t *ns = pool_elt_at_index(rm->streams, stream_index);
187  return rtnl_exec_in_namespace_byfd(ns->ns_fd, fn, arg, ret);
188}
189
190int rtnl_exec_in_namespace_by_name(char *nsname, void *(*fn)(void *), void *arg, void **ret)
191{
192  int fd;
193  u8 *s = format((u8 *)0, "%U", format_rtnl_nsname2path, nsname);
194
195  if ((fd = open((char *)s, O_RDONLY)) < 0) {
196    vec_free(s);
197    return -errno;
198  }
199
200  int r = rtnl_exec_in_namespace_byfd(fd, fn, arg, ret);
201  vec_free(s);
202  close(fd);
203  return r;
204}
205
206/* this function is run by the second thread */
207static void *rtnl_thread_fn(void *p)
208{
209  rtnl_ns_t *ns = (rtnl_ns_t *) p;
210  if (setns(ns->ns_fd, 0)) {
211    clib_warning("setns(%d, %d) error %d", ns->ns_fd, CLONE_NEWNET, errno);
212    return (void *) -1;
213  }
214
215  if ((ns->rtnl_socket = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE)) == -1) {
216    clib_warning("Cannot open socket");
217    return (void *) -2;
218  }
219
220  return NULL;
221}
222
223static int rtnl_socket_open(rtnl_ns_t *ns)
224{
225  pthread_t thread;
226  void *thread_ret;
227  if(pthread_create(&thread, NULL, rtnl_thread_fn, ns)) {
228    clib_warning("Can't create opening thread");
229    return -1;
230  }
231
232  if(pthread_join(thread, &thread_ret)) {
233    clib_warning("Can't join opening thread");
234    return -2;
235  }
236
237  if (thread_ret) {
238    clib_warning("Could not open netlink socket");
239    return -3;
240  }
241
242  struct sockaddr_nl addr = {
243      .nl_family = AF_NETLINK,
244      .nl_pad = 0,
245      .nl_pid = 0,
246      .nl_groups =
247           RTMGRP_LINK | RTMGRP_IPV6_IFADDR | RTMGRP_IPV4_IFADDR |
248           RTMGRP_IPV4_ROUTE | RTMGRP_IPV6_ROUTE | RTMGRP_NEIGH |
249           RTMGRP_NOTIFY,
250  };
251
252  if (bind(ns->rtnl_socket, (struct sockaddr*) &addr, sizeof(addr))) {
253    close(ns->rtnl_socket);
254    return -3;
255  }
256
257  unix_file_t template = {0};
258  template.read_function = rtnl_read_cb;
259  template.file_descriptor = ns->rtnl_socket;
260  template.private_data = (uword) ns;
261  ns->unix_index = unix_file_add (&unix_main, &template);
262  return 0;
263}
264
265static int
266rtnl_rcv_error(rtnl_ns_t *ns, struct nlmsghdr *hdr, int *error)
267{
268  struct nlmsgerr *err = NLMSG_DATA(hdr);
269  size_t datalen = hdr->nlmsg_len - NLMSG_ALIGN(sizeof(*hdr));
270  if(datalen < sizeof(*err))
271    return -1;
272
273  *error = err->error;
274  return 0;
275}
276
277static void
278rtnl_sync_reset(rtnl_ns_t *ns)
279{
280  if (ns->sync_state == RTNL_SS_OPENING)
281    return;
282
283  rtnl_socket_close(ns);
284  ns->sync_state = RTNL_SS_OPENING;
285}
286
287static void
288rtnl_sync_done(rtnl_ns_t *ns)
289{
290  rtnl_main_t *rm = &rtnl_main;
291  struct ifaddrmsg addrmsg;
292  struct rtmsg rtmsg;
293  struct ndmsg ndmsg;
294  switch (ns->sync_state) {
295    case RTNL_SS_OPENING:
296      //Cannot happen here
297      break;
298    case RTNL_SS_LINK:
299      memset(&addrmsg, 0, sizeof(addrmsg));
300      addrmsg.ifa_family = AF_UNSPEC;
301      if(rtnl_dump_request(ns, RTM_GETADDR, &addrmsg, sizeof(addrmsg))) {
302        rtnl_sync_reset(ns);
303        rtnl_schedule_timeout(ns, rm->now + 1);
304        return;
305      }
306      rtnl_schedule_timeout(ns, rm->now + RTNL_DUMP_TIMEOUT);
307      ns->sync_state = RTNL_SS_ADDR;
308      break;
309    case RTNL_SS_ADDR:
310    case RTNL_SS_ROUTE4:
311      memset(&rtmsg, 0, sizeof(rtmsg));
312      rtmsg.rtm_family = (ns->sync_state == RTNL_SS_ADDR)?AF_INET:AF_INET6;
313      rtmsg.rtm_table = RT_TABLE_UNSPEC;
314      if(rtnl_dump_request(ns, RTM_GETROUTE, &rtmsg, sizeof(rtmsg))) {
315        rtnl_sync_reset(ns);
316        rtnl_schedule_timeout(ns, rm->now + 1);
317        return;
318      }
319      rtnl_schedule_timeout(ns, rm->now + RTNL_DUMP_TIMEOUT);
320      ns->sync_state = (ns->sync_state == RTNL_SS_ADDR)?RTNL_SS_ROUTE4:RTNL_SS_ROUTE6;
321      break;
322    case RTNL_SS_ROUTE6:
323      memset(&ndmsg, 0, sizeof(ndmsg));
324      ndmsg.ndm_family = AF_UNSPEC;
325      if(rtnl_dump_request(ns, RTM_GETNEIGH, &ndmsg, sizeof(ndmsg))) {
326        rtnl_sync_reset(ns);
327        rtnl_schedule_timeout(ns, rm->now + 1);
328        return;
329      }
330      rtnl_schedule_timeout(ns, rm->now + RTNL_DUMP_TIMEOUT);
331      ns->sync_state = RTNL_SS_NEIGH;
332      break;
333    case RTNL_SS_NEIGH:
334      ns->state = RTNL_S_READY;
335      ns->sync_state = 0;
336      rtnl_cancel_timeout(ns);
337      break;
338  }
339}
340
341static void
342rtnl_sync_timeout(rtnl_ns_t *ns)
343{
344  rtnl_main_t *rm = &rtnl_main;
345  struct ifinfomsg imsg = {};
346  switch (ns->sync_state) {
347    case RTNL_SS_OPENING:
348      if (rtnl_socket_open(ns)) {
349        rtnl_schedule_timeout(ns, rm->now + 10);
350        return;
351      }
352      imsg.ifi_family = AF_UNSPEC;
353      if (rtnl_dump_request(ns, RTM_GETLINK, &imsg, sizeof(imsg))) {
354        rtnl_sync_reset(ns);
355        rtnl_schedule_timeout(ns, rm->now + 10);
356      }
357      ns->sync_state = RTNL_SS_LINK;
358      rtnl_schedule_timeout(ns, rm->now + 2);
359      break;
360    case RTNL_SS_LINK:
361    case RTNL_SS_ADDR:
362    case RTNL_SS_ROUTE4:
363    case RTNL_SS_ROUTE6:
364    case RTNL_SS_NEIGH:
365      //Timeout happened while synchronizing
366      rtnl_sync_reset(ns);
367      rtnl_schedule_timeout(ns, rm->now + 1);
368      break;
369  }
370}
371
372static int
373rtnl_ns_recv(rtnl_ns_t *ns, struct nlmsghdr *hdr)
374{
375  rtnl_main_t *rm = &rtnl_main;
376  int ret, error = 0;
377
378  if (ns->state == RTNL_S_SYNC &&
379      ((hdr->nlmsg_flags & RTM_F_NOTIFY) ||
380          (hdr->nlmsg_seq != (ns->rtnl_seq)))) {
381    clib_warning("Received notification while in sync. Restart synchronization.");
382    rtnl_sync_reset(ns);
383    rtnl_schedule_timeout(ns, rm->now);
384  }
385
386  switch (hdr->nlmsg_type) {
387    case NLMSG_DONE:
388      rtnl_sync_done(ns);
389      break;
390    case NLMSG_ERROR:
391      if((ret = rtnl_rcv_error(ns, hdr, &error)))
392        return ret;
393      break;
394    case RTM_NEWROUTE:
395    case RTM_DELROUTE:
396    case RTM_NEWLINK:
397    case RTM_DELLINK:
398    case RTM_NEWADDR:
399    case RTM_DELADDR:
400    case RTM_NEWNEIGH:
401    case RTM_DELNEIGH:
402      if (ns->stream.recv_message)
403        ns->stream.recv_message(hdr, ns->stream.opaque);
404      break;
405    default:
406      clib_warning("Unknown rtnetlink type %d", hdr->nlmsg_type);
407      break;
408  }
409  return 0;
410}
411
412static void
413rtnl_process_open(rtnl_ns_t *ns)
414{
415  rtnl_main_t *rm = &rtnl_main;
416  if (ns->state != RTNL_S_INIT)
417    return;
418
419  ns->state = RTNL_S_SYNC;
420  ns->sync_state = RTNL_SS_OPENING;
421  rtnl_schedule_timeout(ns, rm->now);
422}
423
424static void
425rtnl_process_close(rtnl_ns_t *ns)
426{
427  rtnl_main_t *rm = &rtnl_main;
428  if (ns->state == RTNL_S_INIT)
429    return;
430
431  rtnl_socket_close(ns);
432  close(ns->ns_fd);
433  pool_put(rm->streams, ns);
434}
435
436static int
437rtnl_process_read(rtnl_ns_t *ns)
438{
439  uint8_t buff[RTNL_BUFFSIZ];
440  ssize_t len;
441  struct nlmsghdr *hdr;
442  while(1) {
443    if((len = recv(ns->rtnl_socket, buff, RTNL_BUFFSIZ, MSG_DONTWAIT)) < 0) {
444      if(errno != EAGAIN) {
445        clib_warning("rtnetlink recv error: %s", strerror(errno));
446        return -1;
447      }
448      return 0;
449    }
450
451    for(hdr = (struct nlmsghdr *) buff;
452        len > 0;
453        len -= NLMSG_ALIGN(hdr->nlmsg_len),
454            hdr = (struct nlmsghdr *) (((uint8_t *) hdr) + NLMSG_ALIGN(hdr->nlmsg_len))) {
455      if((sizeof(*hdr) > (size_t)len) || (hdr->nlmsg_len > (size_t)len)) {
456        clib_warning("rtnetlink buffer too small (%d Vs %d)", (int) hdr->nlmsg_len, (int) len);
457        return -1;
458      }
459      if (rtnl_ns_recv(ns, hdr))
460        return -1;
461    }
462  }
463  return 0;
464}
465
466static void
467rtnl_process_timeout(rtnl_ns_t *ns)
468{
469  switch (ns->state) {
470    case RTNL_S_SYNC:
471      rtnl_sync_timeout(ns);
472      break;
473    case RTNL_S_INIT:
474    case RTNL_S_READY:
475      clib_warning("Should not happen");
476      break;
477  }
478}
479
480static uword
481rtnl_process (vlib_main_t * vm,
482              vlib_node_runtime_t * node,
483              vlib_frame_t * frame)
484{
485  rtnl_main_t *rm = &rtnl_main;
486  uword event_type;
487  uword *event_data = 0;
488  rm->now = vlib_time_now(vm);
489  f64 timeout = DBL_MAX;
490  rtnl_ns_t *ns;
491
492  //Setting up
493  while (1) {
494    vlib_process_wait_for_event_or_clock(vm, timeout - rm->now);
495    event_type = vlib_process_get_events(vm, &event_data);
496    rm->now = vlib_time_now(vm);
497
498    if (event_type == ~0) { //Clock event or no event
499      pool_foreach(ns, rm->streams, {
500         if (ns->timeout < rm->now) {
501           ns->timeout = DBL_MAX;
502           rtnl_process_timeout(ns);
503         }
504      });
505    } else {
506      rtnl_ns_t *ns;
507      uword *d;
508      vec_foreach(d, event_data) {
509        ns = (rtnl_ns_t *)d[0];
510        switch (event_type)
511        {
512          case RTNL_E_CLOSE:
513            rtnl_process_close(ns);
514            break;
515          case RTNL_E_OPEN:
516            rtnl_process_open(ns);
517            break;
518          case RTNL_E_READ:
519            rtnl_process_read(ns);
520            break;
521        }
522      }
523    }
524
525    vec_reset_length (event_data);
526
527    timeout = DBL_MAX;
528    pool_foreach(ns, rm->streams, {
529        if (ns->timeout < timeout)
530          timeout = ns->timeout;
531    });
532  }
533  return frame->n_vectors;
534}
535
536VLIB_REGISTER_NODE(rtnl_process_node, static) = {
537    .function = rtnl_process,
538    .name = "rtnl-process",
539    .type = VLIB_NODE_TYPE_PROCESS,
540};
541
542u32
543rtnl_stream_open(rtnl_stream_t *template)
544{
545  vlib_main_t *vm = vlib_get_main();
546  rtnl_main_t *rm = &rtnl_main;
547  rtnl_ns_t *ns;
548  int fd;
549  u8 *s = format((u8 *)0, "%U", format_rtnl_nsname2path, template->name);
550
551  if ((fd = open((char *)s, O_RDONLY)) < 0) {
552    vec_free(s);
553    return ~0;
554  }
555
556  vec_free(s);
557  pool_get(rm->streams, ns);
558  ns->state = RTNL_S_INIT;
559  ns->ns_fd = fd;
560  ns->stream = *template;
561  vlib_process_signal_event_pointer(vm, rtnl_process_node.index, RTNL_E_OPEN, ns);
562  return ns - rm->streams;
563}
564
565void
566rtnl_stream_close(u32 stream_index)
567{
568  vlib_main_t *vm = vlib_get_main();
569  rtnl_main_t *rm = &rtnl_main;
570  ASSERT(!pool_is_free_index(rm->streams, stream_index));
571  rtnl_ns_t *ns = pool_elt_at_index(rm->streams, stream_index);
572  vlib_process_signal_event_pointer(vm, rtnl_process_node.index, RTNL_E_CLOSE, ns);
573}
574
575clib_error_t *
576rtnl_init (vlib_main_t * vm)
577{
578  rtnl_main_t *rm = &rtnl_main;
579  rm->streams = 0;
580  return 0;
581}
582
583VLIB_INIT_FUNCTION (rtnl_init);
584