1/*
2 * mc_socket.c: socket based multicast for vlib mc
3 *
4 * Copyright (c) 2010 Cisco and/or its affiliates.
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 *     http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18#include <vlib/vlib.h>
19#include <vlib/unix/mc_socket.h>
20
21#include <sys/ioctl.h>		/* for FIONBIO */
22#include <netinet/tcp.h>	/* for TCP_NODELAY */
23#include <net/if.h>		/* for struct ifreq */
24
25static u8 *
26format_socket_peer_id (u8 * s, va_list * args)
27{
28  u64 peer_id_as_u64 = va_arg (*args, u64);
29  mc_peer_id_t peer_id;
30  peer_id.as_u64 = peer_id_as_u64;
31  u32 a = mc_socket_peer_id_get_address (peer_id);
32  u32 p = mc_socket_peer_id_get_port (peer_id);
33
34  s = format (s, "%U:%04x", format_network_address, AF_INET, &a, ntohs (p));
35
36  return s;
37}
38
39typedef void (mc_msg_handler_t) (mc_main_t * mcm, void *msg,
40				 u32 buffer_index);
41
42always_inline void
43msg_handler (mc_main_t * mcm,
44	     u32 buffer_index, u32 handler_frees_buffer, void *_h)
45{
46  vlib_main_t *vm = mcm->vlib_main;
47  mc_msg_handler_t *h = _h;
48  vlib_buffer_t *b = vlib_get_buffer (vm, buffer_index);
49  void *the_msg = vlib_buffer_get_current (b);
50
51  h (mcm, the_msg, buffer_index);
52  if (!handler_frees_buffer)
53    vlib_buffer_free_one (vm, buffer_index);
54}
55
56static uword
57append_buffer_index_to_iovec (vlib_main_t * vm,
58			      u32 buffer_index, struct iovec **iovs_return)
59{
60  struct iovec *i;
61  vlib_buffer_t *b;
62  u32 bi = buffer_index;
63  u32 l = 0;
64
65  while (1)
66    {
67      b = vlib_get_buffer (vm, bi);
68      vec_add2 (*iovs_return, i, 1);
69      i->iov_base = vlib_buffer_get_current (b);
70      i->iov_len = b->current_length;
71      l += i->iov_len;
72      if (!(b->flags & VLIB_BUFFER_NEXT_PRESENT))
73	break;
74      bi = b->next_buffer;
75    }
76
77  return l;
78}
79
80static clib_error_t *
81sendmsg_helper (mc_socket_main_t * msm,
82		int socket, struct sockaddr_in *tx_addr, u32 buffer_index)
83{
84  vlib_main_t *vm = msm->mc_main.vlib_main;
85  struct msghdr h;
86  word n_bytes, n_bytes_tx, n_retries;
87
88  clib_memset (&h, 0, sizeof (h));
89  h.msg_name = tx_addr;
90  h.msg_namelen = sizeof (tx_addr[0]);
91
92  if (msm->iovecs)
93    _vec_len (msm->iovecs) = 0;
94
95  n_bytes = append_buffer_index_to_iovec (vm, buffer_index, &msm->iovecs);
96  ASSERT (n_bytes <= msm->mc_main.transport.max_packet_size);
97  if (n_bytes > msm->mc_main.transport.max_packet_size)
98    clib_error ("sending packet larger than interace MTU %d bytes", n_bytes);
99
100  h.msg_iov = msm->iovecs;
101  h.msg_iovlen = vec_len (msm->iovecs);
102
103  n_retries = 0;
104  while ((n_bytes_tx = sendmsg (socket, &h, /* flags */ 0)) != n_bytes
105	 && errno == EAGAIN)
106    n_retries++;
107  if (n_bytes_tx != n_bytes)
108    {
109      clib_unix_warning ("sendmsg");
110      return 0;
111    }
112  if (n_retries)
113    {
114      ELOG_TYPE_DECLARE (e) =
115      {
116      .format = "sendmsg-helper: %d retries",.format_args = "i4",};
117      struct
118      {
119	u32 retries;
120      } *ed = 0;
121
122      ed = ELOG_DATA (&vm->elog_main, e);
123      ed->retries = n_retries;
124    }
125  return 0;
126}
127
128static clib_error_t *
129tx_buffer (void *transport, mc_transport_type_t type, u32 buffer_index)
130{
131  mc_socket_main_t *msm = (mc_socket_main_t *) transport;
132  vlib_main_t *vm = msm->mc_main.vlib_main;
133  mc_multicast_socket_t *ms = &msm->multicast_sockets[type];
134  clib_error_t *error;
135  error = sendmsg_helper (msm, ms->socket, &ms->tx_addr, buffer_index);
136  if (type != MC_TRANSPORT_USER_REQUEST_TO_RELAY)
137    vlib_buffer_free_one (vm, buffer_index);
138  return error;
139}
140
141static clib_error_t *
142tx_ack (void *transport, mc_peer_id_t dest_peer_id, u32 buffer_index)
143{
144  struct sockaddr_in tx_addr;
145  mc_socket_main_t *msm = (mc_socket_main_t *) transport;
146  vlib_main_t *vm = msm->mc_main.vlib_main;
147  clib_error_t *error;
148
149  clib_memset (&tx_addr, 0, sizeof (tx_addr));
150  tx_addr.sin_family = AF_INET;
151  tx_addr.sin_addr.s_addr = mc_socket_peer_id_get_address (dest_peer_id);
152  tx_addr.sin_port = mc_socket_peer_id_get_port (dest_peer_id);
153
154  error = sendmsg_helper (msm, msm->ack_socket, &tx_addr, buffer_index);
155  vlib_buffer_free_one (vm, buffer_index);
156  return error;
157}
158
159static clib_error_t *
160recvmsg_helper (mc_socket_main_t * msm,
161		int socket,
162		struct sockaddr_in *rx_addr,
163		u32 * buffer_index, u32 drop_message)
164{
165  vlib_main_t *vm = msm->mc_main.vlib_main;
166  vlib_buffer_t *b;
167  uword n_left, n_alloc, n_mtu, i, i_rx;
168  const uword buffer_size = vlib_buffer_get_default_data_size (vm);
169  word n_bytes_left;
170
171  /* Make sure we have at least a MTU worth of buffers. */
172  n_mtu = msm->rx_mtu_n_buffers;
173  n_left = vec_len (msm->rx_buffers);
174  if (n_left < n_mtu)
175    {
176      uword max_alloc = 8 * n_mtu;
177      vec_validate (msm->rx_buffers, max_alloc - 1);
178      n_alloc =
179	vlib_buffer_alloc (vm, msm->rx_buffers + n_left, max_alloc - n_left);
180      _vec_len (msm->rx_buffers) = n_left + n_alloc;
181    }
182
183  ASSERT (vec_len (msm->rx_buffers) >= n_mtu);
184  vec_validate (msm->iovecs, n_mtu - 1);
185
186  /* Allocate RX buffers from end of rx_buffers.
187     Turn them into iovecs to pass to readv. */
188  i_rx = vec_len (msm->rx_buffers) - 1;
189  for (i = 0; i < n_mtu; i++)
190    {
191      b = vlib_get_buffer (vm, msm->rx_buffers[i_rx - i]);
192      msm->iovecs[i].iov_base = b->data;
193      msm->iovecs[i].iov_len = buffer_size;
194    }
195  _vec_len (msm->iovecs) = n_mtu;
196
197  {
198    struct msghdr h;
199
200    clib_memset (&h, 0, sizeof (h));
201    if (rx_addr)
202      {
203	h.msg_name = rx_addr;
204	h.msg_namelen = sizeof (rx_addr[0]);
205      }
206    h.msg_iov = msm->iovecs;
207    h.msg_iovlen = vec_len (msm->iovecs);
208
209    n_bytes_left = recvmsg (socket, &h, 0);
210    if (n_bytes_left < 0)
211      return clib_error_return_unix (0, "recvmsg");
212  }
213
214  if (drop_message)
215    {
216      *buffer_index = ~0;
217      return 0;
218    }
219
220  *buffer_index = msm->rx_buffers[i_rx];
221  while (1)
222    {
223      b = vlib_get_buffer (vm, msm->rx_buffers[i_rx]);
224
225      b->flags = 0;
226      b->current_data = 0;
227      b->current_length =
228	n_bytes_left < buffer_size ? n_bytes_left : buffer_size;
229
230      n_bytes_left -= buffer_size;
231
232      if (n_bytes_left <= 0)
233	break;
234
235      i_rx--;
236      b->flags |= VLIB_BUFFER_NEXT_PRESENT;
237      b->next_buffer = msm->rx_buffers[i_rx];
238    }
239
240  _vec_len (msm->rx_buffers) = i_rx;
241
242  return 0 /* no error */ ;
243}
244
245static clib_error_t *
246mastership_socket_read_ready (clib_file_t * uf)
247{
248  mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data;
249  mc_main_t *mcm = &msm->mc_main;
250  mc_multicast_socket_t *ms =
251    &msm->multicast_sockets[MC_TRANSPORT_MASTERSHIP];
252  clib_error_t *error;
253  u32 bi = 0;
254
255  error = recvmsg_helper (msm, ms->socket, /* rx_addr */ 0, &bi,	/* drop_message */
256			  0);
257  if (!error)
258    msg_handler (mcm, bi,
259		 /* handler_frees_buffer */ 0,
260		 mc_msg_master_assert_handler);
261
262  return error;
263}
264
265static clib_error_t *
266to_relay_socket_read_ready (clib_file_t * uf)
267{
268  mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data;
269  mc_main_t *mcm = &msm->mc_main;
270  vlib_main_t *vm = msm->mc_main.vlib_main;
271  mc_multicast_socket_t *ms_to_relay =
272    &msm->multicast_sockets[MC_TRANSPORT_USER_REQUEST_TO_RELAY];
273  mc_multicast_socket_t *ms_from_relay =
274    &msm->multicast_sockets[MC_TRANSPORT_USER_REQUEST_FROM_RELAY];
275  clib_error_t *error;
276  u32 bi = 0;
277  u32 is_master = mcm->relay_state == MC_RELAY_STATE_MASTER;
278
279  /* Not the ordering master? Turf the msg */
280  error = recvmsg_helper (msm, ms_to_relay->socket, /* rx_addr */ 0, &bi,
281			  /* drop_message */ !is_master);
282
283  /* If we are the master, number and rebroadcast the msg. */
284  if (!error && is_master)
285    {
286      vlib_buffer_t *b = vlib_get_buffer (vm, bi);
287      mc_msg_user_request_t *mp = vlib_buffer_get_current (b);
288      mp->global_sequence = clib_host_to_net_u32 (mcm->relay_global_sequence);
289      mcm->relay_global_sequence++;
290      error =
291	sendmsg_helper (msm, ms_from_relay->socket, &ms_from_relay->tx_addr,
292			bi);
293      vlib_buffer_free_one (vm, bi);
294    }
295
296  return error;
297}
298
299static clib_error_t *
300from_relay_socket_read_ready (clib_file_t * uf)
301{
302  mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data;
303  mc_main_t *mcm = &msm->mc_main;
304  mc_multicast_socket_t *ms =
305    &msm->multicast_sockets[MC_TRANSPORT_USER_REQUEST_FROM_RELAY];
306  clib_error_t *error;
307  u32 bi = 0;
308
309  error = recvmsg_helper (msm, ms->socket, /* rx_addr */ 0, &bi,	/* drop_message */
310			  0);
311  if (!error)
312    {
313      msg_handler (mcm, bi, /* handler_frees_buffer */ 1,
314		   mc_msg_user_request_handler);
315    }
316  return error;
317}
318
319static clib_error_t *
320join_socket_read_ready (clib_file_t * uf)
321{
322  mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data;
323  mc_main_t *mcm = &msm->mc_main;
324  vlib_main_t *vm = mcm->vlib_main;
325  mc_multicast_socket_t *ms = &msm->multicast_sockets[MC_TRANSPORT_JOIN];
326  clib_error_t *error;
327  u32 bi = 0;
328
329  error = recvmsg_helper (msm, ms->socket, /* rx_addr */ 0, &bi,	/* drop_message */
330			  0);
331  if (!error)
332    {
333      vlib_buffer_t *b = vlib_get_buffer (vm, bi);
334      mc_msg_join_or_leave_request_t *mp = vlib_buffer_get_current (b);
335
336      switch (clib_host_to_net_u32 (mp->type))
337	{
338	case MC_MSG_TYPE_join_or_leave_request:
339	  msg_handler (mcm, bi, /* handler_frees_buffer */ 0,
340		       mc_msg_join_or_leave_request_handler);
341	  break;
342
343	case MC_MSG_TYPE_join_reply:
344	  msg_handler (mcm, bi, /* handler_frees_buffer */ 0,
345		       mc_msg_join_reply_handler);
346	  break;
347
348	default:
349	  ASSERT (0);
350	  break;
351	}
352    }
353  return error;
354}
355
356static clib_error_t *
357ack_socket_read_ready (clib_file_t * uf)
358{
359  mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data;
360  mc_main_t *mcm = &msm->mc_main;
361  clib_error_t *error;
362  u32 bi = 0;
363
364  error = recvmsg_helper (msm, msm->ack_socket, /* rx_addr */ 0, &bi,
365			  /* drop_message */ 0);
366  if (!error)
367    msg_handler (mcm, bi, /* handler_frees_buffer */ 0,
368		 mc_msg_user_ack_handler);
369  return error;
370}
371
372static void
373catchup_cleanup (mc_socket_main_t * msm,
374		 mc_socket_catchup_t * c, clib_file_main_t * um,
375		 clib_file_t * uf)
376{
377  hash_unset (msm->catchup_index_by_file_descriptor, uf->file_descriptor);
378  clib_file_del (um, uf);
379  vec_free (c->input_vector);
380  vec_free (c->output_vector);
381  pool_put (msm->catchups, c);
382}
383
384static mc_socket_catchup_t *
385find_catchup_from_file_descriptor (mc_socket_main_t * msm,
386				   int file_descriptor)
387{
388  uword *p =
389    hash_get (msm->catchup_index_by_file_descriptor, file_descriptor);
390  return p ? pool_elt_at_index (msm->catchups, p[0]) : 0;
391}
392
393static clib_error_t *
394catchup_socket_read_ready (clib_file_t * uf, int is_server)
395{
396  clib_file_main_t *um = &file_main;
397  mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data;
398  mc_main_t *mcm = &msm->mc_main;
399  mc_socket_catchup_t *c =
400    find_catchup_from_file_descriptor (msm, uf->file_descriptor);
401  word l, n, is_eof;
402
403  l = vec_len (c->input_vector);
404  vec_resize (c->input_vector, 4096);
405  n =
406    read (uf->file_descriptor, c->input_vector + l,
407	  vec_len (c->input_vector) - l);
408  is_eof = n == 0;
409
410  if (n < 0)
411    {
412      if (errno == EAGAIN)
413	n = 0;
414      else
415	{
416	  catchup_cleanup (msm, c, um, uf);
417	  return clib_error_return_unix (0, "read");
418	}
419    }
420
421  _vec_len (c->input_vector) = l + n;
422
423  if (is_eof && vec_len (c->input_vector) > 0)
424    {
425      if (is_server)
426	{
427	  mc_msg_catchup_request_handler (mcm, (void *) c->input_vector,
428					  c - msm->catchups);
429	  _vec_len (c->input_vector) = 0;
430	}
431      else
432	{
433	  mc_msg_catchup_reply_handler (mcm, (void *) c->input_vector,
434					c - msm->catchups);
435	  c->input_vector = 0;	/* reply handler is responsible for freeing vector */
436	  catchup_cleanup (msm, c, um, uf);
437	}
438    }
439
440  return 0 /* no error */ ;
441}
442
443static clib_error_t *
444catchup_server_read_ready (clib_file_t * uf)
445{
446  return catchup_socket_read_ready (uf, /* is_server */ 1);
447}
448
449static clib_error_t *
450catchup_client_read_ready (clib_file_t * uf)
451{
452  if (MC_EVENT_LOGGING)
453    {
454      mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data;
455      vlib_main_t *vm = msm->mc_main.vlib_main;
456
457      ELOG_TYPE (e, "catchup_client_read_ready");
458      ELOG (&vm->elog_main, e, 0);
459    }
460  return catchup_socket_read_ready (uf, /* is_server */ 0);
461}
462
463static clib_error_t *
464catchup_socket_write_ready (clib_file_t * uf, int is_server)
465{
466  clib_file_main_t *um = &file_main;
467  mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data;
468  mc_socket_catchup_t *c =
469    find_catchup_from_file_descriptor (msm, uf->file_descriptor);
470  clib_error_t *error = 0;
471  int n;
472
473  if (c->connect_in_progress)
474    {
475      u32 len, value;
476
477      c->connect_in_progress = 0;
478      len = sizeof (value);
479      if (getsockopt (c->socket, SOL_SOCKET, SO_ERROR, &value, &len) < 0)
480	{
481	  error = clib_error_return_unix (0, "getsockopt SO_ERROR");
482	  goto error_quit;
483	}
484      if (value != 0)
485	{
486	  error =
487	    clib_error_return_code (0, value, CLIB_ERROR_ERRNO_VALID,
488				    "connect fails");
489	  goto error_quit;
490	}
491    }
492
493  while (1)
494    {
495      u32 n_this_write;
496
497      n_this_write =
498	clib_min (vec_len (c->output_vector) - c->output_vector_n_written,
499		  msm->rx_mtu_n_bytes -
500		  64 /* ip + tcp + option allowance */ );
501
502      if (n_this_write <= 0)
503	break;
504
505      do
506	{
507	  n = write (uf->file_descriptor,
508		     c->output_vector + c->output_vector_n_written,
509		     n_this_write);
510	}
511      while (n < 0 && errno == EAGAIN);
512
513      if (n < 0)
514	{
515	  error = clib_error_return_unix (0, "write");
516	  goto error_quit;
517	}
518      c->output_vector_n_written += n;
519    }
520
521  if (c->output_vector_n_written >= vec_len (c->output_vector))
522    {
523      if (!is_server)
524	{
525	  uf->flags &= ~UNIX_FILE_DATA_AVAILABLE_TO_WRITE;
526	  file_main.file_update (uf, UNIX_FILE_UPDATE_MODIFY);
527	  /* Send EOF to other side. */
528	  shutdown (uf->file_descriptor, SHUT_WR);
529	  return error;
530	}
531      else
532	{
533	error_quit:
534	  catchup_cleanup (msm, c, um, uf);
535	}
536    }
537  return error;
538}
539
540static clib_error_t *
541catchup_server_write_ready (clib_file_t * uf)
542{
543  return catchup_socket_write_ready (uf, /* is_server */ 1);
544}
545
546static clib_error_t *
547catchup_client_write_ready (clib_file_t * uf)
548{
549  return catchup_socket_write_ready (uf, /* is_server */ 0);
550}
551
552static clib_error_t *
553catchup_socket_error_ready (clib_file_t * uf)
554{
555  clib_file_main_t *um = &file_main;
556  mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data;
557  mc_socket_catchup_t *c =
558    find_catchup_from_file_descriptor (msm, uf->file_descriptor);
559  catchup_cleanup (msm, c, um, uf);
560  return clib_error_return (0, "error");
561}
562
563static clib_error_t *
564catchup_listen_read_ready (clib_file_t * uf)
565{
566  mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data;
567  struct sockaddr_in client_addr;
568  int client_len;
569  mc_socket_catchup_t *c;
570  clib_file_t template = { 0 };
571
572  pool_get (msm->catchups, c);
573  clib_memset (c, 0, sizeof (c[0]));
574
575  client_len = sizeof (client_addr);
576
577  /* Acquires the non-blocking attrib from the server socket. */
578  c->socket = accept (uf->file_descriptor,
579		      (struct sockaddr *) &client_addr,
580		      (socklen_t *) & client_len);
581
582  if (c->socket < 0)
583    {
584      pool_put (msm->catchups, c);
585      return clib_error_return_unix (0, "accept");
586    }
587
588  if (MC_EVENT_LOGGING)
589    {
590      mc_main_t *mcm = &msm->mc_main;
591      vlib_main_t *vm = mcm->vlib_main;
592
593      ELOG_TYPE_DECLARE (e) =
594      {
595      .format = "catchup accepted from 0x%lx",.format_args = "i4",};
596      struct
597      {
598	u32 addr;
599      } *ed = 0;
600
601      ed = ELOG_DATA (&vm->elog_main, e);
602      ed->addr = ntohl (client_addr.sin_addr.s_addr);
603    }
604
605  /* Disable the Nagle algorithm, ship catchup pkts immediately */
606  {
607    int one = 1;
608    if ((setsockopt (c->socket, IPPROTO_TCP,
609		     TCP_NODELAY, (void *) &one, sizeof (one))) < 0)
610      {
611	clib_unix_warning ("catchup socket: set TCP_NODELAY");
612      }
613  }
614
615  template.read_function = catchup_server_read_ready;
616  template.write_function = catchup_server_write_ready;
617  template.error_function = catchup_socket_error_ready;
618  template.file_descriptor = c->socket;
619  template.private_data = pointer_to_uword (msm);
620  c->clib_file_index = clib_file_add (&file_main, &template);
621  hash_set (msm->catchup_index_by_file_descriptor, c->socket,
622	    c - msm->catchups);
623
624  return 0;
625}
626
627/* Return and bind to an unused port. */
628static word
629find_and_bind_to_free_port (word sock, word port)
630{
631  for (; port < 1 << 16; port++)
632    {
633      struct sockaddr_in a;
634
635      clib_memset (&a, 0, sizeof (a));	/* Warnings be gone */
636
637      a.sin_family = PF_INET;
638      a.sin_addr.s_addr = INADDR_ANY;
639      a.sin_port = htons (port);
640
641      if (bind (sock, (struct sockaddr *) &a, sizeof (a)) >= 0)
642	break;
643    }
644
645  return port < 1 << 16 ? port : -1;
646}
647
648static clib_error_t *
649setup_mutlicast_socket (mc_socket_main_t * msm,
650			mc_multicast_socket_t * ms,
651			char *type, uword udp_port)
652{
653  int one = 1;
654  struct ip_mreq mcast_req;
655
656  if (!msm->multicast_ttl)
657    msm->multicast_ttl = 1;
658
659  /* mastership (multicast) TX socket */
660  if ((ms->socket = socket (PF_INET, SOCK_DGRAM, IPPROTO_UDP)) < 0)
661    return clib_error_return_unix (0, "%s socket", type);
662
663  {
664    u8 ttl = msm->multicast_ttl;
665
666    if ((setsockopt (ms->socket, IPPROTO_IP,
667		     IP_MULTICAST_TTL, (void *) &ttl, sizeof (ttl))) < 0)
668      return clib_error_return_unix (0, "%s set multicast ttl", type);
669  }
670
671  if (setsockopt (ms->socket, SOL_SOCKET, SO_REUSEADDR, &one, sizeof (one)) <
672      0)
673    return clib_error_return_unix (0, "%s setsockopt SO_REUSEADDR", type);
674
675  clib_memset (&ms->tx_addr, 0, sizeof (ms->tx_addr));
676  ms->tx_addr.sin_family = AF_INET;
677  ms->tx_addr.sin_addr.s_addr =
678    htonl (msm->multicast_tx_ip4_address_host_byte_order);
679  ms->tx_addr.sin_port = htons (udp_port);
680
681  if (bind (ms->socket, (struct sockaddr *) &ms->tx_addr,
682	    sizeof (ms->tx_addr)) < 0)
683    return clib_error_return_unix (0, "%s bind", type);
684
685  clib_memset (&mcast_req, 0, sizeof (mcast_req));
686  mcast_req.imr_multiaddr.s_addr =
687    htonl (msm->multicast_tx_ip4_address_host_byte_order);
688  mcast_req.imr_interface.s_addr = msm->if_ip4_address_net_byte_order;
689
690  if ((setsockopt (ms->socket, IPPROTO_IP,
691		   IP_ADD_MEMBERSHIP, (void *) &mcast_req,
692		   sizeof (mcast_req))) < 0)
693    return clib_error_return_unix (0, "%s IP_ADD_MEMBERSHIP setsockopt",
694				   type);
695
696  if (ioctl (ms->socket, FIONBIO, &one) < 0)
697    return clib_error_return_unix (0, "%s set FIONBIO", type);
698
699  /* FIXME remove this when we support tx_ready. */
700  {
701    u32 len = 1 << 20;
702    socklen_t sl = sizeof (len);
703    if (setsockopt (ms->socket, SOL_SOCKET, SO_SNDBUF, &len, sl) < 0)
704      clib_unix_error ("setsockopt");
705  }
706
707  return 0;
708}
709
710static clib_error_t *
711socket_setup (mc_socket_main_t * msm)
712{
713  int one = 1;
714  clib_error_t *error;
715  u32 port;
716
717  if (!msm->base_multicast_udp_port_host_byte_order)
718    msm->base_multicast_udp_port_host_byte_order =
719      0xffff - ((MC_N_TRANSPORT_TYPE + 2 /* ack socket, catchup socket */ )
720		- 1);
721
722  port = msm->base_multicast_udp_port_host_byte_order;
723
724  error = setup_mutlicast_socket (msm,
725				  &msm->multicast_sockets
726				  [MC_TRANSPORT_MASTERSHIP], "mastership",
727				  port++);
728  if (error)
729    return error;
730
731  error = setup_mutlicast_socket (msm,
732				  &msm->multicast_sockets[MC_TRANSPORT_JOIN],
733				  "join", port++);
734  if (error)
735    return error;
736
737  error = setup_mutlicast_socket (msm,
738				  &msm->multicast_sockets
739				  [MC_TRANSPORT_USER_REQUEST_TO_RELAY],
740				  "to relay", port++);
741  if (error)
742    return error;
743
744  error = setup_mutlicast_socket (msm,
745				  &msm->multicast_sockets
746				  [MC_TRANSPORT_USER_REQUEST_FROM_RELAY],
747				  "from relay", port++);
748  if (error)
749    return error;
750
751  /* ACK rx socket */
752  msm->ack_socket = socket (PF_INET, SOCK_DGRAM, IPPROTO_UDP);
753  if (msm->ack_socket < 0)
754    return clib_error_return_unix (0, "ack socket");
755
756  msm->ack_udp_port = find_and_bind_to_free_port (msm->ack_socket, port++);
757
758  if (ioctl (msm->ack_socket, FIONBIO, &one) < 0)
759    return clib_error_return_unix (0, "ack socket FIONBIO");
760
761  msm->catchup_server_socket = socket (AF_INET, SOCK_STREAM, 0);
762  if (msm->catchup_server_socket < 0)
763    return clib_error_return_unix (0, "catchup server socket");
764
765  msm->catchup_tcp_port =
766    find_and_bind_to_free_port (msm->catchup_server_socket, port++);
767
768  if (ioctl (msm->catchup_server_socket, FIONBIO, &one) < 0)
769    return clib_error_return_unix (0, "catchup server socket FIONBIO");
770
771  if (listen (msm->catchup_server_socket, 5) < 0)
772    return clib_error_return_unix (0, "catchup server socket listen");
773
774  /* epoll setup for multicast mastership socket */
775  {
776    clib_file_t template = { 0 };
777
778    template.read_function = mastership_socket_read_ready;
779    template.file_descriptor =
780      msm->multicast_sockets[MC_TRANSPORT_MASTERSHIP].socket;
781    template.private_data = (uword) msm;
782    clib_file_add (&file_main, &template);
783
784    /* epoll setup for multicast to_relay socket */
785    template.read_function = to_relay_socket_read_ready;
786    template.file_descriptor =
787      msm->multicast_sockets[MC_TRANSPORT_USER_REQUEST_TO_RELAY].socket;
788    template.private_data = (uword) msm;
789    clib_file_add (&file_main, &template);
790
791    /* epoll setup for multicast from_relay socket */
792    template.read_function = from_relay_socket_read_ready;
793    template.file_descriptor =
794      msm->multicast_sockets[MC_TRANSPORT_USER_REQUEST_FROM_RELAY].socket;
795    template.private_data = (uword) msm;
796    clib_file_add (&file_main, &template);
797
798    template.read_function = join_socket_read_ready;
799    template.file_descriptor =
800      msm->multicast_sockets[MC_TRANSPORT_JOIN].socket;
801    template.private_data = (uword) msm;
802    clib_file_add (&file_main, &template);
803
804    /* epoll setup for ack rx socket */
805    template.read_function = ack_socket_read_ready;
806    template.file_descriptor = msm->ack_socket;
807    template.private_data = (uword) msm;
808    clib_file_add (&file_main, &template);
809
810    /* epoll setup for TCP catchup server */
811    template.read_function = catchup_listen_read_ready;
812    template.file_descriptor = msm->catchup_server_socket;
813    template.private_data = (uword) msm;
814    clib_file_add (&file_main, &template);
815  }
816
817  return 0;
818}
819
820static void *
821catchup_add_pending_output (mc_socket_catchup_t * c, uword n_bytes,
822			    u8 * set_output_vector)
823{
824  clib_file_t *uf = pool_elt_at_index (file_main.file_pool,
825				       c->clib_file_index);
826  u8 *result = 0;
827
828  if (set_output_vector)
829    c->output_vector = set_output_vector;
830  else
831    vec_add2 (c->output_vector, result, n_bytes);
832  if (vec_len (c->output_vector) > 0)
833    {
834      int skip_update = 0 != (uf->flags & UNIX_FILE_DATA_AVAILABLE_TO_WRITE);
835      uf->flags |= UNIX_FILE_DATA_AVAILABLE_TO_WRITE;
836      if (!skip_update)
837	file_main.file_update (uf, UNIX_FILE_UPDATE_MODIFY);
838    }
839  return result;
840}
841
842static uword
843catchup_request_fun (void *transport_main,
844		     u32 stream_index, mc_peer_id_t catchup_peer_id)
845{
846  mc_socket_main_t *msm = (mc_socket_main_t *) transport_main;
847  mc_main_t *mcm = &msm->mc_main;
848  vlib_main_t *vm = mcm->vlib_main;
849  mc_socket_catchup_t *c;
850  struct sockaddr_in addr;
851  clib_file_main_t *um = &file_main;
852  int one = 1;
853
854  pool_get (msm->catchups, c);
855  clib_memset (c, 0, sizeof (*c));
856
857  c->socket = socket (AF_INET, SOCK_STREAM, 0);
858  if (c->socket < 0)
859    {
860      clib_unix_warning ("socket");
861      return 0;
862    }
863
864  if (ioctl (c->socket, FIONBIO, &one) < 0)
865    {
866      clib_unix_warning ("FIONBIO");
867      return 0;
868    }
869
870  clib_memset (&addr, 0, sizeof (addr));
871  addr.sin_family = AF_INET;
872  addr.sin_addr.s_addr = mc_socket_peer_id_get_address (catchup_peer_id);
873  addr.sin_port = mc_socket_peer_id_get_port (catchup_peer_id);
874
875  c->connect_in_progress = 1;
876
877  if (MC_EVENT_LOGGING)
878    {
879      ELOG_TYPE_DECLARE (e) =
880      {
881      .format = "connecting to peer 0x%Lx",.format_args = "i8",};
882      struct
883      {
884	u64 peer;
885      } *ed;
886      ed = ELOG_DATA (&vm->elog_main, e);
887      ed->peer = catchup_peer_id.as_u64;
888    }
889
890  if (connect (c->socket, (const void *) &addr, sizeof (addr))
891      < 0 && errno != EINPROGRESS)
892    {
893      clib_unix_warning ("connect to %U fails",
894			 format_socket_peer_id, catchup_peer_id);
895      return 0;
896    }
897
898  {
899    clib_file_t template = { 0 };
900
901    template.read_function = catchup_client_read_ready;
902    template.write_function = catchup_client_write_ready;
903    template.error_function = catchup_socket_error_ready;
904    template.file_descriptor = c->socket;
905    template.private_data = (uword) msm;
906    c->clib_file_index = clib_file_add (um, &template);
907
908    hash_set (msm->catchup_index_by_file_descriptor, c->socket,
909	      c - msm->catchups);
910  }
911
912  {
913    mc_msg_catchup_request_t *mp;
914    mp = catchup_add_pending_output (c, sizeof (mp[0]),	/* set_output_vector */
915				     0);
916    mp->peer_id = msm->mc_main.transport.our_catchup_peer_id;
917    mp->stream_index = stream_index;
918    mc_byte_swap_msg_catchup_request (mp);
919  }
920
921  return c - msm->catchups;
922}
923
924static void
925catchup_send_fun (void *transport_main, uword opaque, u8 * data)
926{
927  mc_socket_main_t *msm = (mc_socket_main_t *) transport_main;
928  mc_socket_catchup_t *c = pool_elt_at_index (msm->catchups, opaque);
929  catchup_add_pending_output (c, 0, data);
930}
931
932static int
933find_interface_ip4_address (char *if_name, u32 * ip4_address, u32 * mtu)
934{
935  int fd;
936  struct ifreq ifr;
937  struct sockaddr_in *sa;
938
939  /* Dig up our IP address */
940  fd = socket (PF_INET, AF_INET, 0);
941  if (fd < 0)
942    {
943      clib_unix_error ("socket");
944      return -1;
945    }
946
947  ifr.ifr_addr.sa_family = AF_INET;
948  strncpy (ifr.ifr_name, if_name, sizeof (ifr.ifr_name) - 1);
949  if (ioctl (fd, SIOCGIFADDR, &ifr) < 0)
950    {
951      clib_unix_error ("ioctl(SIOCFIGADDR)");
952      close (fd);
953      return -1;
954    }
955
956  sa = (void *) &ifr.ifr_addr;
957  clib_memcpy (ip4_address, &sa->sin_addr.s_addr, sizeof (ip4_address[0]));
958
959  if (ioctl (fd, SIOCGIFMTU, &ifr) < 0)
960    {
961      close (fd);
962      return -1;
963    }
964  if (mtu)
965    *mtu = ifr.ifr_mtu - ( /* IP4 header */ 20 + /* UDP header */ 8);
966
967  close (fd);
968
969  return 0;
970}
971
972clib_error_t *
973mc_socket_main_init (mc_socket_main_t * msm, char **intfc_probe_list,
974		     int n_intfcs_to_probe)
975{
976  clib_error_t *error;
977  mc_main_t *mcm;
978  u32 mtu;
979
980  mcm = &msm->mc_main;
981
982  /* 239.255.0.7 */
983  if (!msm->multicast_tx_ip4_address_host_byte_order)
984    msm->multicast_tx_ip4_address_host_byte_order = 0xefff0007;
985
986  {
987    u32 i, a, win;
988
989    win = 0;
990    if (msm->multicast_interface_name)
991      {
992	win =
993	  !find_interface_ip4_address (msm->multicast_interface_name, &a,
994				       &mtu);
995      }
996    else
997      {
998	for (i = 0; i < n_intfcs_to_probe; i++)
999	  if (!find_interface_ip4_address (intfc_probe_list[i], &a, &mtu))
1000	    {
1001	      win = 1;
1002	      msm->multicast_interface_name = intfc_probe_list[i];
1003	      break;
1004	    }
1005      }
1006
1007    if (!win)
1008      return clib_error_return (0, "can't find interface ip4 address");
1009
1010    msm->if_ip4_address_net_byte_order = a;
1011  }
1012
1013  msm->rx_mtu_n_bytes = mtu;
1014  msm->rx_mtu_n_buffers =
1015    msm->rx_mtu_n_bytes / vlib_buffer_get_default_data_size (vm);
1016  msm->rx_mtu_n_buffers +=
1017    (msm->rx_mtu_n_bytes % vlib_buffer_get_default_data_size (vm)) != 0;
1018
1019  error = socket_setup (msm);
1020  if (error)
1021    return error;
1022
1023  mcm->transport.our_ack_peer_id =
1024    mc_socket_set_peer_id (msm->if_ip4_address_net_byte_order,
1025			   msm->ack_udp_port);
1026
1027  mcm->transport.our_catchup_peer_id =
1028    mc_socket_set_peer_id (msm->if_ip4_address_net_byte_order,
1029			   msm->catchup_tcp_port);
1030
1031  mcm->transport.tx_buffer = tx_buffer;
1032  mcm->transport.tx_ack = tx_ack;
1033  mcm->transport.catchup_request_fun = catchup_request_fun;
1034  mcm->transport.catchup_send_fun = catchup_send_fun;
1035  mcm->transport.format_peer_id = format_socket_peer_id;
1036  mcm->transport.opaque = msm;
1037  mcm->transport.max_packet_size = mtu;
1038
1039  mc_main_init (mcm, "socket");
1040
1041  return error;
1042}
1043
1044/*
1045 * fd.io coding-style-patch-verification: ON
1046 *
1047 * Local Variables:
1048 * eval: (c-set-style "gnu")
1049 * End:
1050 */
1051