svm.c revision 9f4ac587
1/*
2 *------------------------------------------------------------------
3 * svm.c - shared VM allocation, mmap(...MAP_FIXED...)
4 * library
5 *
6 * Copyright (c) 2009 Cisco and/or its affiliates.
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 * you may not use this file except in compliance with the License.
9 * You may obtain a copy of the License at:
10 *
11 *     http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 *------------------------------------------------------------------
19 */
20
21#include <stdio.h>
22#include <stdlib.h>
23#include <sys/types.h>
24#include <sys/mman.h>
25#include <sys/stat.h>
26#include <netinet/in.h>
27#include <signal.h>
28#include <pthread.h>
29#include <unistd.h>
30#include <time.h>
31#include <fcntl.h>
32#include <string.h>
33#include <vppinfra/clib.h>
34#include <vppinfra/vec.h>
35#include <vppinfra/hash.h>
36#include <vppinfra/bitmap.h>
37#include <vppinfra/fifo.h>
38#include <vppinfra/time.h>
39#include <vppinfra/mheap.h>
40#include <vppinfra/heap.h>
41#include <vppinfra/pool.h>
42#include <vppinfra/format.h>
43
44#include "svm.h"
45
46static svm_region_t *root_rp;
47static int root_rp_refcount;
48
49#define MAXLOCK 2
50static pthread_mutex_t *mutexes_held[MAXLOCK];
51static int nheld;
52
53svm_region_t *
54svm_get_root_rp (void)
55{
56  return root_rp;
57}
58
59#define MUTEX_DEBUG
60
61u64
62svm_get_global_region_base_va ()
63{
64#ifdef CLIB_SANITIZE_ADDR
65  return 0x200000000000;
66#endif
67
68#if __aarch64__
69  /* On AArch64 VA space can have different size, from 36 to 48 bits.
70     Here we are trying to detect VA bits by parsing /proc/self/maps
71     address ranges */
72  int fd;
73  unformat_input_t input;
74  u64 start, end = 0;
75  u8 bits = 0;
76
77  if ((fd = open ("/proc/self/maps", 0)) < 0)
78    clib_unix_error ("open '/proc/self/maps'");
79
80  unformat_init_clib_file (&input, fd);
81  while (unformat_check_input (&input) != UNFORMAT_END_OF_INPUT)
82    {
83      if (unformat (&input, "%llx-%llx", &start, &end))
84	end--;
85      unformat_skip_line (&input);
86    }
87  unformat_free (&input);
88  close (fd);
89
90  bits = count_leading_zeros (end);
91  bits = 64 - bits;
92  if (bits >= 36 && bits <= 48)
93    return ((1ul << bits) / 4) - (2 * SVM_GLOBAL_REGION_SIZE);
94  else
95    clib_unix_error ("unexpected va bits '%u'", bits);
96#endif
97
98  /* default value */
99  return 0x130000000ULL;
100}
101
102static void
103region_lock (svm_region_t * rp, int tag)
104{
105  pthread_mutex_lock (&rp->mutex);
106#ifdef MUTEX_DEBUG
107  rp->mutex_owner_pid = getpid ();
108  rp->mutex_owner_tag = tag;
109#endif
110  ASSERT (nheld < MAXLOCK);
111  /*
112   * Keep score of held mutexes so we can try to exit
113   * cleanly if the world comes to an end at the worst possible
114   * moment
115   */
116  mutexes_held[nheld++] = &rp->mutex;
117}
118
119static void
120region_unlock (svm_region_t * rp)
121{
122  int i, j;
123#ifdef MUTEX_DEBUG
124  rp->mutex_owner_pid = 0;
125  rp->mutex_owner_tag = 0;
126#endif
127
128  for (i = nheld - 1; i >= 0; i--)
129    {
130      if (mutexes_held[i] == &rp->mutex)
131	{
132	  for (j = i; j < MAXLOCK - 1; j++)
133	    mutexes_held[j] = mutexes_held[j + 1];
134	  nheld--;
135	  goto found;
136	}
137    }
138  ASSERT (0);
139
140found:
141  CLIB_MEMORY_BARRIER ();
142  pthread_mutex_unlock (&rp->mutex);
143}
144
145
146static u8 *
147format_svm_flags (u8 * s, va_list * args)
148{
149  uword f = va_arg (*args, uword);
150
151  if (f & SVM_FLAGS_MHEAP)
152    s = format (s, "MHEAP ");
153  if (f & SVM_FLAGS_FILE)
154    s = format (s, "FILE ");
155  if (f & SVM_FLAGS_NODATA)
156    s = format (s, "NODATA ");
157  if (f & SVM_FLAGS_NEED_DATA_INIT)
158    s = format (s, "INIT ");
159
160  return (s);
161}
162
163static u8 *
164format_svm_size (u8 * s, va_list * args)
165{
166  uword size = va_arg (*args, uword);
167
168  if (size >= (1 << 20))
169    {
170      s = format (s, "(%d mb)", size >> 20);
171    }
172  else if (size >= (1 << 10))
173    {
174      s = format (s, "(%d kb)", size >> 10);
175    }
176  else
177    {
178      s = format (s, "(%d bytes)", size);
179    }
180  return (s);
181}
182
183u8 *
184format_svm_region (u8 * s, va_list * args)
185{
186  svm_region_t *rp = va_arg (*args, svm_region_t *);
187  int verbose = va_arg (*args, int);
188  int i;
189  uword lo, hi;
190
191  s = format (s, "%s: base va 0x%x size 0x%x %U\n",
192	      rp->region_name, rp->virtual_base,
193	      rp->virtual_size, format_svm_size, rp->virtual_size);
194  s = format (s, "  user_ctx 0x%x, bitmap_size %d\n",
195	      rp->user_ctx, rp->bitmap_size);
196
197  if (verbose)
198    {
199      s = format (s, "  flags: 0x%x %U\n", rp->flags,
200		  format_svm_flags, rp->flags);
201      s = format (s,
202		  "  region_heap 0x%x data_base 0x%x data_heap 0x%x\n",
203		  rp->region_heap, rp->data_base, rp->data_heap);
204    }
205
206  s = format (s, "  %d clients, pids: ", vec_len (rp->client_pids));
207
208  for (i = 0; i < vec_len (rp->client_pids); i++)
209    s = format (s, "%d ", rp->client_pids[i]);
210
211  s = format (s, "\n");
212
213  if (verbose)
214    {
215      lo = hi = ~0;
216
217      s = format (s, "  VM in use: ");
218
219      for (i = 0; i < rp->bitmap_size; i++)
220	{
221	  if (clib_bitmap_get_no_check (rp->bitmap, i) != 0)
222	    {
223	      if (lo == ~0)
224		{
225		  hi = lo = rp->virtual_base + i * MMAP_PAGESIZE;
226		}
227	      else
228		{
229		  hi = rp->virtual_base + i * MMAP_PAGESIZE;
230		}
231	    }
232	  else
233	    {
234	      if (lo != ~0)
235		{
236		  hi = rp->virtual_base + i * MMAP_PAGESIZE - 1;
237		  s = format (s, "   0x%x - 0x%x (%dk)\n", lo, hi,
238			      (hi - lo) >> 10);
239		  lo = hi = ~0;
240		}
241	    }
242	}
243#if USE_DLMALLOC == 0
244      s = format (s, "  rgn heap stats: %U", format_mheap,
245		  rp->region_heap, 0);
246      if ((rp->flags & SVM_FLAGS_MHEAP) && rp->data_heap)
247	{
248	  s = format (s, "\n  data heap stats: %U", format_mheap,
249		      rp->data_heap, 1);
250	}
251      s = format (s, "\n");
252#endif
253    }
254
255  return (s);
256}
257
258/*
259 * rnd_pagesize
260 * Round to a pagesize multiple, presumably 4k works
261 */
262static u64
263rnd_pagesize (u64 size)
264{
265  u64 rv;
266
267  rv = (size + (MMAP_PAGESIZE - 1)) & ~(MMAP_PAGESIZE - 1);
268  return (rv);
269}
270
271/*
272 * svm_data_region_setup
273 */
274static int
275svm_data_region_create (svm_map_region_args_t * a, svm_region_t * rp)
276{
277  int fd;
278  u8 junk = 0;
279  uword map_size;
280
281  map_size = rp->virtual_size - (MMAP_PAGESIZE +
282				 (a->pvt_heap_size ? a->pvt_heap_size :
283				  SVM_PVT_MHEAP_SIZE));
284
285  if (a->flags & SVM_FLAGS_FILE)
286    {
287      struct stat statb;
288
289      fd = open (a->backing_file, O_RDWR | O_CREAT, 0777);
290
291      if (fd < 0)
292	{
293	  clib_unix_warning ("open");
294	  return -1;
295	}
296
297      if (fstat (fd, &statb) < 0)
298	{
299	  clib_unix_warning ("fstat");
300	  close (fd);
301	  return -2;
302	}
303
304      if (statb.st_mode & S_IFREG)
305	{
306	  if (statb.st_size == 0)
307	    {
308	      if (lseek (fd, map_size, SEEK_SET) == (off_t) - 1)
309		{
310		  clib_unix_warning ("seek region size");
311		  close (fd);
312		  return -3;
313		}
314	      if (write (fd, &junk, 1) != 1)
315		{
316		  clib_unix_warning ("set region size");
317		  close (fd);
318		  return -3;
319		}
320	    }
321	  else
322	    {
323	      map_size = rnd_pagesize (statb.st_size);
324	    }
325	}
326      else
327	{
328	  map_size = a->backing_mmap_size;
329	}
330
331      ASSERT (map_size <= rp->virtual_size -
332	      (MMAP_PAGESIZE + SVM_PVT_MHEAP_SIZE));
333
334      if (mmap (rp->data_base, map_size, PROT_READ | PROT_WRITE,
335		MAP_SHARED | MAP_FIXED, fd, 0) == MAP_FAILED)
336	{
337	  clib_unix_warning ("mmap");
338	  close (fd);
339	  return -3;
340	}
341      close (fd);
342      rp->backing_file = (char *) format (0, "%s\0", a->backing_file);
343      rp->flags |= SVM_FLAGS_FILE;
344    }
345
346  if (a->flags & SVM_FLAGS_MHEAP)
347    {
348#if USE_DLMALLOC == 0
349      mheap_t *heap_header;
350      rp->data_heap =
351	mheap_alloc_with_flags ((void *) (rp->data_base), map_size,
352				MHEAP_FLAG_DISABLE_VM);
353      heap_header = mheap_header (rp->data_heap);
354      heap_header->flags |= MHEAP_FLAG_THREAD_SAFE;
355#else
356      rp->data_heap = create_mspace_with_base (rp->data_base,
357					       map_size, 1 /* locked */ );
358      mspace_disable_expand (rp->data_heap);
359#endif
360
361      rp->flags |= SVM_FLAGS_MHEAP;
362    }
363  return 0;
364}
365
366static int
367svm_data_region_map (svm_map_region_args_t * a, svm_region_t * rp)
368{
369  int fd;
370  u8 junk = 0;
371  uword map_size;
372  struct stat statb;
373
374  map_size = rp->virtual_size -
375    (MMAP_PAGESIZE
376     + (a->pvt_heap_size ? a->pvt_heap_size : SVM_PVT_MHEAP_SIZE));
377
378  if (a->flags & SVM_FLAGS_FILE)
379    {
380
381      fd = open (a->backing_file, O_RDWR, 0777);
382
383      if (fd < 0)
384	{
385	  clib_unix_warning ("open");
386	  return -1;
387	}
388
389      if (fstat (fd, &statb) < 0)
390	{
391	  clib_unix_warning ("fstat");
392	  close (fd);
393	  return -2;
394	}
395
396      if (statb.st_mode & S_IFREG)
397	{
398	  if (statb.st_size == 0)
399	    {
400	      if (lseek (fd, map_size, SEEK_SET) == (off_t) - 1)
401		{
402		  clib_unix_warning ("seek region size");
403		  close (fd);
404		  return -3;
405		}
406	      if (write (fd, &junk, 1) != 1)
407		{
408		  clib_unix_warning ("set region size");
409		  close (fd);
410		  return -3;
411		}
412	    }
413	  else
414	    {
415	      map_size = rnd_pagesize (statb.st_size);
416	    }
417	}
418      else
419	{
420	  map_size = a->backing_mmap_size;
421	}
422
423      ASSERT (map_size <= rp->virtual_size
424	      - (MMAP_PAGESIZE
425		 +
426		 (a->pvt_heap_size ? a->pvt_heap_size : SVM_PVT_MHEAP_SIZE)));
427
428      if (mmap (rp->data_base, map_size, PROT_READ | PROT_WRITE,
429		MAP_SHARED | MAP_FIXED, fd, 0) == MAP_FAILED)
430	{
431	  clib_unix_warning ("mmap");
432	  close (fd);
433	  return -3;
434	}
435      close (fd);
436    }
437  return 0;
438}
439
440u8 *
441shm_name_from_svm_map_region_args (svm_map_region_args_t * a)
442{
443  u8 *shm_name;
444  int root_path_offset = 0;
445  int name_offset = 0;
446
447  if (a->root_path)
448    {
449      /* Tolerate present or absent slashes */
450      if (a->root_path[0] == '/')
451	root_path_offset++;
452
453      if (a->name[0] == '/')
454	name_offset = 1;
455
456      shm_name = format (0, "/%s-%s%c", &a->root_path[root_path_offset],
457			 &a->name[name_offset], 0);
458    }
459  else
460    shm_name = format (0, "%s%c", a->name, 0);
461  return (shm_name);
462}
463
464void
465svm_region_init_mapped_region (svm_map_region_args_t * a, svm_region_t * rp)
466{
467  pthread_mutexattr_t attr;
468  pthread_condattr_t cattr;
469  int nbits, words, bit;
470  int overhead_space;
471  void *oldheap;
472  uword data_base;
473  ASSERT (rp);
474  int rv;
475
476  clib_memset (rp, 0, sizeof (*rp));
477
478  if (pthread_mutexattr_init (&attr))
479    clib_unix_warning ("mutexattr_init");
480
481  if (pthread_mutexattr_setpshared (&attr, PTHREAD_PROCESS_SHARED))
482    clib_unix_warning ("mutexattr_setpshared");
483
484  if (pthread_mutex_init (&rp->mutex, &attr))
485    clib_unix_warning ("mutex_init");
486
487  if (pthread_mutexattr_destroy (&attr))
488    clib_unix_warning ("mutexattr_destroy");
489
490  if (pthread_condattr_init (&cattr))
491    clib_unix_warning ("condattr_init");
492
493  if (pthread_condattr_setpshared (&cattr, PTHREAD_PROCESS_SHARED))
494    clib_unix_warning ("condattr_setpshared");
495
496  if (pthread_cond_init (&rp->condvar, &cattr))
497    clib_unix_warning ("cond_init");
498
499  if (pthread_condattr_destroy (&cattr))
500    clib_unix_warning ("condattr_destroy");
501
502  region_lock (rp, 1);
503
504  rp->virtual_base = a->baseva;
505  rp->virtual_size = a->size;
506
507#if USE_DLMALLOC == 0
508  rp->region_heap =
509    mheap_alloc_with_flags (uword_to_pointer
510			    (a->baseva + MMAP_PAGESIZE, void *),
511			    (a->pvt_heap_size !=
512			     0) ? a->pvt_heap_size : SVM_PVT_MHEAP_SIZE,
513			    MHEAP_FLAG_DISABLE_VM);
514#else
515  rp->region_heap = create_mspace_with_base
516    (uword_to_pointer (a->baseva + MMAP_PAGESIZE, void *),
517     (a->pvt_heap_size !=
518      0) ? a->pvt_heap_size : SVM_PVT_MHEAP_SIZE, 1 /* locked */ );
519
520  mspace_disable_expand (rp->region_heap);
521#endif
522
523  oldheap = svm_push_pvt_heap (rp);
524
525  rp->region_name = (char *) format (0, "%s%c", a->name, 0);
526  vec_add1 (rp->client_pids, getpid ());
527
528  nbits = rp->virtual_size / MMAP_PAGESIZE;
529
530  ASSERT (nbits > 0);
531  rp->bitmap_size = nbits;
532  words = (nbits + BITS (uword) - 1) / BITS (uword);
533  vec_validate (rp->bitmap, words - 1);
534
535  overhead_space = MMAP_PAGESIZE /* header */  +
536    ((a->pvt_heap_size != 0) ? a->pvt_heap_size : SVM_PVT_MHEAP_SIZE);
537
538  bit = 0;
539  data_base = (uword) rp->virtual_base;
540
541  if (a->flags & SVM_FLAGS_NODATA)
542    rp->flags |= SVM_FLAGS_NEED_DATA_INIT;
543
544  do
545    {
546      clib_bitmap_set_no_check (rp->bitmap, bit, 1);
547      bit++;
548      overhead_space -= MMAP_PAGESIZE;
549      data_base += MMAP_PAGESIZE;
550    }
551  while (overhead_space > 0);
552
553  rp->data_base = (void *) data_base;
554
555  /*
556   * Note: although the POSIX spec guarantees that only one
557   * process enters this block, we have to play games
558   * to hold off clients until e.g. the mutex is ready
559   */
560  rp->version = SVM_VERSION;
561
562  /* setup the data portion of the region */
563
564  rv = svm_data_region_create (a, rp);
565  if (rv)
566    {
567      clib_warning ("data_region_create: %d", rv);
568    }
569
570  region_unlock (rp);
571
572  svm_pop_heap (oldheap);
573}
574
575/*
576 * svm_map_region
577 */
578void *
579svm_map_region (svm_map_region_args_t * a)
580{
581  int svm_fd;
582  svm_region_t *rp;
583  int deadman = 0;
584  u8 junk = 0;
585  void *oldheap;
586  int rv;
587  int pid_holding_region_lock;
588  u8 *shm_name;
589  int dead_region_recovery = 0;
590  int time_left;
591  struct stat stat;
592  struct timespec ts, tsrem;
593
594  ASSERT ((a->size & ~(MMAP_PAGESIZE - 1)) == a->size);
595  ASSERT (a->name);
596
597  shm_name = shm_name_from_svm_map_region_args (a);
598
599  if (CLIB_DEBUG > 1)
600    clib_warning ("[%d] map region %s: shm_open (%s)",
601		  getpid (), a->name, shm_name);
602
603  svm_fd = shm_open ((char *) shm_name, O_RDWR | O_CREAT | O_EXCL, 0777);
604
605  if (svm_fd >= 0)
606    {
607      if (fchmod (svm_fd, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP) < 0)
608	clib_unix_warning ("segment chmod");
609      /* This turns out to fail harmlessly if the client starts first */
610      if (fchown (svm_fd, a->uid, a->gid) < 0)
611	clib_unix_warning ("segment chown [ok if client starts first]");
612
613      vec_free (shm_name);
614
615      if (lseek (svm_fd, a->size, SEEK_SET) == (off_t) - 1)
616	{
617	  clib_warning ("seek region size");
618	  close (svm_fd);
619	  return (0);
620	}
621      if (write (svm_fd, &junk, 1) != 1)
622	{
623	  clib_warning ("set region size");
624	  close (svm_fd);
625	  return (0);
626	}
627
628      rp = mmap (uword_to_pointer (a->baseva, void *), a->size,
629		 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, svm_fd, 0);
630
631      if (rp == (svm_region_t *) MAP_FAILED)
632	{
633	  clib_unix_warning ("mmap create");
634	  close (svm_fd);
635	  return (0);
636	}
637      close (svm_fd);
638
639      svm_region_init_mapped_region (a, rp);
640
641      return ((void *) rp);
642    }
643  else
644    {
645      svm_fd = shm_open ((char *) shm_name, O_RDWR, 0777);
646
647      vec_free (shm_name);
648
649      if (svm_fd < 0)
650	{
651	  perror ("svm_region_map(mmap open)");
652	  return (0);
653	}
654
655      /* Reset ownership in case the client started first */
656      if (fchown (svm_fd, a->uid, a->gid) < 0)
657	clib_unix_warning ("segment chown [ok if client starts first]");
658
659      time_left = 20;
660      while (1)
661	{
662	  if (0 != fstat (svm_fd, &stat))
663	    {
664	      clib_warning ("fstat failed: %d", errno);
665	      close (svm_fd);
666	      return (0);
667	    }
668	  if (stat.st_size > 0)
669	    {
670	      break;
671	    }
672	  if (0 == time_left)
673	    {
674	      clib_warning ("waiting for resize of shm file timed out");
675	      close (svm_fd);
676	      return (0);
677	    }
678	  ts.tv_sec = 0;
679	  ts.tv_nsec = 100000000;
680	  while (nanosleep (&ts, &tsrem) < 0)
681	    ts = tsrem;
682	  time_left--;
683	}
684
685      rp = mmap (0, MMAP_PAGESIZE,
686		 PROT_READ | PROT_WRITE, MAP_SHARED, svm_fd, 0);
687
688      if (rp == (svm_region_t *) MAP_FAILED)
689	{
690	  close (svm_fd);
691	  clib_warning ("mmap");
692	  return (0);
693	}
694      /*
695       * We lost the footrace to create this region; make sure
696       * the winner has crossed the finish line.
697       */
698      while (rp->version == 0 && deadman++ < 5)
699	{
700	  sleep (1);
701	}
702
703      /*
704       * <bleep>-ed?
705       */
706      if (rp->version == 0)
707	{
708	  clib_warning ("rp->version %d not %d", rp->version, SVM_VERSION);
709	  close (svm_fd);
710	  munmap (rp, a->size);
711	  return (0);
712	}
713      /* Remap now that the region has been placed */
714      a->baseva = rp->virtual_base;
715      a->size = rp->virtual_size;
716      munmap (rp, MMAP_PAGESIZE);
717
718      rp = (void *) mmap (uword_to_pointer (a->baseva, void *), a->size,
719			  PROT_READ | PROT_WRITE,
720			  MAP_SHARED | MAP_FIXED, svm_fd, 0);
721      if ((uword) rp == (uword) MAP_FAILED)
722	{
723	  clib_unix_warning ("mmap");
724	  close (svm_fd);
725	  return (0);
726	}
727
728      close (svm_fd);
729
730      if ((uword) rp != rp->virtual_base)
731	{
732	  clib_warning ("mmap botch");
733	}
734
735      /*
736       * Try to fix the region mutex if it is held by
737       * a dead process
738       */
739      pid_holding_region_lock = rp->mutex_owner_pid;
740      if (pid_holding_region_lock && kill (pid_holding_region_lock, 0) < 0)
741	{
742	  clib_warning
743	    ("region %s mutex held by dead pid %d, tag %d, force unlock",
744	     rp->region_name, pid_holding_region_lock, rp->mutex_owner_tag);
745	  /* owner pid is nonexistent */
746	  rp->mutex.__data.__owner = 0;
747	  rp->mutex.__data.__lock = 0;
748	  dead_region_recovery = 1;
749	}
750
751      if (dead_region_recovery)
752	clib_warning ("recovery: attempt to re-lock region");
753
754      region_lock (rp, 2);
755      oldheap = svm_push_pvt_heap (rp);
756      vec_add1 (rp->client_pids, getpid ());
757
758      if (dead_region_recovery)
759	clib_warning ("recovery: attempt svm_data_region_map");
760
761      rv = svm_data_region_map (a, rp);
762      if (rv)
763	{
764	  clib_warning ("data_region_map: %d", rv);
765	}
766
767      if (dead_region_recovery)
768	clib_warning ("unlock and continue");
769
770      region_unlock (rp);
771
772      svm_pop_heap (oldheap);
773
774      return ((void *) rp);
775
776    }
777  return 0;			/* NOTREACHED */
778}
779
780static void
781svm_mutex_cleanup (void)
782{
783  int i;
784  for (i = 0; i < nheld; i++)
785    {
786      pthread_mutex_unlock (mutexes_held[i]);
787    }
788}
789
790static int
791svm_region_init_internal (svm_map_region_args_t * a)
792{
793  svm_region_t *rp;
794  u64 ticks = clib_cpu_time_now ();
795  uword randomize_baseva;
796
797  /* guard against klutz calls */
798  if (root_rp)
799    return -1;
800
801  root_rp_refcount++;
802
803  atexit (svm_mutex_cleanup);
804
805  /* Randomize the shared-VM base at init time */
806  if (MMAP_PAGESIZE <= (4 << 10))
807    randomize_baseva = (ticks & 15) * MMAP_PAGESIZE;
808  else
809    randomize_baseva = (ticks & 3) * MMAP_PAGESIZE;
810
811  a->baseva += randomize_baseva;
812
813  rp = svm_map_region (a);
814  if (!rp)
815    return -1;
816
817  region_lock (rp, 3);
818
819  /* Set up the main region data structures */
820  if (rp->flags & SVM_FLAGS_NEED_DATA_INIT)
821    {
822      svm_main_region_t *mp = 0;
823      void *oldheap;
824
825      rp->flags &= ~(SVM_FLAGS_NEED_DATA_INIT);
826
827      oldheap = svm_push_pvt_heap (rp);
828      vec_validate (mp, 0);
829      mp->name_hash = hash_create_string (0, sizeof (uword));
830      mp->root_path = a->root_path ? format (0, "%s%c", a->root_path, 0) : 0;
831      mp->uid = a->uid;
832      mp->gid = a->gid;
833      rp->data_base = mp;
834      svm_pop_heap (oldheap);
835    }
836  region_unlock (rp);
837  root_rp = rp;
838
839  return 0;
840}
841
842void
843svm_region_init (void)
844{
845  svm_map_region_args_t _a, *a = &_a;
846
847  clib_memset (a, 0, sizeof (*a));
848  a->root_path = 0;
849  a->name = SVM_GLOBAL_REGION_NAME;
850  a->baseva = svm_get_global_region_base_va ();
851  a->size = SVM_GLOBAL_REGION_SIZE;
852  a->flags = SVM_FLAGS_NODATA;
853  a->uid = 0;
854  a->gid = 0;
855
856  svm_region_init_internal (a);
857}
858
859int
860svm_region_init_chroot (const char *root_path)
861{
862  svm_map_region_args_t _a, *a = &_a;
863
864  clib_memset (a, 0, sizeof (*a));
865  a->root_path = root_path;
866  a->name = SVM_GLOBAL_REGION_NAME;
867  a->baseva = svm_get_global_region_base_va ();
868  a->size = SVM_GLOBAL_REGION_SIZE;
869  a->flags = SVM_FLAGS_NODATA;
870  a->uid = 0;
871  a->gid = 0;
872
873  return svm_region_init_internal (a);
874}
875
876void
877svm_region_init_chroot_uid_gid (const char *root_path, int uid, int gid)
878{
879  svm_map_region_args_t _a, *a = &_a;
880
881  clib_memset (a, 0, sizeof (*a));
882  a->root_path = root_path;
883  a->name = SVM_GLOBAL_REGION_NAME;
884  a->baseva = svm_get_global_region_base_va ();
885  a->size = SVM_GLOBAL_REGION_SIZE;
886  a->flags = SVM_FLAGS_NODATA;
887  a->uid = uid;
888  a->gid = gid;
889
890  svm_region_init_internal (a);
891}
892
893void
894svm_region_init_args (svm_map_region_args_t * a)
895{
896  svm_region_init_internal (a);
897}
898
899void *
900svm_region_find_or_create (svm_map_region_args_t * a)
901{
902  svm_main_region_t *mp;
903  svm_region_t *rp;
904  uword need_nbits;
905  int index, i;
906  void *oldheap;
907  uword *p;
908  u8 *name;
909  svm_subregion_t *subp;
910
911  ASSERT (root_rp);
912
913  a->size += MMAP_PAGESIZE +
914    ((a->pvt_heap_size != 0) ? a->pvt_heap_size : SVM_PVT_MHEAP_SIZE);
915  a->size = rnd_pagesize (a->size);
916
917  region_lock (root_rp, 4);
918  oldheap = svm_push_pvt_heap (root_rp);
919  mp = root_rp->data_base;
920
921  ASSERT (mp);
922
923  /* Map the named region from the correct chroot environment */
924  if (a->root_path == NULL)
925    a->root_path = (char *) mp->root_path;
926
927  /*
928   * See if this region is already known. If it is, we're
929   * almost done...
930   */
931  p = hash_get_mem (mp->name_hash, a->name);
932
933  if (p)
934    {
935      rp = svm_map_region (a);
936      region_unlock (root_rp);
937      svm_pop_heap (oldheap);
938      return rp;
939    }
940
941  /* Create the region. */
942  ASSERT ((a->size & ~(MMAP_PAGESIZE - 1)) == a->size);
943
944  need_nbits = a->size / MMAP_PAGESIZE;
945
946  index = 1;			/* $$$ fixme, figure out how many bit to really skip */
947
948  /*
949   * Scan the virtual space allocation bitmap, looking for a large
950   * enough chunk
951   */
952  do
953    {
954      if (clib_bitmap_get_no_check (root_rp->bitmap, index) == 0)
955	{
956	  for (i = 0; i < (need_nbits - 1); i++)
957	    {
958	      if (clib_bitmap_get_no_check (root_rp->bitmap, index + i) == 1)
959		{
960		  index = index + i;
961		  goto next;
962		}
963	    }
964	  break;
965	}
966      index++;
967    next:;
968    }
969  while (index < root_rp->bitmap_size);
970
971  /* Completely out of VM? */
972  if (index >= root_rp->bitmap_size)
973    {
974      clib_warning ("region %s: not enough VM to allocate 0x%llx (%lld)",
975		    root_rp->region_name, a->size, a->size);
976      svm_pop_heap (oldheap);
977      region_unlock (root_rp);
978      return 0;
979    }
980
981  /*
982   * Mark virtual space allocated
983   */
984#if CLIB_DEBUG > 1
985  clib_warning ("set %d bits at index %d", need_nbits, index);
986#endif
987
988  for (i = 0; i < need_nbits; i++)
989    {
990      clib_bitmap_set_no_check (root_rp->bitmap, index + i, 1);
991    }
992
993  /* Place this region where it goes... */
994  a->baseva = root_rp->virtual_base + index * MMAP_PAGESIZE;
995
996  rp = svm_map_region (a);
997
998  pool_get (mp->subregions, subp);
999  name = format (0, "%s%c", a->name, 0);
1000  subp->subregion_name = name;
1001
1002  hash_set_mem (mp->name_hash, name, subp - mp->subregions);
1003
1004  svm_pop_heap (oldheap);
1005
1006  region_unlock (root_rp);
1007
1008  return (rp);
1009}
1010
1011void
1012svm_region_unlink (svm_region_t * rp)
1013{
1014  svm_map_region_args_t _a, *a = &_a;
1015  svm_main_region_t *mp;
1016  u8 *shm_name;
1017
1018  ASSERT (root_rp);
1019  ASSERT (rp);
1020  ASSERT (vec_c_string_is_terminated (rp->region_name));
1021
1022  mp = root_rp->data_base;
1023  ASSERT (mp);
1024
1025  a->root_path = (char *) mp->root_path;
1026  a->name = rp->region_name;
1027  shm_name = shm_name_from_svm_map_region_args (a);
1028  if (CLIB_DEBUG > 1)
1029    clib_warning ("[%d] shm_unlink (%s)", getpid (), shm_name);
1030  shm_unlink ((const char *) shm_name);
1031  vec_free (shm_name);
1032}
1033
1034/*
1035 * svm_region_unmap
1036 *
1037 * Let go of the indicated region. If the calling process
1038 * is the last customer, throw it away completely.
1039 * The root region mutex guarantees atomicity with respect to
1040 * a new region client showing up at the wrong moment.
1041 */
1042void
1043svm_region_unmap_internal (void *rp_arg, u8 is_client)
1044{
1045  int i, mypid = getpid ();
1046  int nclients_left;
1047  void *oldheap;
1048  uword virtual_base, virtual_size;
1049  svm_region_t *rp = rp_arg;
1050  char *name;
1051
1052  /*
1053   * If we take a signal while holding one or more shared-memory
1054   * mutexes, we may end up back here from an otherwise
1055   * benign exit handler. Bail out to avoid a recursive
1056   * mutex screw-up.
1057   */
1058  if (nheld)
1059    return;
1060
1061  ASSERT (rp);
1062  ASSERT (root_rp);
1063
1064  if (CLIB_DEBUG > 1)
1065    clib_warning ("[%d] unmap region %s", getpid (), rp->region_name);
1066
1067  region_lock (root_rp, 5);
1068  region_lock (rp, 6);
1069
1070  oldheap = svm_push_pvt_heap (rp);	/* nb vec_delete() in the loop */
1071
1072  /* Remove the caller from the list of mappers */
1073  for (i = 0; i < vec_len (rp->client_pids); i++)
1074    {
1075      if (rp->client_pids[i] == mypid)
1076	{
1077	  vec_delete (rp->client_pids, 1, i);
1078	  goto found;
1079	}
1080    }
1081  clib_warning ("pid %d AWOL", mypid);
1082
1083found:
1084
1085  svm_pop_heap (oldheap);
1086
1087  nclients_left = vec_len (rp->client_pids);
1088  virtual_base = rp->virtual_base;
1089  virtual_size = rp->virtual_size;
1090
1091  if (nclients_left == 0)
1092    {
1093      int index, nbits, i;
1094      svm_main_region_t *mp;
1095      uword *p;
1096      svm_subregion_t *subp;
1097
1098      /* Kill the region, last guy on his way out */
1099
1100      oldheap = svm_push_pvt_heap (root_rp);
1101      name = vec_dup (rp->region_name);
1102
1103      virtual_base = rp->virtual_base;
1104      virtual_size = rp->virtual_size;
1105
1106      /* Figure out which bits to clear in the root region bitmap */
1107      index = (virtual_base - root_rp->virtual_base) / MMAP_PAGESIZE;
1108
1109      nbits = (virtual_size + MMAP_PAGESIZE - 1) / MMAP_PAGESIZE;
1110
1111#if CLIB_DEBUG > 1
1112      clib_warning ("clear %d bits at index %d", nbits, index);
1113#endif
1114      /* Give back the allocated VM */
1115      for (i = 0; i < nbits; i++)
1116	{
1117	  clib_bitmap_set_no_check (root_rp->bitmap, index + i, 0);
1118	}
1119
1120      mp = root_rp->data_base;
1121
1122      p = hash_get_mem (mp->name_hash, name);
1123
1124      /* Better never happen ... */
1125      if (p == NULL)
1126	{
1127	  region_unlock (rp);
1128	  region_unlock (root_rp);
1129	  svm_pop_heap (oldheap);
1130	  clib_warning ("Region name '%s' not found?", name);
1131	  return;
1132	}
1133
1134      /* Remove from the root region subregion pool */
1135      subp = mp->subregions + p[0];
1136      pool_put (mp->subregions, subp);
1137
1138      hash_unset_mem (mp->name_hash, name);
1139
1140      vec_free (name);
1141
1142      region_unlock (rp);
1143
1144      /* If a client asks for the cleanup, don't unlink the backing
1145       * file since we can't tell if it has been recreated. */
1146      if (!is_client)
1147	svm_region_unlink (rp);
1148
1149      munmap ((void *) virtual_base, virtual_size);
1150      region_unlock (root_rp);
1151      svm_pop_heap (oldheap);
1152      return;
1153    }
1154
1155  region_unlock (rp);
1156  region_unlock (root_rp);
1157
1158  munmap ((void *) virtual_base, virtual_size);
1159}
1160
1161void
1162svm_region_unmap (void *rp_arg)
1163{
1164  svm_region_unmap_internal (rp_arg, 0 /* is_client */ );
1165}
1166
1167void
1168svm_region_unmap_client (void *rp_arg)
1169{
1170  svm_region_unmap_internal (rp_arg, 1 /* is_client */ );
1171}
1172
1173/*
1174 * svm_region_exit
1175 */
1176static void
1177svm_region_exit_internal (u8 is_client)
1178{
1179  void *oldheap;
1180  int i, mypid = getpid ();
1181  uword virtual_base, virtual_size;
1182
1183  /* It felt so nice we did it twice... */
1184  if (root_rp == 0)
1185    return;
1186
1187  if (--root_rp_refcount > 0)
1188    return;
1189
1190  /*
1191   * If we take a signal while holding one or more shared-memory
1192   * mutexes, we may end up back here from an otherwise
1193   * benign exit handler. Bail out to avoid a recursive
1194   * mutex screw-up.
1195   */
1196  if (nheld)
1197    return;
1198
1199  region_lock (root_rp, 7);
1200  oldheap = svm_push_pvt_heap (root_rp);
1201
1202  virtual_base = root_rp->virtual_base;
1203  virtual_size = root_rp->virtual_size;
1204
1205  for (i = 0; i < vec_len (root_rp->client_pids); i++)
1206    {
1207      if (root_rp->client_pids[i] == mypid)
1208	{
1209	  vec_delete (root_rp->client_pids, 1, i);
1210	  goto found;
1211	}
1212    }
1213  clib_warning ("pid %d AWOL", mypid);
1214
1215found:
1216
1217  if (!is_client && vec_len (root_rp->client_pids) == 0)
1218    svm_region_unlink (root_rp);
1219
1220  region_unlock (root_rp);
1221  svm_pop_heap (oldheap);
1222
1223  root_rp = 0;
1224  munmap ((void *) virtual_base, virtual_size);
1225}
1226
1227void
1228svm_region_exit (void)
1229{
1230  svm_region_exit_internal (0 /* is_client */ );
1231}
1232
1233void
1234svm_region_exit_client (void)
1235{
1236  svm_region_exit_internal (1 /* is_client */ );
1237}
1238
1239void
1240svm_client_scan_this_region_nolock (svm_region_t * rp)
1241{
1242  int j;
1243  int mypid = getpid ();
1244  void *oldheap;
1245
1246  for (j = 0; j < vec_len (rp->client_pids); j++)
1247    {
1248      if (mypid == rp->client_pids[j])
1249	continue;
1250      if (rp->client_pids[j] && (kill (rp->client_pids[j], 0) < 0))
1251	{
1252	  clib_warning ("%s: cleanup ghost pid %d",
1253			rp->region_name, rp->client_pids[j]);
1254	  /* nb: client vec in rp->region_heap */
1255	  oldheap = svm_push_pvt_heap (rp);
1256	  vec_delete (rp->client_pids, 1, j);
1257	  j--;
1258	  svm_pop_heap (oldheap);
1259	}
1260    }
1261}
1262
1263
1264/*
1265 * Scan svm regions for dead clients
1266 */
1267void
1268svm_client_scan (const char *root_path)
1269{
1270  int i, j;
1271  svm_main_region_t *mp;
1272  svm_map_region_args_t *a = 0;
1273  svm_region_t *root_rp;
1274  svm_region_t *rp;
1275  svm_subregion_t *subp;
1276  u8 *name = 0;
1277  u8 **svm_names = 0;
1278  void *oldheap;
1279  int mypid = getpid ();
1280
1281  vec_validate (a, 0);
1282
1283  svm_region_init_chroot (root_path);
1284
1285  root_rp = svm_get_root_rp ();
1286
1287  pthread_mutex_lock (&root_rp->mutex);
1288
1289  mp = root_rp->data_base;
1290
1291  for (j = 0; j < vec_len (root_rp->client_pids); j++)
1292    {
1293      if (mypid == root_rp->client_pids[j])
1294	continue;
1295      if (root_rp->client_pids[j] && (kill (root_rp->client_pids[j], 0) < 0))
1296	{
1297	  clib_warning ("%s: cleanup ghost pid %d",
1298			root_rp->region_name, root_rp->client_pids[j]);
1299	  /* nb: client vec in root_rp->region_heap */
1300	  oldheap = svm_push_pvt_heap (root_rp);
1301	  vec_delete (root_rp->client_pids, 1, j);
1302	  j--;
1303	  svm_pop_heap (oldheap);
1304	}
1305    }
1306
1307  /*
1308   * Snapshoot names, can't hold root rp mutex across
1309   * find_or_create.
1310   */
1311  /* *INDENT-OFF* */
1312  pool_foreach (subp, mp->subregions, ({
1313        name = vec_dup (subp->subregion_name);
1314        vec_add1(svm_names, name);
1315      }));
1316  /* *INDENT-ON* */
1317
1318  pthread_mutex_unlock (&root_rp->mutex);
1319
1320  for (i = 0; i < vec_len (svm_names); i++)
1321    {
1322      vec_validate (a, 0);
1323      a->root_path = root_path;
1324      a->name = (char *) svm_names[i];
1325      rp = svm_region_find_or_create (a);
1326      if (rp)
1327	{
1328	  pthread_mutex_lock (&rp->mutex);
1329
1330	  svm_client_scan_this_region_nolock (rp);
1331
1332	  pthread_mutex_unlock (&rp->mutex);
1333	  svm_region_unmap (rp);
1334	  vec_free (svm_names[i]);
1335	}
1336      vec_free (a);
1337    }
1338  vec_free (svm_names);
1339
1340  svm_region_exit ();
1341
1342  vec_free (a);
1343}
1344
1345/*
1346 * fd.io coding-style-patch-verification: ON
1347 *
1348 * Local Variables:
1349 * eval: (c-set-style "gnu")
1350 * End:
1351 */
1352