1/*
2 Itay Marom
3 Cisco Systems, Inc.
4*/
5
6/*
7Copyright (c) 2015-2015 Cisco Systems, Inc.
8
9Licensed under the Apache License, Version 2.0 (the "License");
10you may not use this file except in compliance with the License.
11You may obtain a copy of the License at
12
13    http://www.apache.org/licenses/LICENSE-2.0
14
15Unless required by applicable law or agreed to in writing, software
16distributed under the License is distributed on an "AS IS" BASIS,
17WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18See the License for the specific language governing permissions and
19limitations under the License.
20*/
21
22#include "trex_watchdog.h"
23#include "trex_exception.h"
24
25#include <assert.h>
26#include <unistd.h>
27#include <sstream>
28
29#include <sys/ptrace.h>
30#include <execinfo.h>
31#include <cxxabi.h>
32#include <dlfcn.h>
33#include <pthread.h>
34#include <signal.h>
35#include <string.h>
36#include <iostream>
37#include  <stdexcept>
38
39
40static TrexMonitor *global_monitor;
41
42const char *get_exe_name();
43
44std::string exec(const char* cmd) {
45    char buffer[128];
46    std::string result = "";
47    std::shared_ptr<FILE> pipe(popen(cmd, "r"), pclose);
48    if (!pipe) throw std::runtime_error("popen() failed!");
49    while (!feof(pipe.get())) {
50        if (fgets(buffer, 128, pipe.get()) != NULL) {
51            result += buffer;
52        }
53    }
54    return result;
55}
56
57// This function produces a stack backtrace with demangled function & method names.
58__attribute__((noinline))
59std::string Backtrace(int skip = 1)
60{
61    void *callstack[128];
62    const int nMaxFrames = sizeof(callstack) / sizeof(callstack[0]);
63    char buf[1024];
64    int nFrames = backtrace(callstack, nMaxFrames);
65    char **symbols = backtrace_symbols(callstack, nFrames);
66
67    std::ostringstream trace_buf;
68    for (int i = skip; i < nFrames; i++) {
69
70        Dl_info info;
71        if (dladdr(callstack[i], &info) && info.dli_sname) {
72            char *demangled = NULL;
73            int status = -1;
74            if (info.dli_sname[0] == '_')
75                demangled = abi::__cxa_demangle(info.dli_sname, NULL, 0, &status);
76            snprintf(buf, sizeof(buf), "%-3d %*p %s + %zd\n",
77                     i, int(2 + sizeof(void*) * 2), callstack[i],
78                     status == 0 ? demangled :
79                     info.dli_sname == 0 ? symbols[i] : info.dli_sname,
80                     (char *)callstack[i] - (char *)info.dli_saddr);
81            free(demangled);
82        } else {
83            snprintf(buf, sizeof(buf), "%-3d %*p %s\n",
84                     i, int(2 + sizeof(void*) * 2), callstack[i], symbols[i]);
85        }
86        trace_buf << buf;
87    }
88    free(symbols);
89    if (nFrames == nMaxFrames)
90        trace_buf << "[truncated]\n";
91
92    /* add the addr2line info */
93    std::stringstream addr2line;
94
95    addr2line << "/usr/bin/addr2line -s -e " << get_exe_name() << " ";
96    for (int i = skip; i < nFrames; i++) {
97        addr2line << callstack[i] << " ";
98    }
99
100    trace_buf << "\n\n*** addr2line information follows ***\n\n";
101    try {
102        trace_buf << exec(addr2line.str().c_str());
103    } catch (std::runtime_error &e) {
104        trace_buf << "\n" << e.what();
105    }
106
107    return trace_buf.str();
108}
109
110__attribute__((noinline))
111static void _callstack_signal_handler(int signr, siginfo_t *info, void *secret) {
112    std::stringstream ss;
113
114    double now = now_sec();
115
116    ss << "WATCHDOG: task '" << global_monitor->get_name() << "' has not responded for more than " << global_monitor->get_interval(now) << " seconds - timeout is " << global_monitor->get_timeout_sec() << " seconds";
117
118    std::string backtrace = Backtrace();
119    ss << "\n\n*** traceback follows ***\n\n" << backtrace << "\n";
120
121    throw std::runtime_error(ss.str());
122}
123
124/**************************************
125 * Trex Monitor object
126 *************************************/
127
128void TrexMonitor::create(const std::string &name, double timeout_sec) {
129    m_tid              = pthread_self();
130    m_name             = name;
131    m_timeout_sec      = timeout_sec;
132    m_base_timeout_sec = timeout_sec;
133    m_tickled          = true;
134    m_ts               = 0;
135    m_io_ref_cnt       = 0;
136
137    /* the rare case of m_active_time_sec set out of order with tickled */
138    asm volatile("mfence" ::: "memory");
139}
140
141/**************************************
142 * Trex watchdog
143 *************************************/
144
145void TrexWatchDog::init(bool enable){
146    m_enable = enable;
147    if (m_enable) {
148        register_signal();
149    }
150}
151
152/**
153 * get pointer to monitor of current thread
154 * (NULL if no monitor)
155 *
156 */
157TrexMonitor * TrexWatchDog::get_current_monitor() {
158
159    for (int i = 0; i < m_mon_count; i++) {
160        if ( m_monitors[i]->get_tid() == pthread_self() ) {
161            return m_monitors[i];
162        }
163    }
164
165    return NULL;
166}
167
168
169/**
170 * register a monitor
171 * this function is thread safe
172 *
173 */
174void TrexWatchDog::register_monitor(TrexMonitor *monitor) {
175    if (!m_enable){
176        return;
177    }
178
179    /* critical section start */
180    std::unique_lock<std::mutex> lock(m_lock);
181
182    /* sanity - not a must but why not... */
183    TrexMonitor * cur_monitor = get_current_monitor();
184    if ( cur_monitor != NULL || cur_monitor == monitor ) {
185        std::stringstream ss;
186        ss << "WATCHDOG: double register detected\n\n" << Backtrace();
187        throw TrexException(ss.str());
188    }
189
190    /* check capacity */
191    if (m_mon_count == MAX_MONITORS) {
192        std::stringstream ss;
193        ss << "WATCHDOG: too many registered monitors\n\n" << Backtrace();
194        throw TrexException(ss.str());
195    }
196
197    /* add monitor */
198    m_monitors[m_mon_count++] = monitor;
199
200    /* critical section end */
201    lock.unlock();
202
203}
204
205void TrexWatchDog::start() {
206
207    if (!m_enable){
208        return ;
209    }
210
211    m_active = true;
212    m_thread = new std::thread(&TrexWatchDog::_main, this);
213    if (!m_thread) {
214        throw TrexException("unable to create watchdog thread");
215    }
216}
217
218void TrexWatchDog::stop() {
219
220    if (!m_enable){
221        return ;
222    }
223
224    m_active = false;
225
226    if (m_thread) {
227        m_thread->join();
228        delete m_thread;
229        m_thread = NULL;
230    }
231}
232
233
234
235/**
236 * main loop
237 *
238 */
239void TrexWatchDog::_main() {
240
241    pthread_setname_np(pthread_self(), "Trex Watchdog");
242
243    assert(m_enable == true);
244
245    /* start main loop */
246    while (m_active) {
247
248        dsec_t now = now_sec();
249
250        /* to be on the safe side - read the count with a lock */
251        std::unique_lock<std::mutex> lock(m_lock);
252        int count = m_mon_count;
253        lock.unlock();
254
255        for (int i = 0; i < count; i++) {
256            TrexMonitor *monitor = m_monitors[i];
257
258            /* skip non expired monitors */
259            if (!monitor->is_expired(now)) {
260                continue;
261            }
262
263            /* it has expired but it was tickled */
264            if (monitor->is_tickled()) {
265                monitor->reset(now);
266                continue;
267            }
268
269            /* crash */
270            global_monitor = monitor;
271
272            pthread_kill(monitor->get_tid(), SIGALRM);
273
274            /* nothing to do more... the other thread will terminate, but if not - we terminate */
275            sleep(5);
276            fprintf(stderr, "\n\n*** WATCHDOG violation detected on task '%s' which have failed to response to the signal ***\n\n", monitor->get_name().c_str());
277            abort();
278        }
279
280        /* the internal clock - 250 ms */
281        delay(250);
282    }
283}
284
285
286void TrexWatchDog::register_signal() {
287    /* do this once */
288    if (g_signal_init) {
289        return;
290    }
291
292    /* register a handler on SIG ALARM */
293    struct sigaction sa;
294    memset (&sa, '\0', sizeof(sa));
295
296    sa.sa_flags = SA_SIGINFO;
297    sa.sa_sigaction = _callstack_signal_handler;
298
299    int rc = sigaction(SIGALRM , &sa, NULL);
300    assert(rc == 0);
301
302    g_signal_init = true;
303}
304
305bool TrexWatchDog::g_signal_init = false;
306
307