1b639fb45Simarom/*
2b639fb45Simarom Itay Marom
3b639fb45Simarom Cisco Systems, Inc.
4b639fb45Simarom*/
5b639fb45Simarom
6b639fb45Simarom/*
7b639fb45SimaromCopyright (c) 2015-2015 Cisco Systems, Inc.
8b639fb45Simarom
9b639fb45SimaromLicensed under the Apache License, Version 2.0 (the "License");
10b639fb45Simaromyou may not use this file except in compliance with the License.
11b639fb45SimaromYou may obtain a copy of the License at
12b639fb45Simarom
13b639fb45Simarom    http://www.apache.org/licenses/LICENSE-2.0
14b639fb45Simarom
15b639fb45SimaromUnless required by applicable law or agreed to in writing, software
16b639fb45Simaromdistributed under the License is distributed on an "AS IS" BASIS,
17b639fb45SimaromWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18b639fb45SimaromSee the License for the specific language governing permissions and
19b639fb45Simaromlimitations under the License.
20b639fb45Simarom*/
21b639fb45Simarom
22b639fb45Simarom#include "trex_watchdog.h"
23b639fb45Simarom#include "trex_exception.h"
24b639fb45Simarom
25b639fb45Simarom#include <assert.h>
26b639fb45Simarom#include <unistd.h>
27b639fb45Simarom#include <sstream>
28b639fb45Simarom
293c4a29e1Simarom#include <sys/ptrace.h>
303c4a29e1Simarom#include <execinfo.h>
313c4a29e1Simarom#include <cxxabi.h>
323c4a29e1Simarom#include <dlfcn.h>
333c4a29e1Simarom#include <pthread.h>
343c4a29e1Simarom#include <signal.h>
353c4a29e1Simarom#include <string.h>
363c4a29e1Simarom#include <iostream>
373c4a29e1Simarom#include  <stdexcept>
383c4a29e1Simarom
398feef53bSimarom
403ca8be80Simaromstatic TrexMonitor *global_monitor;
413c4a29e1Simarom
423c4a29e1Simaromconst char *get_exe_name();
433c4a29e1Simarom
443c4a29e1Simaromstd::string exec(const char* cmd) {
453c4a29e1Simarom    char buffer[128];
463c4a29e1Simarom    std::string result = "";
473c4a29e1Simarom    std::shared_ptr<FILE> pipe(popen(cmd, "r"), pclose);
483c4a29e1Simarom    if (!pipe) throw std::runtime_error("popen() failed!");
493c4a29e1Simarom    while (!feof(pipe.get())) {
503c4a29e1Simarom        if (fgets(buffer, 128, pipe.get()) != NULL) {
513c4a29e1Simarom            result += buffer;
523c4a29e1Simarom        }
533c4a29e1Simarom    }
543c4a29e1Simarom    return result;
553c4a29e1Simarom}
563c4a29e1Simarom
573c4a29e1Simarom// This function produces a stack backtrace with demangled function & method names.
583c4a29e1Simarom__attribute__((noinline))
593c4a29e1Simaromstd::string Backtrace(int skip = 1)
603c4a29e1Simarom{
613c4a29e1Simarom    void *callstack[128];
623c4a29e1Simarom    const int nMaxFrames = sizeof(callstack) / sizeof(callstack[0]);
633c4a29e1Simarom    char buf[1024];
643c4a29e1Simarom    int nFrames = backtrace(callstack, nMaxFrames);
653c4a29e1Simarom    char **symbols = backtrace_symbols(callstack, nFrames);
663c4a29e1Simarom
673c4a29e1Simarom    std::ostringstream trace_buf;
683c4a29e1Simarom    for (int i = skip; i < nFrames; i++) {
693c4a29e1Simarom
703c4a29e1Simarom        Dl_info info;
713c4a29e1Simarom        if (dladdr(callstack[i], &info) && info.dli_sname) {
723c4a29e1Simarom            char *demangled = NULL;
733c4a29e1Simarom            int status = -1;
743c4a29e1Simarom            if (info.dli_sname[0] == '_')
753c4a29e1Simarom                demangled = abi::__cxa_demangle(info.dli_sname, NULL, 0, &status);
763c4a29e1Simarom            snprintf(buf, sizeof(buf), "%-3d %*p %s + %zd\n",
773c4a29e1Simarom                     i, int(2 + sizeof(void*) * 2), callstack[i],
783c4a29e1Simarom                     status == 0 ? demangled :
793c4a29e1Simarom                     info.dli_sname == 0 ? symbols[i] : info.dli_sname,
803c4a29e1Simarom                     (char *)callstack[i] - (char *)info.dli_saddr);
813c4a29e1Simarom            free(demangled);
823c4a29e1Simarom        } else {
833c4a29e1Simarom            snprintf(buf, sizeof(buf), "%-3d %*p %s\n",
843c4a29e1Simarom                     i, int(2 + sizeof(void*) * 2), callstack[i], symbols[i]);
853c4a29e1Simarom        }
863c4a29e1Simarom        trace_buf << buf;
873c4a29e1Simarom    }
883c4a29e1Simarom    free(symbols);
893c4a29e1Simarom    if (nFrames == nMaxFrames)
903c4a29e1Simarom        trace_buf << "[truncated]\n";
913c4a29e1Simarom
923c4a29e1Simarom    /* add the addr2line info */
933c4a29e1Simarom    std::stringstream addr2line;
943c4a29e1Simarom
954127b043Simarom    addr2line << "/usr/bin/addr2line -s -e " << get_exe_name() << " ";
963c4a29e1Simarom    for (int i = skip; i < nFrames; i++) {
973c4a29e1Simarom        addr2line << callstack[i] << " ";
983c4a29e1Simarom    }
993c4a29e1Simarom
1003c4a29e1Simarom    trace_buf << "\n\n*** addr2line information follows ***\n\n";
1013c4a29e1Simarom    try {
1023c4a29e1Simarom        trace_buf << exec(addr2line.str().c_str());
1033c4a29e1Simarom    } catch (std::runtime_error &e) {
1043c4a29e1Simarom        trace_buf << "\n" << e.what();
1053c4a29e1Simarom    }
1063c4a29e1Simarom
1073c4a29e1Simarom    return trace_buf.str();
1083c4a29e1Simarom}
1093c4a29e1Simarom
1103c4a29e1Simarom__attribute__((noinline))
1113c4a29e1Simaromstatic void _callstack_signal_handler(int signr, siginfo_t *info, void *secret) {
1123c4a29e1Simarom    std::stringstream ss;
1133c4a29e1Simarom
1143c4a29e1Simarom    double now = now_sec();
1153c4a29e1Simarom
1163ca8be80Simarom    ss << "WATCHDOG: task '" << global_monitor->get_name() << "' has not responded for more than " << global_monitor->get_interval(now) << " seconds - timeout is " << global_monitor->get_timeout_sec() << " seconds";
1173c4a29e1Simarom
1183c4a29e1Simarom    std::string backtrace = Backtrace();
1193c4a29e1Simarom    ss << "\n\n*** traceback follows ***\n\n" << backtrace << "\n";
1203c4a29e1Simarom
1213c4a29e1Simarom    throw std::runtime_error(ss.str());
1223c4a29e1Simarom}
1233c4a29e1Simarom
1243ca8be80Simarom/**************************************
1253ca8be80Simarom * Trex Monitor object
1263ca8be80Simarom *************************************/
1273ca8be80Simarom
1283ca8be80Simaromvoid TrexMonitor::create(const std::string &name, double timeout_sec) {
129fff19c8cSimarom    m_tid              = pthread_self();
130fff19c8cSimarom    m_name             = name;
131fff19c8cSimarom    m_timeout_sec      = timeout_sec;
1329ad36b3dSimarom    m_base_timeout_sec = timeout_sec;
133fff19c8cSimarom    m_tickled          = true;
134fff19c8cSimarom    m_ts               = 0;
1359ad36b3dSimarom    m_io_ref_cnt       = 0;
136fff19c8cSimarom
137fff19c8cSimarom    /* the rare case of m_active_time_sec set out of order with tickled */
138fff19c8cSimarom    asm volatile("mfence" ::: "memory");
1393ca8be80Simarom}
1403ca8be80Simarom
1413ca8be80Simarom/**************************************
1423ca8be80Simarom * Trex watchdog
1433ca8be80Simarom *************************************/
144ca8b613fSHanoh Haim
145ca8b613fSHanoh Haimvoid TrexWatchDog::init(bool enable){
1463ca8be80Simarom    m_enable = enable;
147ca8b613fSHanoh Haim    if (m_enable) {
148ca8b613fSHanoh Haim        register_signal();
149ca8b613fSHanoh Haim    }
150ca8b613fSHanoh Haim}
151ca8b613fSHanoh Haim
152f5350dfbSYaroslav Brustinov/**
153f5350dfbSYaroslav Brustinov * get pointer to monitor of current thread
154f5350dfbSYaroslav Brustinov * (NULL if no monitor)
155f5350dfbSYaroslav Brustinov *
156f5350dfbSYaroslav Brustinov */
157f5350dfbSYaroslav BrustinovTrexMonitor * TrexWatchDog::get_current_monitor() {
158f5350dfbSYaroslav Brustinov
159f5350dfbSYaroslav Brustinov    for (int i = 0; i < m_mon_count; i++) {
160f5350dfbSYaroslav Brustinov        if ( m_monitors[i]->get_tid() == pthread_self() ) {
161a56ae96bSYaroslav Brustinov            return m_monitors[i];
162f5350dfbSYaroslav Brustinov        }
163f5350dfbSYaroslav Brustinov    }
164f5350dfbSYaroslav Brustinov
165a56ae96bSYaroslav Brustinov    return NULL;
166f5350dfbSYaroslav Brustinov}
167f5350dfbSYaroslav Brustinov
168f5350dfbSYaroslav Brustinov
1693c4a29e1Simarom/**
1703c4a29e1Simarom * register a monitor
1713c4a29e1Simarom * this function is thread safe
1723c4a29e1Simarom *
1733c4a29e1Simarom */
1743ca8be80Simaromvoid TrexWatchDog::register_monitor(TrexMonitor *monitor) {
175ca8b613fSHanoh Haim    if (!m_enable){
1763ca8be80Simarom        return;
177ca8b613fSHanoh Haim    }
178b639fb45Simarom
1793c4a29e1Simarom    /* critical section start */
1803c4a29e1Simarom    std::unique_lock<std::mutex> lock(m_lock);
1813c4a29e1Simarom
1823ca8be80Simarom    /* sanity - not a must but why not... */
183f5350dfbSYaroslav Brustinov    TrexMonitor * cur_monitor = get_current_monitor();
184f5350dfbSYaroslav Brustinov    if ( cur_monitor != NULL || cur_monitor == monitor ) {
185f5350dfbSYaroslav Brustinov        std::stringstream ss;
186f5350dfbSYaroslav Brustinov        ss << "WATCHDOG: double register detected\n\n" << Backtrace();
187f5350dfbSYaroslav Brustinov        throw TrexException(ss.str());
1883c4a29e1Simarom    }
1893c4a29e1Simarom
1903ca8be80Simarom    /* check capacity */
1913ca8be80Simarom    if (m_mon_count == MAX_MONITORS) {
1923ca8be80Simarom        std::stringstream ss;
1933ca8be80Simarom        ss << "WATCHDOG: too many registered monitors\n\n" << Backtrace();
1943ca8be80Simarom        throw TrexException(ss.str());
1953ca8be80Simarom    }
196b639fb45Simarom
1973ca8be80Simarom    /* add monitor */
1983ca8be80Simarom    m_monitors[m_mon_count++] = monitor;
1993c4a29e1Simarom
2003c4a29e1Simarom    /* critical section end */
2013c4a29e1Simarom    lock.unlock();
2023c4a29e1Simarom
2033c4a29e1Simarom}
2043c4a29e1Simarom
2053c4a29e1Simaromvoid TrexWatchDog::start() {
2063c4a29e1Simarom
207ca8b613fSHanoh Haim    if (!m_enable){
208ca8b613fSHanoh Haim        return ;
209ca8b613fSHanoh Haim    }
210ca8b613fSHanoh Haim
211b639fb45Simarom    m_active = true;
2123c4a29e1Simarom    m_thread = new std::thread(&TrexWatchDog::_main, this);
213b639fb45Simarom    if (!m_thread) {
214b639fb45Simarom        throw TrexException("unable to create watchdog thread");
215b639fb45Simarom    }
216b639fb45Simarom}
217b639fb45Simarom
2183c4a29e1Simaromvoid TrexWatchDog::stop() {
2193ca8be80Simarom
220ca8b613fSHanoh Haim    if (!m_enable){
221ca8b613fSHanoh Haim        return ;
222ca8b613fSHanoh Haim    }
223ca8b613fSHanoh Haim
2243c4a29e1Simarom    m_active = false;
2253c4a29e1Simarom
2263c4a29e1Simarom    if (m_thread) {
2273c4a29e1Simarom        m_thread->join();
2283c4a29e1Simarom        delete m_thread;
2293c4a29e1Simarom        m_thread = NULL;
2303c4a29e1Simarom    }
231b639fb45Simarom}
232b639fb45Simarom
2333c4a29e1Simarom
2343c4a29e1Simarom
235b639fb45Simarom/**
236b639fb45Simarom * main loop
237b639fb45Simarom *
238b639fb45Simarom */
2393c4a29e1Simaromvoid TrexWatchDog::_main() {
2403c4a29e1Simarom
2415ab7411fSimarom    pthread_setname_np(pthread_self(), "Trex Watchdog");
2425ab7411fSimarom
2433ca8be80Simarom    assert(m_enable == true);
2443c4a29e1Simarom
2453c4a29e1Simarom    /* start main loop */
246b639fb45Simarom    while (m_active) {
247b639fb45Simarom
248b639fb45Simarom        dsec_t now = now_sec();
249b639fb45Simarom
250b4a17c0dSimarom        /* to be on the safe side - read the count with a lock */
251b4a17c0dSimarom        std::unique_lock<std::mutex> lock(m_lock);
2523ca8be80Simarom        int count = m_mon_count;
253b4a17c0dSimarom        lock.unlock();
2543ca8be80Simarom
2553ca8be80Simarom        for (int i = 0; i < count; i++) {
2563ca8be80Simarom            TrexMonitor *monitor = m_monitors[i];
2578feef53bSimarom
2589ad36b3dSimarom            /* skip non expired monitors */
2599ad36b3dSimarom            if (!monitor->is_expired(now)) {
2608feef53bSimarom                continue;
2618feef53bSimarom            }
2629ad36b3dSimarom
2639ad36b3dSimarom            /* it has expired but it was tickled */
2643ca8be80Simarom            if (monitor->is_tickled()) {
2653ca8be80Simarom                monitor->reset(now);
266b639fb45Simarom                continue;
267b639fb45Simarom            }
268b639fb45Simarom
2699ad36b3dSimarom            /* crash */
2709ad36b3dSimarom            global_monitor = monitor;
2713c4a29e1Simarom
2729ad36b3dSimarom            pthread_kill(monitor->get_tid(), SIGALRM);
273b639fb45Simarom
2749ad36b3dSimarom            /* nothing to do more... the other thread will terminate, but if not - we terminate */
2759ad36b3dSimarom            sleep(5);
2769ad36b3dSimarom            fprintf(stderr, "\n\n*** WATCHDOG violation detected on task '%s' which have failed to response to the signal ***\n\n", monitor->get_name().c_str());
2779ad36b3dSimarom            abort();
278b639fb45Simarom        }
279b639fb45Simarom
2803c4a29e1Simarom        /* the internal clock - 250 ms */
2813c4a29e1Simarom        delay(250);
282b639fb45Simarom    }
283b639fb45Simarom}
284b639fb45Simarom
2853ca8be80Simarom
2863ca8be80Simaromvoid TrexWatchDog::register_signal() {
2873ca8be80Simarom    /* do this once */
2883ca8be80Simarom    if (g_signal_init) {
2893ca8be80Simarom        return;
2903ca8be80Simarom    }
2913ca8be80Simarom
2923ca8be80Simarom    /* register a handler on SIG ALARM */
2933ca8be80Simarom    struct sigaction sa;
2943ca8be80Simarom    memset (&sa, '\0', sizeof(sa));
2953ca8be80Simarom
2963ca8be80Simarom    sa.sa_flags = SA_SIGINFO;
2973ca8be80Simarom    sa.sa_sigaction = _callstack_signal_handler;
2983ca8be80Simarom
2993ca8be80Simarom    int rc = sigaction(SIGALRM , &sa, NULL);
3003ca8be80Simarom    assert(rc == 0);
3013ca8be80Simarom
3023ca8be80Simarom    g_signal_init = true;
3033ca8be80Simarom}
3043ca8be80Simarom
3053c4a29e1Simarombool TrexWatchDog::g_signal_init = false;
3063ca8be80Simarom
307