[ovs-dev] [PATCHv2] fatal-signal: Catch SIGSEGV and print backtrace.

William Tu u9012063 at gmail.com
Wed Sep 25 23:15:31 UTC 2019


The patch catches the SIGSEGV signal and prints the backtrace
using libunwind at the monitor daemon. This makes debugging easier
when there is no debug symbol package or gdb installed on production
systems.  The patch works when the ovs-vswitchd compiles even without
debug symbol (no -g option), because the object files still have function
symbols. For example:
 |daemon_unix(monitor)|WARN|SIGSEGV detected, backtrace:
 |daemon_unix(monitor)|WARN|0x0000000000482752 <fatal_signal_handler+0x52>
 |daemon_unix(monitor)|WARN|0x00007fb4900734b0 <killpg+0x40>
 |daemon_unix(monitor)|WARN|0x00007fb49013974d <__poll+0x2d>
 |daemon_unix(monitor)|WARN|0x000000000052b348 <time_poll+0x108>
 |daemon_unix(monitor)|WARN|0x00000000005153ec <poll_block+0x8c>
 |daemon_unix(monitor)|WARN|0x000000000058630a <clean_thread_main+0x1aa>
 |daemon_unix(monitor)|WARN|0x00000000004ffd1d <ovsthread_wrapper+0x7d>
 |daemon_unix(monitor)|WARN|0x00007fb490b3b6ba <start_thread+0xca>
 |daemon_unix(monitor)|WARN|0x00007fb49014541d <clone+0x6d>
 |daemon_unix(monitor)|ERR|1 crashes: pid 122849 died, killed \
    (Segmentation fault), core dumped, restarting

However, if the object files' symbols are stripped, then we can only
get init function plus offset value. This is still useful when trying
to see if two bugs have the same root cause, Example:
 |daemon_unix(monitor)|WARN|SIGSEGV detected, backtrace:
 |daemon_unix(monitor)|WARN|0x0000000000482752 <_init+0x7d68a>
 |daemon_unix(monitor)|WARN|0x00007f5f7c8cf4b0 <killpg+0x40>
 |daemon_unix(monitor)|WARN|0x00007f5f7c99574d <__poll+0x2d>
 |daemon_unix(monitor)|WARN|0x000000000052b348 <_init+0x126280>
 |daemon_unix(monitor)|WARN|0x00000000005153ec <_init+0x110324>
 |daemon_unix(monitor)|WARN|0x0000000000407439 <_init+0x2371>
 |daemon_unix(monitor)|WARN|0x00007f5f7c8ba830 <__libc_start_main+0xf0>
 |daemon_unix(monitor)|WARN|0x0000000000408329 <_init+0x3261>
 |daemon_unix(monitor)|ERR|1 crashes: pid 106155 died, killed \
	(Segmentation fault), core dumped, restarting

Signed-off-by: William Tu <u9012063 at gmail.com>
---
v2:
  Address comments from Ben about async-signal-safety
---
 .travis.yml        |  1 +
 configure.ac       |  1 +
 lib/backtrace.h    | 18 ++++++++++++++++++
 lib/daemon-unix.c  | 30 ++++++++++++++++++++++++------
 lib/fatal-signal.c | 47 ++++++++++++++++++++++++++++++++++++++++++++++-
 m4/openvswitch.m4  | 10 ++++++++++
 6 files changed, 100 insertions(+), 7 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 68026312ba84..b547eb041791 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -25,6 +25,7 @@ addons:
       - selinux-policy-dev
       - libunbound-dev
       - libunbound-dev:i386
+      - libunwind-dev
 
 before_install: ./.travis/${TRAVIS_OS_NAME}-prepare.sh
 
diff --git a/configure.ac b/configure.ac
index 1d45c4fdd153..15922418062b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -139,6 +139,7 @@ OVS_LIBTOOL_VERSIONS
 OVS_CHECK_CXX
 AX_FUNC_POSIX_MEMALIGN
 OVS_CHECK_UNBOUND
+OVS_CHECK_UNWIND
 
 OVS_CHECK_INCLUDE_NEXT([stdio.h string.h])
 AC_CONFIG_FILES([
diff --git a/lib/backtrace.h b/lib/backtrace.h
index 384f2700d94c..daf035d4f322 100644
--- a/lib/backtrace.h
+++ b/lib/backtrace.h
@@ -20,6 +20,11 @@
 #include <stdint.h>
 #include "openvswitch/dynamic-string.h"
 
+#ifdef HAVE_UNWIND
+#define UNW_LOCAL_ONLY
+#include <libunwind.h>
+#endif
+
 /* log_backtrace() will save the backtrace of a running program
  * into the log at the DEBUG level.
  *
@@ -71,4 +76,17 @@ struct backtrace {
 void backtrace_capture(struct backtrace *);
 void log_backtrace_at(const char *msg, const char *where);
 
+#ifdef HAVE_UNWIND
+#define UNW_MAX_DEPTH 32
+#define UNW_MAX_FUNCN 32
+#define UNW_MAX_BUF \
+    (UNW_MAX_DEPTH * sizeof(struct unw_backtrace))
+
+struct unw_backtrace {
+    char func[UNW_MAX_FUNCN];
+    unw_word_t ip;
+    unw_word_t offset;
+};
+#endif
+
 #endif /* backtrace.h */
diff --git a/lib/daemon-unix.c b/lib/daemon-unix.c
index 6169763c294c..b45d1ffad043 100644
--- a/lib/daemon-unix.c
+++ b/lib/daemon-unix.c
@@ -15,6 +15,7 @@
  */
 
 #include <config.h>
+#include "backtrace.h"
 #include "daemon.h"
 #include "daemon-private.h"
 #include <errno.h>
@@ -75,7 +76,7 @@ static bool overwrite_pidfile;
 static bool chdir_ = true;
 
 /* File descriptor used by daemonize_start() and daemonize_complete(). */
-static int daemonize_fd = -1;
+int daemonize_fd = -1;
 
 /* --monitor: Should a supervisory process monitor the daemon and restart it if
  * it dies due to an error signal? */
@@ -291,8 +292,7 @@ fork_and_wait_for_startup(int *fdp, pid_t *child_pid)
                 OVS_NOT_REACHED();
             }
         }
-        close(fds[0]);
-        *fdp = -1;
+        *fdp = fds[0];
     } else if (!pid) {
         /* Running in child process. */
         close(fds[0]);
@@ -313,8 +313,6 @@ fork_notify_startup(int fd)
         if (error) {
             VLOG_FATAL("pipe write failed (%s)", ovs_strerror(error));
         }
-
-        close(fd);
     }
 }
 
@@ -373,6 +371,8 @@ monitor_daemon(pid_t daemon_pid)
         }
 
         if (!child_ready || retval == daemon_pid) {
+            int byte_read;
+            struct unw_backtrace backtrace[UNW_MAX_DEPTH];
             char *s = process_status_msg(status);
             if (should_restart(status)) {
                 free(status_msg);
@@ -393,6 +393,25 @@ monitor_daemon(pid_t daemon_pid)
                     }
                 }
 
+                fcntl(daemonize_fd, F_SETFL, O_NONBLOCK);
+                memset(backtrace, 0, UNW_MAX_BUF);
+                byte_read = read(daemonize_fd, backtrace, UNW_MAX_BUF);
+                if (byte_read < 0) {
+                    VLOG_ERR("Read fd %d failed: %s", daemonize_fd,
+                             ovs_strerror(errno));
+                } else if (byte_read > 0) {
+                    VLOG_WARN("SIGSEGV detected, backtrace:");
+                    for (int i = 0; i < UNW_MAX_DEPTH; i++) {
+                        if (backtrace[i].func[0] == 0) {
+                            break;
+                        }
+                        VLOG_WARN("0x%016lx <%s+0x%lx>\n",
+                                   backtrace[i].ip,
+                                   backtrace[i].func,
+                                   backtrace[i].offset);
+                    }
+                }
+
                 /* Throttle restarts to no more than once every 10 seconds. */
                 if (time(NULL) < last_restart + 10) {
                     VLOG_WARN("%s, waiting until 10 seconds since last "
@@ -508,7 +527,6 @@ daemonize_complete(void)
         detached = true;
 
         fork_notify_startup(daemonize_fd);
-        daemonize_fd = -1;
         daemonize_post_detach();
     }
 }
diff --git a/lib/fatal-signal.c b/lib/fatal-signal.c
index 3b905b6de766..4a2f4745d568 100644
--- a/lib/fatal-signal.c
+++ b/lib/fatal-signal.c
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 #include <config.h>
+#include "backtrace.h"
 #include "fatal-signal.h"
 #include <errno.h>
 #include <signal.h>
@@ -42,7 +43,8 @@ VLOG_DEFINE_THIS_MODULE(fatal_signal);
 
 /* Signals to catch. */
 #ifndef _WIN32
-static const int fatal_signals[] = { SIGTERM, SIGINT, SIGHUP, SIGALRM };
+static const int fatal_signals[] = { SIGTERM, SIGINT, SIGHUP, SIGALRM,
+                                     SIGSEGV };
 #else
 static const int fatal_signals[] = { SIGTERM };
 #endif
@@ -151,6 +153,42 @@ fatal_signal_add_hook(void (*hook_cb)(void *aux), void (*cancel_cb)(void *aux),
     ovs_mutex_unlock(&mutex);
 }
 
+#ifdef HAVE_UNWIND
+extern int daemonize_fd;
+
+/* Send the backtrace buffer to monitor thread.
+ *
+ * Note that this runs in the signal handling context, any system
+ * library functions used here must be async-signal-safe.
+ */
+static void
+send_backtrace_to_monitor(void) {
+    int dep;
+    struct unw_backtrace unw_bt[UNW_MAX_DEPTH];
+    unw_cursor_t cursor;
+    unw_context_t uc;
+
+    dep = 0;
+    unw_getcontext(&uc);
+    unw_init_local(&cursor, &uc);
+
+    while (dep < UNW_MAX_DEPTH && unw_step(&cursor)) {
+        memset(unw_bt[dep].func, 0, UNW_MAX_FUNCN);
+        unw_get_reg(&cursor, UNW_REG_IP, &unw_bt[dep].ip);
+        unw_get_proc_name(&cursor, unw_bt[dep].func, UNW_MAX_FUNCN,
+                          &unw_bt[dep].offset);
+       dep++;
+    }
+
+    ignore(write(daemonize_fd, unw_bt, dep * sizeof(struct unw_backtrace)));
+}
+#else
+static void
+send_backtrace_to_monitor(void) {
+    /* Nothing. */
+}
+#endif
+
 /* Handles fatal signal number 'sig_nr'.
  *
  * Ordinarily this is the actual signal handler.  When other code needs to
@@ -164,6 +202,13 @@ void
 fatal_signal_handler(int sig_nr)
 {
 #ifndef _WIN32
+    if (sig_nr == SIGSEGV) {
+        signal(sig_nr, SIG_DFL); /* Set it back immediately. */
+        if (daemonize_fd != -1) {
+            send_backtrace_to_monitor();
+        }
+        raise(sig_nr);
+    }
     ignore(write(signal_fds[1], "", 1));
 #else
     SetEvent(wevent);
diff --git a/m4/openvswitch.m4 b/m4/openvswitch.m4
index ba912e58780a..465d9c5ba368 100644
--- a/m4/openvswitch.m4
+++ b/m4/openvswitch.m4
@@ -700,3 +700,13 @@ AC_DEFUN([OVS_CHECK_UNBOUND],
    fi
    AM_CONDITIONAL([HAVE_UNBOUND], [test "$HAVE_UNBOUND" = yes])
    AC_SUBST([HAVE_UNBOUND])])
+
+dnl Checks for libunwind.
+AC_DEFUN([OVS_CHECK_UNWIND],
+  [AC_CHECK_LIB(unwind, unw_backtrace, [HAVE_UNWIND=yes], [HAVE_UNWIND=no])
+   if test "$HAVE_UNWIND" = yes; then
+     AC_DEFINE([HAVE_UNWIND], [1], [Define to 1 if unwind is detected.])
+     LIBS="$LIBS -lunwind"
+   fi
+   AM_CONDITIONAL([HAVE_UNWIND], [test "$HAVE_UNWIND" = yes])
+   AC_SUBST([HAVE_UNWIND])])
-- 
2.7.4



More information about the dev mailing list