Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make SIGSEGV handler emit nicer backtraces #113565

Merged
merged 3 commits into from
Aug 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 9 additions & 66 deletions compiler/rustc_driver_impl/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,15 @@ pub mod pretty;
#[macro_use]
mod print;
mod session_diagnostics;
#[cfg(all(unix, any(target_env = "gnu", target_os = "macos")))]
mod signal_handler;

#[cfg(not(all(unix, any(target_env = "gnu", target_os = "macos"))))]
mod signal_handler {
/// On platforms which don't support our signal handler's requirements,
/// simply use the default signal handler provided by std.
pub(super) fn install() {}
}

use crate::session_diagnostics::{
RLinkEmptyVersionNumber, RLinkEncodingVersionMismatch, RLinkRustcVersionMismatch,
Expand Down Expand Up @@ -1427,72 +1436,6 @@ pub fn init_env_logger(handler: &EarlyErrorHandler, env: &str) {
}
}

#[cfg(all(unix, any(target_env = "gnu", target_os = "macos")))]
mod signal_handler {
extern "C" {
fn backtrace_symbols_fd(
buffer: *const *mut libc::c_void,
size: libc::c_int,
fd: libc::c_int,
);
}

extern "C" fn print_stack_trace(_: libc::c_int) {
const MAX_FRAMES: usize = 256;
static mut STACK_TRACE: [*mut libc::c_void; MAX_FRAMES] =
[std::ptr::null_mut(); MAX_FRAMES];
unsafe {
let depth = libc::backtrace(STACK_TRACE.as_mut_ptr(), MAX_FRAMES as i32);
if depth == 0 {
return;
}
backtrace_symbols_fd(STACK_TRACE.as_ptr(), depth, 2);
}
}

/// When an error signal (such as SIGABRT or SIGSEGV) is delivered to the
/// process, print a stack trace and then exit.
pub(super) fn install() {
use std::alloc::{alloc, Layout};

unsafe {
let alt_stack_size: usize = min_sigstack_size() + 64 * 1024;
let mut alt_stack: libc::stack_t = std::mem::zeroed();
alt_stack.ss_sp = alloc(Layout::from_size_align(alt_stack_size, 1).unwrap()).cast();
alt_stack.ss_size = alt_stack_size;
libc::sigaltstack(&alt_stack, std::ptr::null_mut());

let mut sa: libc::sigaction = std::mem::zeroed();
sa.sa_sigaction = print_stack_trace as libc::sighandler_t;
sa.sa_flags = libc::SA_NODEFER | libc::SA_RESETHAND | libc::SA_ONSTACK;
libc::sigemptyset(&mut sa.sa_mask);
libc::sigaction(libc::SIGSEGV, &sa, std::ptr::null_mut());
}
}

/// Modern kernels on modern hardware can have dynamic signal stack sizes.
#[cfg(any(target_os = "linux", target_os = "android"))]
fn min_sigstack_size() -> usize {
const AT_MINSIGSTKSZ: core::ffi::c_ulong = 51;
let dynamic_sigstksz = unsafe { libc::getauxval(AT_MINSIGSTKSZ) };
// If getauxval couldn't find the entry, it returns 0,
// so take the higher of the "constant" and auxval.
// This transparently supports older kernels which don't provide AT_MINSIGSTKSZ
libc::MINSIGSTKSZ.max(dynamic_sigstksz as _)
}

/// Not all OS support hardware where this is needed.
#[cfg(not(any(target_os = "linux", target_os = "android")))]
fn min_sigstack_size() -> usize {
libc::MINSIGSTKSZ
}
}

#[cfg(not(all(unix, any(target_env = "gnu", target_os = "macos"))))]
mod signal_handler {
pub(super) fn install() {}
}

pub fn main() -> ! {
let start_time = Instant::now();
let start_rss = get_resident_set_size();
Expand Down
142 changes: 142 additions & 0 deletions compiler/rustc_driver_impl/src/signal_handler.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
//! Signal handler for rustc
//! Primarily used to extract a backtrace from stack overflow

use std::alloc::{alloc, Layout};
use std::{fmt, mem, ptr};

extern "C" {
fn backtrace_symbols_fd(buffer: *const *mut libc::c_void, size: libc::c_int, fd: libc::c_int);
}

fn backtrace_stderr(buffer: &[*mut libc::c_void]) {
let size = buffer.len().try_into().unwrap_or_default();
unsafe { backtrace_symbols_fd(buffer.as_ptr(), size, libc::STDERR_FILENO) };
}

/// Unbuffered, unsynchronized writer to stderr.
///
/// Only acceptable because everything will end soon anyways.
struct RawStderr(());

impl fmt::Write for RawStderr {
fn write_str(&mut self, s: &str) -> Result<(), fmt::Error> {
let ret = unsafe { libc::write(libc::STDERR_FILENO, s.as_ptr().cast(), s.len()) };
if ret == -1 { Err(fmt::Error) } else { Ok(()) }
}
}

/// We don't really care how many bytes we actually get out. SIGSEGV comes for our head.
/// Splash stderr with letters of our own blood to warn our friends about the monster.
macro raw_errln($tokens:tt) {
let _ = ::core::fmt::Write::write_fmt(&mut RawStderr(()), format_args!($tokens));
let _ = ::core::fmt::Write::write_char(&mut RawStderr(()), '\n');
}

/// Signal handler installed for SIGSEGV
extern "C" fn print_stack_trace(_: libc::c_int) {
const MAX_FRAMES: usize = 256;
// Reserve data segment so we don't have to malloc in a signal handler, which might fail
// in incredibly undesirable and unexpected ways due to e.g. the allocator deadlocking
static mut STACK_TRACE: [*mut libc::c_void; MAX_FRAMES] = [ptr::null_mut(); MAX_FRAMES];
let stack = unsafe {
// Collect return addresses
let depth = libc::backtrace(STACK_TRACE.as_mut_ptr(), MAX_FRAMES as i32);
if depth == 0 {
return;
}
&STACK_TRACE.as_slice()[0..(depth as _)]
};

// Just a stack trace is cryptic. Explain what we're doing.
raw_errln!("error: rustc interrupted by SIGSEGV, printing backtrace\n");
let mut written = 1;
let mut consumed = 0;
// Begin elaborating return addrs into symbols and writing them directly to stderr
// Most backtraces are stack overflow, most stack overflows are from recursion
// Check for cycles before writing 250 lines of the same ~5 symbols
let cycled = |(runner, walker)| runner == walker;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is interesting... I'm trying to decide whether there's any world where the "normal" stack backtrace mechanism could benefit from an attempt to compress its presentation in the same manner that you are doing here. (e.g. if code is tracking a depth counter or some other exhaustion-prevention mechanism, and then panicking explicitly rather than allowing the stack to actually overflow...)

But I suspect that this code belongs here and only here because in vast majority of cases, that kind of unrestricted recursion is going to end with a SIGSEGV and handled here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we could lift cycle-detection algorithms up into other parts of rustc, since in many cases we run afoul of essentially unbounded recursion, but it's probably not the case that this one is sufficiently general yet to be used thusly.

let mut cyclic = false;
if let Some(period) = stack.iter().skip(1).step_by(2).zip(stack).position(cycled) {
let period = period.saturating_add(1); // avoid "what if wrapped?" branches
let Some(offset) = stack.iter().skip(period).zip(stack).position(cycled) else {
// impossible.
return;
};

Copy link
Member

@pnkfelix pnkfelix Aug 30, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the code inside here might benefit from an ascii-art diagram or something.

My main concern is that even though I have passing familiarity with Floyd's tortoise and hare algorithm, I nonetheless cannot immediately tell whether you are e.g. reporting the minimal period λ, or if you are only guaranteeing that you are reporting some period ν that is a multiple of λ. (See the linked wikipedia page for where those greek letter cames from.)

... or better still: Would it make any sense to factor this out into its own generic routine that can have concrete unit tests, and then instantiate it here?

... anyway, on the other hand, this code has waited long enough to land, and it certainly looks like an improvement. Further improvements like what I am suggesting here can be follow-on PR's.

Copy link
Member Author

@workingjubilee workingjubilee Aug 30, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure! I want rustc to have a SIGILL handler as well, so I'm definitely going to be doing some followups here and imposing better factorization. It just seemed premature to do anything like that, or at least I wasn't entirely sure what would be best to factor.

// Count matching trace slices, else we could miscount "biphasic cycles"
// with the same period + loop entry but a different inner loop
let next_cycle = stack[offset..].chunks_exact(period).skip(1);
let cycles = 1 + next_cycle
.zip(stack[offset..].chunks_exact(period))
.filter(|(next, prev)| next == prev)
.count();
backtrace_stderr(&stack[..offset]);
written += offset;
consumed += offset;
if cycles > 1 {
raw_errln!("\n### cycle encountered after {offset} frames with period {period}");
backtrace_stderr(&stack[consumed..consumed + period]);
raw_errln!("### recursed {cycles} times\n");
written += period + 4;
consumed += period * cycles;
cyclic = true;
};
}
let rem = &stack[consumed..];
backtrace_stderr(rem);
raw_errln!("");
written += rem.len() + 1;

let random_depth = || 8 * 16; // chosen by random diceroll (2d20)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(... I don't understand why this is a closure versus just a let-bound usize.)

if cyclic || stack.len() > random_depth() {
// technically speculation, but assert it with confidence anyway.
// rustc only arrived in this signal handler because bad things happened
// and this message is for explaining it's not the programmer's fault
raw_errln!("note: rustc unexpectedly overflowed its stack! this is a bug");
written += 1;
}
if stack.len() == MAX_FRAMES {
raw_errln!("note: maximum backtrace depth reached, frames may have been lost");
written += 1;
}
raw_errln!("note: we would appreciate a report at https://github.com/rust-lang/rust");
written += 1;
if written > 24 {
// We probably just scrolled the earlier "we got SIGSEGV" message off the terminal
raw_errln!("note: backtrace dumped due to SIGSEGV! resuming signal");
};
}

/// When SIGSEGV is delivered to the process, print a stack trace and then exit.
pub(super) fn install() {
unsafe {
let alt_stack_size: usize = min_sigstack_size() + 64 * 1024;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is 64k still definitely enough stack space for this more complicated handler to run in?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So far on my machine yes, but then again my machine is probably answering getauxval(AT_MINSIGSTKSZ) with a nonzero value.

Copy link
Member Author

@workingjubilee workingjubilee Jul 14, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was a good question to ask because I found a spot where a small change removes an entire panic branch (and simplifies the code in general).

let mut alt_stack: libc::stack_t = mem::zeroed();
alt_stack.ss_sp = alloc(Layout::from_size_align(alt_stack_size, 1).unwrap()).cast();
alt_stack.ss_size = alt_stack_size;
libc::sigaltstack(&alt_stack, ptr::null_mut());

let mut sa: libc::sigaction = mem::zeroed();
sa.sa_sigaction = print_stack_trace as libc::sighandler_t;
sa.sa_flags = libc::SA_NODEFER | libc::SA_RESETHAND | libc::SA_ONSTACK;
libc::sigemptyset(&mut sa.sa_mask);
libc::sigaction(libc::SIGSEGV, &sa, ptr::null_mut());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should probably backtrace on SIGBUS too, since wild pointers can trigger sigbus just as easily (okay, maybe not just as easily, but, easily).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was going to look into what sort of signals we wanted to extend to after, since what I really started digging into the signal handler for is to set up a SIGILL handler for when the compiler needs a hot pad and some soup LLVM, running in our process space, runs into one of its llvm_unreachables that on x86-64, thanks to the CMake options we build LLVM with, are compiled into ud2 instead of UB (because they're @llvm.trap()).

}
}

/// Modern kernels on modern hardware can have dynamic signal stack sizes.
#[cfg(any(target_os = "linux", target_os = "android"))]
fn min_sigstack_size() -> usize {
const AT_MINSIGSTKSZ: core::ffi::c_ulong = 51;
let dynamic_sigstksz = unsafe { libc::getauxval(AT_MINSIGSTKSZ) };
// If getauxval couldn't find the entry, it returns 0,
// so take the higher of the "constant" and auxval.
// This transparently supports older kernels which don't provide AT_MINSIGSTKSZ
libc::MINSIGSTKSZ.max(dynamic_sigstksz as _)
}

/// Not all OS support hardware where this is needed.
#[cfg(not(any(target_os = "linux", target_os = "android")))]
fn min_sigstack_size() -> usize {
libc::MINSIGSTKSZ
}