Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[crashtracker] Enable client to configure which signals to trap #856

Merged
merged 25 commits into from
Feb 13, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions bin_tests/src/bin/crashtracker_bin_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ mod unix {
create_alt_stack: true,
use_alt_stack: true,
resolve_frames: crashtracker::StacktraceCollection::WithoutSymbols,
signals: vec![libc::SIGBUS, libc::SIGSEGV],
endpoint,
timeout_ms: TEST_COLLECTOR_TIMEOUT_MS,
unix_socket_path: Some("".to_string()),
Expand Down
4 changes: 4 additions & 0 deletions crashtracker-ffi/src/collector/datatypes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ pub struct Config<'a> {
/// If None, the crashtracker will infer the agent host from env variables.
pub endpoint: Option<&'a Endpoint>,
pub resolve_frames: StacktraceCollection,
/// The set of signals we should be registered for.
pub signals: Slice<'a, i32>,
/// Timeout in milliseconds before the signal handler starts tearing things down to return.
/// This is given as a uint32_t, but the actual timeout needs to fit inside of an i32 (max
/// 2^31-1). This is a limitation of the various interfaces used to guarantee the timeout.
Expand All @@ -84,6 +86,7 @@ impl<'a> TryFrom<Config<'a>> for datadog_crashtracker::CrashtrackerConfiguration
let use_alt_stack = value.use_alt_stack;
let endpoint = value.endpoint.cloned();
let resolve_frames = value.resolve_frames;
let signals = value.signals.iter().copied().collect();
let timeout_ms = value.timeout_ms;
let unix_socket_path = value.optional_unix_socket_filename.try_to_string_option()?;
Self::new(
Expand All @@ -92,6 +95,7 @@ impl<'a> TryFrom<Config<'a>> for datadog_crashtracker::CrashtrackerConfiguration
use_alt_stack,
endpoint,
resolve_frames,
signals,
timeout_ms,
unix_socket_path,
)
Expand Down
10 changes: 9 additions & 1 deletion crashtracker-ffi/src/collector/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use anyhow::Context;
pub use counters::*;
use datadog_crashtracker::CrashtrackerReceiverConfig;
pub use datatypes::*;
use ddcommon_ffi::{wrap_with_void_ffi_result, VoidResult};
use ddcommon_ffi::{wrap_with_void_ffi_result, Slice, VoidResult};
use function_name::named;
pub use spans::*;

Expand Down Expand Up @@ -133,3 +133,11 @@ pub unsafe extern "C" fn ddog_crasht_init_without_receiver(
)?
})
}

#[no_mangle]
/// Returns a list of signals suitable for use in a crashtracker config.
pub extern "C" fn ddog_crasht_default_signals<'a>() -> Slice<'a, libc::c_int> {
static DEFAULT_SYMBOLS: [libc::c_int; 4] =
[libc::SIGBUS, libc::SIGABRT, libc::SIGSEGV, libc::SIGILL];
Slice::new(&DEFAULT_SYMBOLS)
}
6 changes: 4 additions & 2 deletions crashtracker/src/collector/api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,9 @@ pub fn init(
metadata: Metadata,
) -> anyhow::Result<()> {
update_metadata(metadata)?;
update_config(config)?;
update_config(config.clone())?;
configure_receiver(receiver_config);
register_crash_handlers()?;
register_crash_handlers(&config)?;
Ok(())
}

Expand Down Expand Up @@ -128,6 +128,7 @@ fn test_crash() -> anyhow::Result<()> {
use_alt_stack,
endpoint,
resolve_frames,
vec![libc::SIGBUS, libc::SIGSEGV],
timeout_ms,
None,
)?;
Expand Down Expand Up @@ -185,6 +186,7 @@ fn test_altstack_paradox() -> anyhow::Result<()> {
use_alt_stack,
endpoint,
resolve_frames,
vec![libc::SIGBUS, libc::SIGSEGV],
timeout_ms,
None,
);
Expand Down
123 changes: 58 additions & 65 deletions crashtracker/src/collector/crash_handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ use super::saguard::SaGuard;
use crate::crash_info::Metadata;
use crate::shared::configuration::{CrashtrackerConfiguration, CrashtrackerReceiverConfig};
use crate::shared::constants::*;
use crate::signal_from_signum;
use anyhow::Context;
use libc::{
c_void, execve, mmap, nfds_t, sigaltstack, siginfo_t, ucontext_t, MAP_ANON, MAP_FAILED,
Expand All @@ -19,6 +20,7 @@ use nix::sys::signal::{self, SaFlags, SigAction, SigHandler, SigSet};
use nix::sys::socket;
use nix::sys::wait::{waitpid, WaitPidFlag, WaitStatus};
use nix::unistd::{close, Pid};
use std::collections::HashMap;
use std::ffi::CString;
use std::fs::{File, OpenOptions};
use std::io::Write;
Expand Down Expand Up @@ -54,12 +56,6 @@ use libc::fork as vfork;
#[cfg(target_os = "linux")]
use libc::vfork;

#[derive(Debug)]
struct OldHandlers {
pub sigbus: SigAction,
pub sigsegv: SigAction,
}

struct Receiver {
receiver_uds: RawFd,
receiver_pid: i32,
Expand Down Expand Up @@ -255,7 +251,8 @@ fn wait_for_pollhup(target_fd: RawFd, timeout_ms: i32) -> anyhow::Result<bool> {
// This means that we can always clean up the memory inside one of these using
// `Box::from_raw` to recreate the box, then dropping it.
static ALTSTACK_INIT: AtomicBool = AtomicBool::new(false);
static OLD_HANDLERS: AtomicPtr<OldHandlers> = AtomicPtr::new(ptr::null_mut());
static OLD_HANDLERS: AtomicPtr<HashMap<i32, (signal::Signal, SigAction)>> =
AtomicPtr::new(ptr::null_mut());
static METADATA: AtomicPtr<(Metadata, String)> = AtomicPtr::new(ptr::null_mut());
static CONFIG: AtomicPtr<(CrashtrackerConfiguration, String)> = AtomicPtr::new(ptr::null_mut());
static RECEIVER_CONFIG: AtomicPtr<CrashtrackerReceiverConfig> = AtomicPtr::new(ptr::null_mut());
Expand Down Expand Up @@ -397,46 +394,36 @@ extern "C" fn handle_posix_sigaction(signum: i32, sig_info: *mut siginfo_t, ucon
// instant of time between when the handlers are registered, and the
// `OLD_HANDLERS` are set. This should be very short, but is hard to fully
// eliminate given the existing POSIX APIs.
let old_handlers = unsafe { &*OLD_HANDLERS.load(SeqCst) };
let old_sigaction = if signum == libc::SIGSEGV {
old_handlers.sigsegv
} else if signum == libc::SIGBUS {
old_handlers.sigbus
let old_handlers = unsafe { &*OLD_HANDLERS.swap(std::ptr::null_mut(), SeqCst) };

if let Some((signal, sigaction)) = old_handlers.get(&signum) {
// How we chain depends on what kind of handler we're chaining to.
// https://www.gnu.org/software/libc/manual/html_node/Signal-Handling.html
// https://man7.org/linux/man-pages/man2/sigaction.2.html
// Follow the approach here:
// https://stackoverflow.com/questions/6015498/executing-default-signal-handler
match sigaction.handler() {
SigHandler::SigDfl => {
// In the case of a default handler, we want to invoke it so that
// the core-dump can be generated. Restoring the handler then
// re-raising the signal accomplishes that.
unsafe { signal::sigaction(*signal, sigaction) }
.unwrap_or_else(|_| std::process::abort());
// Signals are only delivered once.
// In the case where we were invoked because of a crash, returning
// is technically UB but in practice re-invokes the crashing instr
// and re-raises the signal. In the case where we were invoked by
// `raise(SIGSEGV)` we need to re-raise the signal, or the default
// handler will never receive it.
unsafe { libc::raise(signum) };
}
SigHandler::SigIgn => (), // Return and ignore the signal.
SigHandler::Handler(f) => f(signum),
SigHandler::SigAction(f) => f(signum, sig_info, ucontext),
};
} else {
unreachable!("The only signals we're registered for are SEGV and BUS")
};

// How we chain depends on what kind of handler we're chaining to.
// https://www.gnu.org/software/libc/manual/html_node/Signal-Handling.html
// https://man7.org/linux/man-pages/man2/sigaction.2.html
// Follow the approach here:
// https://stackoverflow.com/questions/6015498/executing-default-signal-handler
match old_sigaction.handler() {
SigHandler::SigDfl => {
// In the case of a default handler, we want to invoke it so that
// the core-dump can be generated. Restoring the handler then
// re-raising the signal accomplishes that.
let signal = if signum == libc::SIGSEGV {
signal::SIGSEGV
} else if signum == libc::SIGBUS {
signal::SIGBUS
} else {
unreachable!("The only signals we're registered for are SEGV and BUS")
};
unsafe { signal::sigaction(signal, &old_sigaction) }
.unwrap_or_else(|_| std::process::abort());
// Signals are only delivered once.
// In the case where we were invoked because of a crash, returning
// is technically UB but in practice re-invokes the crashing instr
// and re-raises the signal. In the case where we were invoked by
// `raise(SIGSEGV)` we need to re-raise the signal, or the default
// handler will never receive it.
unsafe { libc::raise(signum) };
}
SigHandler::SigIgn => (), // Return and ignore the signal.
SigHandler::Handler(f) => f(signum),
SigHandler::SigAction(f) => f(signum, sig_info, ucontext),
};
eprintln!("Unexpected signal number {signum}, can't chain the handlers");
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What should we do here? abort?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess this could happen if the user code changes the sigaction while we're in the middle of handling a signal? I'm not sure how it might arise.

}
}

fn receiver_from_socket(unix_socket_path: &str) -> anyhow::Result<Receiver> {
Expand Down Expand Up @@ -611,29 +598,33 @@ fn handle_posix_signal_impl(
/// will not yet be stored. This would lead to unexpected behaviour for the
/// user. This should only matter if something crashes concurrently with
/// this function executing.
pub fn register_crash_handlers() -> anyhow::Result<()> {
pub fn register_crash_handlers(config: &CrashtrackerConfiguration) -> anyhow::Result<()> {
if !OLD_HANDLERS.load(SeqCst).is_null() {
return Ok(());
}

let config_ptr = CONFIG.load(SeqCst);
anyhow::ensure!(!config_ptr.is_null(), "No crashtracking config");
let (config, _config_str) = unsafe { config_ptr.as_ref().context("config ptr")? };

unsafe {
if config.create_alt_stack {
create_alt_stack()?;
}
let sigbus = register_signal_handler(signal::SIGBUS, config)?;
let sigsegv = register_signal_handler(signal::SIGSEGV, config)?;
let boxed_ptr = Box::into_raw(Box::new(OldHandlers { sigbus, sigsegv }));

let res = OLD_HANDLERS.compare_exchange(ptr::null_mut(), boxed_ptr, SeqCst, SeqCst);
if config.create_alt_stack {
unsafe { create_alt_stack()? };
}
let mut old_handlers = HashMap::new();
// TODO: if this fails, we end in a situation where handlers have been registered with no chain
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What to do here?

for signum in &config.signals {
let signal = signal_from_signum(*signum)?;
anyhow::ensure!(
res.is_ok(),
"TOCTTOU error in crashtracker::register_crash_handlers"
!old_handlers.contains_key(signum),
"Handler already registered for {signal}"
);
let handler = unsafe { register_signal_handler(signal, config)? };
old_handlers.insert(*signum, (signal, handler));
}
let boxed_ptr = Box::into_raw(Box::new(old_handlers));

let res = OLD_HANDLERS.compare_exchange(ptr::null_mut(), boxed_ptr, SeqCst, SeqCst);
anyhow::ensure!(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This leaks the old_handlers, if we care?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think so--small one-time leak, right?

res.is_ok(),
"TOCTTOU error in crashtracker::register_crash_handlers"
);

Ok(())
}

Expand Down Expand Up @@ -674,9 +665,11 @@ pub fn restore_old_handlers(inside_signal_handler: bool) -> anyhow::Result<()> {
anyhow::ensure!(!prev.is_null(), "No crashtracking previous signal handlers");
// Safety: The only nonnull pointer stored here comes from Box::into_raw()
let prev = unsafe { Box::from_raw(prev) };
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could flip this around, and only rehydrate into a box when we are ready to drop it

// Safety: The value restored here was returned from a previous sigaction call
unsafe { signal::sigaction(signal::SIGBUS, &prev.sigbus)? };
unsafe { signal::sigaction(signal::SIGSEGV, &prev.sigsegv)? };
for (_signum, (signal, sigaction)) in prev.iter() {
// Safety: The value restored here was returned from a previous sigaction call
unsafe { signal::sigaction(*signal, sigaction)? };
}

// We want to avoid freeing memory inside the handler, so just leak it
// This is fine since we're crashing anyway at this point
if inside_signal_handler {
Expand Down
22 changes: 10 additions & 12 deletions crashtracker/src/collector/emitters.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use crate::collector::counters::emit_counters;
use crate::collector::spans::{emit_spans, emit_traces};
use crate::shared::constants::*;
use crate::CrashtrackerConfiguration;
use crate::SignalNames;
use crate::StacktraceCollection;
use anyhow::Context;
use backtrace::Frame;
Expand Down Expand Up @@ -185,19 +186,16 @@ fn emit_siginfo(w: &mut impl Write, sig_info: *const siginfo_t) -> anyhow::Resul
anyhow::ensure!(!sig_info.is_null());

let si_signo = unsafe { (*sig_info).si_signo };
let si_signo_human_readable = match si_signo {
libc::SIGABRT => "SIGABRT",
libc::SIGBUS => "SIGBUS",
libc::SIGSEGV => "SIGSEGV",
libc::SIGSYS => "SIGSYS",
_ => "UNKNOWN",
};
let si_signo_human_readable: SignalNames = si_signo.into();

// Derive the faulting address from `sig_info`
let si_addr: Option<usize> = if si_signo == libc::SIGSEGV || si_signo == libc::SIGBUS {
unsafe { Some((*sig_info).si_addr() as usize) }
} else {
None
// https://man7.org/linux/man-pages/man2/sigaction.2.html
// SIGILL, SIGFPE, SIGSEGV, SIGBUS, and SIGTRAP fill in si_addr with the address of the fault.
let si_addr: Option<usize> = match si_signo {
libc::SIGILL | libc::SIGFPE | libc::SIGSEGV | libc::SIGBUS | libc::SIGTRAP => {
Some(unsafe { (*sig_info).si_addr() as usize })
}
_ => None,
};

let si_code = unsafe { (*sig_info).si_code };
Expand All @@ -214,7 +212,7 @@ fn emit_siginfo(w: &mut impl Write, sig_info: *const siginfo_t) -> anyhow::Resul
write!(w, ", \"si_signo\": {si_signo}")?;
write!(
w,
", \"si_signo_human_readable\": \"{si_signo_human_readable}\""
", \"si_signo_human_readable\": \"{si_signo_human_readable:?}\""
)?;
if let Some(si_addr) = si_addr {
write!(w, ", \"si_addr\": \"{si_addr:#018x}\"")?;
Expand Down
Loading
Loading