From 9fa7b5ed8a7d24a86b387d77b597ad3ae446e449 Mon Sep 17 00:00:00 2001 From: Sascha Grunert Date: Tue, 31 Jan 2023 12:00:17 +0100 Subject: [PATCH] Fix OOM watcher for cgroupv2 `oom_kill` events It may be possible that the container process already got signaled but is still running, whereas we will not get a `modify` (but `other`) event on the file watcher. Beside that, it will also not update the `oom` entry of `memory.events`, but the `oom_kill`, which provides another indicator for a possible out of memory kill. Ref: https://www.kernel.org/doc/Documentation/cgroup-v2.txt Signed-off-by: Sascha Grunert --- conmon-rs/server/src/child_reaper.rs | 2 +- conmon-rs/server/src/oom_watcher.rs | 27 ++++++++++++--------------- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/conmon-rs/server/src/child_reaper.rs b/conmon-rs/server/src/child_reaper.rs index 903a491151..9b85e8e032 100644 --- a/conmon-rs/server/src/child_reaper.rs +++ b/conmon-rs/server/src/child_reaper.rs @@ -395,7 +395,7 @@ impl ReapableChild { return exit_code; } Ok(WaitStatus::Signaled(_, sig, _)) => { - debug!("Signaled"); + debug!("Signaled: {sig}"); token.cancel(); return (sig as i32) + 128; } diff --git a/conmon-rs/server/src/oom_watcher.rs b/conmon-rs/server/src/oom_watcher.rs index 20735e9e49..7b78264018 100644 --- a/conmon-rs/server/src/oom_watcher.rs +++ b/conmon-rs/server/src/oom_watcher.rs @@ -222,34 +222,29 @@ impl OOMWatcher { tokio::select! { _ = token.cancelled() => { debug!("Loop cancelled"); - match tx.try_send(OOMEvent{ oom: false }) { - Ok(_) => break, - Err(e) => error!("try_send failed: {:#}", e) + if let Err(e) = tx.try_send(OOMEvent{ oom: false }) { + error!("try_send failed: {:#}", e); }; break; } Some(res) = rx.recv() => { match res { Ok(event) => { - if event.kind.is_remove() || event.kind.is_other() { - match tx.try_send(OOMEvent{ oom: false }) { - Ok(_) => break, - Err(e) => error!("try_send failed: {:#}", e) + debug!("Got event OOM file event: {:?}", event); + if event.kind.is_remove() { + if let Err(e) = tx.try_send(OOMEvent{ oom: false }) { + error!("try_send failed: {:#}", e); }; break } - if !event.kind.is_modify() { - continue; - } - debug!("Found modify event"); match Self::check_for_oom(&memory_events_file_path, last_counter).await { Ok((counter, is_oom)) => { if !is_oom { continue; } - debug!(counter, "Found oom event"); + debug!(counter, "Found OOM event"); if let Err(e) = Self::write_oom_files(exit_paths).await { - error!("Writing oom files failed: {:#}", e); + error!("Writing OOM files failed: {:#}", e); } last_counter = counter; match tx.try_send(OOMEvent{ oom: true }) { @@ -258,7 +253,7 @@ impl OOMWatcher { }; } Err(e) => { - error!("Checking for oom failed: {}", e); + error!("Checking for OOM failed: {}", e); match tx.try_send(OOMEvent{ oom: false }) { Ok(_) => break, Err(e) => error!("try_send failed: {:#}", e) @@ -291,6 +286,7 @@ impl OOMWatcher { memory_events_file_path: &Path, last_counter: u64, ) -> Result<(u64, bool)> { + debug!("Checking for possible OOM"); let mut new_counter: u64 = 0; let mut found_oom = false; let fp = File::open(memory_events_file_path).await.context(format!( @@ -300,12 +296,13 @@ impl OOMWatcher { let reader = BufReader::new(fp); let mut lines = reader.lines(); while let Some(line) = lines.next_line().await.context("get next line")? { - if let Some(counter) = line.strip_prefix("oom ") { + if let Some(counter) = line.strip_prefix("oom ").or(line.strip_prefix("oom_kill ")) { let counter = counter .to_string() .parse::() .context("parse u64 counter")?; if counter != last_counter { + debug!("Updating OOM counter to {counter}"); new_counter = counter; found_oom = true; break;