Skip to content

Commit

Permalink
Merge pull request #1310 from DataDog/ivoanjo/warn-on-missing-cpu-tim…
Browse files Browse the repository at this point in the history
…e-instrumentation

Detect and warn about threads without CPU time instrumentation
  • Loading branch information
ivoanjo authored Feb 11, 2021
2 parents 8aad622 + 3ab06d1 commit 915476e
Show file tree
Hide file tree
Showing 4 changed files with 210 additions and 81 deletions.
33 changes: 32 additions & 1 deletion lib/ddtrace/profiling/collectors/stack.rb
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ def initialize(recorder, options = {})

# Workers::Polling settings
self.enabled = options[:enabled] == true

@warned_about_missing_cpu_time_instrumentation = false
end

def start
Expand Down Expand Up @@ -120,8 +122,16 @@ def collect_thread_event(thread, wall_time_interval_ns)

def get_cpu_time_interval!(thread)
# Return if we can't get the current CPU time
return unless thread.respond_to?(:cpu_time)
unless thread.respond_to?(:cpu_time_instrumentation_installed?) && thread.cpu_time_instrumentation_installed?
warn_about_missing_cpu_time_instrumentation(thread)
return
end

current_cpu_time_ns = thread.cpu_time(:nanosecond)

# Note: This can still be nil even when all of the checks above passed because of a race: there's a bit of
# initialization that needs to be done by the thread itself, and it's possible for us to try to sample
# *before* the thread had time to finish the initialization
return unless current_cpu_time_ns

last_cpu_time_ns = (thread[THREAD_LAST_CPU_TIME_KEY] || current_cpu_time_ns)
Expand Down Expand Up @@ -175,6 +185,27 @@ def build_backtrace_location(_id, base_label, lineno, path)
string_table.fetch_string(path)
)
end

private

def warn_about_missing_cpu_time_instrumentation(thread)
return if @warned_about_missing_cpu_time_instrumentation

# make sure we warn only once
@warned_about_missing_cpu_time_instrumentation = true

# Is the profiler thread instrumented? If it is, then we know instrumentation is available, just missing in
# the thread being sampled
if Thread.current.respond_to?(:cpu_time) && Thread.current.cpu_time
Datadog.logger.warn(
"Detected thread ('#{thread}') with missing CPU profiling instrumentation. " \
'CPU Profiling results will be inaccurate. ' \
'Please report this at https://github.com/DataDog/dd-trace-rb/blob/master/CONTRIBUTING.md#found-a-bug'
)
end

nil
end
end
end
end
Expand Down
12 changes: 11 additions & 1 deletion lib/ddtrace/profiling/ext/cthread.rb
Original file line number Diff line number Diff line change
Expand Up @@ -49,14 +49,24 @@ def self.prepended(base)

def clock_id
update_native_ids if forked?
@clock_id ||= nil
defined?(@clock_id) && @clock_id
end

def cpu_time(unit = :float_second)
return unless clock_id && ::Process.respond_to?(:clock_gettime)
::Process.clock_gettime(clock_id, unit)
end

def cpu_time_instrumentation_installed?
# If this thread was started before this module was added to Thread OR if something caused the initialize
# method above not to be properly called on new threads, this instance variable is never defined (never set to
# any value at all, including nil).
#
# Thus, we can use @clock_id as a canary to detect a thread that has missing instrumentation, because we
# know that in initialize above we always set this variable to nil.
defined?(@clock_id) != nil
end

private

# Retrieves number of classes from runtime
Expand Down
191 changes: 119 additions & 72 deletions spec/ddtrace/profiling/collectors/stack_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -249,42 +249,45 @@
end
end

if Datadog::Profiling.native_cpu_time_supported?
context 'and CPU timing is available' do
let(:current_cpu_time) { last_cpu_time + cpu_interval }
let(:last_cpu_time) { rand(1e4) }
let(:cpu_interval) { 1000 }
context 'and CPU timing is available' do
let(:current_cpu_time) { last_cpu_time + cpu_interval }
let(:last_cpu_time) { rand(1e4) }
let(:cpu_interval) { 1000 }

include_context 'with profiling extensions'

before do
allow(thread)
.to receive(:cpu_time)
.with(:nanosecond)
.and_return(current_cpu_time)

allow(thread)
.to receive(:[])
.with(described_class::THREAD_LAST_CPU_TIME_KEY)
.and_return(last_cpu_time)
include_context 'with profiling extensions'

expect(thread)
.to receive(:[]=)
.with(described_class::THREAD_LAST_CPU_TIME_KEY, current_cpu_time)
end
before do
allow(Thread).to receive(:current).and_return(double('Current thread', cpu_time: true))

allow(thread)
.to receive(:cpu_time_instrumentation_installed?)
.and_return(true)
allow(thread)
.to receive(:cpu_time)
.with(:nanosecond)
.and_return(current_cpu_time)

allow(thread)
.to receive(:[])
.with(described_class::THREAD_LAST_CPU_TIME_KEY)
.and_return(last_cpu_time)

expect(thread)
.to receive(:[]=)
.with(described_class::THREAD_LAST_CPU_TIME_KEY, current_cpu_time)
end

it 'builds an event with CPU time' do
is_expected.to be_a_kind_of(Datadog::Profiling::Events::StackSample)
it 'builds an event with CPU time' do
is_expected.to be_a_kind_of(Datadog::Profiling::Events::StackSample)

is_expected.to have_attributes(
timestamp: kind_of(Float),
frames: array_including(kind_of(Datadog::Profiling::BacktraceLocation)),
total_frame_count: backtrace.length,
thread_id: thread.object_id,
cpu_time_interval_ns: cpu_interval,
wall_time_interval_ns: wall_time_interval_ns
)
end
is_expected.to have_attributes(
timestamp: kind_of(Float),
frames: array_including(kind_of(Datadog::Profiling::BacktraceLocation)),
total_frame_count: backtrace.length,
thread_id: thread.object_id,
cpu_time_interval_ns: cpu_interval,
wall_time_interval_ns: wall_time_interval_ns
)
end
end

Expand Down Expand Up @@ -323,64 +326,108 @@

context 'when CPU timing is not supported' do
it { is_expected.to be nil }

it 'does not log any warnings' do
expect(Datadog).to_not receive(:logger)

get_cpu_time_interval!
end
end

if Datadog::Profiling.native_cpu_time_supported?
context 'when CPU timing is supported' do
include_context 'with profiling extensions'
context 'when CPU timing is supported' do
before do
allow(Thread).to receive(:current).and_return(double('Current thread', cpu_time: true))
end

include_context 'with profiling extensions'

context 'but thread is not properly instrumented' do
before do
allow(thread)
.to receive(:cpu_time_instrumentation_installed?)
.and_return(false)
allow(Datadog.logger).to receive(:warn)
end

context 'but yields nil' do
it { is_expected.to be nil }

it 'logs a warning' do
expect(Datadog.logger).to receive(:warn).with(/missing CPU profiling instrumentation/)

get_cpu_time_interval!
end

it 'logs a warning only once' do
expect(Datadog.logger).to receive(:warn).once

get_cpu_time_interval!
get_cpu_time_interval!
end
end

context 'but yields nil' do
before do
allow(thread)
.to receive(:cpu_time_instrumentation_installed?)
.and_return(true)
allow(thread)
.to receive(:cpu_time)
.and_return(nil)
end

it { is_expected.to be nil }

it 'does not log any warnings' do
expect(Datadog).to_not receive(:logger)

get_cpu_time_interval!
end
end

context 'and returns time' do
let(:current_cpu_time) { last_cpu_time + cpu_interval }
let(:last_cpu_time) { rand(1e4) }
let(:cpu_interval) { 1000 }

before do
allow(thread)
.to receive(:cpu_time_instrumentation_installed?)
.and_return(true)
allow(thread)
.to receive(:cpu_time)
.with(:nanosecond)
.and_return(current_cpu_time)

expect(thread)
.to receive(:[]=)
.with(described_class::THREAD_LAST_CPU_TIME_KEY, current_cpu_time)
end

context 'and the thread CPU time has not been retrieved before' do
before do
allow(thread)
.to receive(:cpu_time)
.to receive(:[])
.with(described_class::THREAD_LAST_CPU_TIME_KEY)
.and_return(nil)
end

it { is_expected.to be nil }
let(:current_cpu_time) { rand(1e4) }
it { is_expected.to eq 0 }
end

context 'and returns time' do
context 'and the thread CPU time has been retrieved before' do
let(:current_cpu_time) { last_cpu_time + cpu_interval }
let(:last_cpu_time) { rand(1e4) }
let(:cpu_interval) { 1000 }

before do
allow(thread)
.to receive(:cpu_time)
.with(:nanosecond)
.and_return(current_cpu_time)

expect(thread)
.to receive(:[]=)
.with(described_class::THREAD_LAST_CPU_TIME_KEY, current_cpu_time)
end

context 'and the thread CPU time has not been retrieved before' do
before do
allow(thread)
.to receive(:[])
.with(described_class::THREAD_LAST_CPU_TIME_KEY)
.and_return(nil)
end

let(:current_cpu_time) { rand(1e4) }
it { is_expected.to eq 0 }
.to receive(:[])
.with(described_class::THREAD_LAST_CPU_TIME_KEY)
.and_return(last_cpu_time)
end

context 'and the thread CPU time has been retrieved before' do
let(:current_cpu_time) { last_cpu_time + cpu_interval }
let(:last_cpu_time) { rand(1e4) }
let(:cpu_interval) { 1000 }

before do
allow(thread)
.to receive(:[])
.with(described_class::THREAD_LAST_CPU_TIME_KEY)
.and_return(last_cpu_time)
end

it { is_expected.to eq(cpu_interval) }
end
it { is_expected.to eq(cpu_interval) }
end
end
end
Expand Down
55 changes: 48 additions & 7 deletions spec/ddtrace/profiling/ext/cthread_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,39 @@

let(:block) { proc { loop { sleep(1) } } }

let(:thread_class) do
let(:thread_class_with_instrumentation) do
expect(::Thread.ancestors).to_not include(described_class)

klass = ::Thread.dup
klass.send(:prepend, described_class)
klass
end

# Leave Thread class in a clean state before and after tests
before do
stub_const('Thread', thread_class)
let(:thread_class_missing_instrumentation) do
expect(::Thread.ancestors).to_not include(described_class)

klass = ::Thread.dup

# keep copy of original initialize method, useful when we want to simulate a thread where our extension hasn't
# been initialized (e.g. a thread created before the extension was added)
klass.send(:alias_method, :original_initialize, :initialize)

# Add the module under test (Ext::CThread)
klass.send(:prepend, described_class)

# Add a module that skips over the module under test's initialize changes
skip_instrumentation = Module.new do
def initialize(*args, &block)
original_initialize(*args, &block) # directly call original initialize, skipping the one in Ext::CThread
end
end
klass.send(:prepend, skip_instrumentation)

klass
end

let(:thread_class) { thread_class_with_instrumentation }

# Kill any spawned threads
after { thread.kill if instance_variable_defined?(:@thread_started) && @thread_started }

Expand Down Expand Up @@ -134,16 +154,23 @@ def on_main_thread
subject(:cpu_time) { thread.cpu_time }

context 'when clock ID' do
before { allow(thread).to receive(:clock_id).and_return(clock_id) }

context 'is not available' do
let(:clock_id) { nil }
let(:thread_class) { thread_class_missing_instrumentation }

it { is_expected.to be nil }

it 'does not define the @clock_id instance variable' do
cpu_time

expect(thread.instance_variable_defined?(:@clock_id)).to be false
end
end

context 'is available' do
let(:clock_id) { double('clock ID') }

before { allow(thread).to receive(:clock_id).and_return(clock_id) }

if Process.respond_to?(:clock_gettime)
let(:cpu_time_measurement) { double('cpu time measurement') }

Expand Down Expand Up @@ -195,5 +222,19 @@ def on_main_thread
end
end
end

describe '#cpu_time_instrumentation_installed?' do
it do
expect(thread.cpu_time_instrumentation_installed?).to be true
end

context 'when our custom initialize block did not run' do
let(:thread_class) { thread_class_missing_instrumentation }

it do
expect(thread.cpu_time_instrumentation_installed?).to be false
end
end
end
end
end

0 comments on commit 915476e

Please sign in to comment.