Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Log ancestors of current span when anomaly found #796

Merged
merged 1 commit into from
Sep 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@ Change Log
Unreleased
~~~~~~~~~~

[4.4.0] - 2024-09-10
~~~~~~~~~~~~~~~~~~~~
Changed
-------
* Datadog diagnostics now logs ancestor spans when an anomaly is encountered, up to a limit of 10 (controlled by new Django setting ``DATADOG_DIAGNOSTICS_LOG_SPAN_DEPTH``). Spans are logged in full and on separate lines, so this logging is now much more verbose; consider only enabling this logging for short periods. Log format of first line has also changed slightly.

[4.3.0] - 2024-08-22
~~~~~~~~~~~~~~~~~~~~
Added
Expand Down
2 changes: 1 addition & 1 deletion edx_arch_experiments/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
A plugin to include applications under development by the architecture team at 2U.
"""

__version__ = '4.3.0'
__version__ = '4.4.0'
48 changes: 40 additions & 8 deletions edx_arch_experiments/datadog_diagnostics/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import logging
import time

from django.conf import settings
from django.core.exceptions import MiddlewareNotUsed
from edx_toggles.toggles import WaffleFlag

Expand Down Expand Up @@ -61,6 +62,13 @@ def __init__(self, get_response):
raise MiddlewareNotUsed # pylint: disable=raise-missing-from

self.worker_start_epoch = time.time()
# .. setting_name: DATADOG_DIAGNOSTICS_LOG_SPAN_DEPTH
# .. setting_default: 10
# .. setting_description: Controls how many ancestors spans are logged
# when anomalous traces are detected. This limits log size when very
# deep span trees are present (such as in anomalous traces, or even
# just when each middleware is given a span).
self.log_span_ancestors_depth = getattr(settings, "DATADOG_DIAGNOSTICS_LOG_SPAN_DEPTH", 10)

def __call__(self, request):
return self.get_response(request)
Expand All @@ -82,25 +90,49 @@ def log_diagnostics(self, request):
"""
Contains all the actual logging logic.
"""
current_span = self.dd_tracer.current_span()
local_root_span = self.dd_tracer.current_root_span()

if DETECT_ANOMALOUS_TRACE.is_enabled():
# For testing, uncomment this line to fake an anomalous span:
# local_root_span.finish()
root_duration_s = local_root_span.duration
if root_duration_s is not None:
worker_run_time_s = time.time() - self.worker_start_epoch
log.warning(
f"Anomalous Datadog local root span (duration already set): "
f"id = {local_root_span.trace_id:x}; "
f"duration = {root_duration_s:0.3f} sec; "
f"worker age = {worker_run_time_s:0.3f} sec"
)
self.log_anomalous_trace(current_span, local_root_span)

if LOG_ROOT_SPAN.is_enabled():
route_pattern = getattr(request.resolver_match, 'route', None)
current_span = self.dd_tracer.current_span()
# pylint: disable=protected-access
log.info(
f"Datadog span diagnostics: Route = {route_pattern}; "
f"local root span = {local_root_span._pprint()}; "
f"current span = {current_span._pprint()}"
)

def log_anomalous_trace(self, current_span, local_root_span):
worker_run_time_s = time.time() - self.worker_start_epoch

# Build up a list of spans from current back towards the root, up to a limit.
ancestors = []
walk_span = current_span
while len(ancestors) < self.log_span_ancestors_depth and walk_span is not None:
ancestors.insert(0, walk_span._pprint()) # pylint: disable=protected-access
walk_span = walk_span._parent # pylint: disable=protected-access

trunc = "(ancestors truncated)\n" if walk_span else ""

if local_root_span.duration:
duration_msg = f"duration={local_root_span.duration:0.3f}"
else:
# Should only occur during local testing of this
# middleware, when forcing this code path to run.
duration_msg = "duration not set"

msg = (
"Anomalous Datadog local root span: "
f"trace_id={local_root_span.trace_id:x}; "
f"{duration_msg}; "
f"worker_age={worker_run_time_s:0.3f}; span ancestry:"
)

log.warning(msg + "\n" + trunc + "\n".join(ancestors)) # pylint: disable=logging-not-lazy
67 changes: 54 additions & 13 deletions edx_arch_experiments/datadog_diagnostics/tests/test_middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import ddt
import ddtrace
from django.test import TestCase
from django.test import TestCase, override_settings

from ..middleware import DETECT_ANOMALOUS_TRACE, LOG_ROOT_SPAN, DatadogDiagnosticMiddleware

Expand All @@ -20,16 +20,21 @@ def fake_view(_request):
class TestDatadogDiagnosticMiddleware(TestCase):
"""Tests for DatadogDiagnosticMiddleware."""

def setUp(self):
self.middleware = DatadogDiagnosticMiddleware(fake_view)
def make_middleware(self):
"""Make an instance of the middleware with current settings."""
return DatadogDiagnosticMiddleware(fake_view)

def run_middleware(self):
def run_middleware(self, middleware=None):
"""Run the middleware using a fake request."""
if middleware is None:
middleware = self.make_middleware()

resolver = Mock()
resolver.route = "/some/path"
request = Mock()
request.resolver_match = resolver
self.middleware.process_view(request, None, None, None)

middleware.process_view(request, None, None, None)

@patch('edx_arch_experiments.datadog_diagnostics.middleware.log.error')
def test_log_diagnostics_error_only_once(self, mock_log_error):
Expand All @@ -38,12 +43,13 @@ def test_log_diagnostics_error_only_once(self, mock_log_error):
The method should still be called every time in case it is still doing
useful work before the error, though.
"""
middleware = self.make_middleware()

bad_method = Mock(side_effect=lambda request: 1/0)
self.middleware.log_diagnostics = bad_method
middleware.log_diagnostics = bad_method

self.run_middleware()
self.run_middleware()
self.run_middleware(middleware)
self.run_middleware(middleware)

# Called twice
assert len(bad_method.call_args_list) == 2
Expand Down Expand Up @@ -71,7 +77,8 @@ def test_anomalous_trace(self, enabled, cause_anomaly, mock_log_warning):
patch.object(LOG_ROOT_SPAN, 'is_enabled', return_value=False),
# Need at least two levels of spans in order to fake
# an anomaly. (Otherwise current_root_span returns None.)
ddtrace.tracer.trace("test_local_root"),
ddtrace.tracer.trace("local_root"),
ddtrace.tracer.trace("intermediary_span"),
ddtrace.tracer.trace("inner_span"),
):
if cause_anomaly:
Expand All @@ -81,21 +88,55 @@ def test_anomalous_trace(self, enabled, cause_anomaly, mock_log_warning):
if enabled and cause_anomaly:
mock_log_warning.assert_called_once()
log_msg = mock_log_warning.call_args_list[0][0][0] # first arg of first call

assert re.fullmatch(
r"Anomalous Datadog local root span \(duration already set\): "
r"id = [0-9A-Fa-f]+; duration = [0-9]\.[0-9]{3} sec; worker age = [0-9]\.[0-9]{3} sec",
r"Anomalous Datadog local root span: "
r"trace_id=[0-9A-Fa-f]+; duration=[0-9]\.[0-9]{3}; worker_age=[0-9]\.[0-9]{3}; span ancestry:\n"
r"name='local_root'.*duration=[0-9]+.*\n"
r"name='intermediary_span'.*duration=None.*\n"
r"name='inner_span'.*duration=None.*",
log_msg
)
else:
mock_log_warning.assert_not_called()

@override_settings(DATADOG_DIAGNOSTICS_LOG_SPAN_DEPTH=2)
@patch('edx_arch_experiments.datadog_diagnostics.middleware.log.warning')
def test_anomalous_trace_truncation(self, mock_log_warning):
"""
Test that truncation works, returning N most proximal spans.
"""
with (
patch.object(DETECT_ANOMALOUS_TRACE, 'is_enabled', return_value=True),
patch.object(LOG_ROOT_SPAN, 'is_enabled', return_value=False),
# Need at least two levels of spans in order to fake
# an anomaly. (Otherwise current_root_span returns None.)
ddtrace.tracer.trace("local_root"),
ddtrace.tracer.trace("intermediary_span"),
ddtrace.tracer.trace("inner_span"),
):
ddtrace.tracer.current_root_span().finish() # cause anomaly
self.run_middleware()

mock_log_warning.assert_called_once()
log_msg = mock_log_warning.call_args_list[0][0][0] # first arg of first call

assert re.fullmatch(
r"Anomalous Datadog local root span: "
r"trace_id=[0-9A-Fa-f]+; duration=[0-9]\.[0-9]{3}; worker_age=[0-9]\.[0-9]{3}; span ancestry:\n"
r"\(ancestors truncated\)\n" # difference here
r"name='intermediary_span'.*duration=None.*\n"
r"name='inner_span'.*duration=None.*",
log_msg
)

@patch('edx_arch_experiments.datadog_diagnostics.middleware.log.info')
def test_log_root_span(self, mock_log_info):
with (
patch.object(DETECT_ANOMALOUS_TRACE, 'is_enabled', return_value=False),
patch.object(LOG_ROOT_SPAN, 'is_enabled', return_value=True),
# Need at least two levels of spans for interesting logging
ddtrace.tracer.trace("test_local_root"),
ddtrace.tracer.trace("local_root"),
ddtrace.tracer.trace("inner_span"),
):
self.run_middleware()
Expand All @@ -104,7 +145,7 @@ def test_log_root_span(self, mock_log_info):
log_msg = mock_log_info.call_args_list[0][0][0] # first arg of first call
assert re.fullmatch(
r"Datadog span diagnostics: Route = /some/path; "
r"local root span = name='test_local_root' .*; "
r"local root span = name='local_root' .*; "
r"current span = name='inner_span' .*",
log_msg
)
Loading