Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Log all span-starts for the first minute #762

Merged
merged 2 commits into from
Aug 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@ Change Log
Unreleased
~~~~~~~~~~

[4.1.0] - 2024-08-09
~~~~~~~~~~~~~~~~~~~~
Changed
-------
* Datadog diagnostics will now log all span-starts for the first minute after server startup

[4.0.0] - 2024-08-05
~~~~~~~~~~~~~~~~~~~~
Changed
Expand Down
2 changes: 1 addition & 1 deletion edx_arch_experiments/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
A plugin to include applications under development by the architecture team at 2U.
"""

__version__ = '4.0.0'
__version__ = '4.1.0'
20 changes: 20 additions & 0 deletions edx_arch_experiments/datadog_diagnostics/apps.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""

import logging
import time

from django.apps import AppConfig
from django.conf import settings
Expand All @@ -26,20 +27,39 @@
# avoids logging more data than is actually needed for diagnosis.
DATADOG_DIAGNOSTICS_MAX_SPANS = getattr(settings, 'DATADOG_DIAGNOSTICS_MAX_SPANS', 100)

# .. setting_name: DATADOG_DIAGNOSTICS_LOG_ALL_SPAN_STARTS_PERIOD
# .. setting_default: 60
# .. setting_description: Log all span starts for this many seconds after worker
# startup.
DATADOG_DIAGNOSTICS_LOG_ALL_SPAN_STARTS_PERIOD = getattr(
settings,
'DATADOG_DIAGNOSTICS_LOG_ALL_SPAN_STARTS_PERIOD',
60
)


# pylint: disable=missing-function-docstring
class MissingSpanProcessor:
"""Datadog span processor that logs unfinished spans at shutdown."""

def __init__(self):
self.spans_started = 0
self.spans_finished = 0
self.open_spans = {}
self.log_all_until = time.time() + DATADOG_DIAGNOSTICS_LOG_ALL_SPAN_STARTS_PERIOD

def on_span_start(self, span):
self.spans_started += 1
if len(self.open_spans) < DATADOG_DIAGNOSTICS_MAX_SPANS:
self.open_spans[span.span_id] = span

# We believe that the anomalous traces always come from a
# single span that is created early in the lifetime of a
# gunicorn worker. If we log *every* span-start in this early
# period, we may be able to observe something interesting.
if time.time() <= self.log_all_until:
log.info(f"Early span-start sample: {span._pprint()}") # pylint: disable=protected-access

def on_span_finish(self, span):
self.spans_finished += 1
self.open_spans.pop(span.span_id, None) # "delete if present"
Expand Down
29 changes: 27 additions & 2 deletions edx_arch_experiments/datadog_diagnostics/tests/test_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Tests for plugin app.
"""

from unittest.mock import patch
from unittest.mock import call, patch

from django.test import TestCase

Expand Down Expand Up @@ -47,5 +47,30 @@ def test_logging(self, mock_log_error, mock_log_info):
proc.on_span_start(FakeSpan(17))
proc.shutdown(0)

mock_log_info.assert_called_once_with("Spans created = 1; spans finished = 0")
assert mock_log_info.call_args_list == [
call("Early span-start sample: span_id=17"),
call("Spans created = 1; spans finished = 0"),
]
mock_log_error.assert_called_once_with("Span created but not finished: span_id=17")

@patch('edx_arch_experiments.datadog_diagnostics.apps.log.info')
def test_early_span_logging_cutoff(self, mock_log_info):
with patch('edx_arch_experiments.datadog_diagnostics.apps.time.time', side_effect=[
# Setup
1700000000,
# Span-start time checks
1700000020,
1700000040,
1700010000,
]):
proc = apps.MissingSpanProcessor()
# Three spans are started
proc.on_span_start(FakeSpan(44))
proc.on_span_start(FakeSpan(45))
proc.on_span_start(FakeSpan(46))

# Just two early span-starts are logged
assert mock_log_info.call_args_list == [
call("Early span-start sample: span_id=44"),
call("Early span-start sample: span_id=45"),
]
Loading