From 6be3a73163cd93650243d93779b3921cdf8c8254 Mon Sep 17 00:00:00 2001 From: Tim McCormack Date: Tue, 13 Aug 2024 15:40:01 +0000 Subject: [PATCH] fix: Remove early span-start logging; fix loading of Django settings (#772) I still don't understand why this setting loading was broken -- maybe there's something unusual about the way plugin apps are loaded? This addresses https://github.com/edx/edx-arch-experiments/issues/771 --- CHANGELOG.rst | 11 ++++ edx_arch_experiments/__init__.py | 2 +- .../datadog_diagnostics/apps.py | 54 +++++----------- .../datadog_diagnostics/tests/test_app.py | 62 ++++++++++--------- 4 files changed, 62 insertions(+), 67 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index c53c86c..12f0cf5 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -14,11 +14,22 @@ Change Log Unreleased ~~~~~~~~~~ +[4.2.0] - 2024-08-13 +~~~~~~~~~~~~~~~~~~~~ +Fixed +----- +* Fixed loading of ``DATADOG_DIAGNOSTICS_ENABLE``, which was previously not loaded properly and therefore was always True. Also fixed loading of ``DATADOG_DIAGNOSTICS_MAX_SPANS``, which was presumably broken as well. + +Removed +------- +* Removed early span-start logging. It never worked properly, possibly because workers are continually being destroyed and created, leading to high log volume. + [4.1.0] - 2024-08-09 ~~~~~~~~~~~~~~~~~~~~ Changed ------- * Datadog diagnostics will now log all span-starts for the first minute after server startup +* **WARNING**: Do not use this version; see 4.2.0 release notes. [4.0.0] - 2024-08-05 ~~~~~~~~~~~~~~~~~~~~ diff --git a/edx_arch_experiments/__init__.py b/edx_arch_experiments/__init__.py index bb3e7ee..6192ccb 100644 --- a/edx_arch_experiments/__init__.py +++ b/edx_arch_experiments/__init__.py @@ -2,4 +2,4 @@ A plugin to include applications under development by the architecture team at 2U. """ -__version__ = '4.1.0' +__version__ = '4.2.0' diff --git a/edx_arch_experiments/datadog_diagnostics/apps.py b/edx_arch_experiments/datadog_diagnostics/apps.py index 3ca9401..e4a22b8 100644 --- a/edx_arch_experiments/datadog_diagnostics/apps.py +++ b/edx_arch_experiments/datadog_diagnostics/apps.py @@ -3,7 +3,6 @@ """ import logging -import time from django.apps import AppConfig from django.conf import settings @@ -11,33 +10,6 @@ log = logging.getLogger(__name__) -# .. toggle_name: DATADOG_DIAGNOSTICS_ENABLE -# .. toggle_implementation: DjangoSetting -# .. toggle_default: True -# .. toggle_description: Enables logging of Datadog diagnostics information. -# .. toggle_use_cases: circuit_breaker -# .. toggle_creation_date: 2024-07-11 -# .. toggle_tickets: https://github.com/edx/edx-arch-experiments/issues/692 -DATADOG_DIAGNOSTICS_ENABLE = getattr(settings, 'DATADOG_DIAGNOSTICS_ENABLE', True) - -# .. setting_name: DATADOG_DIAGNOSTICS_MAX_SPANS -# .. setting_default: 100 -# .. setting_description: Limit of how many spans to hold onto and log -# when diagnosing Datadog tracing issues. This limits memory consumption -# avoids logging more data than is actually needed for diagnosis. -DATADOG_DIAGNOSTICS_MAX_SPANS = getattr(settings, 'DATADOG_DIAGNOSTICS_MAX_SPANS', 100) - -# .. setting_name: DATADOG_DIAGNOSTICS_LOG_ALL_SPAN_STARTS_PERIOD -# .. setting_default: 60 -# .. setting_description: Log all span starts for this many seconds after worker -# startup. -DATADOG_DIAGNOSTICS_LOG_ALL_SPAN_STARTS_PERIOD = getattr( - settings, - 'DATADOG_DIAGNOSTICS_LOG_ALL_SPAN_STARTS_PERIOD', - 60 -) - - # pylint: disable=missing-function-docstring class MissingSpanProcessor: """Datadog span processor that logs unfinished spans at shutdown.""" @@ -46,20 +18,19 @@ def __init__(self): self.spans_started = 0 self.spans_finished = 0 self.open_spans = {} - self.log_all_until = time.time() + DATADOG_DIAGNOSTICS_LOG_ALL_SPAN_STARTS_PERIOD + + # .. setting_name: DATADOG_DIAGNOSTICS_MAX_SPANS + # .. setting_default: 100 + # .. setting_description: Limit of how many spans to hold onto and log + # when diagnosing Datadog tracing issues. This limits memory consumption + # avoids logging more data than is actually needed for diagnosis. + self.DATADOG_DIAGNOSTICS_MAX_SPANS = getattr(settings, 'DATADOG_DIAGNOSTICS_MAX_SPANS', 100) def on_span_start(self, span): self.spans_started += 1 - if len(self.open_spans) < DATADOG_DIAGNOSTICS_MAX_SPANS: + if len(self.open_spans) < self.DATADOG_DIAGNOSTICS_MAX_SPANS: self.open_spans[span.span_id] = span - # We believe that the anomalous traces always come from a - # single span that is created early in the lifetime of a - # gunicorn worker. If we log *every* span-start in this early - # period, we may be able to observe something interesting. - if time.time() <= self.log_all_until: - log.info(f"Early span-start sample: {span._pprint()}") # pylint: disable=protected-access - def on_span_finish(self, span): self.spans_finished += 1 self.open_spans.pop(span.span_id, None) # "delete if present" @@ -80,6 +51,15 @@ class DatadogDiagnostics(AppConfig): plugin_app = {} def ready(self): + # .. toggle_name: DATADOG_DIAGNOSTICS_ENABLE + # .. toggle_implementation: DjangoSetting + # .. toggle_default: True + # .. toggle_description: Enables logging of Datadog diagnostics information. + # .. toggle_use_cases: circuit_breaker + # .. toggle_creation_date: 2024-07-11 + # .. toggle_tickets: https://github.com/edx/edx-arch-experiments/issues/692 + DATADOG_DIAGNOSTICS_ENABLE = getattr(settings, 'DATADOG_DIAGNOSTICS_ENABLE', True) + if not DATADOG_DIAGNOSTICS_ENABLE: return diff --git a/edx_arch_experiments/datadog_diagnostics/tests/test_app.py b/edx_arch_experiments/datadog_diagnostics/tests/test_app.py index b927499..ba89b95 100644 --- a/edx_arch_experiments/datadog_diagnostics/tests/test_app.py +++ b/edx_arch_experiments/datadog_diagnostics/tests/test_app.py @@ -2,9 +2,10 @@ Tests for plugin app. """ -from unittest.mock import call, patch +from unittest.mock import patch -from django.test import TestCase +from ddtrace import tracer +from django.test import TestCase, override_settings from .. import apps @@ -21,7 +22,35 @@ def _pprint(self): class TestMissingSpanProcessor(TestCase): """Tests for MissingSpanProcessor.""" - @patch.object(apps, 'DATADOG_DIAGNOSTICS_MAX_SPANS', new=3) + def test_feature_switch(self): + """ + Regression test -- the use of override_settings ensures that we read + the setting as needed, and not once at module load time (when it's + not guaranteed to be available.) + """ + def initialize(): + apps.DatadogDiagnostics('edx_arch_experiments.datadog_diagnostics', apps).ready() + + def get_processor_list(): + # pylint: disable=protected-access + return [type(sp).__name__ for sp in tracer._span_processors] + + with override_settings(DATADOG_DIAGNOSTICS_ENABLE=False): + initialize() + assert sorted(get_processor_list()) == [ + 'EndpointCallCounterProcessor', 'TopLevelSpanProcessor', + ] + + # The True case needs to come second because the initializer + # appends to the list and there isn't an immediately obvious + # way of resetting it. + with override_settings(DATADOG_DIAGNOSTICS_ENABLE=True): + initialize() + assert sorted(get_processor_list()) == [ + 'EndpointCallCounterProcessor', 'MissingSpanProcessor', 'TopLevelSpanProcessor', + ] + + @override_settings(DATADOG_DIAGNOSTICS_MAX_SPANS=3) def test_metrics(self): proc = apps.MissingSpanProcessor() ids = [2, 4, 6, 8, 10] @@ -47,30 +76,5 @@ def test_logging(self, mock_log_error, mock_log_info): proc.on_span_start(FakeSpan(17)) proc.shutdown(0) - assert mock_log_info.call_args_list == [ - call("Early span-start sample: span_id=17"), - call("Spans created = 1; spans finished = 0"), - ] + mock_log_info.assert_called_once_with("Spans created = 1; spans finished = 0") mock_log_error.assert_called_once_with("Span created but not finished: span_id=17") - - @patch('edx_arch_experiments.datadog_diagnostics.apps.log.info') - def test_early_span_logging_cutoff(self, mock_log_info): - with patch('edx_arch_experiments.datadog_diagnostics.apps.time.time', side_effect=[ - # Setup - 1700000000, - # Span-start time checks - 1700000020, - 1700000040, - 1700010000, - ]): - proc = apps.MissingSpanProcessor() - # Three spans are started - proc.on_span_start(FakeSpan(44)) - proc.on_span_start(FakeSpan(45)) - proc.on_span_start(FakeSpan(46)) - - # Just two early span-starts are logged - assert mock_log_info.call_args_list == [ - call("Early span-start sample: span_id=44"), - call("Early span-start sample: span_id=45"), - ]