From 69480518ed90678f7647c7fef13a765e74cda376 Mon Sep 17 00:00:00 2001 From: Timothy Mothra Date: Mon, 4 Mar 2024 16:57:13 -0800 Subject: [PATCH] [LiveMetrics] report process metrics `CPU Total` and `Committed Memory` (#42213) * implement process counters * use Demo for testing. REVERT THIS BEFORE MERGING * fix typo * changelog * Revert "use Demo for testing. REVERT THIS BEFORE MERGING" This reverts commit 38bf3f3a83eee996dc489643121bd2a472035cfb. * cleanup * update changelog * refactor * typo * rename * changelog * rename * pr feedback --- .../CHANGELOG.md | 5 + .../LiveMetricsExporterEventSource.cs | 9 ++ .../src/Internals/LiveMetricConstants.cs | 2 +- .../src/Internals/Manager.Metrics.cs | 125 ++++++++++++++---- .../src/Internals/Manager.State.cs | 3 + 5 files changed, 119 insertions(+), 25 deletions(-) diff --git a/sdk/monitor/Azure.Monitor.OpenTelemetry.LiveMetrics/CHANGELOG.md b/sdk/monitor/Azure.Monitor.OpenTelemetry.LiveMetrics/CHANGELOG.md index 9efc46dd03f00..6d1c654572c9d 100644 --- a/sdk/monitor/Azure.Monitor.OpenTelemetry.LiveMetrics/CHANGELOG.md +++ b/sdk/monitor/Azure.Monitor.OpenTelemetry.LiveMetrics/CHANGELOG.md @@ -4,6 +4,11 @@ ### Features Added +* Update to report Memory and CPU which are displayed in the Live Metrics UX. + ([#42213](https://github.com/Azure/azure-sdk-for-net/pull/42213)) + * For "Committed Memory", we use [Process.PrivateMemorySize64](https://learn.microsoft.com/dotnet/api/system.diagnostics.process.privatememorysize64). + * For "CPU Total (%)", we use the change in [Process.TotalProcessorTime](https://learn.microsoft.com/dotnet/api/system.diagnostics.process.totalprocessortime) over a period of time. This value is normalized by dividing by the number of processors. The formula is `((change in ticks / period) / number of processors)`. + ### Breaking Changes ### Bugs Fixed diff --git a/sdk/monitor/Azure.Monitor.OpenTelemetry.LiveMetrics/src/Internals/Diagnostics/LiveMetricsExporterEventSource.cs b/sdk/monitor/Azure.Monitor.OpenTelemetry.LiveMetrics/src/Internals/Diagnostics/LiveMetricsExporterEventSource.cs index d23b8b30ac5ab..3455f9b68c98c 100644 --- a/sdk/monitor/Azure.Monitor.OpenTelemetry.LiveMetrics/src/Internals/Diagnostics/LiveMetricsExporterEventSource.cs +++ b/sdk/monitor/Azure.Monitor.OpenTelemetry.LiveMetrics/src/Internals/Diagnostics/LiveMetricsExporterEventSource.cs @@ -195,5 +195,14 @@ public void DroppedDocument(DocumentIngressDocumentType documentType) [Event(12, Message = "Document was dropped. DocumentType: {0}. Not user actionable.", Level = EventLevel.Warning)] public void DroppedDocument(string documentType) => WriteEvent(12, documentType); + + [Event(13, Message = "Failure to calculate CPU Counter. Unexpected negative timespan: PreviousCollectedTime: {0}. RecentCollectedTime: {0}. Not user actionable.", Level = EventLevel.Error)] + public void ProcessCountersUnexpectedNegativeTimeSpan(long previousCollectedTime, long recentCollectedTime) => WriteEvent(13, previousCollectedTime, recentCollectedTime); + + [Event(14, Message = "Failure to calculate CPU Counter. Unexpected negative value: PreviousCollectedValue: {0}. RecentCollectedValue: {0}. Not user actionable.", Level = EventLevel.Error)] + public void ProcessCountersUnexpectedNegativeValue(long previousCollectedValue, long recentCollectedValue) => WriteEvent(14, previousCollectedValue, recentCollectedValue); + + [Event(15, Message = "Calculated Cpu Counter: Period: {0}. DiffValue: {1}. CalculatedValue: {2}. ProcessorCount: {3}. NormalizedValue: {4}", Level = EventLevel.Verbose)] + public void ProcessCountersCpuCounter(long period, long diffValue, double calculatedValue, int processorCount, double normalizedValue) => WriteEvent(15, period, diffValue, calculatedValue, processorCount, normalizedValue); } } diff --git a/sdk/monitor/Azure.Monitor.OpenTelemetry.LiveMetrics/src/Internals/LiveMetricConstants.cs b/sdk/monitor/Azure.Monitor.OpenTelemetry.LiveMetrics/src/Internals/LiveMetricConstants.cs index 04047bf254e3e..2986e08889c70 100644 --- a/sdk/monitor/Azure.Monitor.OpenTelemetry.LiveMetrics/src/Internals/LiveMetricConstants.cs +++ b/sdk/monitor/Azure.Monitor.OpenTelemetry.LiveMetrics/src/Internals/LiveMetricConstants.cs @@ -27,7 +27,7 @@ internal static class MetricId // EXCEPTIONS internal const string ExceptionsPerSecondMetricIdValue = @"\ApplicationInsights\Exceptions/Sec"; - // PERFORMANCE COUNTERS + // PROCESS METRICS internal const string MemoryCommittedBytesMetricIdValue = @"\Memory\Committed Bytes"; internal const string ProcessorTimeMetricIdValue = @"\Processor(_Total)\% Processor Time"; } diff --git a/sdk/monitor/Azure.Monitor.OpenTelemetry.LiveMetrics/src/Internals/Manager.Metrics.cs b/sdk/monitor/Azure.Monitor.OpenTelemetry.LiveMetrics/src/Internals/Manager.Metrics.cs index a8a0445af6da9..fc8343fb6eeb6 100644 --- a/sdk/monitor/Azure.Monitor.OpenTelemetry.LiveMetrics/src/Internals/Manager.Metrics.cs +++ b/sdk/monitor/Azure.Monitor.OpenTelemetry.LiveMetrics/src/Internals/Manager.Metrics.cs @@ -18,8 +18,10 @@ internal partial class Manager internal readonly DoubleBuffer _documentBuffer = new(); internal static bool? s_isAzureWebApp = null; - //private readonly PerformanceCounter _performanceCounter_ProcessorTime = new(categoryName: "Processor", counterName: "% Processor Time", instanceName: "_Total"); - //private readonly PerformanceCounter _performanceCounter_CommittedBytes = new(categoryName: "Memory", counterName: "Committed Bytes"); + private readonly int _processorCount = Environment.ProcessorCount; + private readonly Process _process = Process.GetCurrentProcess(); + private DateTimeOffset _cachedCollectedTime = DateTimeOffset.MinValue; + private long _cachedCollectedValue = 0; public MonitoringDataPoint GetDataPoint() { @@ -91,32 +93,45 @@ public MonitoringDataPoint GetDataPoint() dataPoint.Metrics.Add(metricPoint); } - // TODO: Reenable Perf Counters - //foreach (var metricPoint in CollectPerfCounters()) - //{ - // dataPoint.Metrics.Add(metricPoint); - //} + foreach (var metricPoint in CollectProcessMetrics()) + { + dataPoint.Metrics.Add(metricPoint); + } return dataPoint; } - //public IEnumerable CollectPerfCounters() - //{ - // // PERFORMANCE COUNTERS - // yield return new Models.MetricPoint - // { - // Name = LiveMetricConstants.MetricId.MemoryCommittedBytesMetricIdValue, - // Value = _performanceCounter_CommittedBytes.NextValue(), - // Weight = 1 - // }; - - // yield return new Models.MetricPoint - // { - // Name = LiveMetricConstants.MetricId.ProcessorTimeMetricIdValue, - // Value = _performanceCounter_ProcessorTime.NextValue(), - // Weight = 1 - // }; - //} + /// + /// + /// For Memory: + /// . + /// "The amount of memory, in bytes, allocated for the associated process that cannot be shared with other processes.". + /// + /// + /// For CPU: + /// . + /// "A TimeSpan that indicates the amount of time that the associated process has spent utilizing the CPU. This value is the sum of the UserProcessorTime and the PrivilegedProcessorTime.". + /// + /// + public IEnumerable CollectProcessMetrics() + { + yield return new Models.MetricPoint + { + Name = LiveMetricConstants.MetricId.MemoryCommittedBytesMetricIdValue, + Value = _process.PrivateMemorySize64, + Weight = 1 + }; + + if (TryCalculateCPUCounter(out var processorValue)) + { + yield return new Models.MetricPoint + { + Name = LiveMetricConstants.MetricId.ProcessorTimeMetricIdValue, + Value = Convert.ToSingle(processorValue), + Weight = 1 + }; + } + } /// /// Searches for the environment variable specific to Azure Web App. @@ -149,5 +164,67 @@ public MonitoringDataPoint GetDataPoint() return s_isAzureWebApp; } + + private void ResetCachedValues() + { + _cachedCollectedTime = DateTimeOffset.MinValue; + _cachedCollectedValue = 0; + } + + /// + /// Calcualte the CPU usage as the diff between two ticks divided by the period of time, and then divided by the number of processors. + /// ((change in ticks / period) / number of processors) + /// + private bool TryCalculateCPUCounter(out double normalizedValue) + { + var previousCollectedValue = _cachedCollectedValue; + var previousCollectedTime = _cachedCollectedTime; + + var recentCollectedValue = _cachedCollectedValue = _process.TotalProcessorTime.Ticks; + var recentCollectedTime = _cachedCollectedTime = DateTimeOffset.UtcNow; + + double calculatedValue; + + if (previousCollectedTime == DateTimeOffset.MinValue) + { + Debug.WriteLine($"{nameof(TryCalculateCPUCounter)} DateTimeOffset.MinValue"); + normalizedValue = default; + return false; + } + + var period = recentCollectedTime.Ticks - previousCollectedTime.Ticks; + if (period < 0) + { + // Not likely to happen but being safe here incase of clock issues in multi-core. + LiveMetricsExporterEventSource.Log.ProcessCountersUnexpectedNegativeTimeSpan( + previousCollectedTime: previousCollectedTime.Ticks, + recentCollectedTime: recentCollectedTime.Ticks); + Debug.WriteLine($"{nameof(TryCalculateCPUCounter)} period less than zero"); + normalizedValue = default; + return false; + } + + var diff = recentCollectedValue - previousCollectedValue; + if (diff < 0) + { + LiveMetricsExporterEventSource.Log.ProcessCountersUnexpectedNegativeValue( + previousCollectedValue: previousCollectedValue, + recentCollectedValue: recentCollectedValue); + Debug.WriteLine($"{nameof(TryCalculateCPUCounter)} diff less than zero"); + normalizedValue = default; + return false; + } + + period = period != 0 ? period : 1; + calculatedValue = diff * 100.0 / period; + normalizedValue = calculatedValue / _processorCount; + LiveMetricsExporterEventSource.Log.ProcessCountersCpuCounter( + period: previousCollectedValue, + diffValue: recentCollectedValue, + calculatedValue: calculatedValue, + processorCount: _processorCount, + normalizedValue: normalizedValue); + return true; + } } } diff --git a/sdk/monitor/Azure.Monitor.OpenTelemetry.LiveMetrics/src/Internals/Manager.State.cs b/sdk/monitor/Azure.Monitor.OpenTelemetry.LiveMetrics/src/Internals/Manager.State.cs index 6dbd36ce39c10..ec0068a4d5894 100644 --- a/sdk/monitor/Azure.Monitor.OpenTelemetry.LiveMetrics/src/Internals/Manager.State.cs +++ b/sdk/monitor/Azure.Monitor.OpenTelemetry.LiveMetrics/src/Internals/Manager.State.cs @@ -71,6 +71,9 @@ private void SetPingState() // This is used in determining if we should Backoff. // If we've been in another state for X amount of time, that may exceed our maximum interval and immediately trigger a Backoff. _lastSuccessfulPing = DateTimeOffset.UtcNow; + + // Must reset the metrics cache here. + ResetCachedValues(); } private void SetPostState()