Skip to content

Commit

Permalink
[LiveMetrics] report process metrics CPU Total and `Committed Memor…
Browse files Browse the repository at this point in the history
…y` (#42213)

* implement process counters

* use Demo for testing. REVERT THIS BEFORE MERGING

* fix typo

* changelog

* Revert "use Demo for testing. REVERT THIS BEFORE MERGING"

This reverts commit 38bf3f3.

* cleanup

* update changelog

* refactor

* typo

* rename

* changelog

* rename

* pr feedback
  • Loading branch information
TimothyMothra authored Mar 5, 2024
1 parent d07d15d commit 6948051
Show file tree
Hide file tree
Showing 5 changed files with 119 additions and 25 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@

### Features Added

* Update to report Memory and CPU which are displayed in the Live Metrics UX.
([#42213](https://github.com/Azure/azure-sdk-for-net/pull/42213))
* For "Committed Memory", we use [Process.PrivateMemorySize64](https://learn.microsoft.com/dotnet/api/system.diagnostics.process.privatememorysize64).
* For "CPU Total (%)", we use the change in [Process.TotalProcessorTime](https://learn.microsoft.com/dotnet/api/system.diagnostics.process.totalprocessortime) over a period of time. This value is normalized by dividing by the number of processors. The formula is `((change in ticks / period) / number of processors)`.

### Breaking Changes

### Bugs Fixed
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -195,5 +195,14 @@ public void DroppedDocument(DocumentIngressDocumentType documentType)

[Event(12, Message = "Document was dropped. DocumentType: {0}. Not user actionable.", Level = EventLevel.Warning)]
public void DroppedDocument(string documentType) => WriteEvent(12, documentType);

[Event(13, Message = "Failure to calculate CPU Counter. Unexpected negative timespan: PreviousCollectedTime: {0}. RecentCollectedTime: {0}. Not user actionable.", Level = EventLevel.Error)]
public void ProcessCountersUnexpectedNegativeTimeSpan(long previousCollectedTime, long recentCollectedTime) => WriteEvent(13, previousCollectedTime, recentCollectedTime);

[Event(14, Message = "Failure to calculate CPU Counter. Unexpected negative value: PreviousCollectedValue: {0}. RecentCollectedValue: {0}. Not user actionable.", Level = EventLevel.Error)]
public void ProcessCountersUnexpectedNegativeValue(long previousCollectedValue, long recentCollectedValue) => WriteEvent(14, previousCollectedValue, recentCollectedValue);

[Event(15, Message = "Calculated Cpu Counter: Period: {0}. DiffValue: {1}. CalculatedValue: {2}. ProcessorCount: {3}. NormalizedValue: {4}", Level = EventLevel.Verbose)]
public void ProcessCountersCpuCounter(long period, long diffValue, double calculatedValue, int processorCount, double normalizedValue) => WriteEvent(15, period, diffValue, calculatedValue, processorCount, normalizedValue);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ internal static class MetricId
// EXCEPTIONS
internal const string ExceptionsPerSecondMetricIdValue = @"\ApplicationInsights\Exceptions/Sec";

// PERFORMANCE COUNTERS
// PROCESS METRICS
internal const string MemoryCommittedBytesMetricIdValue = @"\Memory\Committed Bytes";
internal const string ProcessorTimeMetricIdValue = @"\Processor(_Total)\% Processor Time";
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@ internal partial class Manager
internal readonly DoubleBuffer _documentBuffer = new();
internal static bool? s_isAzureWebApp = null;

//private readonly PerformanceCounter _performanceCounter_ProcessorTime = new(categoryName: "Processor", counterName: "% Processor Time", instanceName: "_Total");
//private readonly PerformanceCounter _performanceCounter_CommittedBytes = new(categoryName: "Memory", counterName: "Committed Bytes");
private readonly int _processorCount = Environment.ProcessorCount;
private readonly Process _process = Process.GetCurrentProcess();
private DateTimeOffset _cachedCollectedTime = DateTimeOffset.MinValue;
private long _cachedCollectedValue = 0;

public MonitoringDataPoint GetDataPoint()
{
Expand Down Expand Up @@ -91,32 +93,45 @@ public MonitoringDataPoint GetDataPoint()
dataPoint.Metrics.Add(metricPoint);
}

// TODO: Reenable Perf Counters
//foreach (var metricPoint in CollectPerfCounters())
//{
// dataPoint.Metrics.Add(metricPoint);
//}
foreach (var metricPoint in CollectProcessMetrics())
{
dataPoint.Metrics.Add(metricPoint);
}

return dataPoint;
}

//public IEnumerable<Models.MetricPoint> CollectPerfCounters()
//{
// // PERFORMANCE COUNTERS
// yield return new Models.MetricPoint
// {
// Name = LiveMetricConstants.MetricId.MemoryCommittedBytesMetricIdValue,
// Value = _performanceCounter_CommittedBytes.NextValue(),
// Weight = 1
// };

// yield return new Models.MetricPoint
// {
// Name = LiveMetricConstants.MetricId.ProcessorTimeMetricIdValue,
// Value = _performanceCounter_ProcessorTime.NextValue(),
// Weight = 1
// };
//}
/// <remarks>
/// <para>
/// For Memory:
/// <see href="https://learn.microsoft.com/dotnet/api/system.diagnostics.process.privatememorysize64"/>.
/// "The amount of memory, in bytes, allocated for the associated process that cannot be shared with other processes.".
/// </para>
/// <para>
/// For CPU:
/// <see href="https://learn.microsoft.com/dotnet/api/system.diagnostics.process.totalprocessortime"/>.
/// "A TimeSpan that indicates the amount of time that the associated process has spent utilizing the CPU. This value is the sum of the UserProcessorTime and the PrivilegedProcessorTime.".
/// </para>
/// </remarks>
public IEnumerable<Models.MetricPoint> CollectProcessMetrics()
{
yield return new Models.MetricPoint
{
Name = LiveMetricConstants.MetricId.MemoryCommittedBytesMetricIdValue,
Value = _process.PrivateMemorySize64,
Weight = 1
};

if (TryCalculateCPUCounter(out var processorValue))
{
yield return new Models.MetricPoint
{
Name = LiveMetricConstants.MetricId.ProcessorTimeMetricIdValue,
Value = Convert.ToSingle(processorValue),
Weight = 1
};
}
}

/// <summary>
/// Searches for the environment variable specific to Azure Web App.
Expand Down Expand Up @@ -149,5 +164,67 @@ public MonitoringDataPoint GetDataPoint()

return s_isAzureWebApp;
}

private void ResetCachedValues()
{
_cachedCollectedTime = DateTimeOffset.MinValue;
_cachedCollectedValue = 0;
}

/// <summary>
/// Calcualte the CPU usage as the diff between two ticks divided by the period of time, and then divided by the number of processors.
/// <code>((change in ticks / period) / number of processors)</code>
/// </summary>
private bool TryCalculateCPUCounter(out double normalizedValue)
{
var previousCollectedValue = _cachedCollectedValue;
var previousCollectedTime = _cachedCollectedTime;

var recentCollectedValue = _cachedCollectedValue = _process.TotalProcessorTime.Ticks;
var recentCollectedTime = _cachedCollectedTime = DateTimeOffset.UtcNow;

double calculatedValue;

if (previousCollectedTime == DateTimeOffset.MinValue)
{
Debug.WriteLine($"{nameof(TryCalculateCPUCounter)} DateTimeOffset.MinValue");
normalizedValue = default;
return false;
}

var period = recentCollectedTime.Ticks - previousCollectedTime.Ticks;
if (period < 0)
{
// Not likely to happen but being safe here incase of clock issues in multi-core.
LiveMetricsExporterEventSource.Log.ProcessCountersUnexpectedNegativeTimeSpan(
previousCollectedTime: previousCollectedTime.Ticks,
recentCollectedTime: recentCollectedTime.Ticks);
Debug.WriteLine($"{nameof(TryCalculateCPUCounter)} period less than zero");
normalizedValue = default;
return false;
}

var diff = recentCollectedValue - previousCollectedValue;
if (diff < 0)
{
LiveMetricsExporterEventSource.Log.ProcessCountersUnexpectedNegativeValue(
previousCollectedValue: previousCollectedValue,
recentCollectedValue: recentCollectedValue);
Debug.WriteLine($"{nameof(TryCalculateCPUCounter)} diff less than zero");
normalizedValue = default;
return false;
}

period = period != 0 ? period : 1;
calculatedValue = diff * 100.0 / period;
normalizedValue = calculatedValue / _processorCount;
LiveMetricsExporterEventSource.Log.ProcessCountersCpuCounter(
period: previousCollectedValue,
diffValue: recentCollectedValue,
calculatedValue: calculatedValue,
processorCount: _processorCount,
normalizedValue: normalizedValue);
return true;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ private void SetPingState()
// This is used in determining if we should Backoff.
// If we've been in another state for X amount of time, that may exceed our maximum interval and immediately trigger a Backoff.
_lastSuccessfulPing = DateTimeOffset.UtcNow;

// Must reset the metrics cache here.
ResetCachedValues();
}

private void SetPostState()
Expand Down

0 comments on commit 6948051

Please sign in to comment.