Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[LiveMetrics] report process metrics CPU Total and Committed Memory #42213

Merged
merged 13 commits into from
Mar 5, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@

### Features Added

* Update to report Memory and CPU which are displayed in the Live Metrics UX.
([#42213](https://github.com/Azure/azure-sdk-for-net/pull/42213))
* For "Committed Memory", we use [Process.PrivateMemorySize64](https://learn.microsoft.com/dotnet/api/system.diagnostics.process.privatememorysize64).
* For "CPU Total (%)", we use the change in [Process.TotalProcessorTime](https://learn.microsoft.com/dotnet/api/system.diagnostics.process.totalprocessortime) over a period of time. This value is normalized by dividing by the number of processors. The formula is `((change in ticks / period) / number of processors)`.

### Breaking Changes

### Bugs Fixed
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -195,5 +195,14 @@ public void DroppedDocument(DocumentIngressDocumentType documentType)

[Event(12, Message = "Document was dropped. DocumentType: {0}. Not user actionable.", Level = EventLevel.Warning)]
public void DroppedDocument(string documentType) => WriteEvent(12, documentType);

[Event(13, Message = "Failure to calculate CPU Counter. Unexpected negative timespan: PreviousCollectedTime: {0}. RecentCollectedTime: {0}. Not user actionable.", Level = EventLevel.Error)]
public void ProcessCountersUnexpectedNegativeTimeSpan(long previousCollectedTime, long recentCollectedTime) => WriteEvent(13, previousCollectedTime, recentCollectedTime);

[Event(14, Message = "Failure to calculate CPU Counter. Unexpected negative value: PreviousCollectedValue: {0}. RecentCollectedValue: {0}. Not user actionable.", Level = EventLevel.Error)]
public void ProcessCountersUnexpectedNegativeValue(long previousCollectedValue, long recentCollectedValue) => WriteEvent(14, previousCollectedValue, recentCollectedValue);

[Event(15, Message = "Calculated Cpu Counter: Period: {0}. DiffValue: {1}. CalculatedValue: {2}. ProcessorCount: {3}. NormalizedValue: {4}", Level = EventLevel.Verbose)]
public void ProcessCountersCpuCounter(long period, long diffValue, double calculatedValue, int processorCount, double normalizedValue) => WriteEvent(15, period, diffValue, calculatedValue, processorCount, normalizedValue);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ internal static class MetricId
// EXCEPTIONS
internal const string ExceptionsPerSecondMetricIdValue = @"\ApplicationInsights\Exceptions/Sec";

// PERFORMANCE COUNTERS
// PROCESS METRICS
internal const string MemoryCommittedBytesMetricIdValue = @"\Memory\Committed Bytes";
internal const string ProcessorTimeMetricIdValue = @"\Processor(_Total)\% Processor Time";
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@ internal partial class Manager
internal readonly DoubleBuffer _documentBuffer = new();
internal static bool? s_isAzureWebApp = null;

//private readonly PerformanceCounter _performanceCounter_ProcessorTime = new(categoryName: "Processor", counterName: "% Processor Time", instanceName: "_Total");
//private readonly PerformanceCounter _performanceCounter_CommittedBytes = new(categoryName: "Memory", counterName: "Committed Bytes");
private readonly int _processorCount = Environment.ProcessorCount;
private readonly Process _process = Process.GetCurrentProcess();
private DateTimeOffset _cachedCollectedTime = DateTimeOffset.MinValue;
private long _cachedCollectedValue = 0;

public MonitoringDataPoint GetDataPoint()
{
Expand Down Expand Up @@ -91,32 +93,45 @@ public MonitoringDataPoint GetDataPoint()
dataPoint.Metrics.Add(metricPoint);
}

// TODO: Reenable Perf Counters
//foreach (var metricPoint in CollectPerfCounters())
//{
// dataPoint.Metrics.Add(metricPoint);
//}
foreach (var metricPoint in CollectProcessMetrics())
{
dataPoint.Metrics.Add(metricPoint);
}

return dataPoint;
}

//public IEnumerable<Models.MetricPoint> CollectPerfCounters()
//{
// // PERFORMANCE COUNTERS
// yield return new Models.MetricPoint
// {
// Name = LiveMetricConstants.MetricId.MemoryCommittedBytesMetricIdValue,
// Value = _performanceCounter_CommittedBytes.NextValue(),
// Weight = 1
// };

// yield return new Models.MetricPoint
// {
// Name = LiveMetricConstants.MetricId.ProcessorTimeMetricIdValue,
// Value = _performanceCounter_ProcessorTime.NextValue(),
// Weight = 1
// };
//}
/// <remarks>
/// <para>
/// For Memory:
/// <see href="https://learn.microsoft.com/dotnet/api/system.diagnostics.process.privatememorysize64"/>.
/// "The amount of memory, in bytes, allocated for the associated process that cannot be shared with other processes.".
/// </para>
/// <para>
/// For CPU:
/// <see href="https://learn.microsoft.com/dotnet/api/system.diagnostics.process.totalprocessortime"/>.
/// "A TimeSpan that indicates the amount of time that the associated process has spent utilizing the CPU. This value is the sum of the UserProcessorTime and the PrivilegedProcessorTime.".
/// </para>
/// </remarks>
public IEnumerable<Models.MetricPoint> CollectProcessMetrics()
{
yield return new Models.MetricPoint
{
Name = LiveMetricConstants.MetricId.MemoryCommittedBytesMetricIdValue,
Value = _process.PrivateMemorySize64,
Weight = 1
};

if (TryCalculateCPUCounter(out var processorValue))
{
yield return new Models.MetricPoint
{
Name = LiveMetricConstants.MetricId.ProcessorTimeMetricIdValue,
Value = Convert.ToSingle(processorValue),
Weight = 1
};
}
}

/// <summary>
/// Searches for the environment variable specific to Azure Web App.
Expand Down Expand Up @@ -149,5 +164,67 @@ public MonitoringDataPoint GetDataPoint()

return s_isAzureWebApp;
}

private void ResetCachedValues()
{
_cachedCollectedTime = DateTimeOffset.MinValue;
_cachedCollectedValue = 0;
}

/// <summary>
/// Calcualte the CPU usage as the diff between two ticks divided by the period of time, and then divided by the number of processors.
/// <code>((change in ticks / period) / number of processors)</code>
/// </summary>
private bool TryCalculateCPUCounter(out double normalizedValue)
{
var previousCollectedValue = _cachedCollectedValue;
var previousCollectedTime = _cachedCollectedTime;

var recentCollectedValue = _cachedCollectedValue = _process.TotalProcessorTime.Ticks;
var recentCollectedTime = _cachedCollectedTime = DateTimeOffset.UtcNow;

double calculatedValue;

if (previousCollectedTime == DateTimeOffset.MinValue)
{
Debug.WriteLine($"{nameof(TryCalculateCPUCounter)} DateTimeOffset.MinValue");
normalizedValue = default;
return false;
}

var period = recentCollectedTime.Ticks - previousCollectedTime.Ticks;
if (period < 0)
{
// Not likely to happen but being safe here incase of clock issues in multi-core.
LiveMetricsExporterEventSource.Log.ProcessCountersUnexpectedNegativeTimeSpan(
previousCollectedTime: previousCollectedTime.Ticks,
recentCollectedTime: recentCollectedTime.Ticks);
Debug.WriteLine($"{nameof(TryCalculateCPUCounter)} period less than zero");
normalizedValue = default;
return false;
}

var diff = recentCollectedValue - previousCollectedValue;
if (diff < 0)
{
LiveMetricsExporterEventSource.Log.ProcessCountersUnexpectedNegativeValue(
previousCollectedValue: previousCollectedValue,
recentCollectedValue: recentCollectedValue);
Debug.WriteLine($"{nameof(TryCalculateCPUCounter)} diff less than zero");
normalizedValue = default;
return false;
}

period = period != 0 ? period : 1;
calculatedValue = diff * 100.0 / period;
normalizedValue = calculatedValue / _processorCount;
LiveMetricsExporterEventSource.Log.ProcessCountersCpuCounter(
period: previousCollectedValue,
diffValue: recentCollectedValue,
calculatedValue: calculatedValue,
processorCount: _processorCount,
normalizedValue: normalizedValue);
return true;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ private void SetPingState()
// This is used in determining if we should Backoff.
// If we've been in another state for X amount of time, that may exceed our maximum interval and immediately trigger a Backoff.
_lastSuccessfulPing = DateTimeOffset.UtcNow;

// Must reset the metrics cache here.
ResetCachedValues();
}

private void SetPostState()
Expand Down
Loading