Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(nvidia/remapped-rows): do not check row remapping for 4090 and other unsupported GPUs #351

Merged
merged 3 commits into from
Feb 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 30 additions & 22 deletions components/accelerator/nvidia/query/gpu_memory.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,34 +2,42 @@ package query

import "strings"

// GetMemoryErrorManagementCapabilities returns the GPU memory error management capabilities
var (
// ref. https://docs.nvidia.com/deploy/a100-gpu-mem-error-mgmt/index.html#supported-gpus
memMgmtCapAllSupported = MemoryErrorManagementCapabilities{
ErrorContainment: true,
DynamicPageOfflining: true,
RowRemapping: true,
}
memMgmtCapOnlyRowRemappingSupported = MemoryErrorManagementCapabilities{
RowRemapping: true,
}
gpuProductToMemMgmtCaps = map[string]MemoryErrorManagementCapabilities{
"a100": memMgmtCapAllSupported,
"b100": memMgmtCapAllSupported,
"b200": memMgmtCapAllSupported,
"h100": memMgmtCapAllSupported,
"h200": memMgmtCapAllSupported,
"a10": memMgmtCapOnlyRowRemappingSupported,
}
)

// SupportedMemoryMgmtCapsByGPUProduct returns the GPU memory error management capabilities
// based on the GPU product name.
// ref. https://docs.nvidia.com/deploy/a100-gpu-mem-error-mgmt/index.html#supported-gpus
func GetMemoryErrorManagementCapabilities(gpuProductName string) MemoryErrorManagementCapabilities {
func SupportedMemoryMgmtCapsByGPUProduct(gpuProductName string) MemoryErrorManagementCapabilities {
p := strings.ToLower(gpuProductName)
switch {
case strings.Contains(p, "h100"):
return MemoryErrorManagementCapabilities{
ErrorContainment: true,
DynamicPageOfflining: true,
RowRemapping: true,
longestName, memCaps := "", MemoryErrorManagementCapabilities{}
for k, v := range gpuProductToMemMgmtCaps {
if !strings.Contains(p, k) {
continue
}

case strings.Contains(p, "a100"):
return MemoryErrorManagementCapabilities{
ErrorContainment: true,
DynamicPageOfflining: true,
RowRemapping: true,
}

case strings.Contains(p, "a10"):
return MemoryErrorManagementCapabilities{
RowRemapping: true,
if len(longestName) < len(k) {
longestName = k
memCaps = v
}

default:
return MemoryErrorManagementCapabilities{}
}
return memCaps
}

// Contains information about the GPU's memory error management capabilities.
Expand Down
98 changes: 96 additions & 2 deletions components/accelerator/nvidia/query/gpu_memory_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import (
"testing"
)

func TestGetMemoryErrorManagementCapabilities(t *testing.T) {
func TestSupportedMemoryMgmtCapsByGPUProduct(t *testing.T) {
tests := []struct {
name string
gpuProductName string
Expand Down Expand Up @@ -50,11 +50,105 @@ func TestGetMemoryErrorManagementCapabilities(t *testing.T) {
RowRemapping: true,
},
},
{
name: "NVIDIA B100",
gpuProductName: "NVIDIA B100",
expected: MemoryErrorManagementCapabilities{
ErrorContainment: true,
DynamicPageOfflining: true,
RowRemapping: true,
},
},
{
name: "NVIDIA B200",
gpuProductName: "NVIDIA B200",
expected: MemoryErrorManagementCapabilities{
ErrorContainment: true,
DynamicPageOfflining: true,
RowRemapping: true,
},
},
{
name: "Mixed case input",
gpuProductName: "NvIdIa A100 PCIe",
expected: MemoryErrorManagementCapabilities{
ErrorContainment: true,
DynamicPageOfflining: true,
RowRemapping: true,
},
},
{
name: "Empty string",
gpuProductName: "",
expected: MemoryErrorManagementCapabilities{},
},
{
name: "NVIDIA T4",
gpuProductName: "NVIDIA T4",
expected: MemoryErrorManagementCapabilities{},
},
{
name: "NVIDIA V100",
gpuProductName: "NVIDIA V100",
expected: MemoryErrorManagementCapabilities{},
},
{
name: "NVIDIA A10G",
gpuProductName: "NVIDIA A10G",
expected: MemoryErrorManagementCapabilities{
RowRemapping: true,
},
},
{
name: "GPU with SXM suffix",
gpuProductName: "NVIDIA A100-SXM",
expected: MemoryErrorManagementCapabilities{
ErrorContainment: true,
DynamicPageOfflining: true,
RowRemapping: true,
},
},
{
name: "GPU with PCIe suffix",
gpuProductName: "NVIDIA A100 PCIe",
expected: MemoryErrorManagementCapabilities{
ErrorContainment: true,
DynamicPageOfflining: true,
RowRemapping: true,
},
},
{
name: "GPU with memory size suffix",
gpuProductName: "NVIDIA A100 80GB",
expected: MemoryErrorManagementCapabilities{
ErrorContainment: true,
DynamicPageOfflining: true,
RowRemapping: true,
},
},
{
name: "Special characters in name",
gpuProductName: "NVIDIA-A100_80GB",
expected: MemoryErrorManagementCapabilities{
ErrorContainment: true,
DynamicPageOfflining: true,
RowRemapping: true,
},
},
{
name: "Non-NVIDIA prefix",
gpuProductName: "Some A100 GPU",
expected: MemoryErrorManagementCapabilities{
ErrorContainment: true,
DynamicPageOfflining: true,
RowRemapping: true,
},
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := GetMemoryErrorManagementCapabilities(tt.gpuProductName)
result := SupportedMemoryMgmtCapsByGPUProduct(tt.gpuProductName)
if !reflect.DeepEqual(result, tt.expected) {
t.Errorf("GetGPUMemoryErrorManagement(%q) = %v, want %v", tt.gpuProductName, result, tt.expected)
}
Expand Down
4 changes: 4 additions & 0 deletions components/accelerator/nvidia/query/nvml/remapped_rows.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
RemappingFailed bool `json:"remapping_failed"`

// Supported is true if the remapped rows are supported by the device.
// Even for "NVIDIA GeForce RTX 4090", this "GetRemappedRows" returns no error,
// thus "Supported" is not a reliable way to check if row remapping is supported.
Supported bool `json:"supported"`
}

Expand All @@ -48,6 +50,8 @@
// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g055e7c34f7f15b6ae9aac1dabd60870d
corrRows, uncRows, isPending, failureOccurred, ret := dev.GetRemappedRows()
if IsNotSupportError(ret) {
// even for "NVIDIA GeForce RTX 4090", this returns no error
// thus "Supported" is not a reliable way to check if row remapping is supported

Check warning on line 54 in components/accelerator/nvidia/query/nvml/remapped_rows.go

View check run for this annotation

Codecov / codecov/patch

components/accelerator/nvidia/query/nvml/remapped_rows.go#L53-L54

Added lines #L53 - L54 were not covered by tests
remRws.Supported = false
return remRws, nil
}
Expand Down
14 changes: 13 additions & 1 deletion components/accelerator/nvidia/query/query.go
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@

productName := o.GPUProductName()
if productName != "" {
o.MemoryErrorManagementCapabilities = GetMemoryErrorManagementCapabilities(o.GPUProductName())
o.MemoryErrorManagementCapabilities = SupportedMemoryMgmtCapsByGPUProduct(o.GPUProductName())

Check warning on line 255 in components/accelerator/nvidia/query/query.go

View check run for this annotation

Codecov / codecov/patch

components/accelerator/nvidia/query/query.go#L255

Added line #L255 was not covered by tests
} else {
log.Logger.Warnw("no gpu product name found -- skipping evaluating memory error management capabilities")
}
Expand Down Expand Up @@ -537,6 +537,18 @@
}
}

if dev.RemappedRows.Supported {
fmt.Printf("%s NVML remapped rows supported\n", checkMark)
if dev.RemappedRows.RequiresReset() {
fmt.Printf("%s NVML found that the GPU needs a reset\n", warningSign)
}
if dev.RemappedRows.QualifiesForRMA() {
fmt.Printf("%s NVML found that the GPU qualifies for RMA\n", warningSign)
}
} else {
fmt.Printf("%s NVML remapped rows are not supported\n", warningSign)
}

Check warning on line 550 in components/accelerator/nvidia/query/query.go

View check run for this annotation

Codecov / codecov/patch

components/accelerator/nvidia/query/query.go#L540-L550

Added lines #L540 - L550 were not covered by tests

uncorrectedErrs := dev.ECCErrors.Volatile.FindUncorrectedErrs()
if len(uncorrectedErrs) > 0 {
fmt.Printf("%s NVML found %d ecc volatile uncorrected error(s)\n", warningSign, len(uncorrectedErrs))
Expand Down
80 changes: 41 additions & 39 deletions components/accelerator/nvidia/remapped-rows/component_output.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,12 @@
return nil, errors.New("no state found")
}

func (o *Output) isRowRemappingSupported() bool {
// even for "NVIDIA GeForce RTX 4090", this returns no error
// thus "RemappedRowsNVML.Supported" is not a reliable way to check if row remapping is supported
return o.MemoryErrorManagementCapabilities.RowRemapping
}

// Returns the output evaluation reason and its healthy-ness.
func (o *Output) Evaluate() (string, bool, error) {
if o == nil {
Expand All @@ -160,53 +166,49 @@
healthy := true
reasons := []string{}

for _, r := range o.RemappedRowsSMI {
rma, err := r.QualifiesForRMA()
if err != nil {
healthy = false
reasons = append(reasons, fmt.Sprintf("nvidia-smi GPU %s failed to determine if it qualifies for RMA: %s", r.ID, err.Error()))
continue
}
if rma {
healthy = false
reasons = append(reasons, fmt.Sprintf("nvidia-smi GPU %s qualifies for RMA (remapping failure occurred %v, remapped due to uncorrectable errors %s)", r.ID, r.RemappingFailed, r.RemappedDueToUncorrectableErrors))
}
if !o.isRowRemappingSupported() {
reasons = append(reasons, fmt.Sprintf("GPU product name %q does not support row remapping (message: %q)", o.GPUProductName, o.MemoryErrorManagementCapabilities.Message))
} else {
for _, r := range o.RemappedRowsSMI {
rma, err := r.QualifiesForRMA()
if err != nil {
healthy = false
reasons = append(reasons, fmt.Sprintf("nvidia-smi GPU %s failed to determine if it qualifies for RMA: %s", r.ID, err.Error()))
continue

Check warning on line 177 in components/accelerator/nvidia/remapped-rows/component_output.go

View check run for this annotation

Codecov / codecov/patch

components/accelerator/nvidia/remapped-rows/component_output.go#L175-L177

Added lines #L175 - L177 were not covered by tests
}
if rma {
healthy = false
reasons = append(reasons, fmt.Sprintf("nvidia-smi GPU %s qualifies for RMA (remapping failure occurred %v, remapped due to uncorrectable errors %s)", r.ID, r.RemappingFailed, r.RemappedDueToUncorrectableErrors))
}

needsReset, err := r.RequiresReset()
if err != nil {
healthy = false
reasons = append(reasons, fmt.Sprintf("nvidia-smi GPU %s failed to determine if it needs reset: %s", r.ID, err.Error()))
continue
}
if needsReset {
healthy = false
reasons = append(reasons, fmt.Sprintf("nvidia-smi GPU %s needs reset (pending remapping %v)", r.ID, needsReset))
needsReset, err := r.RequiresReset()
if err != nil {
reasons = append(reasons, fmt.Sprintf("nvidia-smi GPU %s failed to determine if it needs reset: %s", r.ID, err.Error()))
continue

Check warning on line 187 in components/accelerator/nvidia/remapped-rows/component_output.go

View check run for this annotation

Codecov / codecov/patch

components/accelerator/nvidia/remapped-rows/component_output.go#L186-L187

Added lines #L186 - L187 were not covered by tests
}
if needsReset {
healthy = false
reasons = append(reasons, fmt.Sprintf("nvidia-smi GPU %s needs reset (pending remapping %v)", r.ID, needsReset))
}
}
}

for _, r := range o.RemappedRowsNVML {
if r.QualifiesForRMA() {
healthy = false
reasons = append(reasons, fmt.Sprintf("nvml GPU %s qualifies for RMA (remapping failure occurred %v, remapped due to uncorrectable errors %d)", r.UUID, r.RemappingFailed, r.RemappedDueToUncorrectableErrors))
}
if r.RequiresReset() {
healthy = false
reasons = append(reasons, fmt.Sprintf("nvml GPU %s needs reset (pending remapping %v)", r.UUID, r.RemappingPending))
for _, r := range o.RemappedRowsNVML {
if r.QualifiesForRMA() {
healthy = false
reasons = append(reasons, fmt.Sprintf("nvml GPU %s qualifies for RMA (remapping failure occurred %v, remapped due to uncorrectable errors %d)", r.UUID, r.RemappingFailed, r.RemappedDueToUncorrectableErrors))
}
if r.RequiresReset() {
healthy = false
reasons = append(reasons, fmt.Sprintf("nvml GPU %s needs reset (pending remapping %v)", r.UUID, r.RemappingPending))
}
}
}

if len(reasons) == 0 {
reasons = append(reasons, "no issue detected")
}

// regardless of the healthy-ness, we want to log the product name
// so that we can identify which product name does not support row remapping
if !o.MemoryErrorManagementCapabilities.RowRemapping {
reasons = append(reasons, fmt.Sprintf("GPU product name %q does not support row remapping (message: %q)", o.GPUProductName, o.MemoryErrorManagementCapabilities.Message))
if len(reasons) == 0 {
reasons = append(reasons, "no issue detected")
}
}

reason := strings.Join(reasons, ", ")

return reason, healthy, nil
}

Expand Down
Loading