leptonai · gyuho · Feb 6, 2025 · Feb 3, 2025 · Feb 4, 2025 · Feb 6, 2025
diff --git a/components/accelerator/nvidia/query/gpu_memory.go b/components/accelerator/nvidia/query/gpu_memory.go
@@ -2,34 +2,42 @@ package query
 
 import "strings"
 
-// GetMemoryErrorManagementCapabilities returns the GPU memory error management capabilities
+var (
+	// ref. https://docs.nvidia.com/deploy/a100-gpu-mem-error-mgmt/index.html#supported-gpus
+	memMgmtCapAllSupported = MemoryErrorManagementCapabilities{
+		ErrorContainment:     true,
+		DynamicPageOfflining: true,
+		RowRemapping:         true,
+	}
+	memMgmtCapOnlyRowRemappingSupported = MemoryErrorManagementCapabilities{
+		RowRemapping: true,
+	}
+	gpuProductToMemMgmtCaps = map[string]MemoryErrorManagementCapabilities{
+		"a100": memMgmtCapAllSupported,
+		"b100": memMgmtCapAllSupported,
+		"b200": memMgmtCapAllSupported,
+		"h100": memMgmtCapAllSupported,
+		"h200": memMgmtCapAllSupported,
+		"a10":  memMgmtCapOnlyRowRemappingSupported,
+	}
+)
+
+// SupportedMemoryMgmtCapsByGPUProduct returns the GPU memory error management capabilities
 // based on the GPU product name.
 // ref. https://docs.nvidia.com/deploy/a100-gpu-mem-error-mgmt/index.html#supported-gpus
-func GetMemoryErrorManagementCapabilities(gpuProductName string) MemoryErrorManagementCapabilities {
+func SupportedMemoryMgmtCapsByGPUProduct(gpuProductName string) MemoryErrorManagementCapabilities {
 	p := strings.ToLower(gpuProductName)
-	switch {
-	case strings.Contains(p, "h100"):
-		return MemoryErrorManagementCapabilities{
-			ErrorContainment:     true,
-			DynamicPageOfflining: true,
-			RowRemapping:         true,
+	longestName, memCaps := "", MemoryErrorManagementCapabilities{}
+	for k, v := range gpuProductToMemMgmtCaps {
+		if !strings.Contains(p, k) {
+			continue
 		}
-
-	case strings.Contains(p, "a100"):
-		return MemoryErrorManagementCapabilities{
-			ErrorContainment:     true,
-			DynamicPageOfflining: true,
-			RowRemapping:         true,
-		}
-
-	case strings.Contains(p, "a10"):
-		return MemoryErrorManagementCapabilities{
-			RowRemapping: true,
+		if len(longestName) < len(k) {
+			longestName = k
+			memCaps = v
 		}
-
-	default:
-		return MemoryErrorManagementCapabilities{}
 	}
+	return memCaps
 }
 
 // Contains information about the GPU's memory error management capabilities.

diff --git a/components/accelerator/nvidia/query/gpu_memory_test.go b/components/accelerator/nvidia/query/gpu_memory_test.go
@@ -5,7 +5,7 @@ import (
 	"testing"
 )
 
-func TestGetMemoryErrorManagementCapabilities(t *testing.T) {
+func TestSupportedMemoryMgmtCapsByGPUProduct(t *testing.T) {
 	tests := []struct {
 		name           string
 		gpuProductName string
@@ -50,11 +50,105 @@ func TestGetMemoryErrorManagementCapabilities(t *testing.T) {
 				RowRemapping:         true,
 			},
 		},
+		{
+			name:           "NVIDIA B100",
+			gpuProductName: "NVIDIA B100",
+			expected: MemoryErrorManagementCapabilities{
+				ErrorContainment:     true,
+				DynamicPageOfflining: true,
+				RowRemapping:         true,
+			},
+		},
+		{
+			name:           "NVIDIA B200",
+			gpuProductName: "NVIDIA B200",
+			expected: MemoryErrorManagementCapabilities{
+				ErrorContainment:     true,
+				DynamicPageOfflining: true,
+				RowRemapping:         true,
+			},
+		},
+		{
+			name:           "Mixed case input",
+			gpuProductName: "NvIdIa A100 PCIe",
+			expected: MemoryErrorManagementCapabilities{
+				ErrorContainment:     true,
+				DynamicPageOfflining: true,
+				RowRemapping:         true,
+			},
+		},
+		{
+			name:           "Empty string",
+			gpuProductName: "",
+			expected:       MemoryErrorManagementCapabilities{},
+		},
+		{
+			name:           "NVIDIA T4",
+			gpuProductName: "NVIDIA T4",
+			expected:       MemoryErrorManagementCapabilities{},
+		},
+		{
+			name:           "NVIDIA V100",
+			gpuProductName: "NVIDIA V100",
+			expected:       MemoryErrorManagementCapabilities{},
+		},
+		{
+			name:           "NVIDIA A10G",
+			gpuProductName: "NVIDIA A10G",
+			expected: MemoryErrorManagementCapabilities{
+				RowRemapping: true,
+			},
+		},
+		{
+			name:           "GPU with SXM suffix",
+			gpuProductName: "NVIDIA A100-SXM",
+			expected: MemoryErrorManagementCapabilities{
+				ErrorContainment:     true,
+				DynamicPageOfflining: true,
+				RowRemapping:         true,
+			},
+		},
+		{
+			name:           "GPU with PCIe suffix",
+			gpuProductName: "NVIDIA A100 PCIe",
+			expected: MemoryErrorManagementCapabilities{
+				ErrorContainment:     true,
+				DynamicPageOfflining: true,
+				RowRemapping:         true,
+			},
+		},
+		{
+			name:           "GPU with memory size suffix",
+			gpuProductName: "NVIDIA A100 80GB",
+			expected: MemoryErrorManagementCapabilities{
+				ErrorContainment:     true,
+				DynamicPageOfflining: true,
+				RowRemapping:         true,
+			},
+		},
+		{
+			name:           "Special characters in name",
+			gpuProductName: "NVIDIA-A100_80GB",
+			expected: MemoryErrorManagementCapabilities{
+				ErrorContainment:     true,
+				DynamicPageOfflining: true,
+				RowRemapping:         true,
+			},
+		},
+		{
+			name:           "Non-NVIDIA prefix",
+			gpuProductName: "Some A100 GPU",
+			expected: MemoryErrorManagementCapabilities{
+				ErrorContainment:     true,
+				DynamicPageOfflining: true,
+				RowRemapping:         true,
+			},
+		},
 	}
 
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			result := GetMemoryErrorManagementCapabilities(tt.gpuProductName)
+			result := SupportedMemoryMgmtCapsByGPUProduct(tt.gpuProductName)
 			if !reflect.DeepEqual(result, tt.expected) {
 				t.Errorf("GetGPUMemoryErrorManagement(%q) = %v, want %v", tt.gpuProductName, result, tt.expected)
 			}

diff --git a/components/accelerator/nvidia/query/nvml/remapped_rows.go b/components/accelerator/nvidia/query/nvml/remapped_rows.go
@@ -36,6 +36,8 @@
 	RemappingFailed bool `json:"remapping_failed"`
 
 	// Supported is true if the remapped rows are supported by the device.
+	// Even for "NVIDIA GeForce RTX 4090", this "GetRemappedRows" returns no error,
+	// thus "Supported" is not a reliable way to check if row remapping is supported.
 	Supported bool `json:"supported"`
 }
 
@@ -48,6 +50,8 @@
 	// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g055e7c34f7f15b6ae9aac1dabd60870d
 	corrRows, uncRows, isPending, failureOccurred, ret := dev.GetRemappedRows()
 	if IsNotSupportError(ret) {
+		// even for "NVIDIA GeForce RTX 4090", this returns no error
+		// thus "Supported" is not a reliable way to check if row remapping is supported
 		remRws.Supported = false
 		return remRws, nil
 	}

diff --git a/components/accelerator/nvidia/query/query.go b/components/accelerator/nvidia/query/query.go
@@ -252,7 +252,7 @@
 
 	productName := o.GPUProductName()
 	if productName != "" {
-		o.MemoryErrorManagementCapabilities = GetMemoryErrorManagementCapabilities(o.GPUProductName())
+		o.MemoryErrorManagementCapabilities = SupportedMemoryMgmtCapsByGPUProduct(o.GPUProductName())
 	} else {
 		log.Logger.Warnw("no gpu product name found -- skipping evaluating memory error management capabilities")
 	}
@@ -537,6 +537,18 @@
 				}
 			}
 
+			if dev.RemappedRows.Supported {
+				fmt.Printf("%s NVML remapped rows supported\n", checkMark)
+				if dev.RemappedRows.RequiresReset() {
+					fmt.Printf("%s NVML found that the GPU needs a reset\n", warningSign)
+				}
+				if dev.RemappedRows.QualifiesForRMA() {
+					fmt.Printf("%s NVML found that the GPU qualifies for RMA\n", warningSign)
+				}
+			} else {
+				fmt.Printf("%s NVML remapped rows are not supported\n", warningSign)
+			}
+
 			uncorrectedErrs := dev.ECCErrors.Volatile.FindUncorrectedErrs()
 			if len(uncorrectedErrs) > 0 {
 				fmt.Printf("%s NVML found %d ecc volatile uncorrected error(s)\n", warningSign, len(uncorrectedErrs))

diff --git a/components/accelerator/nvidia/remapped-rows/component_output.go b/components/accelerator/nvidia/remapped-rows/component_output.go
@@ -151,6 +151,12 @@
 	return nil, errors.New("no state found")
 }
 
+func (o *Output) isRowRemappingSupported() bool {
+	// even for "NVIDIA GeForce RTX 4090", this returns no error
+	// thus "RemappedRowsNVML.Supported" is not a reliable way to check if row remapping is supported
+	return o.MemoryErrorManagementCapabilities.RowRemapping
+}
+
 // Returns the output evaluation reason and its healthy-ness.
 func (o *Output) Evaluate() (string, bool, error) {
 	if o == nil {
@@ -160,53 +166,49 @@
 	healthy := true
 	reasons := []string{}
 
-	for _, r := range o.RemappedRowsSMI {
-		rma, err := r.QualifiesForRMA()
-		if err != nil {
-			healthy = false
-			reasons = append(reasons, fmt.Sprintf("nvidia-smi GPU %s failed to determine if it qualifies for RMA: %s", r.ID, err.Error()))
-			continue
-		}
-		if rma {
-			healthy = false
-			reasons = append(reasons, fmt.Sprintf("nvidia-smi GPU %s qualifies for RMA (remapping failure occurred %v, remapped due to uncorrectable errors %s)", r.ID, r.RemappingFailed, r.RemappedDueToUncorrectableErrors))
-		}
+	if !o.isRowRemappingSupported() {
+		reasons = append(reasons, fmt.Sprintf("GPU product name %q does not support row remapping (message: %q)", o.GPUProductName, o.MemoryErrorManagementCapabilities.Message))
+	} else {
+		for _, r := range o.RemappedRowsSMI {
+			rma, err := r.QualifiesForRMA()
+			if err != nil {
+				healthy = false
+				reasons = append(reasons, fmt.Sprintf("nvidia-smi GPU %s failed to determine if it qualifies for RMA: %s", r.ID, err.Error()))
+				continue
+			}
+			if rma {
+				healthy = false
+				reasons = append(reasons, fmt.Sprintf("nvidia-smi GPU %s qualifies for RMA (remapping failure occurred %v, remapped due to uncorrectable errors %s)", r.ID, r.RemappingFailed, r.RemappedDueToUncorrectableErrors))
+			}
 
-		needsReset, err := r.RequiresReset()
-		if err != nil {
-			healthy = false
-			reasons = append(reasons, fmt.Sprintf("nvidia-smi GPU %s failed to determine if it needs reset: %s", r.ID, err.Error()))
-			continue
-		}
-		if needsReset {
-			healthy = false
-			reasons = append(reasons, fmt.Sprintf("nvidia-smi GPU %s needs reset (pending remapping %v)", r.ID, needsReset))
+			needsReset, err := r.RequiresReset()
+			if err != nil {
+				reasons = append(reasons, fmt.Sprintf("nvidia-smi GPU %s failed to determine if it needs reset: %s", r.ID, err.Error()))
+				continue
+			}
+			if needsReset {
+				healthy = false
+				reasons = append(reasons, fmt.Sprintf("nvidia-smi GPU %s needs reset (pending remapping %v)", r.ID, needsReset))
+			}
 		}
-	}
 
-	for _, r := range o.RemappedRowsNVML {
-		if r.QualifiesForRMA() {
-			healthy = false
-			reasons = append(reasons, fmt.Sprintf("nvml GPU %s qualifies for RMA (remapping failure occurred %v, remapped due to uncorrectable errors %d)", r.UUID, r.RemappingFailed, r.RemappedDueToUncorrectableErrors))
-		}
-		if r.RequiresReset() {
-			healthy = false
-			reasons = append(reasons, fmt.Sprintf("nvml GPU %s needs reset (pending remapping %v)", r.UUID, r.RemappingPending))
+		for _, r := range o.RemappedRowsNVML {
+			if r.QualifiesForRMA() {
+				healthy = false
+				reasons = append(reasons, fmt.Sprintf("nvml GPU %s qualifies for RMA (remapping failure occurred %v, remapped due to uncorrectable errors %d)", r.UUID, r.RemappingFailed, r.RemappedDueToUncorrectableErrors))
+			}
+			if r.RequiresReset() {
+				healthy = false
+				reasons = append(reasons, fmt.Sprintf("nvml GPU %s needs reset (pending remapping %v)", r.UUID, r.RemappingPending))
+			}
 		}
-	}
-
-	if len(reasons) == 0 {
-		reasons = append(reasons, "no issue detected")
-	}
 
-	// regardless of the healthy-ness, we want to log the product name
-	// so that we can identify which product name does not support row remapping
-	if !o.MemoryErrorManagementCapabilities.RowRemapping {
-		reasons = append(reasons, fmt.Sprintf("GPU product name %q does not support row remapping (message: %q)", o.GPUProductName, o.MemoryErrorManagementCapabilities.Message))
+		if len(reasons) == 0 {
+			reasons = append(reasons, "no issue detected")
+		}
 	}
 
 	reason := strings.Join(reasons, ", ")
-
 	return reason, healthy, nil
 }