Skip to content

Commit

Permalink
use longest match
Browse files Browse the repository at this point in the history
Signed-off-by: Gyuho Lee <[email protected]>
  • Loading branch information
gyuho committed Feb 6, 2025
1 parent 43592dd commit f7f807c
Showing 1 changed file with 21 additions and 12 deletions.
33 changes: 21 additions & 12 deletions components/accelerator/nvidia/query/gpu_memory.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,36 +3,45 @@ package query
import "strings"

var (
// ref. https://docs.nvidia.com/deploy/a100-gpu-mem-error-mgmt/index.html#supported-gpus
memMgmtCapAllSupported = MemoryErrorManagementCapabilities{
ErrorContainment: true,
DynamicPageOfflining: true,
RowRemapping: true,
}
// ref. https://docs.nvidia.com/deploy/a100-gpu-mem-error-mgmt/index.html#supported-gpus
gpuProductsWithAllSupported = []string{"a100", "b100", "b200", "h100", "h200"}

memMgmtCapOnlyRowRemappingSupported = MemoryErrorManagementCapabilities{
RowRemapping: true,
}
gpuProductsWithOnlyRowRemappingSupported = []string{"a10"}
gpuProductToMemMgmtCaps = map[string]MemoryErrorManagementCapabilities{
"a100": memMgmtCapAllSupported,
"b100": memMgmtCapAllSupported,
"b200": memMgmtCapAllSupported,
"h100": memMgmtCapAllSupported,
"h200": memMgmtCapAllSupported,
"a10": memMgmtCapOnlyRowRemappingSupported,
}
)

// SupportedMemoryMgmtCapsByGPUProduct returns the GPU memory error management capabilities
// based on the GPU product name.
// ref. https://docs.nvidia.com/deploy/a100-gpu-mem-error-mgmt/index.html#supported-gpus
func SupportedMemoryMgmtCapsByGPUProduct(gpuProductName string) MemoryErrorManagementCapabilities {
p := strings.ToLower(gpuProductName)
for _, s := range gpuProductsWithAllSupported {
if strings.Contains(p, s) {
return memMgmtCapAllSupported

longestName, memCaps := "", MemoryErrorManagementCapabilities{}
for k, v := range gpuProductToMemMgmtCaps {
if !strings.Contains(p, k) {
continue
}
}
for _, s := range gpuProductsWithOnlyRowRemappingSupported {
if strings.Contains(p, s) {
return memMgmtCapOnlyRowRemappingSupported
if len(longestName) < len(k) {
longestName = k
memCaps = v
}
}
return MemoryErrorManagementCapabilities{}
if longestName == "" {
return MemoryErrorManagementCapabilities{}
}
return memCaps
}

// Contains information about the GPU's memory error management capabilities.
Expand Down

0 comments on commit f7f807c

Please sign in to comment.