Skip to content

Commit

Permalink
use nvml
Browse files Browse the repository at this point in the history
Signed-off-by: Gyuho Lee <[email protected]>
  • Loading branch information
gyuho committed Feb 4, 2025
1 parent af902d8 commit a67ace7
Show file tree
Hide file tree
Showing 3 changed files with 299 additions and 56 deletions.
12 changes: 12 additions & 0 deletions components/accelerator/nvidia/query/query.go
Original file line number Diff line number Diff line change
Expand Up @@ -539,6 +539,18 @@ func (o *Output) PrintInfo(opts ...OpOption) {
}
}

if dev.RemappedRows.Supported {
fmt.Printf("%s NVML remapped rows supported\n", checkMark)
if dev.RemappedRows.RequiresReset() {
fmt.Printf("%s NVML found that the GPU needs a reset\n", warningSign)
}
if dev.RemappedRows.QualifiesForRMA() {
fmt.Printf("%s NVML found that the GPU qualifies for RMA\n", warningSign)
}
} else {
fmt.Printf("%s NVML remapped rows are not supported\n", warningSign)
}

Check warning on line 552 in components/accelerator/nvidia/query/query.go

View check run for this annotation

Codecov / codecov/patch

components/accelerator/nvidia/query/query.go#L542-L552

Added lines #L542 - L552 were not covered by tests

uncorrectedErrs := dev.ECCErrors.Volatile.FindUncorrectedErrs()
if len(uncorrectedErrs) > 0 {
fmt.Printf("%s NVML found %d ecc volatile uncorrected error(s)\n", warningSign, len(uncorrectedErrs))
Expand Down
11 changes: 10 additions & 1 deletion components/accelerator/nvidia/remapped-rows/component_output.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,15 @@ func ParseStatesToOutput(states ...components.State) (*Output, error) {
return nil, errors.New("no state found")
}

func (o *Output) isRowRemappingSupported() bool {
for _, r := range o.RemappedRowsNVML {
if !r.Supported {
return false
}
}
return true
}

// Returns the output evaluation reason and its healthy-ness.
func (o *Output) Evaluate() (string, bool, error) {
if o == nil {
Expand All @@ -160,7 +169,7 @@ func (o *Output) Evaluate() (string, bool, error) {
healthy := true
reasons := []string{}

if !o.MemoryErrorManagementCapabilities.RowRemapping {
if !o.isRowRemappingSupported() {
reasons = append(reasons, fmt.Sprintf("GPU product name %q does not support row remapping (message: %q)", o.GPUProductName, o.MemoryErrorManagementCapabilities.Message))
} else {
for _, r := range o.RemappedRowsSMI {
Expand Down
Loading

0 comments on commit a67ace7

Please sign in to comment.