-
Notifications
You must be signed in to change notification settings - Fork 35
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #11 from ionet-official/cloud-2694-implement-cuda-…
…self-check-binary Implement CUDA self-check binary
- Loading branch information
Showing
7 changed files
with
157 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
build* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
FROM ubuntu:18.04 | ||
|
||
RUN apt-get update && apt-get install -y wget && \ | ||
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb -O /tmp/cuda-keyring_1.0-1_all.deb && \ | ||
dpkg -i /tmp/cuda-keyring_1.0-1_all.deb && \ | ||
apt-get update && \ | ||
apt-get -y install cuda-toolkit-11-8 && \ | ||
apt-get install -y build-essential && \ | ||
apt-get clean && \ | ||
rm -rf /var/lib/apt/lists/* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
CUDA_ROOT := /usr/local/cuda-11 | ||
export PATH := $(CUDA_ROOT)/bin:$(PATH) | ||
BUILD_DIR := build | ||
MARKER := $(BUILD_DIR)/.marker | ||
|
||
all: $(BUILD_DIR)/self-check | ||
|
||
$(MARKER): | ||
mkdir $(BUILD_DIR) -p | ||
touch $@ | ||
|
||
|
||
$(BUILD_DIR)/checks.cu.o: checks.cu checks.cuh $(MARKER) | ||
nvcc -arch=sm_61 --device-c -O3 $< -c -o $@ | ||
|
||
$(BUILD_DIR)/checks.o: $(BUILD_DIR)/checks.cu.o | ||
nvcc -arch=sm_61 --device-link -o $@ $^ | ||
|
||
$(BUILD_DIR)/main.o: main.cpp checks.cuh $(MARKER) | ||
g++ -O3 -march=corei7-avx -mtune=corei7-avx -mno-avx -mno-aes $< -c -o $@ | ||
|
||
$(BUILD_DIR)/self-check: $(BUILD_DIR)/main.o $(BUILD_DIR)/checks.o $(BUILD_DIR)/checks.cu.o | ||
g++ $^ -o $@ -L$(CUDA_ROOT)/lib64 -lcudart_static -ldl -lrt -pthread | ||
strip $@ | ||
|
||
clean: | ||
rm -rf $(BUILD_DIR) | ||
|
||
run: $(BUILD_DIR)/self-check | ||
$< | ||
|
||
docker: Dockerfile | ||
docker build . -t self-check-build | ||
docker run --volume $(CURDIR):/checker --user $(shell id -u) self-check-build make BUILD_DIR=build-docker -C /checker clean all | ||
docker run -it --volume $(CURDIR):/checker --gpus all --entrypoint /checker/build-docker/self-check brunneis/python:3.9.0-ubuntu | ||
|
||
.PHONY: all clean docker run | ||
|
||
#.SILENT: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# IO Net CUDA Self-Check Binary | ||
|
||
This is intentionally released in source form for transparency. | ||
|
||
To run the check, get the binary from the Releases and run it on the Linux **host** (if your worker is Linux-based) or it WSL2 used to run our Launcher (if Windows-based). | ||
|
||
It should perform simple CUDA checks and report the results. | ||
|
||
## Example of good output | ||
|
||
``` | ||
Reported 1 CUDA devices | ||
Device #0: name=NVIDIA GeForce RTX 3080: memory alloc test pass | ||
all cards look ok | ||
``` | ||
|
||
## Example of output when some issues are found | ||
``` | ||
Cannot get device count: cuda error=35 - CUDA driver version is insufficient for CUDA runtime version | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
#include "checks.cuh" | ||
#include <stdio.h> | ||
|
||
#define CUDA_ERROR_CHECK(err, msg) { \ | ||
if ((err) != cudaSuccess) { \ | ||
fprintf(stderr, "%s: cuda error=%d - %s\n", (msg), (int)(err), cudaGetErrorString(err)); \ | ||
return -1; \ | ||
} \ | ||
} | ||
|
||
int get_devices_count() { | ||
int result; | ||
auto err = cudaGetDeviceCount(&result); | ||
CUDA_ERROR_CHECK(err, "Cannot get device count"); | ||
return result; | ||
} | ||
|
||
int get_device_name(int device, char** result) { | ||
if (result == nullptr) return -2; | ||
cudaDeviceProp prop; | ||
auto err = cudaGetDeviceProperties(&prop, device); | ||
CUDA_ERROR_CHECK(err, "Cannot get device properties"); | ||
*result = prop.name; | ||
return 0; | ||
} | ||
|
||
int device_malloc(int device, void** result) { | ||
if (result == nullptr) return -2; | ||
auto err = cudaSetDevice(device); | ||
CUDA_ERROR_CHECK(err, "Cannot set active device"); | ||
void* mem = nullptr; | ||
err = cudaMalloc(&mem, 1024); | ||
CUDA_ERROR_CHECK(err, "Cannot allocate memory"); | ||
*result = mem; | ||
return 0; | ||
} | ||
|
||
int device_free(int device, void* ptr) { | ||
if (ptr == nullptr) return -2; | ||
auto err = cudaSetDevice(device); | ||
CUDA_ERROR_CHECK(err, "Cannot set active device"); | ||
err = cudaFree(ptr); | ||
CUDA_ERROR_CHECK(err, "Cannot free memory"); | ||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
#pragma once | ||
|
||
int get_devices_count(); | ||
int get_device_name(int device, char** result); | ||
int device_malloc(int device, void** result); | ||
int device_free(int device, void* ptr); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
#include <stdio.h> | ||
#include "checks.cuh" | ||
|
||
int main() { | ||
int devices = get_devices_count(); | ||
if (devices < 1) { | ||
printf("Cannot detect any CUDA devices\n"); | ||
return 2; | ||
} | ||
printf("Reported %d CUDA devices\n", devices); | ||
|
||
bool okay = true; | ||
for (int device = 0; device < devices; device++) { | ||
char* name; | ||
if (get_device_name(device, &name) < 0) { | ||
printf("Cannot get device name for #%d\n", device); | ||
okay = false; | ||
continue; | ||
} | ||
printf("Device #%d: name=%s: ", device, name); | ||
void* ptr; | ||
if (device_malloc(device, &ptr) < 0) { | ||
printf("cannot allocate memory on device #%d\n", device); | ||
okay = false; | ||
continue; | ||
} | ||
if (device_free(device, ptr) < 0) { | ||
printf("cannot free memory on device #%d\n", device); | ||
okay = false; | ||
continue; | ||
} | ||
printf("memory alloc test pass\n"); | ||
} | ||
printf(okay ? "all cards look ok\n" : "some cards failed check\n"); | ||
return okay ? 0 : 1; | ||
} |