Merge pull request #11 from ionet-official/cloud-2694-implement-cuda-…

…self-check-binary Implement CUDA self-check binary
ionet-official · May 29, 2024 · 764c83d · 764c83d
2 parents 9ffb466 + 1deb1a5
commit 764c83d
Show file tree

Hide file tree

Showing 7 changed files with 157 additions and 0 deletions.
diff --git a/self-check/.gitignore b/self-check/.gitignore
@@ -0,0 +1 @@
+build*
diff --git a/self-check/Dockerfile b/self-check/Dockerfile
@@ -0,0 +1,10 @@
+FROM ubuntu:18.04
+
+RUN apt-get update && apt-get install -y wget && \
+    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb -O /tmp/cuda-keyring_1.0-1_all.deb && \
+    dpkg -i /tmp/cuda-keyring_1.0-1_all.deb && \
+    apt-get update && \
+    apt-get -y install cuda-toolkit-11-8 && \
+    apt-get install -y build-essential && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
diff --git a/self-check/Makefile b/self-check/Makefile
@@ -0,0 +1,39 @@
+CUDA_ROOT := /usr/local/cuda-11
+export PATH := $(CUDA_ROOT)/bin:$(PATH)
+BUILD_DIR := build
+MARKER := $(BUILD_DIR)/.marker
+
+all: $(BUILD_DIR)/self-check
+
+$(MARKER):
+	mkdir $(BUILD_DIR) -p
+	touch $@
+
+
+$(BUILD_DIR)/checks.cu.o: checks.cu checks.cuh $(MARKER)
+	nvcc -arch=sm_61 --device-c -O3 $< -c -o $@
+
+$(BUILD_DIR)/checks.o: $(BUILD_DIR)/checks.cu.o
+	nvcc -arch=sm_61 --device-link -o $@ $^
+
+$(BUILD_DIR)/main.o: main.cpp checks.cuh $(MARKER)
+	g++ -O3 -march=corei7-avx -mtune=corei7-avx -mno-avx -mno-aes $< -c -o $@
+
+$(BUILD_DIR)/self-check: $(BUILD_DIR)/main.o $(BUILD_DIR)/checks.o $(BUILD_DIR)/checks.cu.o
+	g++ $^ -o $@ -L$(CUDA_ROOT)/lib64 -lcudart_static -ldl -lrt -pthread
+	strip $@
+
+clean:
+	rm -rf $(BUILD_DIR)
+
+run: $(BUILD_DIR)/self-check
+	$<
+
+docker: Dockerfile
+	docker build . -t self-check-build
+	docker run --volume $(CURDIR):/checker --user $(shell id -u) self-check-build make BUILD_DIR=build-docker -C /checker clean all
+	docker run -it --volume $(CURDIR):/checker --gpus all --entrypoint /checker/build-docker/self-check brunneis/python:3.9.0-ubuntu
+
+.PHONY: all clean docker run
+
+#.SILENT:
diff --git a/self-check/README.md b/self-check/README.md
@@ -0,0 +1,20 @@
+# IO Net CUDA Self-Check Binary
+
+This is intentionally released in source form for transparency.
+
+To run the check, get the binary from the Releases and run it on the Linux **host** (if your worker is Linux-based) or it WSL2 used to run our Launcher (if Windows-based).
+
+It should perform simple CUDA checks and report the results.
+
+## Example of good output
+
+```
+    Reported 1 CUDA devices
+    Device #0: name=NVIDIA GeForce RTX 3080: memory alloc test pass
+    all cards look ok
+```
+
+## Example of output when some issues are found
+```
+    Cannot get device count: cuda error=35 - CUDA driver version is insufficient for CUDA runtime version
+```
diff --git a/self-check/checks.cu b/self-check/checks.cu
@@ -0,0 +1,45 @@
+#include "checks.cuh"
+#include <stdio.h>
+
+#define CUDA_ERROR_CHECK(err, msg) {    \
+    if ((err) != cudaSuccess) {         \
+        fprintf(stderr, "%s: cuda error=%d - %s\n", (msg), (int)(err), cudaGetErrorString(err));  \
+        return -1;                      \
+    }                                   \
+}
+
+int get_devices_count() {
+    int result;
+    auto err = cudaGetDeviceCount(&result);
+    CUDA_ERROR_CHECK(err, "Cannot get device count");
+    return result;
+}
+
+int get_device_name(int device, char** result) {
+    if (result == nullptr) return -2;
+    cudaDeviceProp prop;
+    auto err = cudaGetDeviceProperties(&prop, device);
+    CUDA_ERROR_CHECK(err, "Cannot get device properties");
+    *result = prop.name;
+    return 0;
+}
+
+int device_malloc(int device, void** result) {
+    if (result == nullptr) return -2;
+    auto err = cudaSetDevice(device);
+    CUDA_ERROR_CHECK(err, "Cannot set active device");
+    void* mem = nullptr;
+    err = cudaMalloc(&mem, 1024);
+    CUDA_ERROR_CHECK(err, "Cannot allocate memory");
+    *result = mem;
+    return 0;
+}
+
+int device_free(int device, void* ptr) {
+    if (ptr == nullptr) return -2;
+    auto err = cudaSetDevice(device);
+    CUDA_ERROR_CHECK(err, "Cannot set active device");
+    err = cudaFree(ptr);
+    CUDA_ERROR_CHECK(err, "Cannot free memory");
+    return 0;
+}
diff --git a/self-check/checks.cuh b/self-check/checks.cuh
@@ -0,0 +1,6 @@
+#pragma once
+
+int get_devices_count();
+int get_device_name(int device, char** result);
+int device_malloc(int device, void** result);
+int device_free(int device, void* ptr);
diff --git a/self-check/main.cpp b/self-check/main.cpp
@@ -0,0 +1,36 @@
+#include <stdio.h>
+#include "checks.cuh"
+
+int main() {
+    int devices = get_devices_count();
+    if (devices < 1) {
+        printf("Cannot detect any CUDA devices\n");
+        return 2;
+    }
+    printf("Reported %d CUDA devices\n", devices);
+
+    bool okay = true;
+    for (int device = 0; device < devices; device++) {
+        char* name;
+        if (get_device_name(device, &name) < 0) {
+            printf("Cannot get device name for #%d\n", device);
+            okay = false;
+            continue;
+        }
+        printf("Device #%d: name=%s: ", device, name);
+        void* ptr;
+        if (device_malloc(device, &ptr) < 0) {
+            printf("cannot allocate memory on device #%d\n", device);
+            okay = false;
+            continue;
+        }
+        if (device_free(device, ptr) < 0) {
+            printf("cannot free memory on device #%d\n", device);
+            okay = false;
+            continue;
+        }
+        printf("memory alloc test pass\n");
+    }
+    printf(okay ? "all cards look ok\n" : "some cards failed check\n");
+    return okay ? 0 : 1;
+}