Skip to content

Commit

Permalink
Merge pull request #11 from ionet-official/cloud-2694-implement-cuda-…
Browse files Browse the repository at this point in the history
…self-check-binary

Implement CUDA self-check binary
  • Loading branch information
vnlitvinov authored May 29, 2024
2 parents 9ffb466 + 1deb1a5 commit 764c83d
Show file tree
Hide file tree
Showing 7 changed files with 157 additions and 0 deletions.
1 change: 1 addition & 0 deletions self-check/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
build*
10 changes: 10 additions & 0 deletions self-check/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
FROM ubuntu:18.04

RUN apt-get update && apt-get install -y wget && \
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb -O /tmp/cuda-keyring_1.0-1_all.deb && \
dpkg -i /tmp/cuda-keyring_1.0-1_all.deb && \
apt-get update && \
apt-get -y install cuda-toolkit-11-8 && \
apt-get install -y build-essential && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
39 changes: 39 additions & 0 deletions self-check/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
CUDA_ROOT := /usr/local/cuda-11
export PATH := $(CUDA_ROOT)/bin:$(PATH)
BUILD_DIR := build
MARKER := $(BUILD_DIR)/.marker

all: $(BUILD_DIR)/self-check

$(MARKER):
mkdir $(BUILD_DIR) -p
touch $@


$(BUILD_DIR)/checks.cu.o: checks.cu checks.cuh $(MARKER)
nvcc -arch=sm_61 --device-c -O3 $< -c -o $@

$(BUILD_DIR)/checks.o: $(BUILD_DIR)/checks.cu.o
nvcc -arch=sm_61 --device-link -o $@ $^

$(BUILD_DIR)/main.o: main.cpp checks.cuh $(MARKER)
g++ -O3 -march=corei7-avx -mtune=corei7-avx -mno-avx -mno-aes $< -c -o $@

$(BUILD_DIR)/self-check: $(BUILD_DIR)/main.o $(BUILD_DIR)/checks.o $(BUILD_DIR)/checks.cu.o
g++ $^ -o $@ -L$(CUDA_ROOT)/lib64 -lcudart_static -ldl -lrt -pthread
strip $@

clean:
rm -rf $(BUILD_DIR)

run: $(BUILD_DIR)/self-check
$<

docker: Dockerfile
docker build . -t self-check-build
docker run --volume $(CURDIR):/checker --user $(shell id -u) self-check-build make BUILD_DIR=build-docker -C /checker clean all
docker run -it --volume $(CURDIR):/checker --gpus all --entrypoint /checker/build-docker/self-check brunneis/python:3.9.0-ubuntu

.PHONY: all clean docker run

#.SILENT:
20 changes: 20 additions & 0 deletions self-check/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# IO Net CUDA Self-Check Binary

This is intentionally released in source form for transparency.

To run the check, get the binary from the Releases and run it on the Linux **host** (if your worker is Linux-based) or it WSL2 used to run our Launcher (if Windows-based).

It should perform simple CUDA checks and report the results.

## Example of good output

```
Reported 1 CUDA devices
Device #0: name=NVIDIA GeForce RTX 3080: memory alloc test pass
all cards look ok
```

## Example of output when some issues are found
```
Cannot get device count: cuda error=35 - CUDA driver version is insufficient for CUDA runtime version
```
45 changes: 45 additions & 0 deletions self-check/checks.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#include "checks.cuh"
#include <stdio.h>

#define CUDA_ERROR_CHECK(err, msg) { \
if ((err) != cudaSuccess) { \
fprintf(stderr, "%s: cuda error=%d - %s\n", (msg), (int)(err), cudaGetErrorString(err)); \
return -1; \
} \
}

int get_devices_count() {
int result;
auto err = cudaGetDeviceCount(&result);
CUDA_ERROR_CHECK(err, "Cannot get device count");
return result;
}

int get_device_name(int device, char** result) {
if (result == nullptr) return -2;
cudaDeviceProp prop;
auto err = cudaGetDeviceProperties(&prop, device);
CUDA_ERROR_CHECK(err, "Cannot get device properties");
*result = prop.name;
return 0;
}

int device_malloc(int device, void** result) {
if (result == nullptr) return -2;
auto err = cudaSetDevice(device);
CUDA_ERROR_CHECK(err, "Cannot set active device");
void* mem = nullptr;
err = cudaMalloc(&mem, 1024);
CUDA_ERROR_CHECK(err, "Cannot allocate memory");
*result = mem;
return 0;
}

int device_free(int device, void* ptr) {
if (ptr == nullptr) return -2;
auto err = cudaSetDevice(device);
CUDA_ERROR_CHECK(err, "Cannot set active device");
err = cudaFree(ptr);
CUDA_ERROR_CHECK(err, "Cannot free memory");
return 0;
}
6 changes: 6 additions & 0 deletions self-check/checks.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#pragma once

int get_devices_count();
int get_device_name(int device, char** result);
int device_malloc(int device, void** result);
int device_free(int device, void* ptr);
36 changes: 36 additions & 0 deletions self-check/main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#include <stdio.h>
#include "checks.cuh"

int main() {
int devices = get_devices_count();
if (devices < 1) {
printf("Cannot detect any CUDA devices\n");
return 2;
}
printf("Reported %d CUDA devices\n", devices);

bool okay = true;
for (int device = 0; device < devices; device++) {
char* name;
if (get_device_name(device, &name) < 0) {
printf("Cannot get device name for #%d\n", device);
okay = false;
continue;
}
printf("Device #%d: name=%s: ", device, name);
void* ptr;
if (device_malloc(device, &ptr) < 0) {
printf("cannot allocate memory on device #%d\n", device);
okay = false;
continue;
}
if (device_free(device, ptr) < 0) {
printf("cannot free memory on device #%d\n", device);
okay = false;
continue;
}
printf("memory alloc test pass\n");
}
printf(okay ? "all cards look ok\n" : "some cards failed check\n");
return okay ? 0 : 1;
}

0 comments on commit 764c83d

Please sign in to comment.