v0.5.0, performance improved, unified memory mode

PawelGorny · Mar 30, 2022 · b93dd90 · b93dd90
1 parent 8ccf387
commit b93dd90
Show file tree

Hide file tree

Showing 4 changed files with 272 additions and 17 deletions.
diff --git a/README.md b/README.md
@@ -29,6 +29,7 @@ Usage:
      -decode wifToDecode:     decodes given WIF
      -restore statusFile:     restore work configuration
      -listDevices:            shows available devices
+	 -disable-um:             disable unified memory mode
      -h :                     shows help
      
 
@@ -81,22 +82,21 @@ Performance
 -----------
 User should modify number of blocks and number of threads in each block to find values which are the best for his card. Number of tests performed by each thread also could have impact of global performance/latency.  
 
-Test card: RTX3060 (eGPU!) with 224 BLOCKS & 640 BLOCK_THREADS (program default values) checks around 10000 MKey/s for compressed address with missing characters in the middle (collision with checksum) and around 1300-1400 Mkey/s for other cases; other results (using default values of blocks, threads and steps per thread):
+Test card: RTX3060 (eGPU!) with 224 BLOCKS & 512 BLOCK_THREADS (program default values) checks around 10000 MKey/s for compressed address with missing characters in the middle (collision with checksum) and around 1400-1540 Mkey/s for other cases (20000steps/thread); other results (using default values of blocks, threads and steps per thread):
 
-| card          | compressed with collision | all other cases |
-|---------------|---------------------------|-----------------|
-| RTX 3060 eGPU | 10000                     | 1400            |
-| RTX 3090      | 29500                     | 3650            |
-| GTX 1080TI    | 6000                      | 650             |
+| card          | compressed with collision | all other cases     |
+|---------------|---------------------------|---------------------|
+| RTX 3060 eGPU | 10000                     | 1520 (224/512/20000)|
+| RTX 3090      | 29500                     | 3950 (656/640/5000) |
+| GTX 1080TI    | 6000                      | 750                 |
 
-Please consult official Nvidia Occupancy Calculator (https://docs.nvidia.com/cuda/cuda-occupancy-calculator/index.html) to see how to select desired amount of threads (shared memory=0, registers per thread = 48).
+Please consult official Nvidia Occupancy Calculator (https://docs.nvidia.com/cuda/cuda-occupancy-calculator/index.html) to see how to select desired amount of threads/block (shared memory=0, registers per thread = 48). Adjust number of steps per thread to obtain the optimal performance.
 
 TODO
 ----
 * code cleaning, review of hash functions
 * predefined custom step (using list of possible characters)
 * auto-processing (preparing configuration) based on WIF
-* support for partially known checksum
 
 Contact
 -------

diff --git a/WifSolverCuda/Worker.cuh b/WifSolverCuda/Worker.cuh
@@ -13,6 +13,11 @@ __global__ void kernelUncompressed(bool* buffResult, bool* buffCollectorWork, ui
 __global__ void kernelCompressed(bool* buffResult, bool* buffCollectorWork, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum);
 __global__ void resultCollector(bool* buffResult, uint64_t* buffCombinedResult, const uint64_t threadsInBlockNumberOfChecks);
 
+__global__ void kernelCompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks);
+__global__ void kernelCompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum);
+__global__ void kernelUncompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks);
+__global__ void kernelUncompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum);
+
 __device__ bool _checksumDoubleSha256CheckUncompressed(unsigned int checksum, beu32* d_hash, uint64_t* _start);
 __device__ bool _checksumDoubleSha256CheckCompressed(unsigned int checksum, beu32* d_hash, uint64_t* _start);
 
@@ -23,4 +28,7 @@ __device__ void _load(uint64_t* C, uint64_t* A);
 
 __device__ void IMult(uint64_t* r, uint64_t* a, int64_t b); 
 
+__device__ void initShared();
+__device__ void summaryShared(uint32_t* unifiedResult, bool* isResultFlag);
+
 cudaError_t loadStride(uint64_t* stride);
diff --git a/WifSolverCuda/Worker1.cu b/WifSolverCuda/Worker1.cu
@@ -1,6 +1,8 @@
 #include "Worker.cuh"
 
 __device__ __constant__ uint64_t _stride[5];
+__device__ __shared__ uint32_t _blockResults[4096];
+__device__ __shared__ bool _blockResultFlag[1];
 
 __global__ void kernelUncompressed(bool* buffResult, bool* buffCollectorWork, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum) {
     uint64_t _start[5];
@@ -70,7 +72,6 @@ __global__ void kernelCompressed(bool* buffResult, bool* buffCollectorWork, uint
         _add(_start, _stride);
     }
 }
-
 __global__ void resultCollector(bool* buffResult, uint64_t* buffCombinedResult, const uint64_t threadsInBlockNumberOfChecks) {
     if (buffCombinedResult[blockIdx.x] == 0xffffffffffff) {
         return;
@@ -90,6 +91,129 @@ __global__ void resultCollector(bool* buffResult, uint64_t* buffCombinedResult,
     buffCombinedResult[blockIdx.x] = 0xffffffffffff;
 }
 
+__global__ void kernelUncompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum) {
+    uint64_t _start[5];
+    beu32 d_hash[8];
+
+    int64_t resIx = threadIdx.x;
+    int64_t tIx = (threadIdx.x + blockIdx.x * blockDim.x) * threadNumberOfChecks;
+    IMult(_start, _stride, tIx);
+    _add(_start, buffRangeStart);
+    bool wasResult = false;
+    initShared();
+    for (uint64_t i = 0, resultIx = tIx; i < threadNumberOfChecks; i++, resultIx++) {
+        if (_checksumDoubleSha256CheckUncompressed(checksum, d_hash, _start)) {
+            _blockResults[resIx] = resultIx;
+            if (!wasResult) {
+                _blockResultFlag[0] = true;
+            }
+            wasResult = true;
+            resIx += blockDim.x;
+        }
+        _add(_start, _stride);
+    }
+    summaryShared(unifiedResult, isResultFlag);
+}
+__global__ void kernelUncompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks) {
+    uint64_t _start[5];
+    beu32 d_hash[8];
+
+    int64_t resIx = threadIdx.x;
+    int64_t tIx = (threadIdx.x + blockIdx.x * blockDim.x) * threadNumberOfChecks;
+    IMult(_start, _stride, tIx);
+    _add(_start, buffRangeStart);
+    bool wasResult = false;
+    initShared();
+    for (uint64_t i = 0, resultIx = tIx; i < threadNumberOfChecks; i++, resultIx++) {
+        if (_checksumDoubleSha256CheckUncompressed(_start[0] & 0xffffffff, d_hash, _start)) {
+            _blockResults[resIx] = resultIx;
+            if (!wasResult) {
+                _blockResultFlag[0] = true;
+            }
+            wasResult = true;
+            resIx += blockDim.x;
+        }
+        _add(_start, _stride);
+    }
+    summaryShared(unifiedResult, isResultFlag);
+}
+__global__ void kernelCompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks) {
+    uint64_t _start[5];
+    beu32 d_hash[8];
+
+    int64_t resIx = threadIdx.x;
+    int64_t tIx = (threadIdx.x + blockIdx.x * blockDim.x) * threadNumberOfChecks;
+    IMult(_start, _stride, tIx);
+    _add(_start, buffRangeStart);
+    bool wasResult = false;
+    initShared();
+    for (uint64_t i = 0, resultIx = tIx; i < threadNumberOfChecks; i++, resultIx++) {
+        if (((_start[0] & 0xff00000000) >> 32) != 0x01) {
+            _add(_start, _stride);
+            continue;
+        }
+        if (_checksumDoubleSha256CheckCompressed(_start[0] & 0xffffffff, d_hash, _start)) {
+            _blockResults[resIx] = resultIx;
+            if (!wasResult) {
+                _blockResultFlag[0] = true;
+            }
+            wasResult = true;
+            resIx += blockDim.x;            
+        }
+        _add(_start, _stride);
+    }
+    summaryShared(unifiedResult, isResultFlag);
+}
+__global__ void kernelCompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum) {
+    uint64_t _start[5];
+    beu32 d_hash[8];
+
+    int64_t resIx = threadIdx.x;
+    int64_t tIx = (threadIdx.x + blockIdx.x * blockDim.x) * threadNumberOfChecks;
+    IMult(_start, _stride, tIx);
+    _add(_start, buffRangeStart);
+    bool wasResult = false;
+    initShared();
+    for (uint64_t i = 0, resultIx = tIx; i < threadNumberOfChecks; i++, resultIx++) {
+        if (((_start[0] & 0xff00000000) >> 32) != 0x01) {
+            _add(_start, _stride);
+            continue;
+        }
+        if (_checksumDoubleSha256CheckCompressed(checksum, d_hash, _start)) {
+            _blockResults[resIx] = resultIx;
+            if (!wasResult) {
+                _blockResultFlag[0] = true;
+            }
+            wasResult = true;
+            resIx += blockDim.x;
+        }
+        _add(_start, _stride);
+    }
+    summaryShared(unifiedResult, isResultFlag);
+}
+
+__device__ void initShared() {
+    if (threadIdx.x == 0) {
+        _blockResultFlag[0] = false;
+        for (int i = 0; i < blockDim.x * 4; i++) {
+            _blockResults[i] = UINT32_MAX;
+        }
+    }
+    __syncthreads();
+}
+__device__ void summaryShared(uint32_t* unifiedResult, bool* isResultFlag) {
+    __syncthreads();
+    if (_blockResultFlag[0] && threadIdx.x == 0) {
+        isResultFlag[0] = true;
+        for (int i = 0, rIx = blockIdx.x; i < blockDim.x * 4; i++) {
+            if (_blockResults[i] != UINT32_MAX) {
+                unifiedResult[rIx] = _blockResults[i];
+                rIx += gridDim.x;
+            }
+        }
+    }
+}
+
 __device__ bool _checksumDoubleSha256CheckCompressed(unsigned int checksum, beu32* d_hash, uint64_t* _start) {
     sha256Kernel(d_hash,
         _start[4] >> 16,