Skip to content

Commit

Permalink
v0.5.0, performance improved, unified memory mode
Browse files Browse the repository at this point in the history
  • Loading branch information
PawelGorny committed Mar 30, 2022
1 parent 8ccf387 commit b93dd90
Show file tree
Hide file tree
Showing 4 changed files with 272 additions and 17 deletions.
16 changes: 8 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ Usage:
-decode wifToDecode: decodes given WIF
-restore statusFile: restore work configuration
-listDevices: shows available devices
-disable-um: disable unified memory mode
-h : shows help

Expand Down Expand Up @@ -81,22 +82,21 @@ Performance
-----------
User should modify number of blocks and number of threads in each block to find values which are the best for his card. Number of tests performed by each thread also could have impact of global performance/latency.

Test card: RTX3060 (eGPU!) with 224 BLOCKS & 640 BLOCK_THREADS (program default values) checks around 10000 MKey/s for compressed address with missing characters in the middle (collision with checksum) and around 1300-1400 Mkey/s for other cases; other results (using default values of blocks, threads and steps per thread):
Test card: RTX3060 (eGPU!) with 224 BLOCKS & 512 BLOCK_THREADS (program default values) checks around 10000 MKey/s for compressed address with missing characters in the middle (collision with checksum) and around 1400-1540 Mkey/s for other cases (20000steps/thread); other results (using default values of blocks, threads and steps per thread):

| card | compressed with collision | all other cases |
|---------------|---------------------------|-----------------|
| RTX 3060 eGPU | 10000 | 1400 |
| RTX 3090 | 29500 | 3650 |
| GTX 1080TI | 6000 | 650 |
| card | compressed with collision | all other cases |
|---------------|---------------------------|---------------------|
| RTX 3060 eGPU | 10000 | 1520 (224/512/20000)|
| RTX 3090 | 29500 | 3950 (656/640/5000) |
| GTX 1080TI | 6000 | 750 |

Please consult official Nvidia Occupancy Calculator (https://docs.nvidia.com/cuda/cuda-occupancy-calculator/index.html) to see how to select desired amount of threads (shared memory=0, registers per thread = 48).
Please consult official Nvidia Occupancy Calculator (https://docs.nvidia.com/cuda/cuda-occupancy-calculator/index.html) to see how to select desired amount of threads/block (shared memory=0, registers per thread = 48). Adjust number of steps per thread to obtain the optimal performance.

TODO
----
* code cleaning, review of hash functions
* predefined custom step (using list of possible characters)
* auto-processing (preparing configuration) based on WIF
* support for partially known checksum

Contact
-------
Expand Down
8 changes: 8 additions & 0 deletions WifSolverCuda/Worker.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@ __global__ void kernelUncompressed(bool* buffResult, bool* buffCollectorWork, ui
__global__ void kernelCompressed(bool* buffResult, bool* buffCollectorWork, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum);
__global__ void resultCollector(bool* buffResult, uint64_t* buffCombinedResult, const uint64_t threadsInBlockNumberOfChecks);

__global__ void kernelCompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks);
__global__ void kernelCompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum);
__global__ void kernelUncompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks);
__global__ void kernelUncompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum);

__device__ bool _checksumDoubleSha256CheckUncompressed(unsigned int checksum, beu32* d_hash, uint64_t* _start);
__device__ bool _checksumDoubleSha256CheckCompressed(unsigned int checksum, beu32* d_hash, uint64_t* _start);

Expand All @@ -23,4 +28,7 @@ __device__ void _load(uint64_t* C, uint64_t* A);

__device__ void IMult(uint64_t* r, uint64_t* a, int64_t b);

__device__ void initShared();
__device__ void summaryShared(uint32_t* unifiedResult, bool* isResultFlag);

cudaError_t loadStride(uint64_t* stride);
126 changes: 125 additions & 1 deletion WifSolverCuda/Worker1.cu
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#include "Worker.cuh"

__device__ __constant__ uint64_t _stride[5];
__device__ __shared__ uint32_t _blockResults[4096];
__device__ __shared__ bool _blockResultFlag[1];

__global__ void kernelUncompressed(bool* buffResult, bool* buffCollectorWork, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum) {
uint64_t _start[5];
Expand Down Expand Up @@ -70,7 +72,6 @@ __global__ void kernelCompressed(bool* buffResult, bool* buffCollectorWork, uint
_add(_start, _stride);
}
}

__global__ void resultCollector(bool* buffResult, uint64_t* buffCombinedResult, const uint64_t threadsInBlockNumberOfChecks) {
if (buffCombinedResult[blockIdx.x] == 0xffffffffffff) {
return;
Expand All @@ -90,6 +91,129 @@ __global__ void resultCollector(bool* buffResult, uint64_t* buffCombinedResult,
buffCombinedResult[blockIdx.x] = 0xffffffffffff;
}

__global__ void kernelUncompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum) {
uint64_t _start[5];
beu32 d_hash[8];

int64_t resIx = threadIdx.x;
int64_t tIx = (threadIdx.x + blockIdx.x * blockDim.x) * threadNumberOfChecks;
IMult(_start, _stride, tIx);
_add(_start, buffRangeStart);
bool wasResult = false;
initShared();
for (uint64_t i = 0, resultIx = tIx; i < threadNumberOfChecks; i++, resultIx++) {
if (_checksumDoubleSha256CheckUncompressed(checksum, d_hash, _start)) {
_blockResults[resIx] = resultIx;
if (!wasResult) {
_blockResultFlag[0] = true;
}
wasResult = true;
resIx += blockDim.x;
}
_add(_start, _stride);
}
summaryShared(unifiedResult, isResultFlag);
}
__global__ void kernelUncompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks) {
uint64_t _start[5];
beu32 d_hash[8];

int64_t resIx = threadIdx.x;
int64_t tIx = (threadIdx.x + blockIdx.x * blockDim.x) * threadNumberOfChecks;
IMult(_start, _stride, tIx);
_add(_start, buffRangeStart);
bool wasResult = false;
initShared();
for (uint64_t i = 0, resultIx = tIx; i < threadNumberOfChecks; i++, resultIx++) {
if (_checksumDoubleSha256CheckUncompressed(_start[0] & 0xffffffff, d_hash, _start)) {
_blockResults[resIx] = resultIx;
if (!wasResult) {
_blockResultFlag[0] = true;
}
wasResult = true;
resIx += blockDim.x;
}
_add(_start, _stride);
}
summaryShared(unifiedResult, isResultFlag);
}
__global__ void kernelCompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks) {
uint64_t _start[5];
beu32 d_hash[8];

int64_t resIx = threadIdx.x;
int64_t tIx = (threadIdx.x + blockIdx.x * blockDim.x) * threadNumberOfChecks;
IMult(_start, _stride, tIx);
_add(_start, buffRangeStart);
bool wasResult = false;
initShared();
for (uint64_t i = 0, resultIx = tIx; i < threadNumberOfChecks; i++, resultIx++) {
if (((_start[0] & 0xff00000000) >> 32) != 0x01) {
_add(_start, _stride);
continue;
}
if (_checksumDoubleSha256CheckCompressed(_start[0] & 0xffffffff, d_hash, _start)) {
_blockResults[resIx] = resultIx;
if (!wasResult) {
_blockResultFlag[0] = true;
}
wasResult = true;
resIx += blockDim.x;
}
_add(_start, _stride);
}
summaryShared(unifiedResult, isResultFlag);
}
__global__ void kernelCompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum) {
uint64_t _start[5];
beu32 d_hash[8];

int64_t resIx = threadIdx.x;
int64_t tIx = (threadIdx.x + blockIdx.x * blockDim.x) * threadNumberOfChecks;
IMult(_start, _stride, tIx);
_add(_start, buffRangeStart);
bool wasResult = false;
initShared();
for (uint64_t i = 0, resultIx = tIx; i < threadNumberOfChecks; i++, resultIx++) {
if (((_start[0] & 0xff00000000) >> 32) != 0x01) {
_add(_start, _stride);
continue;
}
if (_checksumDoubleSha256CheckCompressed(checksum, d_hash, _start)) {
_blockResults[resIx] = resultIx;
if (!wasResult) {
_blockResultFlag[0] = true;
}
wasResult = true;
resIx += blockDim.x;
}
_add(_start, _stride);
}
summaryShared(unifiedResult, isResultFlag);
}

__device__ void initShared() {
if (threadIdx.x == 0) {
_blockResultFlag[0] = false;
for (int i = 0; i < blockDim.x * 4; i++) {
_blockResults[i] = UINT32_MAX;
}
}
__syncthreads();
}
__device__ void summaryShared(uint32_t* unifiedResult, bool* isResultFlag) {
__syncthreads();
if (_blockResultFlag[0] && threadIdx.x == 0) {
isResultFlag[0] = true;
for (int i = 0, rIx = blockIdx.x; i < blockDim.x * 4; i++) {
if (_blockResults[i] != UINT32_MAX) {
unifiedResult[rIx] = _blockResults[i];
rIx += gridDim.x;
}
}
}
}

__device__ bool _checksumDoubleSha256CheckCompressed(unsigned int checksum, beu32* d_hash, uint64_t* _start) {
sha256Kernel(d_hash,
_start[4] >> 16,
Expand Down
Loading

0 comments on commit b93dd90

Please sign in to comment.