diff --git a/README.md b/README.md index a8d8fe1..5a69250 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,7 @@ Usage: -decode wifToDecode: decodes given WIF -restore statusFile: restore work configuration -listDevices: shows available devices + -disable-um: disable unified memory mode -h : shows help @@ -81,22 +82,21 @@ Performance ----------- User should modify number of blocks and number of threads in each block to find values which are the best for his card. Number of tests performed by each thread also could have impact of global performance/latency. -Test card: RTX3060 (eGPU!) with 224 BLOCKS & 640 BLOCK_THREADS (program default values) checks around 10000 MKey/s for compressed address with missing characters in the middle (collision with checksum) and around 1300-1400 Mkey/s for other cases; other results (using default values of blocks, threads and steps per thread): +Test card: RTX3060 (eGPU!) with 224 BLOCKS & 512 BLOCK_THREADS (program default values) checks around 10000 MKey/s for compressed address with missing characters in the middle (collision with checksum) and around 1400-1540 Mkey/s for other cases (20000steps/thread); other results (using default values of blocks, threads and steps per thread): -| card | compressed with collision | all other cases | -|---------------|---------------------------|-----------------| -| RTX 3060 eGPU | 10000 | 1400 | -| RTX 3090 | 29500 | 3650 | -| GTX 1080TI | 6000 | 650 | +| card | compressed with collision | all other cases | +|---------------|---------------------------|---------------------| +| RTX 3060 eGPU | 10000 | 1520 (224/512/20000)| +| RTX 3090 | 29500 | 3950 (656/640/5000) | +| GTX 1080TI | 6000 | 750 | -Please consult official Nvidia Occupancy Calculator (https://docs.nvidia.com/cuda/cuda-occupancy-calculator/index.html) to see how to select desired amount of threads (shared memory=0, registers per thread = 48). +Please consult official Nvidia Occupancy Calculator (https://docs.nvidia.com/cuda/cuda-occupancy-calculator/index.html) to see how to select desired amount of threads/block (shared memory=0, registers per thread = 48). Adjust number of steps per thread to obtain the optimal performance. TODO ---- * code cleaning, review of hash functions * predefined custom step (using list of possible characters) * auto-processing (preparing configuration) based on WIF -* support for partially known checksum Contact ------- diff --git a/WifSolverCuda/Worker.cuh b/WifSolverCuda/Worker.cuh index c9fe6fd..195ba2c 100644 --- a/WifSolverCuda/Worker.cuh +++ b/WifSolverCuda/Worker.cuh @@ -13,6 +13,11 @@ __global__ void kernelUncompressed(bool* buffResult, bool* buffCollectorWork, ui __global__ void kernelCompressed(bool* buffResult, bool* buffCollectorWork, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum); __global__ void resultCollector(bool* buffResult, uint64_t* buffCombinedResult, const uint64_t threadsInBlockNumberOfChecks); +__global__ void kernelCompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks); +__global__ void kernelCompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum); +__global__ void kernelUncompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks); +__global__ void kernelUncompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum); + __device__ bool _checksumDoubleSha256CheckUncompressed(unsigned int checksum, beu32* d_hash, uint64_t* _start); __device__ bool _checksumDoubleSha256CheckCompressed(unsigned int checksum, beu32* d_hash, uint64_t* _start); @@ -23,4 +28,7 @@ __device__ void _load(uint64_t* C, uint64_t* A); __device__ void IMult(uint64_t* r, uint64_t* a, int64_t b); +__device__ void initShared(); +__device__ void summaryShared(uint32_t* unifiedResult, bool* isResultFlag); + cudaError_t loadStride(uint64_t* stride); \ No newline at end of file diff --git a/WifSolverCuda/Worker1.cu b/WifSolverCuda/Worker1.cu index dce6729..549a55a 100644 --- a/WifSolverCuda/Worker1.cu +++ b/WifSolverCuda/Worker1.cu @@ -1,6 +1,8 @@ #include "Worker.cuh" __device__ __constant__ uint64_t _stride[5]; +__device__ __shared__ uint32_t _blockResults[4096]; +__device__ __shared__ bool _blockResultFlag[1]; __global__ void kernelUncompressed(bool* buffResult, bool* buffCollectorWork, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum) { uint64_t _start[5]; @@ -70,7 +72,6 @@ __global__ void kernelCompressed(bool* buffResult, bool* buffCollectorWork, uint _add(_start, _stride); } } - __global__ void resultCollector(bool* buffResult, uint64_t* buffCombinedResult, const uint64_t threadsInBlockNumberOfChecks) { if (buffCombinedResult[blockIdx.x] == 0xffffffffffff) { return; @@ -90,6 +91,129 @@ __global__ void resultCollector(bool* buffResult, uint64_t* buffCombinedResult, buffCombinedResult[blockIdx.x] = 0xffffffffffff; } +__global__ void kernelUncompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum) { + uint64_t _start[5]; + beu32 d_hash[8]; + + int64_t resIx = threadIdx.x; + int64_t tIx = (threadIdx.x + blockIdx.x * blockDim.x) * threadNumberOfChecks; + IMult(_start, _stride, tIx); + _add(_start, buffRangeStart); + bool wasResult = false; + initShared(); + for (uint64_t i = 0, resultIx = tIx; i < threadNumberOfChecks; i++, resultIx++) { + if (_checksumDoubleSha256CheckUncompressed(checksum, d_hash, _start)) { + _blockResults[resIx] = resultIx; + if (!wasResult) { + _blockResultFlag[0] = true; + } + wasResult = true; + resIx += blockDim.x; + } + _add(_start, _stride); + } + summaryShared(unifiedResult, isResultFlag); +} +__global__ void kernelUncompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks) { + uint64_t _start[5]; + beu32 d_hash[8]; + + int64_t resIx = threadIdx.x; + int64_t tIx = (threadIdx.x + blockIdx.x * blockDim.x) * threadNumberOfChecks; + IMult(_start, _stride, tIx); + _add(_start, buffRangeStart); + bool wasResult = false; + initShared(); + for (uint64_t i = 0, resultIx = tIx; i < threadNumberOfChecks; i++, resultIx++) { + if (_checksumDoubleSha256CheckUncompressed(_start[0] & 0xffffffff, d_hash, _start)) { + _blockResults[resIx] = resultIx; + if (!wasResult) { + _blockResultFlag[0] = true; + } + wasResult = true; + resIx += blockDim.x; + } + _add(_start, _stride); + } + summaryShared(unifiedResult, isResultFlag); +} +__global__ void kernelCompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks) { + uint64_t _start[5]; + beu32 d_hash[8]; + + int64_t resIx = threadIdx.x; + int64_t tIx = (threadIdx.x + blockIdx.x * blockDim.x) * threadNumberOfChecks; + IMult(_start, _stride, tIx); + _add(_start, buffRangeStart); + bool wasResult = false; + initShared(); + for (uint64_t i = 0, resultIx = tIx; i < threadNumberOfChecks; i++, resultIx++) { + if (((_start[0] & 0xff00000000) >> 32) != 0x01) { + _add(_start, _stride); + continue; + } + if (_checksumDoubleSha256CheckCompressed(_start[0] & 0xffffffff, d_hash, _start)) { + _blockResults[resIx] = resultIx; + if (!wasResult) { + _blockResultFlag[0] = true; + } + wasResult = true; + resIx += blockDim.x; + } + _add(_start, _stride); + } + summaryShared(unifiedResult, isResultFlag); +} +__global__ void kernelCompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum) { + uint64_t _start[5]; + beu32 d_hash[8]; + + int64_t resIx = threadIdx.x; + int64_t tIx = (threadIdx.x + blockIdx.x * blockDim.x) * threadNumberOfChecks; + IMult(_start, _stride, tIx); + _add(_start, buffRangeStart); + bool wasResult = false; + initShared(); + for (uint64_t i = 0, resultIx = tIx; i < threadNumberOfChecks; i++, resultIx++) { + if (((_start[0] & 0xff00000000) >> 32) != 0x01) { + _add(_start, _stride); + continue; + } + if (_checksumDoubleSha256CheckCompressed(checksum, d_hash, _start)) { + _blockResults[resIx] = resultIx; + if (!wasResult) { + _blockResultFlag[0] = true; + } + wasResult = true; + resIx += blockDim.x; + } + _add(_start, _stride); + } + summaryShared(unifiedResult, isResultFlag); +} + +__device__ void initShared() { + if (threadIdx.x == 0) { + _blockResultFlag[0] = false; + for (int i = 0; i < blockDim.x * 4; i++) { + _blockResults[i] = UINT32_MAX; + } + } + __syncthreads(); +} +__device__ void summaryShared(uint32_t* unifiedResult, bool* isResultFlag) { + __syncthreads(); + if (_blockResultFlag[0] && threadIdx.x == 0) { + isResultFlag[0] = true; + for (int i = 0, rIx = blockIdx.x; i < blockDim.x * 4; i++) { + if (_blockResults[i] != UINT32_MAX) { + unifiedResult[rIx] = _blockResults[i]; + rIx += gridDim.x; + } + } + } +} + __device__ bool _checksumDoubleSha256CheckCompressed(unsigned int checksum, beu32* d_hash, uint64_t* _start) { sha256Kernel(d_hash, _start[4] >> 16, diff --git a/WifSolverCuda/main.cu b/WifSolverCuda/main.cu index 8719102..2068e6a 100644 --- a/WifSolverCuda/main.cu +++ b/WifSolverCuda/main.cu @@ -30,11 +30,14 @@ void saveStatus(); void restoreSettings(string fileStatusRestore); cudaError_t processCuda(); +cudaError_t processCudaUnified(); + +bool unifiedMemory = true; int DEVICE_NR = 0; unsigned int BLOCK_THREADS = 0; unsigned int BLOCK_NUMBER = 0; -unsigned int THREAD_STEPS = 1682; +unsigned int THREAD_STEPS = 5000; size_t wifLen = 53; int dataLen = 37; @@ -52,8 +55,6 @@ bool DECODE = false; string WIF_TO_DECODE; bool RESULT = false; -bool useCollector = false; -uint64_t collectorLimit = 100; uint64_t outputSize; @@ -72,7 +73,7 @@ Secp256K1* secp; int main(int argc, char** argv) { - printf("WifSolver 0.4.9\n\n"); + printf("WifSolver 0.5.0\n\n"); printf("Use parameter '-h' for help and list of available parameters\n\n"); if (readArgs(argc, argv)) { @@ -112,7 +113,13 @@ int main(int argc, char** argv) std::time_t s_time = std::chrono::system_clock::to_time_t(time); std::cout << "Work started at " << std::ctime(&s_time); - cudaError_t cudaStatus = processCuda(); + cudaError_t cudaStatus; + if (unifiedMemory) { + cudaStatus = processCudaUnified(); + } + else { + cudaStatus = processCuda(); + } time = std::chrono::system_clock::now(); s_time = std::chrono::system_clock::to_time_t(time); @@ -128,6 +135,112 @@ int main(int argc, char** argv) return 0; } +cudaError_t processCudaUnified() { + cudaError_t cudaStatus; + uint64_t* buffRangeStart = new uint64_t[NB64BLOCK]; + uint64_t* dev_buffRangeStart = new uint64_t[NB64BLOCK]; + uint64_t* buffStride = new uint64_t[NB64BLOCK]; + + const size_t RANGE_TRANSFER_SIZE = NB64BLOCK * sizeof(uint64_t); + const int COLLECTOR_SIZE_MM = 4 * BLOCK_NUMBER * BLOCK_THREADS; + const uint32_t expectedChecksum = IS_CHECKSUM ? CHECKSUM.GetInt32() : 0; + uint64_t counter = 0; + + __Load(buffStride, STRIDE.bits64); + loadStride(buffStride); + delete buffStride; + + uint32_t* buffResultManaged = new uint32_t[COLLECTOR_SIZE_MM]; + cudaStatus = cudaMallocManaged(&buffResultManaged, COLLECTOR_SIZE_MM * sizeof(uint32_t)); + + for (int i = 0; i < COLLECTOR_SIZE_MM; i++) { + buffResultManaged[i] = UINT32_MAX; + } + + bool* buffCollectorWork = new bool[1]; + buffCollectorWork[0] = false; + bool* dev_buffCollectorWork = new bool[1]; + cudaStatus = cudaMalloc((void**)&dev_buffCollectorWork, 1 * sizeof(bool)); + cudaStatus = cudaMemcpy(dev_buffCollectorWork, buffCollectorWork, 1 * sizeof(bool), cudaMemcpyHostToDevice); + + cudaStatus = cudaMalloc((void**)&dev_buffRangeStart, NB64BLOCK * sizeof(uint64_t)); + + bool* buffIsResultManaged = new bool[1]; + cudaStatus = cudaMallocManaged(&buffIsResultManaged, 1 * sizeof(bool)); + buffIsResultManaged[0] = false; + + std::chrono::steady_clock::time_point beginCountHashrate = std::chrono::steady_clock::now(); + std::chrono::steady_clock::time_point beginCountStatus = std::chrono::steady_clock::now(); + + while (!RESULT && RANGE_START.IsLower(&RANGE_END)) { + //prepare launch + __Load(buffRangeStart, RANGE_START.bits64); + cudaStatus = cudaMemcpy(dev_buffRangeStart, buffRangeStart, RANGE_TRANSFER_SIZE, cudaMemcpyHostToDevice); + //launch work + std::chrono::steady_clock::time_point beginKernel = std::chrono::steady_clock::now(); + if (COMPRESSED) { + if (IS_CHECKSUM) { + kernelCompressed << > > (buffResultManaged, buffIsResultManaged, dev_buffRangeStart, THREAD_STEPS, expectedChecksum); + } + else { + kernelCompressed << > > (buffResultManaged, buffIsResultManaged, dev_buffRangeStart, THREAD_STEPS); + } + } + else { + if (IS_CHECKSUM) { + kernelUncompressed << > > (buffResultManaged, buffIsResultManaged, dev_buffRangeStart, THREAD_STEPS, expectedChecksum); + } + else { + kernelUncompressed << > > (buffResultManaged, buffIsResultManaged, dev_buffRangeStart, THREAD_STEPS); + } + } + cudaStatus = cudaGetLastError(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "kernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); + goto Error; + } + cudaStatus = cudaDeviceSynchronize(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching kernel!\n", cudaStatus); + goto Error; + } + int64_t tKernel = std::chrono::duration_cast(std::chrono::steady_clock::now() - beginKernel).count(); + if (buffIsResultManaged[0]) { + buffIsResultManaged[0] = false; + for (int i = 0; i < COLLECTOR_SIZE_MM && !RESULT; i++) { + if (buffResultManaged[i] != UINT32_MAX) { + Int toTest = new Int(&RANGE_START); + Int diff = new Int(&STRIDE); + diff.Mult(buffResultManaged[i]); + toTest.Add(&diff); + processCandidate(toTest); + buffResultManaged[i] = UINT32_MAX; + } + } + }//test + + RANGE_START.Add(&loopStride); + counter += outputSize; + int64_t tHash = std::chrono::duration_cast(std::chrono::steady_clock::now() - beginCountHashrate).count(); + if (tHash > 5) { + double speed = (double)((double)counter / tHash) / 1000000.0; + printSpeed(speed); + counter = 0; + beginCountHashrate = std::chrono::steady_clock::now(); + } + if (std::chrono::duration_cast(std::chrono::steady_clock::now() - beginCountStatus).count() > fileStatusInterval) { + saveStatus(); + beginCountStatus = std::chrono::steady_clock::now(); + } + }//while + +Error: + cudaFree(dev_buffRangeStart); + cudaFree(dev_buffCollectorWork); + cudaFree(buffResultManaged); + return cudaStatus; +} + cudaError_t processCuda() { cudaError_t cudaStatus; uint64_t* buffRangeStart = new uint64_t[NB64BLOCK]; @@ -513,11 +626,14 @@ bool checkDevice() { cudaDeviceProp props; cudaStatus = cudaGetDeviceProperties(&props, DEVICE_NR); printf("Using GPU nr %d:\n", DEVICE_NR); + if (props.canMapHostMemory == 0) { + printf("unified memory not supported\n"); + unifiedMemory = 0; + } printf("%s (%2d procs)\n", props.name, props.multiProcessorCount); - printf("maxThreadsPerBlock: %2d\n\n", props.maxThreadsPerBlock); + printf("maxThreadsPerBlock: %2d\n\n", props.maxThreadsPerBlock); if (BLOCK_NUMBER == 0) { BLOCK_NUMBER = props.multiProcessorCount * 8; - } if (BLOCK_THREADS == 0) { BLOCK_THREADS = (props.maxThreadsPerBlock / 8) * 5; @@ -525,7 +641,6 @@ bool checkDevice() { outputSize = BLOCK_NUMBER * BLOCK_THREADS * THREAD_STEPS; loopStride = new Int(&STRIDE); loopStride.Mult(outputSize); - useCollector = outputSize >= collectorLimit; } return true; } @@ -556,6 +671,7 @@ void showHelp() { printf("-decode wifToDecode: decodes given WIF\n"); printf("-restore statusFile: restore work configuration\n"); printf("-listDevices: shows available devices\n"); + printf("-disable-um: disable unified memory mode\n"); printf("-h : shows help\n"); } @@ -653,6 +769,10 @@ bool readArgs(int argc, char** argv) { CHECKSUM.SetBase16((char*)string(argv[a]).c_str()); IS_CHECKSUM = true; } + else if (strcmp(argv[a], "-disable-um") == 0) { + unifiedMemory = 0; + printf("unified memory mode disabled\n"); + } a++; } @@ -705,6 +825,9 @@ void listDevices() { cudaGetDeviceProperties(&prop, i); printf("Device Number: %d\n", i); printf(" %s\n", prop.name); + if (prop.canMapHostMemory == 0) { + printf(" unified memory not supported\n"); + } printf(" %2d procs\n", prop.multiProcessorCount); printf(" maxThreadsPerBlock: %2d\n", prop.maxThreadsPerBlock); printf(" version majorminor: %d%d\n\n", prop.major, prop.minor);