From 46221a2c3e1a53c0a7b888589fd72a91cf26b88e Mon Sep 17 00:00:00 2001
From: PawelGorny <pawel@pawelgorny.com>
Date: Thu, 20 Jan 2022 15:05:47 +0100
Subject: [PATCH] v 0.3, checksum, speed improvements

---
 README.md                |  8 ++--
 WifSolverCuda/Worker.cuh |  7 +--
 WifSolverCuda/Worker1.cu | 93 +++++++++++++++++++++++++++++++++-------
 WifSolverCuda/main.cu    | 69 +++++++++++++++++++++--------
 4 files changed, 136 insertions(+), 41 deletions(-)

diff --git a/README.md b/README.md
index e957d6d..83b2bbb 100644
--- a/README.md
+++ b/README.md
@@ -64,15 +64,13 @@ Program was prepared using CUDA 11.6 - for other version manual change in VS pro
 Performance
 -----------
 One's must modify number of blocks and number of threads in each block to find the ones which are the best for his card. Number of test performed by each thread also could have impact of global performance/latency.  
-Test card: RTX3060 (-b 224 -t 512 -s 3364) checks around 1000Mkey/s for uncompressed address and around 2100 MKey/s for compressed address. 
-Example above extended to 7 missing characters was solved in 12 minutes (uncompressed starting key: 80c59cb0997ad73f7bf8621b1955caf80b304ded0a48e5b8f28c31b30a90d68ffcabd9b283); work done = 19 * 6 missing characters.
-With the same configuration of the card but for compressed address, the full range of 7 missing characters was solved in 18 minutes (compressed starting key: 8070cfa0d40309798a5bd144a396478b5b5ae3305b7413601b18758c81b73fb371a1c9d1823a).
+Test card: RTX3060 (-b 224 -t 512 -s 3364) checks around 3600 MKey/s for compressed address with missing characters in the middle and around 1300Mkey/s for other cases.
 
        
 TODO
 ----
 * code cleaning, review of hash functions
 * build configuration for Linux
-* solver for missing characters at the left side (with a known expected checksum)
 * predefined custom step (using list of possible characters)
-* reading configuration from file
\ No newline at end of file
+* reading configuration from file
+* build-in stride calculcater
\ No newline at end of file
diff --git a/WifSolverCuda/Worker.cuh b/WifSolverCuda/Worker.cuh
index 0d97057..932ba39 100644
--- a/WifSolverCuda/Worker.cuh
+++ b/WifSolverCuda/Worker.cuh
@@ -7,9 +7,10 @@
 #include "lib/Math.cuh"
 
 
-__global__ void kernelUncompressed(bool* buffResult, uint64_t* buffRangeStart, uint64_t* buffStride, const int threadNumberOfChecks);
-__global__ void kernelCompressed(bool* buffResult, uint64_t* buffRangeStart, uint64_t* buffStride, const int threadNumberOfChecks);
-
+__global__ void kernelUncompressed(bool* buffResult, bool* buffCollectorWork, uint64_t* buffRangeStart, uint64_t* buffStride, const int threadNumberOfChecks);
+__global__ void kernelCompressed(bool* buffResult, bool* buffCollectorWork, uint64_t* buffRangeStart, uint64_t* buffStride, const int threadNumberOfChecks);
+__global__ void kernelUncompressed(bool* buffResult, bool* buffCollectorWork, uint64_t* buffRangeStart, uint64_t* buffStride, const int threadNumberOfChecks, const uint32_t checksum);
+__global__ void kernelCompressed(bool* buffResult, bool* buffCollectorWork, uint64_t* buffRangeStart, uint64_t* buffStride, const int threadNumberOfChecks, const uint32_t checksum);
 __global__ void resultCollector(bool* buffResult, uint64_t* buffCombinedResult, const uint64_t threadNumberOfChecks);
 
 __device__ bool _checksumDoubleSha256CheckUncompressed(unsigned int checksum, beu32* d_hash, uint64_t* _start);
diff --git a/WifSolverCuda/Worker1.cu b/WifSolverCuda/Worker1.cu
index 9ee43c9..b7a7c39 100644
--- a/WifSolverCuda/Worker1.cu
+++ b/WifSolverCuda/Worker1.cu
@@ -1,7 +1,62 @@
 #include "Worker.cuh"
 
-//todo collector preparer _shared
-__global__ void kernelUncompressed(bool* buffResult, uint64_t* buffRangeStart, uint64_t* buffStride, const int threadNumberOfChecks) {
+__global__ void kernelUncompressed(bool* buffResult, bool* buffCollectorWork, uint64_t* buffRangeStart, uint64_t* buffStride, const int threadNumberOfChecks, const uint32_t checksum) {
+
+    uint64_t _stride[5];
+    uint64_t _start[5];
+    uint64_t _startStride[5];
+    _load(_start, buffRangeStart);
+    _load(_stride, buffStride);
+
+    int64_t tIx = (threadIdx.x + blockIdx.x * blockDim.x) * threadNumberOfChecks;
+    IMult(_startStride, _stride, tIx);
+    _add(_start, _startStride);
+    beu32 d_hash[8];
+    bool tempResult = false;
+    bool setHit = false;
+    for (uint64_t i = 0, resultIx = tIx; i < threadNumberOfChecks; i++, resultIx++) {
+        tempResult = _checksumDoubleSha256CheckUncompressed(checksum, d_hash, _start);
+        buffResult[resultIx] = tempResult;
+        _add(_start, _stride);
+        if (tempResult && !setHit) {
+            setHit = true;
+        }
+    }
+    if (setHit) {
+        buffCollectorWork[0] = true;
+    }
+}
+__global__ void kernelCompressed(bool* buffResult, bool* buffCollectorWork, uint64_t* buffRangeStart, uint64_t* buffStride, const int threadNumberOfChecks, const uint32_t checksum) {
+    uint64_t _stride[5];
+    uint64_t _start[5];
+    uint64_t _startStride[5];
+    _load(_start, buffRangeStart);
+    _load(_stride, buffStride);
+
+    int64_t tIx = (threadIdx.x + blockIdx.x * blockDim.x) * threadNumberOfChecks;
+    IMult(_startStride, _stride, tIx);
+    _add(_start, _startStride);
+    beu32 d_hash[8];
+    bool tempResult = false;
+    bool setHit = false;
+    for (uint64_t i = 0, resultIx = tIx; i < threadNumberOfChecks; i++, resultIx++) {
+        if (((_start[0] & 0xff00000000) >> 32) != 0x01) {
+            _add(_start, _stride);
+            buffResult[resultIx] = false;
+            continue;
+        }
+        tempResult = _checksumDoubleSha256CheckCompressed(checksum, d_hash, _start);
+        buffResult[resultIx] = tempResult;
+        _add(_start, _stride);
+        if (tempResult && !setHit) {
+            setHit = true;
+        }
+    }
+    if (setHit) {
+        buffCollectorWork[0] = true;
+    }
+}
+__global__ void kernelUncompressed(bool* buffResult, bool* buffCollectorWork, uint64_t* buffRangeStart, uint64_t* buffStride, const int threadNumberOfChecks) {
 	uint64_t _stride[5];
 	uint64_t _start[5];
     uint64_t _startStride[5];
@@ -12,17 +67,21 @@ __global__ void kernelUncompressed(bool* buffResult, uint64_t* buffRangeStart, u
     IMult(_startStride, _stride, tIx);
     _add(_start, _startStride);
     beu32 d_hash[8];
-
+    bool tempResult = false;
+    bool setHit = false;
 	for (uint64_t i = 0, resultIx = tIx ; i < threadNumberOfChecks; i++, resultIx++) {
-        unsigned int checksum = _start[0] & 0xffffffff;
-        buffResult[resultIx] = false;
-        if (_checksumDoubleSha256CheckUncompressed(checksum, d_hash, _start)) {
-            buffResult[resultIx] = true;
-        }		
+        tempResult = _checksumDoubleSha256CheckUncompressed(_start[0] & 0xffffffff, d_hash, _start);
+        buffResult[resultIx] = tempResult;
 		_add(_start, _stride);
+        if (tempResult && !setHit) {
+            setHit = true;
+        }
 	}
+    if (setHit) {
+        buffCollectorWork[0] = true;
+    }
 }
-__global__ void kernelCompressed(bool* buffResult, uint64_t* buffRangeStart, uint64_t* buffStride, const int threadNumberOfChecks) {
+__global__ void kernelCompressed(bool* buffResult, bool* buffCollectorWork, uint64_t* buffRangeStart, uint64_t* buffStride, const int threadNumberOfChecks) {
     uint64_t _stride[5];
     uint64_t _start[5];
     uint64_t _startStride[5];
@@ -33,19 +92,23 @@ __global__ void kernelCompressed(bool* buffResult, uint64_t* buffRangeStart, uin
     IMult(_startStride, _stride, tIx);
     _add(_start, _startStride);
     beu32 d_hash[8];
-
+    bool tempResult = false;
+    bool setHit = false;
     for (uint64_t i = 0, resultIx = tIx; i < threadNumberOfChecks; i++, resultIx++) {
         if (((_start[0] & 0xff00000000) >> 32) != 0x01) {
             _add(_start, _stride);
             buffResult[resultIx] = false;
             continue;
         }
-        unsigned int checksum = _start[0] & 0xffffffff;    
-        buffResult[resultIx] = false;
-        if (_checksumDoubleSha256CheckCompressed(checksum, d_hash, _start)) {
-            buffResult[resultIx] = true;
-        }   
+        tempResult = _checksumDoubleSha256CheckCompressed(_start[0] & 0xffffffff, d_hash, _start);
+        buffResult[resultIx] = tempResult;
         _add(_start, _stride);
+        if (tempResult && !setHit) {
+            setHit = true;
+        }
+    }
+    if (setHit) {
+        buffCollectorWork[0] = true;
     }
 }
 
diff --git a/WifSolverCuda/main.cu b/WifSolverCuda/main.cu
index 928c8d9..453b266 100644
--- a/WifSolverCuda/main.cu
+++ b/WifSolverCuda/main.cu
@@ -38,6 +38,9 @@ Int STRIDE, RANGE_START, RANGE_END;
 Int loopStride;
 Int counter;
 string TARGET_ADDRESS = "";
+Int CHECKSUM;
+bool IS_CHECKSUM = false;
+
 
 bool RESULT = false;
 bool useCollector = false;
@@ -54,7 +57,7 @@ Secp256K1* secp;
 
 int main(int argc, char** argv)
 {    
-    printf("WifSolver 0.2\n\n");
+    printf("WifSolver 0.3\n\n");
 
     if (readArgs(argc, argv)) {
         showHelp(); 
@@ -101,8 +104,8 @@ cudaError_t processCuda() {
 
     cudaStatus = cudaMalloc((void**)&dev_buffRangeStart, NB64BLOCK * sizeof(uint64_t));
     cudaStatus = cudaMalloc((void**)&dev_buffStride, NB64BLOCK * sizeof(uint64_t));
-    cudaStatus = cudaMemcpy(dev_buffStride, buffStride, NB64BLOCK * sizeof(uint64_t), cudaMemcpyHostToDevice);
-    
+    cudaStatus = cudaMemcpy(dev_buffStride, buffStride, NB64BLOCK * sizeof(uint64_t), cudaMemcpyHostToDevice);      
+
     bool* buffDeviceResult = new bool[outputSize];
     bool* dev_buffDeviceResult = new bool[outputSize];
     cudaStatus = cudaMalloc((void**)&dev_buffDeviceResult, outputSize * sizeof(bool));
@@ -114,11 +117,19 @@ cudaError_t processCuda() {
     cudaStatus = cudaMalloc((void**)&dev_buffResult, COLLECTOR_SIZE * sizeof(uint64_t));
     cudaStatus = cudaMemcpy(dev_buffResult, buffResult, COLLECTOR_SIZE * sizeof(uint64_t), cudaMemcpyHostToDevice);
 
+    bool* buffCollectorWork = new bool[1];
+    buffCollectorWork[0] = false;
+    bool* dev_buffCollectorWork = new bool[1];
+    cudaStatus = cudaMalloc((void**)&dev_buffCollectorWork, 1 * sizeof(bool));
+    cudaStatus = cudaMemcpy(dev_buffCollectorWork, buffCollectorWork, 1 * sizeof(bool), cudaMemcpyHostToDevice);
+
+    const uint32_t expectedChecksum = IS_CHECKSUM ? CHECKSUM.GetInt32() : 0;
+
     uint64_t counter = 0;
     int counterSaveFile = 0;
         
     std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
-
+    
     while (!RESULT && RANGE_START.IsLower(&RANGE_END)) {
 
         //prepare launch
@@ -126,10 +137,19 @@ cudaError_t processCuda() {
         cudaStatus = cudaMemcpy(dev_buffRangeStart, buffRangeStart, NB64BLOCK * sizeof(uint64_t), cudaMemcpyHostToDevice);
         //launch work
         if (COMPRESSED) {
-            kernelCompressed << <BLOCK_NUMBER, BLOCK_THREADS >> > (dev_buffDeviceResult, dev_buffRangeStart, dev_buffStride, THREAD_STEPS);
+            if (IS_CHECKSUM) {
+                kernelCompressed << <BLOCK_NUMBER, BLOCK_THREADS >> > (dev_buffDeviceResult, dev_buffCollectorWork, dev_buffRangeStart, dev_buffStride, THREAD_STEPS, expectedChecksum);
+            }else{
+                kernelCompressed << <BLOCK_NUMBER, BLOCK_THREADS >> > (dev_buffDeviceResult, dev_buffCollectorWork, dev_buffRangeStart, dev_buffStride, THREAD_STEPS);
+            }            
         }
         else {            
-            kernelUncompressed << <BLOCK_NUMBER, BLOCK_THREADS >> > (dev_buffDeviceResult, dev_buffRangeStart, dev_buffStride, THREAD_STEPS);
+            if (IS_CHECKSUM) {
+                kernelUncompressed << <BLOCK_NUMBER, BLOCK_THREADS >> > (dev_buffDeviceResult, dev_buffCollectorWork, dev_buffRangeStart, dev_buffStride, THREAD_STEPS, expectedChecksum);
+            }else{
+                kernelUncompressed << <BLOCK_NUMBER, BLOCK_THREADS >> > (dev_buffDeviceResult, dev_buffCollectorWork, dev_buffRangeStart, dev_buffStride, THREAD_STEPS);
+            }
+            
         }        
         cudaStatus = cudaGetLastError();
         if (cudaStatus != cudaSuccess) {
@@ -144,8 +164,11 @@ cudaError_t processCuda() {
         //std::cout << "Time difference = " << std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - begin).count() << "[ms]" << std::endl;
 
         if (useCollector) {
-            //summarize results
-            bool anyResult = true;
+            //summarize results            
+            cudaStatus = cudaMemcpy(buffCollectorWork, dev_buffCollectorWork, 1 * sizeof(bool), cudaMemcpyDeviceToHost);
+            bool anyResult = buffCollectorWork[0];
+            buffCollectorWork[0] = false;
+            cudaStatus = cudaMemcpy(dev_buffCollectorWork, buffCollectorWork, 1 * sizeof(bool), cudaMemcpyHostToDevice);
             while (anyResult) {
                 resultCollector << <BLOCK_NUMBER, 1 >> > (dev_buffDeviceResult, dev_buffResult, THREAD_STEPS * BLOCK_THREADS);
                 cudaStatus = cudaGetLastError();
@@ -196,7 +219,7 @@ cudaError_t processCuda() {
         //std::cout << "Time difference = " << std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - begin).count() << "[ms]" << std::endl;        
         RANGE_START.Add(&loopStride);
         counter += outputSize;
-        int32_t t = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - begin).count();
+        uint64_t t = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - begin).count();
         if ( t > 5000) {
             double speed = (double)((double)counter / (t/1000) ) / 1000000.0;            
             std::string speedStr;
@@ -229,6 +252,7 @@ Error:
     cudaFree(dev_buffDeviceResult);
     cudaFree(dev_buffRangeStart);
     cudaFree(dev_buffStride);
+    cudaFree(dev_buffCollectorWork);
     return cudaStatus;
 }
 
@@ -268,8 +292,8 @@ void processCandidate(Int &toTest) {
 
 void printConfig() {
     printf("Range start: %s\n", RANGE_START.GetBase16().c_str());
-    printf( "Range end  : %s\n", RANGE_END.GetBase16().c_str());
-    printf( "Stride     : %s\n", STRIDE.GetBase16().c_str());
+    printf("Range end  : %s\n", RANGE_END.GetBase16().c_str());
+    printf("Stride     : %s\n", STRIDE.GetBase16().c_str());
     if (!TARGET_ADDRESS.empty()) {
         printf( "Target     : %s\n", TARGET_ADDRESS.c_str());
     }
@@ -277,6 +301,9 @@ void printConfig() {
         printf("Target COMPRESSED\n");
     }    else    {
         printf("Target UNCOMPRESSED\n");
+    }   
+    if (IS_CHECKSUM) {
+        printf("Checksum   : %s\n", CHECKSUM.GetBase16().c_str());
     }
     printf( "\n");
     printf( "number of blocks: %d\n", BLOCK_NUMBER);
@@ -320,16 +347,17 @@ bool checkDevice() {
 void showHelp() {    
     printf("WifSolverCuda [-d deviceId] [-b NbBlocks] [-t NbThreads] [-s NbThreadChecks]\n");
     printf("    [-fresultp reportFile] [-fresult resultFile] [-fstatus statusFile] [-a targetAddress]\n");
-    printf("    -rangeStart hexKeyStart -rangeEnd hexKeyEnd -stride hexKeyStride\n\n");
+    printf("    -stride hexKeyStride -rangeStart hexKeyStart [-rangeEnd hexKeyEnd] [-checksum hexChecksum] \n\n");
     printf("-rangeStart hexKeyStart: decoded initial key with compression flag and checksum \n");
     printf("-rangeEnd hexKeyEnd: decoded end key with compression flag and checksum \n");
+    printf("-checksum hexChecksum: decoded checksum, cannot be modified with a stride  \n");
     printf("-stride hexKeyStride: full stride calculated as 58^(missing char index) \n");
     printf("-fresult resultFile: file for final result (default: %s)\n", fileResult.c_str());
     printf("-fresultp reportFile: file for each WIF with correct checksum (default: %s)\n", fileResultPartial.c_str());
     printf("-fstatus statusFile: file for periodically saved status (default: %s) \n", fileStatus.c_str());
     printf("-d deviceId: default 0\n");
-    printf("-c search for compressed address\n");
-    printf("-u search for uncompressed address (default)\n");
+    printf("-c : search for compressed address\n");
+    printf("-u : search for uncompressed address (default)\n");
     printf("-b NbBlocks: default processorCount * 12\n");
     printf("-t NbThreads: default deviceMax / 4\n");
     printf("-s NbThreadChecks: default 3364\n");
@@ -344,12 +372,12 @@ bool readArgs(int argc, char** argv) {
     while (a < argc) {
         if (strcmp(argv[a], "-h") == 0) {
             return true;
-        }else
-        if (strcmp(argv[a], "-d") == 0) {
+        }
+        else if (strcmp(argv[a], "-d") == 0) {
             a++;
             DEVICE_NR = strtol(argv[a], NULL, 10);
-        }else
-        if (strcmp(argv[a], "-c") == 0) {
+        }
+        else if (strcmp(argv[a], "-c") == 0) {
             COMPRESSED = true;
         }
         else if (strcmp(argv[a], "-u") == 0) {
@@ -398,6 +426,11 @@ bool readArgs(int argc, char** argv) {
             a++;
             TARGET_ADDRESS = string(argv[a]);
         }
+        else if (strcmp(argv[a], "-checksum") == 0) {
+            a++;
+            CHECKSUM.SetBase16((char*)string(argv[a]).c_str());
+            IS_CHECKSUM = true;
+        }
         a++;
     }