From 713b76822b6e646b9c28b06d93447c7aa1913acb Mon Sep 17 00:00:00 2001
From: PawelGorny <pawel@pawelgorny.com>
Date: Fri, 6 May 2022 10:19:47 +0200
Subject: [PATCH] v0.5.5, minor changes, doc update

---
 README.md                | 17 +++++++++++++----
 WifSolverCuda/Worker1.cu | 33 +++++++++++++++++----------------
 WifSolverCuda/main.cu    | 12 ++++++------
 docs/examples.txt        |  7 +++++++
 4 files changed, 43 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index c51062c..6fc5199 100644
--- a/README.md
+++ b/README.md
@@ -71,6 +71,15 @@ Similar test for compressed WIF (target _Kzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz
     -stride 7479027ea100 -c -rangeStart 8070cfa0d40309798a5bd144a396478b5b5ae3305b7413601b18767654f1108a02787692623a  -a 1PzaLTZS3J3HqGfsa8Z2jfkCT1QpSMVunD
    
 For other of examples please see the file /docs/examples.txt. 
+It is also possible to use parameters -wifStart and -wifEnd for defining ranges.    
+    
+    -stride 7479027ea100 -u -wifStart 5KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK11111KKKSmnqY -a 19NzcPZvZMSNQk8sDbSiyjeKpEVpaS1212
+   
+or test for compressed, with start&end ranges:
+    
+    -c -wifStart L5KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK1111111KKKJsczi8wg -stride 15ac264554f032800 -wifEnd L5KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKzzzzzzzKKKJsczi8wg -a 1M9JqAMbCBwtvNLedwNG4TjiPsxATJrvxq
+   
+
 
 Program supports legacy addresses (1...), native Segwit/bench32 P2WPKH (bc1...) and P2WPKH-P2SH addresses (3...).
         
@@ -89,15 +98,15 @@ Performance
 -----------
 User should modify number of blocks and number of threads in each block to find values which are the best for his card. Number of tests performed by each thread also could have impact of global performance/latency.  
 
-Test card: RTX3060 (eGPU!) with 224 BLOCKS & 512 BLOCK_THREADS (program default values) checks around 10000 MKey/s for compressed address with missing characters in the middle (collision with checksum) and around 1400-1540 Mkey/s for missing beginning (20000steps/thread); other results (using default values of blocks, threads and steps per thread):
+Test card: RTX3060 (eGPU!) with 224 BLOCKS & 640 BLOCK_THREADS (program default values) checks around 10000 MKey/s for compressed address with missing characters in the middle (collision with checksum) and around 1400 Mkey/s for missing beginning (20000steps/thread); other results (using default values of blocks, threads and steps per thread):
 
 | card          | perf Mkey/s, missing beginning
 |---------------|---------------------|
-| RTX 3060 eGPU | 1520 (224/512/20000)|
+| RTX 3060 eGPU | 1400 (224/512/20000)|
 | RTX 3070      | 2200 (414/640/5000) |
 | RTX 3090      | 3950 (656/640/5000) |
-| RTX 3080TI    | 4090 (640/640/5000) |
-| RTX A6000     | 4070 (588/640/5000) |
+| RTX 3080TI    | 4000 (640/640/5000) |
+| RTX A6000     | 4000 (588/640/5000) |
 | GTX 1070      | 950  (135/768/5000) |
 
 Please consult official Nvidia Occupancy Calculator (https://docs.nvidia.com/cuda/cuda-occupancy-calculator/index.html) to see how to select desired amount of threads/block (shared memory=0, registers per thread = 48). Adjust number of steps per thread to obtain the optimal performance.
diff --git a/WifSolverCuda/Worker1.cu b/WifSolverCuda/Worker1.cu
index 549a55a..5431213 100644
--- a/WifSolverCuda/Worker1.cu
+++ b/WifSolverCuda/Worker1.cu
@@ -156,8 +156,8 @@ __global__ void kernelCompressed(uint32_t* unifiedResult, bool* isResultFlag, ui
             _blockResults[resIx] = resultIx;
             if (!wasResult) {
                 _blockResultFlag[0] = true;
-            }
-            wasResult = true;
+                wasResult = true;
+            }            
             resIx += blockDim.x;            
         }
         _add(_start, _stride);
@@ -183,8 +183,8 @@ __global__ void kernelCompressed(uint32_t* unifiedResult, bool* isResultFlag, ui
             _blockResults[resIx] = resultIx;
             if (!wasResult) {
                 _blockResultFlag[0] = true;
+                wasResult = true;
             }
-            wasResult = true;
             resIx += blockDim.x;
         }
         _add(_start, _stride);
@@ -192,16 +192,17 @@ __global__ void kernelCompressed(uint32_t* unifiedResult, bool* isResultFlag, ui
     summaryShared(unifiedResult, isResultFlag);
 }
 
-__device__ void initShared() {
+__device__ __inline__ void initShared() {
+    for (int i = threadIdx.x; i < blockDim.x * 4;) {
+        _blockResults[i] = UINT32_MAX;
+        i += blockDim.x;
+    }
     if (threadIdx.x == 0) {
-        _blockResultFlag[0] = false;
-        for (int i = 0; i < blockDim.x * 4; i++) {
-            _blockResults[i] = UINT32_MAX;
-        }
+        _blockResultFlag[0] = false;  
     }
     __syncthreads();
 }
-__device__ void summaryShared(uint32_t* unifiedResult, bool* isResultFlag) {
+__device__ __inline__ void summaryShared(uint32_t* unifiedResult, bool* isResultFlag) {
     __syncthreads();
     if (_blockResultFlag[0] && threadIdx.x == 0) {
         isResultFlag[0] = true;
@@ -214,7 +215,7 @@ __device__ void summaryShared(uint32_t* unifiedResult, bool* isResultFlag) {
     }
 }
 
-__device__ bool _checksumDoubleSha256CheckCompressed(unsigned int checksum, beu32* d_hash, uint64_t* _start) {
+__device__  __inline__ bool _checksumDoubleSha256CheckCompressed(unsigned int checksum, beu32* d_hash, uint64_t* _start) {
     sha256Kernel(d_hash,
         _start[4] >> 16,
         (_start[4] & 0x0000ffff) << 16 | _start[3] >> 48,
@@ -236,7 +237,7 @@ __device__ bool _checksumDoubleSha256CheckCompressed(unsigned int checksum, beu3
     return _checksumDoubleSha256(checksum, d_hash);
 }
 
-__device__ bool _checksumDoubleSha256CheckUncompressed(unsigned int checksum, beu32* d_hash, uint64_t* _start) {
+__device__  __inline__ bool _checksumDoubleSha256CheckUncompressed(unsigned int checksum, beu32* d_hash, uint64_t* _start) {
     sha256Kernel(d_hash,
         _start[4] >> 8,
         (_start[4] & 0x000000ff) << 24 | _start[3] >> 40,
@@ -258,14 +259,14 @@ __device__ bool _checksumDoubleSha256CheckUncompressed(unsigned int checksum, be
     return _checksumDoubleSha256(checksum, d_hash);
 }
 
-__device__ bool _checksumDoubleSha256(unsigned int checksum, beu32* d_hash) {
+__device__  __inline__ bool _checksumDoubleSha256(unsigned int checksum, beu32* d_hash) {
     sha256Kernel(d_hash, d_hash[0], d_hash[1], d_hash[2], d_hash[3], d_hash[4], d_hash[5],
         d_hash[6], d_hash[7], 0x80000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
         0x00000000, 0x00000000, 0x100);
     return (checksum == d_hash[0]);
 }
 
-__device__ void sha256Kernel(beu32* const hash, C16(COMMA, EMPTY)) {
+__device__  __inline__ void sha256Kernel(beu32* const hash, C16(COMMA, EMPTY)) {
 #undef  H
 #define H(i,alpha,magic)  beu32 hout##i;
 
@@ -288,16 +289,16 @@ __device__ void sha256Kernel(beu32* const hash, C16(COMMA, EMPTY)) {
     H8(EMPTY, EMPTY);
 }
 
-__device__ void _add(uint64_t* C, uint64_t* A) {
+__device__  __inline__ void _add(uint64_t* C, uint64_t* A) {
     __Add1(C, A);
 }
 
-__device__ void _load(uint64_t* C, uint64_t* A) {
+__device__  __inline__ void _load(uint64_t* C, uint64_t* A) {
     __Load(C, A);
 }
 
 
-__device__ void IMult(uint64_t* r, uint64_t* a, int64_t b) {
+__device__  __inline__ void IMult(uint64_t* r, uint64_t* a, int64_t b) {
     uint64_t t[NBBLOCK];
     // Make b positive
     int64_t msk = b >> 63;
diff --git a/WifSolverCuda/main.cu b/WifSolverCuda/main.cu
index ebdbc9b..9880980 100644
--- a/WifSolverCuda/main.cu
+++ b/WifSolverCuda/main.cu
@@ -81,7 +81,7 @@ Secp256K1* secp;
 
 int main(int argc, char** argv)
 {    
-    printf("WifSolver 0.5.4\n\n");
+    printf("WifSolver 0.5.5\n\n");
     printf("Use parameter '-h' for help and list of available parameters\n\n");
 
     if (argc <=1 || readArgs(argc, argv)) {
@@ -218,18 +218,18 @@ cudaError_t processCudaUnified() {
         cudaStatus = cudaMemcpyAsync(dev_buffRangeStart, buffRangeStart, RANGE_TRANSFER_SIZE, cudaMemcpyHostToDevice);
 
         //verify the last results
-        if (buffIsResultManaged[0]) {
+        if (buffIsResultManaged[0]) {            
             buffIsResultManaged[0] = false;
             for (int i = 0; i < COLLECTOR_SIZE_MM && !RESULT; i++) {
-                if (buffResultManaged[i] != UINT32_MAX) {
+                if (buffResultManaged[i] != UINT32_MAX) {                    
                     Int toTest = new Int(&rangeTestStart);
                     Int diff = new Int(&STRIDE);
                     diff.Mult(buffResultManaged[i]);
                     toTest.Add(&diff);
                     processCandidate(toTest);
-                    buffResultManaged[i] = UINT32_MAX;
-                }
-            }
+                    buffResultManaged[i] = UINT32_MAX;                                        
+                }                
+            }            
         }//test
     }//while loop
 
diff --git a/docs/examples.txt b/docs/examples.txt
index 80f0142..1d23257 100644
--- a/docs/examples.txt
+++ b/docs/examples.txt
@@ -10,6 +10,9 @@
 5KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKzzzzzzKKKSmnqY
 80 c59cb0997ad73f7bf8621b1955caf80b304ded0a48e5b8f28c7e51edc840934a bae9d183
 -u -rangeStart 80c59cb0997ad73f7bf8621b1955caf80b304ded0a48e5b8f28c7a4990fb3b6d3fffda3283 -stride 7479027ea100 -rangeEnd 80c59cb0997ad73f7bf8621b1955caf80b304ded0a48e5b8f28c7e51edc840934abae9d183 -a 19NzcPZvZMSNQk8sDbSiyjeKpEVpaS1212
+
+-u -wifStart 5KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK111111KKKSmnqY -stride 7479027ea100 -wifEnd 5KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKzzzzzzKKKSmnqY -a 19NzcPZvZMSNQk8sDbSiyjeKpEVpaS1212
+
 -------------------------------------------------------------------------------
 5K______KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKSmnqY
        ^ 58^43 = edbafda67ca37188cf28263571f03b9716879e4acc9c514ab67280000000000
@@ -29,6 +32,8 @@ L5KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK111111KKKJsczi8wg
 80 f1953d849c8450c9d7c9a6beaf9d565366137e487e5c45d407385c0106810981 b2 70f18f37
 
 -c -rangeStart 80f1953d849c8450c9d7c9a6beaf9d565366137e487e5c45d407385c0106810981b270f18f37 -stride 15ac264554f032800 -a 1M9JqAMbCBwtvNLedwNG4TjiPsxATJrvxq
+
+-c -wifStart L5KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK111111KKKJsczi8wg -stride 15ac264554f032800 -a 1M9JqAMbCBwtvNLedwNG4TjiPsxATJrvxq
 -------------------------------------------------------------------------------
 L5KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK_______KKKJsczi8wg
 L5KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK1111111KKKJsczi8wg
@@ -36,6 +41,8 @@ L5KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK1111111KKKJsczi8wg
 L5KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKzzzzzzzKKKJsczi8wg
 80 f1953d849c8450c9d7c9a6beaf9d565366137e487e5c45d5e775395f71b6322d b6 7efe6737
 -c -rangeStart 80f1953d849c8450c9d7c9a6beaf9d565366137e487e5c45d32f1cf86355a6ac8723d3dd8f37 -stride 15ac264554f032800 -rangeEnd 80f1953d849c8450c9d7c9a6beaf9d565366137e487e5c45d5e775395f71b6322db67efe6737 -a 1M9JqAMbCBwtvNLedwNG4TjiPsxATJrvxq
+
+-c -wifStart L5KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK1111111KKKJsczi8wg -stride 15ac264554f032800 -wifEnd L5KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKzzzzzzzKKKJsczi8wg -a 1M9JqAMbCBwtvNLedwNG4TjiPsxATJrvxq
 -------------------------------------------------------------------------------
 L5K______KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKJsczi8wg
         ^ 58^43 = edbafda67ca37188cf28263571f03b9716879e4acc9c514ab67280000000000