Skip to content

Commit

Permalink
v0.5.5, minor changes, doc update
Browse files Browse the repository at this point in the history
  • Loading branch information
PawelGorny committed May 6, 2022
1 parent 232232d commit 713b768
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 26 deletions.
17 changes: 13 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,15 @@ Similar test for compressed WIF (target _Kzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz
-stride 7479027ea100 -c -rangeStart 8070cfa0d40309798a5bd144a396478b5b5ae3305b7413601b18767654f1108a02787692623a -a 1PzaLTZS3J3HqGfsa8Z2jfkCT1QpSMVunD

For other of examples please see the file /docs/examples.txt.
It is also possible to use parameters -wifStart and -wifEnd for defining ranges.

-stride 7479027ea100 -u -wifStart 5KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK11111KKKSmnqY -a 19NzcPZvZMSNQk8sDbSiyjeKpEVpaS1212

or test for compressed, with start&end ranges:

-c -wifStart L5KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK1111111KKKJsczi8wg -stride 15ac264554f032800 -wifEnd L5KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKzzzzzzzKKKJsczi8wg -a 1M9JqAMbCBwtvNLedwNG4TjiPsxATJrvxq



Program supports legacy addresses (1...), native Segwit/bench32 P2WPKH (bc1...) and P2WPKH-P2SH addresses (3...).

Expand All @@ -89,15 +98,15 @@ Performance
-----------
User should modify number of blocks and number of threads in each block to find values which are the best for his card. Number of tests performed by each thread also could have impact of global performance/latency.

Test card: RTX3060 (eGPU!) with 224 BLOCKS & 512 BLOCK_THREADS (program default values) checks around 10000 MKey/s for compressed address with missing characters in the middle (collision with checksum) and around 1400-1540 Mkey/s for missing beginning (20000steps/thread); other results (using default values of blocks, threads and steps per thread):
Test card: RTX3060 (eGPU!) with 224 BLOCKS & 640 BLOCK_THREADS (program default values) checks around 10000 MKey/s for compressed address with missing characters in the middle (collision with checksum) and around 1400 Mkey/s for missing beginning (20000steps/thread); other results (using default values of blocks, threads and steps per thread):

| card | perf Mkey/s, missing beginning
|---------------|---------------------|
| RTX 3060 eGPU | 1520 (224/512/20000)|
| RTX 3060 eGPU | 1400 (224/512/20000)|
| RTX 3070 | 2200 (414/640/5000) |
| RTX 3090 | 3950 (656/640/5000) |
| RTX 3080TI | 4090 (640/640/5000) |
| RTX A6000 | 4070 (588/640/5000) |
| RTX 3080TI | 4000 (640/640/5000) |
| RTX A6000 | 4000 (588/640/5000) |
| GTX 1070 | 950 (135/768/5000) |

Please consult official Nvidia Occupancy Calculator (https://docs.nvidia.com/cuda/cuda-occupancy-calculator/index.html) to see how to select desired amount of threads/block (shared memory=0, registers per thread = 48). Adjust number of steps per thread to obtain the optimal performance.
Expand Down
33 changes: 17 additions & 16 deletions WifSolverCuda/Worker1.cu
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,8 @@ __global__ void kernelCompressed(uint32_t* unifiedResult, bool* isResultFlag, ui
_blockResults[resIx] = resultIx;
if (!wasResult) {
_blockResultFlag[0] = true;
}
wasResult = true;
wasResult = true;
}
resIx += blockDim.x;
}
_add(_start, _stride);
Expand All @@ -183,25 +183,26 @@ __global__ void kernelCompressed(uint32_t* unifiedResult, bool* isResultFlag, ui
_blockResults[resIx] = resultIx;
if (!wasResult) {
_blockResultFlag[0] = true;
wasResult = true;
}
wasResult = true;
resIx += blockDim.x;
}
_add(_start, _stride);
}
summaryShared(unifiedResult, isResultFlag);
}

__device__ void initShared() {
__device__ __inline__ void initShared() {
for (int i = threadIdx.x; i < blockDim.x * 4;) {
_blockResults[i] = UINT32_MAX;
i += blockDim.x;
}
if (threadIdx.x == 0) {
_blockResultFlag[0] = false;
for (int i = 0; i < blockDim.x * 4; i++) {
_blockResults[i] = UINT32_MAX;
}
_blockResultFlag[0] = false;
}
__syncthreads();
}
__device__ void summaryShared(uint32_t* unifiedResult, bool* isResultFlag) {
__device__ __inline__ void summaryShared(uint32_t* unifiedResult, bool* isResultFlag) {
__syncthreads();
if (_blockResultFlag[0] && threadIdx.x == 0) {
isResultFlag[0] = true;
Expand All @@ -214,7 +215,7 @@ __device__ void summaryShared(uint32_t* unifiedResult, bool* isResultFlag) {
}
}

__device__ bool _checksumDoubleSha256CheckCompressed(unsigned int checksum, beu32* d_hash, uint64_t* _start) {
__device__ __inline__ bool _checksumDoubleSha256CheckCompressed(unsigned int checksum, beu32* d_hash, uint64_t* _start) {
sha256Kernel(d_hash,
_start[4] >> 16,
(_start[4] & 0x0000ffff) << 16 | _start[3] >> 48,
Expand All @@ -236,7 +237,7 @@ __device__ bool _checksumDoubleSha256CheckCompressed(unsigned int checksum, beu3
return _checksumDoubleSha256(checksum, d_hash);
}

__device__ bool _checksumDoubleSha256CheckUncompressed(unsigned int checksum, beu32* d_hash, uint64_t* _start) {
__device__ __inline__ bool _checksumDoubleSha256CheckUncompressed(unsigned int checksum, beu32* d_hash, uint64_t* _start) {
sha256Kernel(d_hash,
_start[4] >> 8,
(_start[4] & 0x000000ff) << 24 | _start[3] >> 40,
Expand All @@ -258,14 +259,14 @@ __device__ bool _checksumDoubleSha256CheckUncompressed(unsigned int checksum, be
return _checksumDoubleSha256(checksum, d_hash);
}

__device__ bool _checksumDoubleSha256(unsigned int checksum, beu32* d_hash) {
__device__ __inline__ bool _checksumDoubleSha256(unsigned int checksum, beu32* d_hash) {
sha256Kernel(d_hash, d_hash[0], d_hash[1], d_hash[2], d_hash[3], d_hash[4], d_hash[5],
d_hash[6], d_hash[7], 0x80000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x100);
return (checksum == d_hash[0]);
}

__device__ void sha256Kernel(beu32* const hash, C16(COMMA, EMPTY)) {
__device__ __inline__ void sha256Kernel(beu32* const hash, C16(COMMA, EMPTY)) {
#undef H
#define H(i,alpha,magic) beu32 hout##i;

Expand All @@ -288,16 +289,16 @@ __device__ void sha256Kernel(beu32* const hash, C16(COMMA, EMPTY)) {
H8(EMPTY, EMPTY);
}

__device__ void _add(uint64_t* C, uint64_t* A) {
__device__ __inline__ void _add(uint64_t* C, uint64_t* A) {
__Add1(C, A);
}

__device__ void _load(uint64_t* C, uint64_t* A) {
__device__ __inline__ void _load(uint64_t* C, uint64_t* A) {
__Load(C, A);
}


__device__ void IMult(uint64_t* r, uint64_t* a, int64_t b) {
__device__ __inline__ void IMult(uint64_t* r, uint64_t* a, int64_t b) {
uint64_t t[NBBLOCK];
// Make b positive
int64_t msk = b >> 63;
Expand Down
12 changes: 6 additions & 6 deletions WifSolverCuda/main.cu
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ Secp256K1* secp;

int main(int argc, char** argv)
{
printf("WifSolver 0.5.4\n\n");
printf("WifSolver 0.5.5\n\n");
printf("Use parameter '-h' for help and list of available parameters\n\n");

if (argc <=1 || readArgs(argc, argv)) {
Expand Down Expand Up @@ -218,18 +218,18 @@ cudaError_t processCudaUnified() {
cudaStatus = cudaMemcpyAsync(dev_buffRangeStart, buffRangeStart, RANGE_TRANSFER_SIZE, cudaMemcpyHostToDevice);

//verify the last results
if (buffIsResultManaged[0]) {
if (buffIsResultManaged[0]) {
buffIsResultManaged[0] = false;
for (int i = 0; i < COLLECTOR_SIZE_MM && !RESULT; i++) {
if (buffResultManaged[i] != UINT32_MAX) {
if (buffResultManaged[i] != UINT32_MAX) {
Int toTest = new Int(&rangeTestStart);
Int diff = new Int(&STRIDE);
diff.Mult(buffResultManaged[i]);
toTest.Add(&diff);
processCandidate(toTest);
buffResultManaged[i] = UINT32_MAX;
}
}
buffResultManaged[i] = UINT32_MAX;
}
}
}//test
}//while loop

Expand Down
7 changes: 7 additions & 0 deletions docs/examples.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
5KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKzzzzzzKKKSmnqY
80 c59cb0997ad73f7bf8621b1955caf80b304ded0a48e5b8f28c7e51edc840934a bae9d183
-u -rangeStart 80c59cb0997ad73f7bf8621b1955caf80b304ded0a48e5b8f28c7a4990fb3b6d3fffda3283 -stride 7479027ea100 -rangeEnd 80c59cb0997ad73f7bf8621b1955caf80b304ded0a48e5b8f28c7e51edc840934abae9d183 -a 19NzcPZvZMSNQk8sDbSiyjeKpEVpaS1212

-u -wifStart 5KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK111111KKKSmnqY -stride 7479027ea100 -wifEnd 5KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKzzzzzzKKKSmnqY -a 19NzcPZvZMSNQk8sDbSiyjeKpEVpaS1212

-------------------------------------------------------------------------------
5K______KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKSmnqY
^ 58^43 = edbafda67ca37188cf28263571f03b9716879e4acc9c514ab67280000000000
Expand All @@ -29,13 +32,17 @@ L5KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK111111KKKJsczi8wg
80 f1953d849c8450c9d7c9a6beaf9d565366137e487e5c45d407385c0106810981 b2 70f18f37

-c -rangeStart 80f1953d849c8450c9d7c9a6beaf9d565366137e487e5c45d407385c0106810981b270f18f37 -stride 15ac264554f032800 -a 1M9JqAMbCBwtvNLedwNG4TjiPsxATJrvxq

-c -wifStart L5KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK111111KKKJsczi8wg -stride 15ac264554f032800 -a 1M9JqAMbCBwtvNLedwNG4TjiPsxATJrvxq
-------------------------------------------------------------------------------
L5KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK_______KKKJsczi8wg
L5KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK1111111KKKJsczi8wg
80f1953d849c8450c9d7c9a6beaf9d565366137e487e5c45d32f1cf86355a6ac8723d3dd8f37
L5KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKzzzzzzzKKKJsczi8wg
80 f1953d849c8450c9d7c9a6beaf9d565366137e487e5c45d5e775395f71b6322d b6 7efe6737
-c -rangeStart 80f1953d849c8450c9d7c9a6beaf9d565366137e487e5c45d32f1cf86355a6ac8723d3dd8f37 -stride 15ac264554f032800 -rangeEnd 80f1953d849c8450c9d7c9a6beaf9d565366137e487e5c45d5e775395f71b6322db67efe6737 -a 1M9JqAMbCBwtvNLedwNG4TjiPsxATJrvxq

-c -wifStart L5KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK1111111KKKJsczi8wg -stride 15ac264554f032800 -wifEnd L5KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKzzzzzzzKKKJsczi8wg -a 1M9JqAMbCBwtvNLedwNG4TjiPsxATJrvxq
-------------------------------------------------------------------------------
L5K______KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKJsczi8wg
^ 58^43 = edbafda67ca37188cf28263571f03b9716879e4acc9c514ab67280000000000
Expand Down

0 comments on commit 713b768

Please sign in to comment.