Improved loading of large list

JeanLucPons · Mar 20, 2019 · e1d8a9c · e1d8a9c
1 parent 06acb37
commit e1d8a9c
Show file tree

Hide file tree

Showing 7 changed files with 81 additions and 92 deletions.
diff --git a/Base58.cpp b/Base58.cpp
@@ -46,7 +46,7 @@ bool DecodeBase58(const char* psz, std::vector<uint8_t> &vch)
       psz++;
     }
 
-    int length = strlen(psz);
+    int length = (int)strlen(psz);
 
     // Process the characters	
     int digitslen = 1;

diff --git a/GPU/GPUCompute.h b/GPU/GPUCompute.h
@@ -23,7 +23,8 @@
 // 
 // We use affine coordinates for elliptic curve point (ie Z=1)
 
-__device__ __noinline__ void CheckPoint(uint32_t *_h, int32_t incr, int32_t endo, int32_t mode,prefix_t *prefix, uint32_t tid, uint32_t *lookup32, uint32_t *out) {
+__device__ __noinline__ void CheckPoint(uint32_t *_h, int32_t incr, int32_t endo, int32_t mode,prefix_t *prefix, 
+                                        uint32_t tid, uint32_t *lookup32, uint32_t maxFound, uint32_t *out) {
 
   uint32_t   off;
   prefixl_t  l32;
@@ -41,32 +42,29 @@ __device__ __noinline__ void CheckPoint(uint32_t *_h, int32_t incr, int32_t endo
   if (hit) {
 
     if (lookup32) {
-
       off = lookup32[pr0];
       l32 = _h[0];
       st=  off;
       ed = off + hit - 1;
       while(st<=ed) {
         mi = (st+ed)/2;
         lmi = lookup32[mi];
-	    if(l32<lmi) {
-		  ed = mi - 1;
-	    } else if(l32==lmi) {
-		  // found
-		  goto addItem;
-	    } else {
-	      st = mi + 1;
-	    }	    
-      }
-
-	  return;
-
+	      if(l32<lmi) {
+		      ed = mi - 1;
+	      } else if(l32==lmi) {
+		      // found
+		      goto addItem;
+	      } else {
+	        st = mi + 1;
+	      }	    
+      }	   
+	    return;	  
     }
 
 addItem:
 
     pos = atomicAdd(out, 1);
-    if (pos < MAX_FOUND) {
+    if (pos < maxFound) {
       out[pos*ITEM_SIZE32 + 1] = tid;
       out[pos*ITEM_SIZE32 + 2] = (uint32_t)(incr << 16) | (uint32_t)(mode << 15) | (uint32_t)(endo);
       out[pos*ITEM_SIZE32 + 3] = _h[0];
@@ -80,10 +78,10 @@ __device__ __noinline__ void CheckPoint(uint32_t *_h, int32_t incr, int32_t endo
 
 }
 
-#define CHECK_POINT(_h,incr,endo,mode)  CheckPoint(_h,incr,endo,mode,prefix,tid,lookup32,out)
+#define CHECK_POINT(_h,incr,endo,mode)  CheckPoint(_h,incr,endo,mode,prefix,tid,lookup32,maxFound,out)
 
-__device__ __noinline__ void CheckHashComp(prefix_t *prefix, uint64_t *px, uint64_t *py,
-  int32_t incr, uint32_t tid, uint32_t *lookup32, uint32_t *out) {
+__device__ __noinline__ void CheckHashComp(prefix_t *prefix, uint64_t *px, uint64_t *py, int32_t incr, 
+                                           uint32_t tid, uint32_t *lookup32, uint32_t maxFound, uint32_t *out) {
 
   uint32_t   h[20];
   uint64_t   pe1x[4];
@@ -110,8 +108,8 @@ __device__ __noinline__ void CheckHashComp(prefix_t *prefix, uint64_t *px, uint6
 
 }
 
-__device__ __noinline__ void CheckHashUncomp(prefix_t *prefix, uint64_t *px, uint64_t *py,
-  int32_t incr, uint32_t tid, uint32_t *lookup32, uint32_t *out) {
+__device__ __noinline__ void CheckHashUncomp(prefix_t *prefix, uint64_t *px, uint64_t *py, int32_t incr, 
+                                             uint32_t tid, uint32_t *lookup32, uint32_t maxFound, uint32_t *out) {
 
   uint32_t   h[5];
   uint64_t   pe1x[4];
@@ -138,28 +136,28 @@ __device__ __noinline__ void CheckHashUncomp(prefix_t *prefix, uint64_t *px, uin
 
 }
 
-__device__ __noinline__ void CheckHash(uint32_t mode, prefix_t *prefix, uint64_t *px, uint64_t *py,
-  int32_t incr, uint32_t tid, uint32_t *lookup32, uint32_t *out) {
+__device__ __noinline__ void CheckHash(uint32_t mode, prefix_t *prefix, uint64_t *px, uint64_t *py, int32_t incr, 
+                                       uint32_t tid, uint32_t *lookup32, uint32_t maxFound, uint32_t *out) {
 
   switch (mode) {
   case SEARCH_COMPRESSED:
-    CheckHashComp(prefix, px, py, incr, tid, lookup32, out);
+    CheckHashComp(prefix, px, py, incr, tid, lookup32, maxFound, out);
     break;
   case SEARCH_UNCOMPRESSED:
-    CheckHashUncomp(prefix, px, py, incr, tid, lookup32, out);
+    CheckHashUncomp(prefix, px, py, incr, tid, lookup32, maxFound, out);
     break;
   case SEARCH_BOTH:
-    CheckHashComp(prefix, px, py, incr, tid, lookup32, out);
-    CheckHashUncomp(prefix, px, py, incr, tid, lookup32, out);
+    CheckHashComp(prefix, px, py, incr, tid, lookup32, maxFound, out);
+    CheckHashUncomp(prefix, px, py, incr, tid, lookup32, maxFound, out);
     break;
   }
 
 }
 
-#define CHECK_PREFIX(incr) CheckHash(mode, sPrefix, px, py, j*GRP_SIZE + (incr), tid, lookup32, out)
+#define CHECK_PREFIX(incr) CheckHash(mode, sPrefix, px, py, j*GRP_SIZE + (incr), tid, lookup32, maxFound, out)
 
 __device__ void ComputeKeys(uint32_t mode, uint64_t *startx, uint64_t *starty, 
-                             prefix_t *sPrefix, uint32_t *lookup32, uint32_t *out) {
+                             prefix_t *sPrefix, uint32_t *lookup32, uint32_t maxFound, uint32_t *out) {
 
   uint64_t dx[GRP_SIZE/2+1][4];
   uint64_t px[4];

diff --git a/GPU/GPUEngine.cu b/GPU/GPUEngine.cu
@@ -1152,11 +1152,11 @@ __device__ __noinline__ void _GetHash160(uint64_t *x, uint64_t *y, uint8_t *hash
 
 // ---------------------------------------------------------------------------------------
 
-__global__ void comp_keys(uint32_t mode,prefix_t *prefix, uint32_t *lookup32, uint64_t *keys, uint32_t *found) {
+__global__ void comp_keys(uint32_t mode,prefix_t *prefix, uint32_t *lookup32, uint64_t *keys, uint32_t maxFound, uint32_t *found) {
 
   int xPtr = (blockIdx.x*blockDim.x) * 8;
   int yPtr = xPtr + 4 * NB_TRHEAD_PER_GROUP;
-  ComputeKeys(mode, keys + xPtr, keys + yPtr, prefix, lookup32, found);
+  ComputeKeys(mode, keys + xPtr, keys + yPtr, prefix, lookup32, maxFound, found);
 
 }
 
@@ -1278,7 +1278,7 @@ int _ConvertSMVer2Cores(int major, int minor) {
 
 }
 
-GPUEngine::GPUEngine(int nbThreadGroup, int gpuId) {
+GPUEngine::GPUEngine(int nbThreadGroup, int gpuId, uint32_t maxFound) {
 
   // Initialise CUDA
   initialised = false;
@@ -1310,8 +1310,10 @@ GPUEngine::GPUEngine(int nbThreadGroup, int gpuId) {
   if (nbThreadGroup == -1)
     nbThreadGroup = deviceProp.multiProcessorCount * 8;
 
-  nbThread = nbThreadGroup * NB_TRHEAD_PER_GROUP;
-
+  this->nbThread = nbThreadGroup * NB_TRHEAD_PER_GROUP;
+  this->maxFound = maxFound;
+  this->outputSize = (maxFound*ITEM_SIZE + 4);
+
   char tmp[256];
   sprintf(tmp,"GPU #%d %s (%dx%d cores) Grid(%dx%d)",
   gpuId,deviceProp.name,deviceProp.multiProcessorCount,
@@ -1364,21 +1366,17 @@ GPUEngine::GPUEngine(int nbThreadGroup, int gpuId) {
     printf("GPUEngine: Allocate input pinned memory: %s\n", cudaGetErrorString(err));
     return;
   }
-  err = cudaMalloc((void **)&outputPrefix, OUTPUT_SIZE);
+  err = cudaMalloc((void **)&outputPrefix, outputSize);
   if (err != cudaSuccess) {
     printf("GPUEngine: Allocate output memory: %s\n", cudaGetErrorString(err));
     return;
   }
-  err = cudaHostAlloc(&outputPrefixPinned, OUTPUT_SIZE, cudaHostAllocWriteCombined | cudaHostAllocMapped);
+  err = cudaHostAlloc(&outputPrefixPinned, outputSize, cudaHostAllocWriteCombined | cudaHostAllocMapped);
   if (err != cudaSuccess) {
     printf("GPUEngine: Allocate output pinned memory: %s\n", cudaGetErrorString(err));
     return;
   }
 
-  //double P = 1/65536.0;
-  //double Plost = Psk(STEP_SIZE,MAX_FOUND,P);
-  //printf("Plost=%g\n",Plost);
-
   searchMode = SEARCH_COMPRESSED;
   initialised = true;
   inputPrefixLookUp = NULL;
@@ -1532,7 +1530,7 @@ bool GPUEngine::callKernel() {
 
   // Call the kernel (Perform STEP_SIZE keys per thread)
   comp_keys<<< nbThread / NB_TRHEAD_PER_GROUP, NB_TRHEAD_PER_GROUP >>>
-      (searchMode, inputPrefix, inputPrefixLookUp, inputKey, outputPrefix);
+      (searchMode, inputPrefix, inputPrefixLookUp, inputKey, maxFound, outputPrefix);
 
   cudaError_t err = cudaGetLastError();
   if (err != cudaSuccess) {
@@ -1587,8 +1585,7 @@ bool GPUEngine::Launch(std::vector<ITEM> &prefixFound,bool spinWait) {
 
   if(spinWait) {
 
-    cudaMemcpy(outputPrefixPinned, outputPrefix, OUTPUT_SIZE,
-      cudaMemcpyDeviceToHost);
+    cudaMemcpy(outputPrefixPinned, outputPrefix, outputSize, cudaMemcpyDeviceToHost);
 
   } else {
 
@@ -1613,13 +1610,13 @@ bool GPUEngine::Launch(std::vector<ITEM> &prefixFound,bool spinWait) {
 
   // Look for prefix found
   uint32_t nbFound = outputPrefixPinned[0];
-  if (nbFound > MAX_FOUND) {
+  if (nbFound > maxFound) {
     // prefix has been lost
     if (!lostWarning) {
-      printf("\nWarning, %d items lost (try to search with less prefixes or less thread (use -g))\n", (nbFound - MAX_FOUND));
+      printf("\nWarning, %d items lost\nHint: Search with less prefixes, less threads (-g) or increase maxFound (-m)\n", (nbFound - maxFound));
       lostWarning = true;
     }
-    nbFound = MAX_FOUND;
+    nbFound = maxFound;
   }
 
   // When can perform a standard copy, the kernel is eneded

diff --git a/GPU/GPUEngine.h b/GPU/GPUEngine.h
@@ -32,13 +32,8 @@ static const char *searchModes[] = {"Compressed","Uncompressed","Compressed or U
 
 // Number of thread per block
 #define NB_TRHEAD_PER_GROUP 128
-
-// Maximum number of 16bit prefix found per kernel 
-// Avg = (nbThread*STEP_SIZE*nbPrefix16)/65536
-#define MAX_FOUND 131072
 #define ITEM_SIZE 28
 #define ITEM_SIZE32 (ITEM_SIZE/4)
-#define OUTPUT_SIZE (MAX_FOUND*ITEM_SIZE+4)
 #define _64K 65536
 
 typedef uint16_t prefix_t;
@@ -62,12 +57,13 @@ class GPUEngine {
 
 public:
 
-  GPUEngine(int nbThreadGroup,int gpuId); 
+  GPUEngine(int nbThreadGroup,int gpuId,uint32_t maxFound); 
   ~GPUEngine();
   void SetPrefix(std::vector<prefix_t> prefixes);
   void SetPrefix(std::vector<LPREFIX> prefixes,uint32_t totalPrefix);
   bool SetKeys(Point *p);
   void SetSearchMode(int serachMode);
+  void SetMaxFound(uint32_t max);
   bool Launch(std::vector<ITEM> &prefixFound,bool spinWait=false);
   int GetNbThread();
   int GetGroupSize();
@@ -98,6 +94,9 @@ class GPUEngine {
   uint32_t searchMode;
   bool littleEndian;
   bool lostWarning;
+  uint32_t maxFound;
+  uint32_t outputSize;
+
 };
 
 #endif // GPUENGINEH
diff --git a/Vanity.cpp b/Vanity.cpp
@@ -37,7 +37,7 @@ Point _2Gn;
 // ----------------------------------------------------------------------------
 
 VanitySearch::VanitySearch(Secp256K1 &secp, vector<std::string> &inputPrefixes,string seed,int searchMode, 
-                           bool useGpu, bool stop, string outputFile, bool useSSE) {
+                           bool useGpu, bool stop, string outputFile, bool useSSE, uint32_t maxFound) {
 
   this->secp = secp;
   this->searchMode = searchMode;
@@ -46,6 +46,7 @@ VanitySearch::VanitySearch(Secp256K1 &secp, vector<std::string> &inputPrefixes,s
   this->outputFile = outputFile;
   this->useSSE = useSSE;
   this->nbGPUThread = 0;
+  this->maxFound = maxFound;
   prefixes.clear();
 
   // Create a 65536 items lookup table
@@ -105,8 +106,8 @@ VanitySearch::VanitySearch(Secp256K1 &secp, vector<std::string> &inputPrefixes,s
         lit.lPrefixes.push_back((*items)[j].lPrefix);
       sort(lit.lPrefixes.begin(), lit.lPrefixes.end());
       usedPrefixL.push_back(lit);
-      if( lit.lPrefixes.size()>maxI ) maxI = lit.lPrefixes.size();
-      if( lit.lPrefixes.size()<minI ) minI = lit.lPrefixes.size();
+      if( (uint32_t)lit.lPrefixes.size()>maxI ) maxI = (uint32_t)lit.lPrefixes.size();
+      if( (uint32_t)lit.lPrefixes.size()<minI ) minI = (uint32_t)lit.lPrefixes.size();
       unique_sPrefix++;
     }
     if (loadingProgress)
@@ -227,7 +228,7 @@ bool VanitySearch::initPrefix(std::string &prefix,PREFIX_ITEM *it) {
     it->sPrefix = *(prefix_t *)(it->hash160);
     it->lPrefix = *(prefixl_t *)(it->hash160);
     it->prefix = (char *)prefix.c_str();
-    it->prefixLength = prefix.length();
+    it->prefixLength = (int)prefix.length();
     it->found = false;
     return true;
 
@@ -248,7 +249,7 @@ bool VanitySearch::initPrefix(std::string &prefix,PREFIX_ITEM *it) {
     it->sPrefix = 0;
     it->lPrefix = 0;
     it->prefix = (char *)prefix.c_str();
-    it->prefixLength = prefix.length();
+    it->prefixLength = (int)prefix.length();
     it->found = false;
     return true;
 
@@ -291,7 +292,7 @@ bool VanitySearch::initPrefix(std::string &prefix,PREFIX_ITEM *it) {
   it->isFull = false;
   it->lPrefix = 0;
   it->prefix = (char *)prefix.c_str();
-  it->prefixLength = prefix.length();
+  it->prefixLength = (int)prefix.length();
   it->found = false;
 
   return true;
@@ -1006,7 +1007,7 @@ void VanitySearch::FindKeyGPU(TH_PARAM *ph) {
 
   // Global init
   int thId = ph->threadId;
-  GPUEngine g(ph->gridSize, ph->gpuId);
+  GPUEngine g(ph->gridSize, ph->gpuId, maxFound);
   int nbThread = g.GetNbThread();
   Point *p = new Point[nbThread];
   Int *keys = new Int[nbThread];

diff --git a/Vanity.h b/Vanity.h
@@ -66,7 +66,7 @@ class VanitySearch {
 public:
 
   VanitySearch(Secp256K1 &secp, std::vector<std::string> &prefix, std::string seed, int searchMode, 
-               bool useGpu,bool stop,std::string outputFile, bool useSSE);
+               bool useGpu,bool stop,std::string outputFile, bool useSSE,uint32_t maxFound);
   void Search(int nbThread,std::vector<int> gpuId,std::vector<int> gridSize);
   void FindKeyCPU(TH_PARAM *p);
   void FindKeyGPU(TH_PARAM *p);
@@ -104,6 +104,7 @@ class VanitySearch {
   std::string outputFile;
   bool useSSE;
   bool onlyFull;
+  uint32_t maxFound;
   double _difficulty;
   std::vector<PREFIX_TABLE_ITEM> prefixes;
   std::vector<prefix_t> usedPrefix;