Cuda fix (#1595)

* tpb arg mismatch messing with CUDA, works now * update readme to include cubit install instructions for PoW * fix tests
opentensor · Nov 28, 2023 · 8941ae0 · 8941ae0
1 parent 2de7bb5
commit 8941ae0
Show file tree

Hide file tree

Showing 8 changed files with 50 additions and 42 deletions.
diff --git a/README.md b/README.md
@@ -50,6 +50,14 @@ or using python
 import bittensor
 ```
 
+#### CUDA
+If you anticipate using PoW registration for subnets or the faucet (only available on staging), please install `cubit` as well for your version of python. You can find the Opentensor cubit implementation and instructions [here](https://github.com/opentensor/cubit).
+
+For example with python 3.10:
+```bash
+pip install https://github.com/opentensor/cubit/releases/download/v1.1.2/cubit-1.1.2-cp310-cp310-linux_x86_64.whl
+```
+
 # Wallets 
 
 Wallets are the core ownership and identity technology around which all functions on Bittensor are carried out. Bittensor wallets consists of a coldkey and hotkey where the coldkey may contain many hotkeys, while each hotkey can only belong to a single coldkey. Coldkeys store funds securely, and operate functions such as transfers and staking, while hotkeys are used for all online operations such as signing queries, running miners and validating. 

diff --git a/bittensor/commands/__init__.py b/bittensor/commands/__init__.py
@@ -26,7 +26,7 @@
             "update_interval": 50000,
             "output_in_place": True,
             "verbose": False,
-            "cuda": {"dev_id": [0], "use_cuda": False, "TPB": 256},
+            "cuda": {"dev_id": [0], "use_cuda": False, "tpb": 256},
         },
         "axon": {
             "port": 8091,

diff --git a/bittensor/commands/register.py b/bittensor/commands/register.py
@@ -157,7 +157,7 @@ class PowRegisterCommand:
     - --pow_register.cuda.use_cuda (bool): Enables the use of CUDA for GPU-accelerated PoW calculations. Requires a CUDA-compatible GPU.
     - --pow_register.cuda.no_cuda (bool): Disables the use of CUDA, defaulting to CPU-based calculations.
     - --pow_register.cuda.dev_id (int): Specifies the CUDA device ID, useful for systems with multiple CUDA-compatible GPUs.
-    - --pow_register.cuda.TPB (int): Sets the number of Threads Per Block for CUDA operations, affecting the GPU calculation dynamics.
+    - --pow_register.cuda.tpb (int): Sets the number of Threads Per Block for CUDA operations, affecting the GPU calculation dynamics.
 
     The command also supports additional wallet and subtensor arguments, enabling further customization of the registration process.
 
@@ -189,7 +189,7 @@ def run(cli):
             wallet=wallet,
             netuid=cli.config.netuid,
             prompt=not cli.config.no_prompt,
-            TPB=cli.config.pow_register.cuda.get("TPB", None),
+            tpb=cli.config.pow_register.cuda.get("tpb", None),
             update_interval=cli.config.pow_register.get("update_interval", None),
             num_processes=cli.config.pow_register.get("num_processes", None),
             cuda=cli.config.pow_register.cuda.get(
@@ -281,10 +281,10 @@ def add_args(parser: argparse.ArgumentParser):
             required=False,
         )
         register_parser.add_argument(
-            "--pow_register.cuda.TPB",
-            "--cuda.TPB",
+            "--pow_register.cuda.tpb",
+            "--cuda.tpb",
             type=int,
-            default=defaults.pow_register.cuda.TPB,
+            default=defaults.pow_register.cuda.tpb,
             help="""Set the number of Threads Per Block for CUDA.""",
             required=False,
         )
@@ -342,7 +342,7 @@ class RunFaucetCommand:
     - --faucet.cuda.use_cuda (bool): Activates the use of CUDA for GPU acceleration in the PoW process, suitable for CUDA-compatible GPUs.
     - --faucet.cuda.no_cuda (bool): Disables the use of CUDA, opting for CPU-based calculations.
     - --faucet.cuda.dev_id (int[]): Allows selection of specific CUDA device IDs for the operation, useful in multi-GPU setups.
-    - --faucet.cuda.TPB (int): Determines the number of Threads Per Block for CUDA operations, affecting GPU calculation efficiency.
+    - --faucet.cuda.tpb (int): Determines the number of Threads Per Block for CUDA operations, affecting GPU calculation efficiency.
 
     These options provide flexibility in configuring the PoW process according to the user's hardware capabilities and preferences.
 
@@ -364,7 +364,7 @@ def run(cli):
         subtensor.run_faucet(
             wallet=wallet,
             prompt=not cli.config.no_prompt,
-            TPB=cli.config.pow_register.cuda.get("TPB", None),
+            tpb=cli.config.pow_register.cuda.get("tpb", None),
             update_interval=cli.config.pow_register.get("update_interval", None),
             num_processes=cli.config.pow_register.get("num_processes", None),
             cuda=cli.config.pow_register.cuda.get(
@@ -449,10 +449,10 @@ def add_args(parser: argparse.ArgumentParser):
             required=False,
         )
         run_faucet_parser.add_argument(
-            "--faucet.cuda.TPB",
-            "--cuda.TPB",
+            "--faucet.cuda.tpb",
+            "--cuda.tpb",
             type=int,
-            default=defaults.pow_register.cuda.TPB,
+            default=defaults.pow_register.cuda.tpb,
             help="""Set the number of Threads Per Block for CUDA.""",
             required=False,
         )

diff --git a/bittensor/extrinsics/registration.py b/bittensor/extrinsics/registration.py
@@ -36,7 +36,7 @@ def register_extrinsic(
     output_in_place: bool = True,
     cuda: bool = False,
     dev_id: Union[List[int], int] = 0,
-    TPB: int = 256,
+    tpb: int = 256,
     num_processes: Optional[int] = None,
     update_interval: Optional[int] = None,
     log_verbose: bool = False,
@@ -61,7 +61,7 @@ def register_extrinsic(
             If true, the wallet should be registered using CUDA device(s).
         dev_id (Union[List[int], int]):
             The CUDA device id to use, or a list of device ids.
-        TPB (int):
+        tpb (int):
             The number of threads per block (CUDA).
         num_processes (int):
             The number of processes to use to register.
@@ -123,7 +123,7 @@ def register_extrinsic(
                 output_in_place,
                 cuda=cuda,
                 dev_id=dev_id,
-                TPB=TPB,
+                tpb=tpb,
                 num_processes=num_processes,
                 update_interval=update_interval,
                 log_verbose=log_verbose,
@@ -339,7 +339,7 @@ def run_faucet_extrinsic(
     output_in_place: bool = True,
     cuda: bool = False,
     dev_id: Union[List[int], int] = 0,
-    TPB: int = 256,
+    tpb: int = 256,
     num_processes: Optional[int] = None,
     update_interval: Optional[int] = None,
     log_verbose: bool = False,
@@ -362,7 +362,7 @@ def run_faucet_extrinsic(
             If true, the wallet should be registered using CUDA device(s).
         dev_id (Union[List[int], int]):
             The CUDA device id to use, or a list of device ids.
-        TPB (int):
+        tpb (int):
             The number of threads per block (CUDA).
         num_processes (int):
             The number of processes to use to register.
@@ -410,7 +410,7 @@ def run_faucet_extrinsic(
                         output_in_place,
                         cuda=cuda,
                         dev_id=dev_id,
-                        TPB=TPB,
+                        tpb=tpb,
                         num_processes=num_processes,
                         update_interval=update_interval,
                         log_verbose=log_verbose,

diff --git a/bittensor/subtensor.py b/bittensor/subtensor.py
@@ -461,7 +461,7 @@ def register(
         output_in_place: bool = True,
         cuda: bool = False,
         dev_id: Union[List[int], int] = 0,
-        TPB: int = 256,
+        tpb: int = 256,
         num_processes: Optional[int] = None,
         update_interval: Optional[int] = None,
         log_verbose: bool = False,
@@ -478,7 +478,7 @@ def register(
             output_in_place=output_in_place,
             cuda=cuda,
             dev_id=dev_id,
-            TPB=TPB,
+            tpb=tpb,
             num_processes=num_processes,
             update_interval=update_interval,
             log_verbose=log_verbose,
@@ -512,7 +512,7 @@ def run_faucet(
         output_in_place: bool = True,
         cuda: bool = False,
         dev_id: Union[List[int], int] = 0,
-        TPB: int = 256,
+        tpb: int = 256,
         num_processes: Optional[int] = None,
         update_interval: Optional[int] = None,
         log_verbose: bool = False,
@@ -528,7 +528,7 @@ def run_faucet(
             output_in_place=output_in_place,
             cuda=cuda,
             dev_id=dev_id,
-            TPB=TPB,
+            tpb=tpb,
             num_processes=num_processes,
             update_interval=update_interval,
             log_verbose=log_verbose,

diff --git a/bittensor/utils/_register_cuda.py b/bittensor/utils/_register_cuda.py
@@ -13,7 +13,7 @@
 def solve_cuda(
     nonce_start: np.int64,
     update_interval: np.int64,
-    TPB: int,
+    tpb: int,
     block_and_hotkey_hash_bytes: bytes,
     difficulty: int,
     limit: int,
@@ -26,7 +26,7 @@ def solve_cuda(
             Starting nonce.
         update_interval: int64
             Number of nonces to solve before updating block information.
-        TPB: int
+        tpb: int
             Threads per block.
         block_and_hotkey_hash_bytes: bytes
             Keccak(Bytes of the block hash + bytes of the hotkey) 64 bytes.
@@ -78,7 +78,7 @@ def _seal_meets_difficulty(seal: bytes, difficulty: int):
     block_and_hotkey_hash_hex = binascii.hexlify(block_and_hotkey_hash_bytes)[:64]
 
     solution = cubit.solve_cuda(
-        TPB,
+        tpb,
         nonce_start,
         update_interval,
         upper_bytes,

diff --git a/bittensor/utils/registration.py b/bittensor/utils/registration.py
@@ -209,7 +209,7 @@ def run(self):
 
 class _CUDASolver(_SolverBase):
     dev_id: int
-    TPB: int
+    tpb: int
 
     def __init__(
         self,
@@ -225,7 +225,7 @@ def __init__(
         check_block,
         limit,
         dev_id: int,
-        TPB: int,
+        tpb: int,
     ):
         super().__init__(
             proc_num,
@@ -241,7 +241,7 @@ def __init__(
             limit,
         )
         self.dev_id = dev_id
-        self.TPB = TPB
+        self.tpb = tpb
 
     def run(self):
         block_number: int = 0  # dummy value
@@ -269,7 +269,7 @@ def run(self):
                 self.limit,
                 block_number,
                 self.dev_id,
-                self.TPB,
+                self.tpb,
             )
             if solution is not None:
                 self.solution_queue.put(solution)
@@ -282,7 +282,7 @@ def run(self):
                 pass
 
             # increase nonce by number of nonces processed
-            nonce_start += self.update_interval * self.TPB
+            nonce_start += self.update_interval * self.tpb
             nonce_start = nonce_start % nonce_limit
 
 
@@ -294,13 +294,13 @@ def _solve_for_nonce_block_cuda(
     limit: int,
     block_number: int,
     dev_id: int,
-    TPB: int,
+    tpb: int,
 ) -> Optional[POWSolution]:
-    """Tries to solve the POW on a CUDA device for a block of nonces (nonce_start, nonce_start + update_interval * TPB"""
+    """Tries to solve the POW on a CUDA device for a block of nonces (nonce_start, nonce_start + update_interval * tpb"""
     solution, seal = solve_cuda(
         nonce_start,
         update_interval,
-        TPB,
+        tpb,
         block_and_hotkey_hash_bytes,
         difficulty,
         limit,
@@ -794,7 +794,7 @@ def _solve_for_difficulty_fast_cuda(
     netuid: int,
     output_in_place: bool = True,
     update_interval: int = 50_000,
-    TPB: int = 512,
+    tpb: int = 512,
     dev_id: Union[List[int], int] = 0,
     n_samples: int = 10,
     alpha_: float = 0.80,
@@ -813,7 +813,7 @@ def _solve_for_difficulty_fast_cuda(
             If true, prints the output in place, otherwise prints to new lines
         update_interval: int
             The number of nonces to try before checking for more blocks
-        TPB: int
+        tpb: int
             The number of threads per block. CUDA param that should match the GPU capability
         dev_id: Union[List[int], int]
             The CUDA device IDs to execute the registration on, either a single device or a list of devices
@@ -868,7 +868,7 @@ def _solve_for_difficulty_fast_cuda(
                 check_block,
                 limit,
                 dev_id[i],
-                TPB,
+                tpb,
             )
             for i in range(num_processes)
         ]
@@ -967,7 +967,7 @@ def _solve_for_difficulty_fast_cuda(
             if num_time > 0 and time_since_last > 0.0:
                 # create EWMA of the hash_rate to make measure more robust
 
-                hash_rate_ = (num_time * TPB * update_interval) / time_since_last
+                hash_rate_ = (num_time * tpb * update_interval) / time_since_last
                 hash_rates.append(hash_rate_)
                 hash_rates.pop(0)  # remove the 0th data point
                 curr_stats.hash_rate = sum(
@@ -987,7 +987,7 @@ def _solve_for_difficulty_fast_cuda(
             curr_stats.time_spent = time_since_last
             new_time_spent_total = time_now - start_time_perpetual
             curr_stats.hash_rate_perpetual = (
-                curr_stats.rounds_total * (TPB * update_interval)
+                curr_stats.rounds_total * (tpb * update_interval)
             ) / new_time_spent_total
             curr_stats.time_spent_total = new_time_spent_total
 
@@ -1071,7 +1071,7 @@ def create_pow(
             netuid=netuid,
             output_in_place=output_in_place,
             dev_id=dev_id,
-            TPB=tpb,
+            tpb=tpb,
             update_interval=update_interval,
             log_verbose=log_verbose,
         )

diff --git a/tests/unit_tests/utils/test_utils.py b/tests/unit_tests/utils/test_utils.py
@@ -639,13 +639,13 @@ def test_multi_cuda_run_updates_nonce_start(self):
         class MockException(Exception):
             pass
 
-        TPB: int = 512
+        tpb: int = 512
         update_interval: int = 70_000
         nonce_limit: int = int(math.pow(2, 64)) - 1
 
         mock_solver_self = MagicMock(
             spec=_CUDASolver,
-            TPB=TPB,
+            tpb=tpb,
             dev_id=0,
             update_interval=update_interval,
             stopEvent=MagicMock(is_set=MagicMock(return_value=False)),
@@ -690,10 +690,10 @@ class MockException(Exception):
                 initial_nonce_start,
                 "nonce_start was not updated after iteration",
             )
-            ## Should incerase by the number of nonces tried == TPB * update_interval
+            ## Should incerase by the number of nonces tried == tpb * update_interval
             self.assertEqual(
                 nonce_start_after_iteration,
-                (initial_nonce_start + update_interval * TPB) % nonce_limit,
+                (initial_nonce_start + update_interval * tpb) % nonce_limit,
                 "nonce_start was not updated by the correct amount",
             )