From 7b4f6f62ae3764311e5bc405988682eea433a843 Mon Sep 17 00:00:00 2001 From: nautilus Date: Fri, 20 Dec 2024 12:52:53 +0100 Subject: [PATCH 1/4] fix: allocation bonus: multiplier instead of addition --- README.md | 32 ++++++++++++------------ neurons/Validator/calculate_pow_score.py | 11 ++++---- 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 10d4231d..71b4c773 100644 --- a/README.md +++ b/README.md @@ -210,43 +210,43 @@ The score calculation function now determines a miner's performance primarily ba - NVIDIA RTX A5000: 0.36 - NVIDIA RTX A4500: 0.34 -**Scaling Factor**: Determine the highest GPU base score, multiply it by 8 (the maximum number of GPUs), and set this scenario as the 100-point baseline. A scaling factor is derived so that using eight of the top GPU models equals 100 points. +**Scaling Factor**: Determine the highest GPU base score, multiply it by 8 (the maximum number of GPUs), and set this scenario as the 100-point baseline. A scaling factor is derived so that using eight of the top GPU models equals 50 points. -**GPU Score**: Multiply the chosen GPU’s base score by the number of GPUs (up to 8) and by the scaling factor to find the miner’s GPU score (0–100). +**GPU Score**: Multiply the chosen GPU’s base score by the number of GPUs (up to 8) and by the scaling factor to find the miner’s GPU score (0–50). -**Allocation Bonus**: If a miner has allocated machine resources, add 100 points to the GPU score, allowing a maximum score of up to 200. +**Allocation Bonus**: If a miner has allocated machine resources, the GPU score is multiplied by 2, allowing a maximum score of up to 100. **Total Score**: -- Score (not allocated) = GPU Score (0–100) -- Score (allocated) = GPU Score + 100 (up to 200) +- Score (not allocated) = GPU Score (0–50) +- Score (allocated) = GPU Score * 2 (up to 100) ### Example 1: Miner A's Total Score - **GPU**: NVIDIA H200 (Base Score: 3.90) - **Number of GPUs**: 8 -- **Allocation**: True +- **Allocation**: False Step-by-step calculation: -1. Highest scenario: 3.90 * 8 = 31.2 -2. Scaling factor: 100 / 31.2 ≈ 3.2051 -3. GPU Score: 3.90 * 8 * 3.2051 ≈ 100 -4. Allocation Bonus: 100 + 100 = 200 +1. Highest scenario: 4 * 8 = 32 +2. Scaling factor: 50 / 32 ≈ 1.5625 +3. GPU Score: 4 * 8 * 1.5625 ≈ 50 +4. Allocation Bonus: 0 -Total Score = 200 +Total Score = 50 ### Example 2: Miner B's Total Score - **GPU**: NVIDIA RTX 4090 (Base Score: 0.69) - **Number of GPUs**: 2 -- **Allocation**: False +- **Allocation**: True Step-by-step calculation: -1. Scaling factor (same as above): 3.2051 -2. GPU Score: 0.69 * 2 * 3.2051 ≈ 4.42 -3. No allocation bonus applied. +1. Scaling factor (same as above): 1.5625 +2. GPU Score: 0.68 * 2 * 1.5625 ≈ 2.125 +3. Allocation Bonus: 2.125 * 2 = 4.25 -Total Score = 4.42 +Total Score = 4.25 ## Resource Allocation Mechanism diff --git a/neurons/Validator/calculate_pow_score.py b/neurons/Validator/calculate_pow_score.py index 667e8f01..7f7b2a84 100644 --- a/neurons/Validator/calculate_pow_score.py +++ b/neurons/Validator/calculate_pow_score.py @@ -39,7 +39,7 @@ def calc_score_pog(gpu_specs, hotkey, allocated_hotkeys, config_data, mock=False # Get the GPU with the maximum score max_gpu = max(gpu_scores, key=gpu_scores.get) max_score = gpu_scores[max_gpu]*8 - score_factor = 100/max_score + score_factor = 50/max_score gpu_name = gpu_specs.get("gpu_name") num_gpus = min(gpu_specs.get("num_gpus"), 8) @@ -47,15 +47,16 @@ def calc_score_pog(gpu_specs, hotkey, allocated_hotkeys, config_data, mock=False # Get GPU score score = gpu_scores.get(gpu_name) * num_gpus * score_factor - # Add allocation score, i.e. max un-allocated score = 100 + # Add allocation score, multiplier = 2 if hotkey in allocated_hotkeys: - score += 100 + score = score * 2 # Logging score - bt.logging.info(f"Score - {hotkey}: {score:.2f}/200") + bt.logging.info(f"Score - {hotkey}: {score:.2f}/100") # Normalize the score - normalized_score = normalize(score, 0, 200) + normalized_score = normalize(score, 0, 100) + return normalized_score except Exception as e: bt.logging.error(f"An error occurred while calculating score for the following hotkey - {hotkey}: {e}") From 249ef9ba2a581f2304119d203b0a079890750841 Mon Sep 17 00:00:00 2001 From: nautilus Date: Fri, 20 Dec 2024 13:00:05 +0100 Subject: [PATCH 2/4] chore: bump version --- compute/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/compute/__init__.py b/compute/__init__.py index f883f7d4..81c060bb 100644 --- a/compute/__init__.py +++ b/compute/__init__.py @@ -18,9 +18,9 @@ import string # Define the version of the template module. -__version__ = "1.6.0" +__version__ = "1.6.1" __minimal_miner_version__ = "1.6.0" -__minimal_validator_version__ = "1.6.0" +__minimal_validator_version__ = "1.6.1" version_split = __version__.split(".") __version_as_int__ = (100 * int(version_split[0])) + (10 * int(version_split[1])) + (1 * int(version_split[2])) From a3b3ba9308c455ee948d8d733bb5fd62d1d48e5a Mon Sep 17 00:00:00 2001 From: nautilus Date: Mon, 23 Dec 2024 16:02:17 +0100 Subject: [PATCH 3/4] fix: different name for test-allocation container --- compute/__init__.py | 2 +- neurons/Miner/container.py | 34 ++++++++++++++++++++++++++++++++-- neurons/miner.py | 2 ++ 3 files changed, 35 insertions(+), 3 deletions(-) diff --git a/compute/__init__.py b/compute/__init__.py index 81c060bb..27537ede 100644 --- a/compute/__init__.py +++ b/compute/__init__.py @@ -20,7 +20,7 @@ # Define the version of the template module. __version__ = "1.6.1" __minimal_miner_version__ = "1.6.0" -__minimal_validator_version__ = "1.6.1" +__minimal_validator_version__ = "1.6.0" version_split = __version__.split(".") __version_as_int__ = (100 * int(version_split[0])) + (10 * int(version_split[1])) + (1 * int(version_split[2])) diff --git a/neurons/Miner/container.py b/neurons/Miner/container.py index 52ff3d86..930b779b 100644 --- a/neurons/Miner/container.py +++ b/neurons/Miner/container.py @@ -38,6 +38,7 @@ image_name = "ssh-image" # Docker image name container_name = "ssh-container" # Docker container name +container_name_test = "ssh-test-container" volume_name = "ssh-volume" # Docker volumne name volume_path = "/tmp" # Path inside the container where the volume will be mounted ssh_port = 4444 # Port to map SSH service on the host @@ -56,7 +57,7 @@ def kill_container(): client, containers = get_docker() running_container = None for container in containers: - if container_name in container.name: + if container.name == container_name: running_container = container break if running_container: @@ -76,6 +77,31 @@ def kill_container(): bt.logging.info(f"Error killing container {e}") return False +# Kill the currently running test container +def kill_test_container(): + try: + client, containers = get_docker() + running_container = None + for container in containers: + if container.name == container_name_test: + running_container = container + break + if running_container: + # stop and remove the container by using the SIGTERM signal to PID 1 (init) process in the container + if running_container.status == "running": + running_container.exec_run(cmd="kill -15 1") + running_container.wait() + # running_container.stop() + running_container.remove() + # Remove all dangling images + client.images.prune(filters={"dangling": True}) + bt.logging.info("Test container was killed successfully") + else: + bt.logging.info("No running container.") + return True + except Exception as e: + bt.logging.info(f"Error killing container {e}") + return False # Run a new docker container with the given docker_name, image_name and device information def run_container(cpu_usage, ram_usage, hard_disk_usage, gpu_usage, public_key, docker_requirement: dict): @@ -150,13 +176,17 @@ def run_container(cpu_usage, ram_usage, hard_disk_usage, gpu_usage, public_key, # Create the Docker volume with the specified size # client.volumes.create(volume_name, driver = 'local', driver_opts={'size': hard_disk_capacity}) + # Determine container name based on ssh key + container_to_run = container_name if docker_ssh_key else container_name_test + + # Step 2: Run the Docker container device_requests = [DeviceRequest(count=-1, capabilities=[["gpu"]])] # if gpu_usage["capacity"] == 0: # device_requests = [] container = client.containers.run( image=image_name, - name=container_name, + name=container_to_run, detach=True, device_requests=device_requests, environment=["NVIDIA_VISIBLE_DEVICES=all"], diff --git a/neurons/miner.py b/neurons/miner.py index 57b0656a..a8986797 100644 --- a/neurons/miner.py +++ b/neurons/miner.py @@ -61,6 +61,7 @@ build_sample_container, check_container, kill_container, + kill_test_container, restart_container, exchange_key_container, pause_container, @@ -218,6 +219,7 @@ def __check_alloaction_errors(self): bt.logging.info( "Container is already running without allocated. Killing the container." ) + kill_test_container() def init_axon(self): # Step 6: Build and link miner functions to the axon. From 061cf2799a7a899f2221aeadcc319973a07a69de Mon Sep 17 00:00:00 2001 From: nautilus Date: Mon, 23 Dec 2024 16:03:48 +0100 Subject: [PATCH 4/4] fix: asyncio timeout for pog worker --- neurons/validator.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/neurons/validator.py b/neurons/validator.py index 2057569d..feced9ad 100644 --- a/neurons/validator.py +++ b/neurons/validator.py @@ -643,8 +643,13 @@ async def worker(): break hotkey = axon.hotkey try: - result = await asyncio.get_event_loop().run_in_executor( - self.executor, self.test_miner_gpu, axon, self.config_data + # Set a timeout for the GPU test + timeout = 300 # e.g., 5 minutes + result = await asyncio.wait_for( + asyncio.get_event_loop().run_in_executor( + self.executor, self.test_miner_gpu, axon, self.config_data + ), + timeout=timeout ) if result[1] is not None and result[2] > 0: async with results_lock: @@ -655,6 +660,16 @@ async def worker(): update_pog_stats(self.db, hotkey, result[1], result[2]) else: raise RuntimeError("GPU test failed") + except asyncio.TimeoutError: + bt.logging.warning(f"⏳ Timeout while testing {hotkey}. Retrying...") + retry_counts[hotkey] += 1 + if retry_counts[hotkey] < retry_limit: + bt.logging.info(f"🔄 {hotkey}: Retrying miner -> (Attempt {retry_counts[hotkey]})") + await asyncio.sleep(retry_interval) + await queue.put(axon) + else: + bt.logging.info(f"❌ {hotkey}: Miner failed after {retry_limit} attempts (Timeout).") + update_pog_stats(self.db, hotkey, None, None) except Exception as e: bt.logging.trace(f"Exception in worker for {hotkey}: {e}") retry_counts[hotkey] += 1 @@ -668,7 +683,6 @@ async def worker(): finally: queue.task_done() - # Number of concurrent workers # Determine a safe default number of workers cpu_cores = os.cpu_count() or 1