Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(BA-460): Cache gpu_alloc_map in Redis, and Add RescanGPUAllocMaps mutation #3293

Open
wants to merge 28 commits into
base: topic/06-13-feat_support_scanning_gpu_allocation
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
f7e8a13
feat: Cache gpu_alloc_map, and Add ScanGPUAllocMap mutation
jopemachine Dec 24, 2024
23fb522
chore: Add news fragment
jopemachine Dec 24, 2024
0731359
chore: Add news fragment
jopemachine Dec 24, 2024
528f97a
chore: fix typo
jopemachine Dec 24, 2024
3936c2c
chore: Improve news fragment
jopemachine Dec 24, 2024
d7e6f4a
fix: Add milestone comment
jopemachine Dec 24, 2024
dd69cb1
fix: Wrong impl of AgentRegistry.scan_gpu_alloc_map
jopemachine Dec 24, 2024
68c7e84
fix: Add `extra_fixtures`
jopemachine Dec 24, 2024
535f138
feat: Add `test_scan_gpu_alloc_maps` test case
jopemachine Dec 24, 2024
2a59d72
feat: Add update call count check
jopemachine Dec 24, 2024
341de3c
fix: Improve `test_scan_gpu_alloc_maps`
jopemachine Dec 26, 2024
83498fc
fix: Improve `test_scan_gpu_alloc_maps`
jopemachine Dec 26, 2024
803ccae
chore: Rename variables
jopemachine Dec 26, 2024
aa4049e
fix: `ScanGPUAllocMaps` -> `RescanGPUAllocMaps`
jopemachine Dec 26, 2024
47b1f7d
fix: Broken test
jopemachine Dec 26, 2024
4a5bdd3
fix: Remove useless `_default_host`
jopemachine Dec 26, 2024
7ee3fb1
chore: Rename news fragment
jopemachine Dec 26, 2024
e2d48c6
feat: Improve error handling
jopemachine Dec 26, 2024
f44ee8e
fix: Improve exception handling and test case
jopemachine Dec 26, 2024
b62eeec
fix: Replace useless `mock_agent_registry_ctx` with local_config's `r…
jopemachine Dec 26, 2024
59a983d
fix: Wrong reference to `redis_stat`
jopemachine Dec 26, 2024
dc1056f
docs: Add description about agent_id
jopemachine Dec 26, 2024
678bcb6
chore: update GraphQL schema dump
jopemachine Dec 26, 2024
05ea9d9
feat: Call agent rpc call in parallel
jopemachine Dec 26, 2024
caed91c
fix: Update milestone
jopemachine Jan 8, 2025
57a80f4
fix: lint
jopemachine Jan 8, 2025
0455b74
fix: Update milestone
jopemachine Jan 8, 2025
c9630bf
fix: Update milestone
jopemachine Jan 24, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changes/3293.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Cache `gpu_alloc_map` in Redis, and Add `RescanGPUAllocMaps` mutation for update the `gpu_alloc_map`s.
13 changes: 13 additions & 0 deletions docs/manager/graphql-reference/schema.graphql
Original file line number Diff line number Diff line change
Expand Up @@ -1783,6 +1783,12 @@
This action cannot be undone.
"""
purge_user(email: String!, props: PurgeUserInput!): PurgeUser

"""Added in 25.2.0."""
rescan_gpu_alloc_maps(

Check notice on line 1788 in docs/manager/graphql-reference/schema.graphql

View workflow job for this annotation

GitHub Actions / GraphQL Inspector

Field 'rescan_gpu_alloc_maps' was added to object type 'Mutations'

Field 'rescan_gpu_alloc_maps' was added to object type 'Mutations'
"""Agent ID to rescan GPU alloc map, Pass None to rescan all agents"""
agent_id: String
): RescanGPUAllocMaps
create_keypair(props: KeyPairInput!, user_id: String!): CreateKeyPair
modify_keypair(access_key: String!, props: ModifyKeyPairInput!): ModifyKeyPair
delete_keypair(access_key: String!): DeleteKeyPair
Expand Down Expand Up @@ -2196,6 +2202,13 @@
purge_shared_vfolders: Boolean
}

"""Added in 25.2.0."""
type RescanGPUAllocMaps {

Check notice on line 2206 in docs/manager/graphql-reference/schema.graphql

View workflow job for this annotation

GitHub Actions / GraphQL Inspector

Type 'RescanGPUAllocMaps' was added

Type 'RescanGPUAllocMaps' was added
ok: Boolean
msg: String
task_id: UUID
}

type CreateKeyPair {
ok: Boolean
msg: String
Expand Down
2 changes: 2 additions & 0 deletions src/ai/backend/manager/models/gql.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@
AgentSummary,
AgentSummaryList,
ModifyAgent,
RescanGPUAllocMaps,
)
from .gql_models.domain import (
CreateDomainNode,
Expand Down Expand Up @@ -257,6 +258,7 @@ class Mutations(graphene.ObjectType):
modify_user = ModifyUser.Field()
delete_user = DeleteUser.Field()
purge_user = PurgeUser.Field()
rescan_gpu_alloc_maps = RescanGPUAllocMaps.Field(description="Added in 25.2.0.")

# admin only
create_keypair = CreateKeyPair.Field()
Expand Down
93 changes: 91 additions & 2 deletions src/ai/backend/manager/models/gql_models/agent.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from __future__ import annotations

import asyncio
import json
import logging
import uuid
from collections.abc import Iterable, Mapping, Sequence
from typing import (
Expand All @@ -18,12 +21,14 @@
from sqlalchemy.ext.asyncio import AsyncConnection as SAConnection

from ai.backend.common import msgpack, redis_helper
from ai.backend.common.bgtask import ProgressReporter
from ai.backend.common.types import (
AccessKey,
AgentId,
BinarySize,
HardwareMetadata,
)
from ai.backend.logging.utils import BraceStyleAdapter

from ..agent import (
AgentRow,
Expand Down Expand Up @@ -61,6 +66,8 @@
if TYPE_CHECKING:
from ..gql import GraphQueryContext

log = BraceStyleAdapter(logging.getLogger(__spec__.name))

__all__ = (
"Agent",
"AgentNode",
Expand Down Expand Up @@ -181,7 +188,13 @@ async def resolve_live_stat(self, info: graphene.ResolveInfo) -> Any:

async def resolve_gpu_alloc_map(self, info: graphene.ResolveInfo) -> Mapping[str, int]:
ctx: GraphQueryContext = info.context
return await ctx.registry.scan_gpu_alloc_map(self.id)
raw_alloc_map = await redis_helper.execute(
ctx.redis_stat, lambda r: r.get(f"gpu_alloc_map.{self.id}")
)
if raw_alloc_map:
return json.loads(raw_alloc_map)
else:
return {}

async def resolve_hardware_metadata(
self,
Expand Down Expand Up @@ -435,7 +448,13 @@ async def resolve_container_count(self, info: graphene.ResolveInfo) -> int:

async def resolve_gpu_alloc_map(self, info: graphene.ResolveInfo) -> Mapping[str, int]:
ctx: GraphQueryContext = info.context
return await ctx.registry.scan_gpu_alloc_map(self.id)
raw_alloc_map = await redis_helper.execute(
ctx.redis_stat, lambda r: r.get(f"gpu_alloc_map.{self.id}")
)
if raw_alloc_map:
return json.loads(raw_alloc_map)
else:
return {}

_queryfilter_fieldspec: Mapping[str, FieldSpecItem] = {
"id": ("id", None),
Expand Down Expand Up @@ -878,3 +897,73 @@ async def mutate(

update_query = sa.update(agents).values(data).where(agents.c.id == id)
return await simple_db_mutate(cls, graph_ctx, update_query)


class RescanGPUAllocMaps(graphene.Mutation):
allowed_roles = (UserRole.SUPERADMIN,)

class Meta:
description = "Added in 25.2.0."

class Arguments:
agent_id = graphene.String(
description="Agent ID to rescan GPU alloc map, Pass None to rescan all agents",
required=False,
)

ok = graphene.Boolean()
msg = graphene.String()
task_id = graphene.UUID()

@classmethod
@privileged_mutation(
UserRole.SUPERADMIN,
lambda id, **kwargs: (None, id),
)
async def mutate(
cls,
root,
info: graphene.ResolveInfo,
agent_id: Optional[str] = None,
) -> RescanGPUAllocMaps:
log.info("rescanning GPU alloc maps")
graph_ctx: GraphQueryContext = info.context

if agent_id:
agent_ids = [agent_id]
else:
agent_ids = [agent.id async for agent in graph_ctx.registry.enumerate_instances()]

async def _scan_single_agent(agent_id: str, reporter: ProgressReporter) -> None:
await reporter.update(message=f"Agent {agent_id} GPU alloc map scanning...")

reporter_msg = ""
try:
alloc_map: Mapping[str, Any] = await graph_ctx.registry.scan_gpu_alloc_map(
AgentId(agent_id)
)
key = f"gpu_alloc_map.{agent_id}"
await redis_helper.execute(
graph_ctx.registry.redis_stat,
lambda r: r.set(name=key, value=json.dumps(alloc_map)),
)
except Exception as e:
reporter_msg = f"Failed to scan GPU alloc map for agent {agent_id}: {str(e)}"
log.error(reporter_msg)
else:
reporter_msg = f"Agent {agent_id} GPU alloc map scanned."

await reporter.update(
increment=1,
message=reporter_msg,
)

async def _rescan_alloc_map_task(reporter: ProgressReporter) -> None:
async with asyncio.TaskGroup() as tg:
for agent_id in agent_ids:
tg.create_task(_scan_single_agent(agent_id, reporter))

await reporter.update(message="GPU alloc map scanning completed")

task_id = await graph_ctx.background_task_manager.start(_rescan_alloc_map_task)
return RescanGPUAllocMaps(ok=True, msg="", task_id=task_id)
13 changes: 12 additions & 1 deletion tests/manager/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,8 @@ def local_config(
redis_addr = redis_container[1]
postgres_addr = postgres_container[1]

build_root = Path(os.environ["BACKEND_BUILD_ROOT"])

# Establish a self-contained config.
cfg = LocalConfig({
**etcd_config_iv.check({
Expand Down Expand Up @@ -208,6 +210,7 @@ def local_config(
"service-addr": HostPortPair("127.0.0.1", 29100 + get_parallel_slot() * 10),
"allowed-plugins": set(),
"disabled-plugins": set(),
"rpc-auth-manager-keypair": f"{build_root}/fixtures/manager/manager.key_secret",
},
"pyroscope": {
"enabled": False,
Expand Down Expand Up @@ -265,7 +268,15 @@ def etcd_fixture(
"volumes": {
"_mount": str(vfolder_mount),
"_fsprefix": str(vfolder_fsprefix),
"_default_host": str(vfolder_host),
"default_host": str(vfolder_host),
"proxies": {
"local": {
"client_api": "http://127.0.0.1:6021",
"manager_api": "https://127.0.0.1:6022",
"secret": "some-secret-shared-with-storage-proxy",
"ssl_verify": "false",
}
},
},
"nodes": {},
"config": {
Expand Down
1 change: 1 addition & 0 deletions tests/manager/models/gql_models/BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
python_tests(name="tests")
Loading
Loading