feat(BA-4144): Refactor device discovery with GlobalDeviceInfo

hhoikoo · hhoikoo · commit 02251f67dc8c · 2026-01-30T13:04:30.000+09:00
Introduce GlobalDeviceInfo dataclass to separate device discovery
from allocation map creation in ResourceAllocator. This enables
cleaner separation of concerns and more flexible device-based
allocation strategies in the future.

Key changes include splitting __ainit__() into three distinct
phases: device discovery from plugins, allocation map creation,
and slot calculation. The _calculate_total_slots() method now
uses plugin.available_slots() directly instead of reading from
allocation maps, providing cleaner abstraction boundaries.

Added comprehensive unit tests covering GlobalDeviceInfo
initialization, _create_global_devices() with single and
multiple plugins, empty device handling, and slot calculation
with aggregation.
diff --git a/changes/8440.feature.md b/changes/8440.feature.md
@@ -0,0 +1 @@
+Introduce GlobalDeviceInfo and device discovery infrastructure to separate device discovery from allocation in ResourceAllocator
diff --git a/src/ai/backend/agent/resources.py b/src/ai/backend/agent/resources.py
@@ -101,6 +101,23 @@ class ComputerContext:
     alloc_map: AbstractAllocMap
 
 
+@attrs.define(auto_attribs=True, slots=True)
+class GlobalDeviceInfo:
+    """
+    Represents discovered devices from a compute plugin.
+
+    This struct separates device discovery from allocation. It contains
+    only the plugin reference and discovered devices, without any
+    allocation map. Allocation maps are created separately when needed.
+    """
+
+    plugin: AbstractComputePlugin
+    devices: Sequence[AbstractComputeDevice]
+
+
+type GlobalDeviceMap = Mapping[DeviceName, GlobalDeviceInfo]
+
+
 @dataclass
 class DeviceView:
     device: DeviceName
@@ -543,15 +560,24 @@ def __init__(self, local_config: AgentUnifiedConfig, etcd: AsyncEtcd) -> None:
 
     async def __ainit__(self) -> None:
         alloc_map_mod.log_alloc_map = self.local_config.debug.log_alloc_map
-        computers = await self._load_resources()
+        plugins = await self._load_resources()
 
+        # Phase 1: Discover devices from all plugins (separation of concerns)
+        global_device_map = await self._create_global_devices(plugins)
+
+        # Phase 2: Create allocation maps and computer contexts
         computer_contexts: dict[DeviceName, ComputerContext] = {}
-        for name, computer in computers.items():
-            devices = await computer.list_devices()
-            alloc_map = await computer.create_alloc_map()
-            computer_contexts[name] = ComputerContext(computer, devices, alloc_map)
+        for device_name, device_info in global_device_map.items():
+            alloc_map = await device_info.plugin.create_alloc_map()
+            computer_contexts[device_name] = ComputerContext(
+                device_info.plugin,
+                device_info.devices,
+                alloc_map,
+            )
         self.computers = computer_contexts
-        total_slots = self._calculate_total_slots()
+
+        # Phase 3: Calculate slots and configure agents
+        total_slots = await self._calculate_total_slots()
         self.available_total_slots = self._calculate_available_total_slots(total_slots)
 
         agent_computers = {}
@@ -621,11 +647,19 @@ def get_resource_scaling_factor(self, agent_id: AgentId) -> SlotsMap:
             raise AgentIdNotFoundError(f"Agent ID {agent_id} not in computers")
         return self.agent_resource_scaling_factor[agent_id]
 
-    def _calculate_total_slots(self) -> SlotsMap:
+    async def _calculate_total_slots(self) -> SlotsMap:
+        """
+        Calculate total available slots by querying each plugin directly.
+
+        This method uses the plugin's available_slots() method rather than
+        reading from allocation maps, providing a cleaner separation between
+        device discovery and allocation tracking.
+        """
         total_slots: dict[SlotName, Decimal] = defaultdict(lambda: Decimal("0"))
-        for device in self.computers.values():
-            for slot_info in device.alloc_map.device_slots.values():
-                total_slots[slot_info.slot_name] += slot_info.amount
+        for ctx in self.computers.values():
+            plugin_slots = await ctx.instance.available_slots()
+            for slot_name, amount in plugin_slots.items():
+                total_slots[slot_name] += amount
         return total_slots
 
     def _calculate_available_total_slots(self, total_slots: SlotsMap) -> SlotsMap:
@@ -691,6 +725,37 @@ async def _load_resources(self) -> Mapping[DeviceName, AbstractComputePlugin]:
             self.local_config.model_dump(by_alias=True),
         )
 
+    async def _create_global_devices(
+        self,
+        plugins: Mapping[DeviceName, AbstractComputePlugin],
+    ) -> GlobalDeviceMap:
+        """
+        Discover available devices from all compute plugins.
+
+        This method iterates through all registered compute plugins and
+        discovers the physical devices available from each. The result is
+        a mapping of device names to GlobalDeviceInfo, which contains the
+        plugin reference and the discovered devices.
+
+        This separation allows device discovery to be performed independently
+        of allocation map creation, enabling more flexible device-based
+        allocation strategies in the future.
+
+        Args:
+            plugins: Mapping of device names to compute plugins
+
+        Returns:
+            GlobalDeviceMap containing discovered devices from all plugins
+        """
+        global_devices: dict[DeviceName, GlobalDeviceInfo] = {}
+        for device_name, plugin in plugins.items():
+            devices = await plugin.list_devices()
+            global_devices[device_name] = GlobalDeviceInfo(
+                plugin=plugin,
+                devices=list(devices),
+            )
+        return global_devices
+
     async def _scan_available_resources(self) -> Mapping[SlotName, Decimal]:
         return await self._agent_discovery.scan_available_resources({
             name: cctx.instance for name, cctx in self.computers.items()
diff --git a/tests/unit/agent/test_resources.py b/tests/unit/agent/test_resources.py
@@ -3,9 +3,11 @@
 import textwrap
 import unittest.mock
 import uuid
+from collections.abc import Sequence
 from decimal import Decimal
 from pathlib import Path
 from unittest import mock
+from unittest.mock import AsyncMock, Mock
 
 import pytest
 from aioresponses import aioresponses
@@ -15,7 +17,15 @@
 from ai.backend.agent.affinity_map import AffinityMap, AffinityPolicy
 from ai.backend.agent.dummy.intrinsic import CPUPlugin, MemoryPlugin
 from ai.backend.agent.exception import FractionalResourceFragmented, InsufficientResource
-from ai.backend.agent.resources import ComputerContext, align_memory, scan_resource_usage_per_slot
+from ai.backend.agent.resources import (
+    AbstractComputeDevice,
+    AbstractComputePlugin,
+    ComputerContext,
+    GlobalDeviceInfo,
+    ResourceAllocator,
+    align_memory,
+    scan_resource_usage_per_slot,
+)
 from ai.backend.agent.vendor import linux
 from ai.backend.common.types import DeviceId, DeviceName, KernelId, ResourceSlot, SlotName
 
@@ -510,3 +520,194 @@ def test_align_memory():
         assert usable % align == 0
         assert usable + actual_reserved == orig
         assert 990 <= actual_reserved <= 1010
+
+
+class TestGlobalDeviceInfo:
+    """Tests for GlobalDeviceInfo dataclass."""
+
+    def test_initialization_with_devices(self) -> None:
+        """Verify GlobalDeviceInfo correctly stores plugin and devices."""
+        mock_plugin = Mock(spec=AbstractComputePlugin)
+        mock_device = Mock(spec=AbstractComputeDevice)
+        mock_device.device_id = DeviceId("0")
+
+        info = GlobalDeviceInfo(plugin=mock_plugin, devices=[mock_device])
+
+        assert info.plugin is mock_plugin
+        assert len(info.devices) == 1
+        assert info.devices[0] is mock_device
+
+    def test_initialization_with_empty_devices(self) -> None:
+        """Verify GlobalDeviceInfo handles empty device list."""
+        mock_plugin = Mock(spec=AbstractComputePlugin)
+
+        info = GlobalDeviceInfo(plugin=mock_plugin, devices=[])
+
+        assert info.plugin is mock_plugin
+        assert len(info.devices) == 0
+        assert isinstance(info.devices, Sequence)
+
+    def test_no_alloc_map_attribute(self) -> None:
+        """Verify GlobalDeviceInfo does not have alloc_map (separation of concerns)."""
+        mock_plugin = Mock(spec=AbstractComputePlugin)
+
+        info = GlobalDeviceInfo(plugin=mock_plugin, devices=[])
+
+        assert not hasattr(info, "alloc_map")
+
+
+@pytest.mark.asyncio
+class TestCreateGlobalDevices:
+    """Tests for _create_global_devices method."""
+
+    async def test_discovers_devices_from_single_plugin(self) -> None:
+        """Verify device discovery works with a single plugin."""
+        mock_device = Mock(spec=AbstractComputeDevice)
+        mock_device.device_id = DeviceId("gpu-0")
+
+        mock_plugin = AsyncMock(spec=AbstractComputePlugin)
+        mock_plugin.list_devices.return_value = [mock_device]
+
+        plugins = {DeviceName("cuda"): mock_plugin}
+
+        # Create a minimal ResourceAllocator mock to test the method
+        allocator = Mock(spec=ResourceAllocator)
+        allocator._create_global_devices = ResourceAllocator._create_global_devices.__get__(
+            allocator, ResourceAllocator
+        )
+
+        result = await allocator._create_global_devices(plugins)
+
+        assert DeviceName("cuda") in result
+        assert result[DeviceName("cuda")].plugin is mock_plugin
+        assert len(result[DeviceName("cuda")].devices) == 1
+        mock_plugin.list_devices.assert_called_once()
+
+    async def test_discovers_devices_from_multiple_plugins(self) -> None:
+        """Verify correct aggregation of devices from CPU, memory, and accelerator plugins."""
+        cpu_device = Mock(spec=AbstractComputeDevice)
+        cpu_device.device_id = DeviceId("0")
+        mem_device = Mock(spec=AbstractComputeDevice)
+        mem_device.device_id = DeviceId("root")
+        gpu_device = Mock(spec=AbstractComputeDevice)
+        gpu_device.device_id = DeviceId("gpu-0")
+
+        cpu_plugin = AsyncMock(spec=AbstractComputePlugin)
+        cpu_plugin.list_devices.return_value = [cpu_device]
+        mem_plugin = AsyncMock(spec=AbstractComputePlugin)
+        mem_plugin.list_devices.return_value = [mem_device]
+        gpu_plugin = AsyncMock(spec=AbstractComputePlugin)
+        gpu_plugin.list_devices.return_value = [gpu_device]
+
+        plugins = {
+            DeviceName("cpu"): cpu_plugin,
+            DeviceName("mem"): mem_plugin,
+            DeviceName("cuda"): gpu_plugin,
+        }
+
+        allocator = Mock(spec=ResourceAllocator)
+        allocator._create_global_devices = ResourceAllocator._create_global_devices.__get__(
+            allocator, ResourceAllocator
+        )
+
+        result = await allocator._create_global_devices(plugins)
+
+        assert len(result) == 3
+        assert DeviceName("cpu") in result
+        assert DeviceName("mem") in result
+        assert DeviceName("cuda") in result
+
+        # Verify each plugin's devices are correctly mapped
+        assert result[DeviceName("cpu")].devices[0].device_id == DeviceId("0")
+        assert result[DeviceName("mem")].devices[0].device_id == DeviceId("root")
+        assert result[DeviceName("cuda")].devices[0].device_id == DeviceId("gpu-0")
+
+
+@pytest.mark.asyncio
+class TestEmptyPluginHandling:
+    """Tests for behavior when a plugin reports no devices."""
+
+    async def test_handles_plugin_with_no_devices(self) -> None:
+        """Verify behavior when a plugin reports no devices."""
+        mock_plugin = AsyncMock(spec=AbstractComputePlugin)
+        mock_plugin.list_devices.return_value = []
+
+        plugins = {DeviceName("mock"): mock_plugin}
+
+        allocator = Mock(spec=ResourceAllocator)
+        allocator._create_global_devices = ResourceAllocator._create_global_devices.__get__(
+            allocator, ResourceAllocator
+        )
+
+        result = await allocator._create_global_devices(plugins)
+
+        assert DeviceName("mock") in result
+        assert len(result[DeviceName("mock")].devices) == 0
+        assert result[DeviceName("mock")].plugin is mock_plugin
+
+
+@pytest.mark.asyncio
+class TestCalculateTotalSlots:
+    """Tests for _calculate_total_slots method."""
+
+    async def test_calculate_total_slots_with_plugins(self) -> None:
+        """Verify _calculate_total_slots returns correct values using plugin.available_slots()."""
+        # Create mock plugins that return specific slot amounts
+        cpu_plugin = AsyncMock()
+        cpu_plugin.available_slots.return_value = {SlotName("cpu"): Decimal(4)}
+
+        mem_plugin = AsyncMock()
+        mem_plugin.available_slots.return_value = {SlotName("mem"): Decimal(8192)}
+
+        # Create mock computer contexts
+        cpu_ctx = Mock(spec=ComputerContext)
+        cpu_ctx.instance = cpu_plugin
+
+        mem_ctx = Mock(spec=ComputerContext)
+        mem_ctx.instance = mem_plugin
+
+        # Create allocator mock with computers attribute
+        allocator = Mock(spec=ResourceAllocator)
+        allocator.computers = {
+            DeviceName("cpu"): cpu_ctx,
+            DeviceName("mem"): mem_ctx,
+        }
+        allocator._calculate_total_slots = ResourceAllocator._calculate_total_slots.__get__(
+            allocator, ResourceAllocator
+        )
+
+        total_slots = await allocator._calculate_total_slots()
+
+        assert total_slots[SlotName("cpu")] == Decimal(4)
+        assert total_slots[SlotName("mem")] == Decimal(8192)
+        cpu_plugin.available_slots.assert_called_once()
+        mem_plugin.available_slots.assert_called_once()
+
+    async def test_calculate_total_slots_aggregates_same_slot_names(self) -> None:
+        """Verify _calculate_total_slots aggregates slots with the same name from multiple plugins."""
+        # Create two plugins that both report a "gpu" slot
+        gpu1_plugin = AsyncMock()
+        gpu1_plugin.available_slots.return_value = {SlotName("cuda.shares"): Decimal("2.0")}
+
+        gpu2_plugin = AsyncMock()
+        gpu2_plugin.available_slots.return_value = {SlotName("cuda.shares"): Decimal("3.0")}
+
+        gpu1_ctx = Mock(spec=ComputerContext)
+        gpu1_ctx.instance = gpu1_plugin
+
+        gpu2_ctx = Mock(spec=ComputerContext)
+        gpu2_ctx.instance = gpu2_plugin
+
+        allocator = Mock(spec=ResourceAllocator)
+        allocator.computers = {
+            DeviceName("cuda1"): gpu1_ctx,
+            DeviceName("cuda2"): gpu2_ctx,
+        }
+        allocator._calculate_total_slots = ResourceAllocator._calculate_total_slots.__get__(
+            allocator, ResourceAllocator
+        )
+
+        total_slots = await allocator._calculate_total_slots()
+
+        # Slots with the same name should be aggregated
+        assert total_slots[SlotName("cuda.shares")] == Decimal("5.0")

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Introduce GlobalDeviceInfo and device discovery infrastructure to separate device discovery from allocation in ResourceAllocator`