"""GPU and vGPU management for VergeOS.
This module provides access to GPU passthrough and NVIDIA vGPU functionality,
enabling AI/ML workloads and graphics-intensive applications.
Example:
>>> # List all vGPU profiles available in the system
>>> for profile in client.vgpu_profiles.list():
... print(f"{profile.name}: {profile.framebuffer} RAM")
>>> # List GPUs configured on a node
>>> node = client.nodes.get(name="node2")
>>> for gpu in node.gpus.list():
... print(f"{gpu.name}: {gpu.mode_display}")
>>> # Configure a GPU for passthrough
>>> gpu = node.gpus.get(name="GPU_1")
>>> gpu = node.gpus.update(gpu.key, mode="gpu")
>>> # Get GPU stats
>>> stats = gpu.stats.get()
>>> print(f"vGPUs in use: {stats.vgpus}/{stats.vgpus_total}")
"""
from __future__ import annotations
import builtins
from datetime import datetime, timezone
from typing import TYPE_CHECKING, Any, Literal
from pyvergeos.exceptions import NotFoundError
from pyvergeos.filters import build_filter
from pyvergeos.resources.base import ResourceManager, ResourceObject
if TYPE_CHECKING:
from pyvergeos.client import VergeClient
# GPU mode display mappings
GPU_MODE_DISPLAY = {
"none": "None",
"gpu": "PCI Passthrough",
"nvidia_vgpu": "NVIDIA vGPU",
}
# vGPU profile type mappings
PROFILE_TYPE_DISPLAY = {
"A": "Virtual Applications (vApps)",
"B": "Virtual Desktops (vPC)",
"C": "AI/Machine Learning/Training (vCS or vWS)",
"Q": "Virtual Workstations (vWS)",
}
# =============================================================================
# NVIDIA vGPU Profiles (Global)
# =============================================================================
[docs]
class NvidiaVgpuProfile(ResourceObject):
"""NVIDIA vGPU profile resource object.
Represents a vGPU profile available in the system. Profiles define
the characteristics of virtual GPUs that can be created.
These are read-only and determined by NVIDIA drivers.
"""
@property
def name(self) -> str:
"""Profile name (e.g., 'nvidia-256', 'grid_p40-1q')."""
return str(self.get("name", ""))
@property
def type_id(self) -> int:
"""NVIDIA type ID for this profile."""
return int(self.get("type_id", 0))
@property
def device_hex(self) -> str:
"""Vendor:device ID in hexadecimal (e.g., '10de:1eb8')."""
return str(self.get("device_hex", ""))
@property
def num_heads(self) -> int:
"""Number of display heads supported."""
return int(self.get("num_heads", 0))
@property
def frl_config(self) -> int:
"""Frame rate limiter configuration."""
return int(self.get("frl_config", 0))
@property
def framebuffer(self) -> str:
"""Framebuffer (VRAM) size (e.g., '256M', '1G')."""
return str(self.get("framebuffer", ""))
@property
def max_resolution(self) -> str:
"""Maximum supported resolution (e.g., '4096x2160')."""
return str(self.get("max_resolution", ""))
@property
def max_instance(self) -> int:
"""Maximum instances per physical GPU."""
return int(self.get("max_instance", 0))
@property
def max_instances_per_vm(self) -> int:
"""Maximum vGPU instances per VM."""
return int(self.get("max_instances_per_vm", 0))
@property
def placement_ids(self) -> str:
"""Placement IDs for this profile."""
return str(self.get("placement_ids", ""))
@property
def location(self) -> str:
"""Profile location/path."""
return str(self.get("location", ""))
@property
def profile_type(self) -> str:
"""Profile type code (A, B, C, Q)."""
return str(self.get("profile_type", ""))
@property
def profile_type_display(self) -> str:
"""Human-readable profile type."""
return PROFILE_TYPE_DISPLAY.get(self.profile_type, self.profile_type)
@property
def grid_license(self) -> str:
"""Required GRID license type."""
return str(self.get("grid_license", ""))
@property
def is_virtual_function(self) -> bool:
"""Whether this is a virtual function profile."""
return bool(self.get("virtual_function", False))
@property
def profile_folder(self) -> str:
"""Profile folder path."""
return str(self.get("profile_folder", ""))
def __repr__(self) -> str:
return (
f"<NvidiaVgpuProfile key={self.get('$key', '?')} "
f"name={self.name!r} fb={self.framebuffer}>"
)
[docs]
class NvidiaVgpuProfileManager(ResourceManager[NvidiaVgpuProfile]):
"""Manager for NVIDIA vGPU profiles.
Provides read-only access to vGPU profiles available in the system.
These profiles are determined by the NVIDIA drivers and available hardware.
Example:
>>> # List all profiles
>>> for profile in client.vgpu_profiles.list():
... print(f"{profile.name}: {profile.framebuffer} ({profile.profile_type_display})")
>>> # Get profiles for AI/ML workloads
>>> ml_profiles = client.vgpu_profiles.list(profile_type="C")
"""
_endpoint = "nvidia_vgpu_profiles"
_default_fields = [
"$key",
"name",
"type_id",
"device_hex",
"num_heads",
"frl_config",
"framebuffer",
"max_resolution",
"max_instance",
"max_instances_per_vm",
"placement_ids",
"location",
"profile_type",
"grid_license",
"virtual_function",
"profile_folder",
]
def _to_model(self, data: dict[str, Any]) -> NvidiaVgpuProfile:
return NvidiaVgpuProfile(data, self)
[docs]
def list(
self,
filter: str | None = None, # noqa: A002
fields: builtins.list[str] | None = None,
limit: int | None = None,
offset: int | None = None,
*,
profile_type: Literal["A", "B", "C", "Q"] | None = None,
**filter_kwargs: Any,
) -> builtins.list[NvidiaVgpuProfile]:
"""List NVIDIA vGPU profiles.
Args:
filter: OData filter string.
fields: List of fields to return.
limit: Maximum number of results.
offset: Skip this many results.
profile_type: Filter by profile type:
- A: Virtual Applications (vApps)
- B: Virtual Desktops (vPC)
- C: AI/ML/Training (vCS or vWS)
- Q: Virtual Workstations (vWS)
**filter_kwargs: Additional filter arguments.
Returns:
List of NvidiaVgpuProfile objects.
Example:
>>> # List all profiles
>>> profiles = client.vgpu_profiles.list()
>>> # List AI/ML profiles only
>>> ml_profiles = client.vgpu_profiles.list(profile_type="C")
"""
if fields is None:
fields = self._default_fields
filters = []
if filter:
filters.append(filter)
if profile_type is not None:
filters.append(f"profile_type eq '{profile_type}'")
if filter_kwargs:
filters.append(build_filter(**filter_kwargs))
params: dict[str, Any] = {"fields": ",".join(fields)}
if filters:
params["filter"] = " and ".join(filters)
if limit is not None:
params["limit"] = limit
if offset is not None:
params["offset"] = offset
response = self._client._request("GET", self._endpoint, params=params)
if response is None:
return []
if isinstance(response, list):
return [self._to_model(item) for item in response]
return [self._to_model(response)]
[docs]
def get(
self,
key: int | None = None,
*,
name: str | None = None,
fields: builtins.list[str] | None = None,
) -> NvidiaVgpuProfile:
"""Get a vGPU profile by key or name.
Args:
key: Profile $key (ID).
name: Profile name.
fields: List of fields to return.
Returns:
NvidiaVgpuProfile object.
Raises:
NotFoundError: If profile not found.
ValueError: If neither key nor name provided.
"""
if fields is None:
fields = self._default_fields
if key is not None:
params: dict[str, Any] = {"fields": ",".join(fields)}
response = self._client._request("GET", f"{self._endpoint}/{key}", params=params)
if response is None:
raise NotFoundError(f"vGPU profile with key {key} not found")
if not isinstance(response, dict):
raise NotFoundError(f"vGPU profile with key {key} returned invalid response")
return self._to_model(response)
if name is not None:
escaped = name.replace("'", "''")
results = self.list(filter=f"name eq '{escaped}'", fields=fields, limit=1)
if not results:
raise NotFoundError(f"vGPU profile with name '{name}' not found")
return results[0]
raise ValueError("Either key or name must be provided")
# =============================================================================
# Node GPUs
# =============================================================================
[docs]
class NodeGpu(ResourceObject):
"""Node GPU resource object.
Represents a physical GPU configured on a VergeOS node. A GPU can be
configured for PCI passthrough or NVIDIA vGPU mode.
Example:
>>> gpu = node.gpus.get(name="GPU_1")
>>> print(f"Mode: {gpu.mode_display}")
>>> print(f"Instances: {gpu.instances_count}/{gpu.max_instances}")
"""
@property
def name(self) -> str:
"""GPU name (e.g., 'GPU_1')."""
return str(self.get("name", ""))
@property
def description(self) -> str:
"""GPU description."""
return str(self.get("description", ""))
@property
def pci_device_key(self) -> int | None:
"""Associated PCI device key."""
pci = self.get("pci_device")
return int(pci) if pci else None
@property
def pci_device_name(self) -> str:
"""PCI device name/description."""
return str(self.get("pci_device_name", ""))
@property
def node_key(self) -> int | None:
"""Parent node key."""
node = self.get("node")
return int(node) if node else None
@property
def node_name(self) -> str:
"""Parent node name."""
return str(self.get("node_name", ""))
@property
def mode(self) -> str:
"""GPU operating mode (none, gpu, nvidia_vgpu)."""
return str(self.get("mode", "none"))
@property
def mode_display(self) -> str:
"""Human-readable GPU mode."""
return GPU_MODE_DISPLAY.get(self.mode, self.mode)
@property
def nvidia_vgpu_profile_key(self) -> int | None:
"""Assigned vGPU profile key (for nvidia_vgpu mode)."""
profile = self.get("nvidia_vgpu_profile")
return int(profile) if profile else None
@property
def nvidia_vgpu_profile_display(self) -> str:
"""Display name of assigned vGPU profile."""
return str(self.get("nvidia_vgpu_profile_disp", ""))
@property
def max_instances(self) -> int:
"""Maximum GPU/vGPU instances this GPU can provide."""
return int(self.get("max_instances", 0))
@property
def instances_count(self) -> int:
"""Current number of assigned instances."""
return int(self.get("instances_count", 0))
@property
def modified_at(self) -> datetime | None:
"""Timestamp when GPU was last modified."""
ts = self.get("modified")
if ts:
return datetime.fromtimestamp(int(ts), tz=timezone.utc)
return None
@property
def is_passthrough(self) -> bool:
"""Check if GPU is configured for PCI passthrough."""
return self.mode == "gpu"
@property
def is_vgpu(self) -> bool:
"""Check if GPU is configured for NVIDIA vGPU."""
return self.mode == "nvidia_vgpu"
@property
def is_disabled(self) -> bool:
"""Check if GPU is disabled (no mode set)."""
return self.mode == "none"
@property
def stats(self) -> NodeGpuStatsManager:
"""Access GPU utilization stats.
Returns:
NodeGpuStatsManager scoped to this GPU.
Example:
>>> stats = gpu.stats.get()
>>> print(f"vGPUs: {stats.vgpus}/{stats.vgpus_total}")
"""
from typing import cast
manager = cast("NodeGpuManager", self._manager)
return NodeGpuStatsManager(manager._client, self.key)
@property
def instances(self) -> NodeGpuInstanceManager:
"""Access GPU instances assigned to VMs.
Returns:
NodeGpuInstanceManager scoped to this GPU.
Example:
>>> for inst in gpu.instances.list():
... print(f"VM: {inst.machine_name}")
"""
from typing import cast
manager = cast("NodeGpuManager", self._manager)
return NodeGpuInstanceManager(manager._client, self.key)
[docs]
def refresh(self) -> NodeGpu:
"""Refresh this GPU's data from the server.
Returns:
Updated NodeGpu object.
"""
from typing import cast
manager = cast("NodeGpuManager", self._manager)
return manager.get(key=self.key)
[docs]
def save(self, **kwargs: Any) -> NodeGpu:
"""Update this GPU with the given values.
Args:
**kwargs: Fields to update.
Returns:
Updated NodeGpu object.
"""
from typing import cast
manager = cast("NodeGpuManager", self._manager)
return manager.update(self.key, **kwargs)
def __repr__(self) -> str:
return (
f"<NodeGpu key={self.get('$key', '?')} name={self.name!r} "
f"mode={self.mode} instances={self.instances_count}/{self.max_instances}>"
)
[docs]
class NodeGpuManager(ResourceManager[NodeGpu]):
"""Manager for node GPU operations.
Provides CRUD operations for GPU configurations on nodes.
Can be used globally or scoped to a specific node.
Example:
>>> # List all GPUs on a node
>>> for gpu in node.gpus.list():
... print(f"{gpu.name}: {gpu.mode_display}")
>>> # Configure a GPU for passthrough
>>> gpu = node.gpus.update(gpu.key, mode="gpu")
>>> # Configure for vGPU with a specific profile
>>> gpu = node.gpus.update(
... gpu.key,
... mode="nvidia_vgpu",
... nvidia_vgpu_profile=profile.key
... )
"""
_endpoint = "node_gpus"
_default_fields = [
"$key",
"name",
"description",
"pci_device",
"pci_device#name as pci_device_name",
"node",
"node#name as node_name",
"mode",
"nvidia_vgpu_profile",
"display(nvidia_vgpu_profile) as nvidia_vgpu_profile_disp",
"max_instances",
"count(instances) as instances_count",
"modified",
]
[docs]
def __init__(self, client: VergeClient, node_key: int | None = None) -> None:
super().__init__(client)
self._node_key = node_key
def _to_model(self, data: dict[str, Any]) -> NodeGpu:
return NodeGpu(data, self)
[docs]
def list(
self,
filter: str | None = None, # noqa: A002
fields: builtins.list[str] | None = None,
limit: int | None = None,
offset: int | None = None,
*,
mode: Literal["none", "gpu", "nvidia_vgpu"] | None = None,
enabled_only: bool = False,
**filter_kwargs: Any,
) -> builtins.list[NodeGpu]:
"""List node GPUs.
Args:
filter: OData filter string.
fields: List of fields to return.
limit: Maximum number of results.
offset: Skip this many results.
mode: Filter by GPU mode.
enabled_only: Only return GPUs with a mode set (not 'none').
**filter_kwargs: Additional filter arguments.
Returns:
List of NodeGpu objects.
Example:
>>> # List all GPUs on a node
>>> gpus = node.gpus.list()
>>> # List only passthrough GPUs
>>> passthrough_gpus = node.gpus.list(mode="gpu")
>>> # List enabled GPUs
>>> enabled = node.gpus.list(enabled_only=True)
"""
if fields is None:
fields = self._default_fields
filters = []
if filter:
filters.append(filter)
if self._node_key is not None:
filters.append(f"node eq {self._node_key}")
if mode is not None:
filters.append(f"mode eq '{mode}'")
elif enabled_only:
filters.append("mode ne 'none'")
if filter_kwargs:
filters.append(build_filter(**filter_kwargs))
params: dict[str, Any] = {"fields": ",".join(fields)}
if filters:
params["filter"] = " and ".join(filters)
if limit is not None:
params["limit"] = limit
if offset is not None:
params["offset"] = offset
response = self._client._request("GET", self._endpoint, params=params)
if response is None:
return []
if isinstance(response, list):
return [self._to_model(item) for item in response]
return [self._to_model(response)]
[docs]
def get(
self,
key: int | None = None,
*,
name: str | None = None,
fields: builtins.list[str] | None = None,
) -> NodeGpu:
"""Get a GPU by key or name.
Args:
key: GPU $key (ID).
name: GPU name.
fields: List of fields to return.
Returns:
NodeGpu object.
Raises:
NotFoundError: If GPU not found.
ValueError: If neither key nor name provided.
"""
if fields is None:
fields = self._default_fields
if key is not None:
params: dict[str, Any] = {"fields": ",".join(fields)}
response = self._client._request("GET", f"{self._endpoint}/{key}", params=params)
if response is None:
raise NotFoundError(f"GPU with key {key} not found")
if not isinstance(response, dict):
raise NotFoundError(f"GPU with key {key} returned invalid response")
return self._to_model(response)
if name is not None:
escaped = name.replace("'", "''")
results = self.list(filter=f"name eq '{escaped}'", fields=fields, limit=1)
if not results:
raise NotFoundError(f"GPU with name '{name}' not found")
return results[0]
raise ValueError("Either key or name must be provided")
[docs]
def update(self, key: int, **kwargs: Any) -> NodeGpu:
"""Update a GPU configuration.
Args:
key: GPU $key (ID).
**kwargs: Fields to update. Common fields:
- name: GPU name
- description: Description
- mode: Operating mode ('none', 'gpu', 'nvidia_vgpu')
- nvidia_vgpu_profile: vGPU profile key (for nvidia_vgpu mode)
Returns:
Updated NodeGpu object.
Example:
>>> # Enable PCI passthrough
>>> gpu = client.nodes.gpus(node_key).update(gpu.key, mode="gpu")
>>> # Enable vGPU mode with a profile
>>> gpu = client.nodes.gpus(node_key).update(
... gpu.key,
... mode="nvidia_vgpu",
... nvidia_vgpu_profile=profile.key
... )
>>> # Disable GPU
>>> gpu = client.nodes.gpus(node_key).update(gpu.key, mode="none")
"""
response = self._client._request("PUT", f"{self._endpoint}/{key}", json_data=kwargs)
if response is None:
return self.get(key)
if not isinstance(response, dict):
return self.get(key)
return self._to_model(response)
# =============================================================================
# Node GPU Stats
# =============================================================================
[docs]
class NodeGpuStats(ResourceObject):
"""Node GPU stats resource object.
Provides current GPU utilization metrics.
"""
@property
def gpu_key(self) -> int:
"""Parent GPU key."""
return int(self.get("node_gpu", 0))
@property
def gpus_total(self) -> int:
"""Total GPU slots available."""
return int(self.get("gpus_total", 0))
@property
def gpus(self) -> int:
"""GPUs in use."""
return int(self.get("gpus", 0))
@property
def gpus_idle(self) -> int:
"""Idle GPU slots."""
return int(self.get("gpus_idle", 0))
@property
def vgpus_total(self) -> int:
"""Total vGPU slots available."""
return int(self.get("vgpus_total", 0))
@property
def vgpus(self) -> int:
"""vGPUs in use."""
return int(self.get("vgpus", 0))
@property
def vgpus_idle(self) -> int:
"""Idle vGPU slots."""
return int(self.get("vgpus_idle", 0))
@property
def timestamp(self) -> datetime | None:
"""Stats timestamp."""
ts = self.get("timestamp")
if ts:
return datetime.fromtimestamp(int(ts), tz=timezone.utc)
return None
def __repr__(self) -> str:
return (
f"<NodeGpuStats gpu={self.gpu_key} "
f"gpus={self.gpus}/{self.gpus_total} vgpus={self.vgpus}/{self.vgpus_total}>"
)
[docs]
class NodeGpuStatsHistory(ResourceObject):
"""Node GPU stats history record.
Represents a single time point in the GPU stats history.
"""
@property
def gpu_key(self) -> int:
"""Parent GPU key."""
return int(self.get("node_gpu", 0))
@property
def timestamp(self) -> datetime | None:
"""Timestamp for this history point."""
ts = self.get("timestamp")
if ts:
return datetime.fromtimestamp(int(ts), tz=timezone.utc)
return None
@property
def timestamp_epoch(self) -> int:
"""Timestamp as Unix epoch."""
return int(self.get("timestamp", 0))
@property
def gpus_total(self) -> int:
"""Total GPU slots available."""
return int(self.get("gpus_total", 0))
@property
def gpus(self) -> int:
"""GPUs in use."""
return int(self.get("gpus", 0))
@property
def gpus_idle(self) -> int:
"""Idle GPU slots."""
return int(self.get("gpus_idle", 0))
@property
def vgpus_total(self) -> int:
"""Total vGPU slots available."""
return int(self.get("vgpus_total", 0))
@property
def vgpus(self) -> int:
"""vGPUs in use."""
return int(self.get("vgpus", 0))
@property
def vgpus_idle(self) -> int:
"""Idle vGPU slots."""
return int(self.get("vgpus_idle", 0))
def __repr__(self) -> str:
ts = self.timestamp.isoformat() if self.timestamp else "?"
return f"<NodeGpuStatsHistory ts={ts} vgpus={self.vgpus}/{self.vgpus_total}>"
[docs]
class NodeGpuStatsManager(ResourceManager[NodeGpuStats]):
"""Manager for node GPU stats.
Provides access to current and historical GPU utilization metrics.
Scoped to a specific GPU.
Example:
>>> # Get current stats
>>> stats = gpu.stats.get()
>>> print(f"vGPUs: {stats.vgpus}/{stats.vgpus_total}")
>>> # Get stats history
>>> history = gpu.stats.history_short(limit=100)
"""
_endpoint = "node_gpu_stats"
_default_fields = [
"$key",
"node_gpu",
"gpus_total",
"gpus",
"gpus_idle",
"vgpus_total",
"vgpus",
"vgpus_idle",
"timestamp",
]
_history_fields = [
"$key",
"node_gpu",
"timestamp",
"gpus_total",
"gpus",
"gpus_idle",
"vgpus_total",
"vgpus",
"vgpus_idle",
]
[docs]
def __init__(self, client: VergeClient, gpu_key: int) -> None:
super().__init__(client)
self._gpu_key = gpu_key
def _to_model(self, data: dict[str, Any]) -> NodeGpuStats:
return NodeGpuStats(data, self)
def _to_history_model(self, data: dict[str, Any]) -> NodeGpuStatsHistory:
return NodeGpuStatsHistory(data, self)
[docs]
def get(self, fields: builtins.list[str] | None = None) -> NodeGpuStats: # type: ignore[override]
"""Get current GPU stats.
Args:
fields: List of fields to return.
Returns:
NodeGpuStats object.
Raises:
NotFoundError: If stats not found for this GPU.
"""
if fields is None:
fields = self._default_fields
params: dict[str, Any] = {
"filter": f"node_gpu eq {self._gpu_key}",
"fields": ",".join(fields),
"limit": 1,
}
response = self._client._request("GET", self._endpoint, params=params)
if response is None:
raise NotFoundError(f"Stats not found for GPU {self._gpu_key}")
if isinstance(response, list):
if not response:
raise NotFoundError(f"Stats not found for GPU {self._gpu_key}")
return self._to_model(response[0])
return self._to_model(response)
[docs]
def history_short(
self,
limit: int | None = None,
offset: int | None = None,
since: datetime | int | None = None,
until: datetime | int | None = None,
fields: builtins.list[str] | None = None,
) -> builtins.list[NodeGpuStatsHistory]:
"""Get short-term GPU stats history (high resolution).
Args:
limit: Maximum number of records to return.
offset: Skip this many records.
since: Return records after this time (datetime or epoch).
until: Return records before this time (datetime or epoch).
fields: List of fields to return.
Returns:
List of NodeGpuStatsHistory objects, sorted by timestamp descending.
"""
return self._get_history(
"node_gpu_stats_history_short",
limit=limit,
offset=offset,
since=since,
until=until,
fields=fields,
)
[docs]
def history_long(
self,
limit: int | None = None,
offset: int | None = None,
since: datetime | int | None = None,
until: datetime | int | None = None,
fields: builtins.list[str] | None = None,
) -> builtins.list[NodeGpuStatsHistory]:
"""Get long-term GPU stats history (lower resolution, longer retention).
Args:
limit: Maximum number of records to return.
offset: Skip this many records.
since: Return records after this time (datetime or epoch).
until: Return records before this time (datetime or epoch).
fields: List of fields to return.
Returns:
List of NodeGpuStatsHistory objects, sorted by timestamp descending.
"""
return self._get_history(
"node_gpu_stats_history_long",
limit=limit,
offset=offset,
since=since,
until=until,
fields=fields,
)
def _get_history(
self,
endpoint: str,
limit: int | None = None,
offset: int | None = None,
since: datetime | int | None = None,
until: datetime | int | None = None,
fields: builtins.list[str] | None = None,
) -> builtins.list[NodeGpuStatsHistory]:
"""Internal helper to get history from short or long endpoint."""
if fields is None:
fields = self._history_fields
filters = [f"node_gpu eq {self._gpu_key}"]
if since is not None:
since_epoch = int(since.timestamp()) if isinstance(since, datetime) else int(since)
filters.append(f"timestamp ge {since_epoch}")
if until is not None:
until_epoch = int(until.timestamp()) if isinstance(until, datetime) else int(until)
filters.append(f"timestamp le {until_epoch}")
params: dict[str, Any] = {
"filter": " and ".join(filters),
"fields": ",".join(fields),
"sort": "-timestamp",
}
if limit is not None:
params["limit"] = limit
if offset is not None:
params["offset"] = offset
response = self._client._request("GET", endpoint, params=params)
if response is None:
return []
if isinstance(response, list):
return [self._to_history_model(item) for item in response]
return [self._to_history_model(response)]
# =============================================================================
# Node GPU Instances
# =============================================================================
[docs]
class NodeGpuInstance(ResourceObject):
"""Node GPU instance resource object.
Represents a GPU or vGPU instance assigned to a VM.
"""
@property
def gpu_key(self) -> int:
"""Parent GPU key."""
return int(self.get("gpu_key", 0))
@property
def gpu_name(self) -> str:
"""Parent GPU name."""
return str(self.get("gpu_name", ""))
@property
def node_key(self) -> int | None:
"""Node key."""
node = self.get("node_key")
return int(node) if node else None
@property
def node_name(self) -> str:
"""Node name."""
return str(self.get("node_display", ""))
@property
def machine_key(self) -> int | None:
"""Machine (VM) key."""
machine = self.get("machine_key")
return int(machine) if machine else None
@property
def machine_name(self) -> str:
"""Machine (VM) name."""
return str(self.get("machine_name", ""))
@property
def machine_type(self) -> str:
"""Machine type (e.g., 'vm')."""
return str(self.get("machine_type", ""))
@property
def machine_type_display(self) -> str:
"""Machine type display name."""
return str(self.get("machine_type_display", ""))
@property
def machine_device_key(self) -> int | None:
"""Machine device key."""
device = self.get("machine_device_key")
return int(device) if device else None
@property
def machine_device_name(self) -> str:
"""Machine device name."""
return str(self.get("machine_device_name", ""))
@property
def machine_device_status(self) -> str:
"""Machine device status."""
return str(self.get("machine_device_status", ""))
@property
def pci_device_key(self) -> int | None:
"""PCI device key."""
pci = self.get("pci_device_key")
return int(pci) if pci else None
@property
def pci_device_name(self) -> str:
"""PCI device name."""
return str(self.get("pci_device_name", ""))
@property
def mode(self) -> str:
"""GPU mode (gpu, nvidia_vgpu)."""
return str(self.get("mode", ""))
@property
def mode_display(self) -> str:
"""GPU mode display name."""
return str(self.get("mode_display", ""))
@property
def description(self) -> str:
"""Instance description."""
return str(self.get("description", ""))
@property
def modified_at(self) -> datetime | None:
"""Timestamp when instance was last modified."""
ts = self.get("modified")
if ts:
return datetime.fromtimestamp(int(ts), tz=timezone.utc)
return None
def __repr__(self) -> str:
return (
f"<NodeGpuInstance key={self.get('$key', '?')} "
f"gpu={self.gpu_name!r} machine={self.machine_name!r}>"
)
[docs]
class NodeGpuInstanceManager(ResourceManager[NodeGpuInstance]):
"""Manager for node GPU instances.
Provides read-only access to GPU instances assigned to VMs.
Scoped to a specific GPU.
Example:
>>> # List instances for a GPU
>>> for inst in gpu.instances.list():
... print(f"VM: {inst.machine_name} ({inst.machine_device_status})")
"""
_endpoint = "node_gpu_instances"
_default_fields = [
"$key",
"gpu#$key as gpu_key",
"gpu#name as gpu_name",
"gpu#node#$key as node_key",
"gpu#node#$display as node_display",
"gpu#mode as mode",
"gpu#display(mode) as mode_display",
"gpu#pci_device#$key as pci_device_key",
"gpu#pci_device#name as pci_device_name",
"machine_device#$key as machine_device_key",
"machine_device#name as machine_device_name",
"machine_device#machine#$key as machine_key",
"machine_device#machine#name as machine_name",
"machine_device#machine#type as machine_type",
"machine_device#machine#display(type) as machine_type_display",
"machine_device#status#status as machine_device_status",
"description",
"modified",
]
[docs]
def __init__(self, client: VergeClient, gpu_key: int) -> None:
super().__init__(client)
self._gpu_key = gpu_key
def _to_model(self, data: dict[str, Any]) -> NodeGpuInstance:
return NodeGpuInstance(data, self)
[docs]
def list(
self,
filter: str | None = None, # noqa: A002
fields: builtins.list[str] | None = None,
limit: int | None = None,
offset: int | None = None,
**filter_kwargs: Any,
) -> builtins.list[NodeGpuInstance]:
"""List GPU instances.
Args:
filter: OData filter string.
fields: List of fields to return.
limit: Maximum number of results.
offset: Skip this many results.
**filter_kwargs: Additional filter arguments.
Returns:
List of NodeGpuInstance objects.
"""
if fields is None:
fields = self._default_fields
filters = [f"gpu eq {self._gpu_key}"]
if filter:
filters.append(filter)
if filter_kwargs:
filters.append(build_filter(**filter_kwargs))
params: dict[str, Any] = {
"filter": " and ".join(filters),
"fields": ",".join(fields),
}
if limit is not None:
params["limit"] = limit
if offset is not None:
params["offset"] = offset
response = self._client._request("GET", self._endpoint, params=params)
if response is None:
return []
if isinstance(response, list):
return [self._to_model(item) for item in response]
return [self._to_model(response)]
# =============================================================================
# Node vGPU Devices (Physical vGPU-capable devices)
# =============================================================================
[docs]
class NodeVgpuDevice(ResourceObject):
"""Node vGPU device resource object.
Represents a physical NVIDIA vGPU-capable device on a node.
These are detected automatically by the system.
"""
@property
def node_key(self) -> int | None:
"""Parent node key."""
node = self.get("node")
return int(node) if node else None
@property
def node_name(self) -> str:
"""Parent node name."""
return str(self.get("node_name", ""))
@property
def pci_device_key(self) -> int | None:
"""Associated PCI device key."""
pci = self.get("pci_device")
return int(pci) if pci else None
@property
def name(self) -> str:
"""Device name."""
return str(self.get("name", ""))
@property
def slot(self) -> str:
"""PCI slot."""
return str(self.get("slot", ""))
@property
def vendor(self) -> str:
"""Vendor name."""
return str(self.get("vendor", ""))
@property
def device(self) -> str:
"""Device description."""
return str(self.get("device", ""))
@property
def vendor_device_hex(self) -> str:
"""Vendor:device ID in hexadecimal."""
return str(self.get("vendor_device_hex", ""))
@property
def driver(self) -> str:
"""Current driver."""
return str(self.get("driver", ""))
@property
def module(self) -> str:
"""Kernel module."""
return str(self.get("module", ""))
@property
def numa_node(self) -> str:
"""NUMA node."""
return str(self.get("numa", ""))
@property
def iommu_group(self) -> str:
"""IOMMU group."""
return str(self.get("iommu_group", ""))
@property
def type_id(self) -> int:
"""Device type ID."""
return int(self.get("type_id", 0))
@property
def max_instances(self) -> int:
"""Maximum vGPU instances."""
return int(self.get("max_instances", 1))
@property
def physical_function(self) -> str:
"""Physical function (for SR-IOV)."""
return str(self.get("physical_function", ""))
@property
def virtual_function(self) -> str:
"""Virtual function identifier."""
return str(self.get("virtfn", ""))
@property
def fingerprint(self) -> str:
"""Device fingerprint for live migration."""
return str(self.get("fingerprint", ""))
@property
def created_at(self) -> datetime | None:
"""Timestamp when device was detected."""
ts = self.get("created")
if ts:
return datetime.fromtimestamp(int(ts), tz=timezone.utc)
return None
@property
def modified_at(self) -> datetime | None:
"""Timestamp when device was last updated."""
ts = self.get("modified")
if ts:
return datetime.fromtimestamp(int(ts), tz=timezone.utc)
return None
def __repr__(self) -> str:
return f"<NodeVgpuDevice key={self.get('$key', '?')} name={self.name!r} slot={self.slot!r}>"
[docs]
class NodeVgpuDeviceManager(ResourceManager[NodeVgpuDevice]):
"""Manager for node vGPU device operations.
Provides read-only access to NVIDIA vGPU-capable devices on nodes.
Can be used globally or scoped to a specific node.
Example:
>>> # List all vGPU devices on a node
>>> for device in node.vgpu_devices.list():
... print(f"{device.name}: {device.vendor} {device.device}")
>>> # List all vGPU devices across all nodes
>>> for device in client.nodes.all_vgpu_devices.list():
... print(f"{device.node_name}: {device.name}")
"""
_endpoint = "node_nvidia_vgpu_devices"
_default_fields = [
"$key",
"node",
"node#name as node_name",
"pci_device",
"name",
"slot",
"vendor",
"device",
"vendor_device_hex",
"driver",
"module",
"numa",
"iommu_group",
"type_id",
"max_instances",
"physical_function",
"virtfn",
"fingerprint",
"created",
"modified",
]
[docs]
def __init__(self, client: VergeClient, node_key: int | None = None) -> None:
super().__init__(client)
self._node_key = node_key
def _to_model(self, data: dict[str, Any]) -> NodeVgpuDevice:
return NodeVgpuDevice(data, self)
[docs]
def list(
self,
filter: str | None = None, # noqa: A002
fields: builtins.list[str] | None = None,
limit: int | None = None,
offset: int | None = None,
*,
vendor: str | None = None,
**filter_kwargs: Any,
) -> builtins.list[NodeVgpuDevice]:
"""List vGPU-capable devices.
Args:
filter: OData filter string.
fields: List of fields to return.
limit: Maximum number of results.
offset: Skip this many results.
vendor: Filter by vendor name (contains).
**filter_kwargs: Additional filter arguments.
Returns:
List of NodeVgpuDevice objects.
"""
if fields is None:
fields = self._default_fields
filters = []
if filter:
filters.append(filter)
if self._node_key is not None:
filters.append(f"node eq {self._node_key}")
if vendor is not None:
escaped = vendor.replace("'", "''")
filters.append(f"vendor ct '{escaped}'")
if filter_kwargs:
filters.append(build_filter(**filter_kwargs))
params: dict[str, Any] = {"fields": ",".join(fields)}
if filters:
params["filter"] = " and ".join(filters)
if limit is not None:
params["limit"] = limit
if offset is not None:
params["offset"] = offset
response = self._client._request("GET", self._endpoint, params=params)
if response is None:
return []
if isinstance(response, list):
return [self._to_model(item) for item in response]
return [self._to_model(response)]
[docs]
def get( # type: ignore[override]
self,
key: int,
*,
fields: builtins.list[str] | None = None,
) -> NodeVgpuDevice:
"""Get a vGPU device by key.
Args:
key: Device $key (ID).
fields: List of fields to return.
Returns:
NodeVgpuDevice object.
Raises:
NotFoundError: If device not found.
"""
if fields is None:
fields = self._default_fields
params: dict[str, Any] = {"fields": ",".join(fields)}
response = self._client._request("GET", f"{self._endpoint}/{key}", params=params)
if response is None:
raise NotFoundError(f"vGPU device with key {key} not found")
if not isinstance(response, dict):
raise NotFoundError(f"vGPU device with key {key} returned invalid response")
return self._to_model(response)
# =============================================================================
# Node Host GPU Devices (Physical GPUs for passthrough)
# =============================================================================
[docs]
class NodeHostGpuDevice(ResourceObject):
"""Node host GPU device resource object.
Represents a physical GPU device available for host GPU passthrough.
These are detected automatically by the system.
"""
@property
def node_key(self) -> int | None:
"""Parent node key."""
node = self.get("node")
return int(node) if node else None
@property
def node_name(self) -> str:
"""Parent node name."""
return str(self.get("node_name", ""))
@property
def pci_device_key(self) -> int | None:
"""Associated PCI device key."""
pci = self.get("pci_device")
return int(pci) if pci else None
@property
def name(self) -> str:
"""Device name."""
return str(self.get("name", ""))
@property
def slot(self) -> str:
"""PCI slot."""
return str(self.get("slot", ""))
@property
def vendor(self) -> str:
"""Vendor name."""
return str(self.get("vendor", ""))
@property
def device(self) -> str:
"""Device description."""
return str(self.get("device", ""))
@property
def vendor_device_hex(self) -> str:
"""Vendor:device ID in hexadecimal."""
return str(self.get("vendor_device_hex", ""))
@property
def driver(self) -> str:
"""Current driver."""
return str(self.get("driver", ""))
@property
def module(self) -> str:
"""Kernel module."""
return str(self.get("module", ""))
@property
def numa_node(self) -> str:
"""NUMA node."""
return str(self.get("numa", ""))
@property
def iommu_group(self) -> str:
"""IOMMU group."""
return str(self.get("iommu_group", ""))
@property
def type_id(self) -> int:
"""Device type ID."""
return int(self.get("type_id", 0))
@property
def device_index(self) -> int:
"""Device index."""
return int(self.get("device_index", 0))
@property
def max_instances(self) -> int:
"""Maximum instances (typically 1 for passthrough)."""
return int(self.get("max_instances", 1))
@property
def fingerprint(self) -> str:
"""Device fingerprint for live migration."""
return str(self.get("fingerprint", ""))
@property
def created_at(self) -> datetime | None:
"""Timestamp when device was detected."""
ts = self.get("created")
if ts:
return datetime.fromtimestamp(int(ts), tz=timezone.utc)
return None
@property
def modified_at(self) -> datetime | None:
"""Timestamp when device was last updated."""
ts = self.get("modified")
if ts:
return datetime.fromtimestamp(int(ts), tz=timezone.utc)
return None
def __repr__(self) -> str:
return (
f"<NodeHostGpuDevice key={self.get('$key', '?')} name={self.name!r} slot={self.slot!r}>"
)
[docs]
class NodeHostGpuDeviceManager(ResourceManager[NodeHostGpuDevice]):
"""Manager for node host GPU device operations.
Provides read-only access to GPUs available for host passthrough on nodes.
Can be used globally or scoped to a specific node.
Example:
>>> # List all host GPUs on a node
>>> for device in node.host_gpu_devices.list():
... print(f"{device.name}: {device.vendor} {device.device}")
>>> # List all host GPUs across all nodes
>>> for device in client.nodes.all_host_gpu_devices.list():
... print(f"{device.node_name}: {device.name}")
"""
_endpoint = "node_host_gpu_devices"
_default_fields = [
"$key",
"node",
"node#name as node_name",
"pci_device",
"name",
"slot",
"vendor",
"device",
"vendor_device_hex",
"driver",
"module",
"numa",
"iommu_group",
"type_id",
"device_index",
"max_instances",
"fingerprint",
"created",
"modified",
]
[docs]
def __init__(self, client: VergeClient, node_key: int | None = None) -> None:
super().__init__(client)
self._node_key = node_key
def _to_model(self, data: dict[str, Any]) -> NodeHostGpuDevice:
return NodeHostGpuDevice(data, self)
[docs]
def list(
self,
filter: str | None = None, # noqa: A002
fields: builtins.list[str] | None = None,
limit: int | None = None,
offset: int | None = None,
*,
vendor: str | None = None,
**filter_kwargs: Any,
) -> builtins.list[NodeHostGpuDevice]:
"""List host GPU devices.
Args:
filter: OData filter string.
fields: List of fields to return.
limit: Maximum number of results.
offset: Skip this many results.
vendor: Filter by vendor name (contains).
**filter_kwargs: Additional filter arguments.
Returns:
List of NodeHostGpuDevice objects.
"""
if fields is None:
fields = self._default_fields
filters = []
if filter:
filters.append(filter)
if self._node_key is not None:
filters.append(f"node eq {self._node_key}")
if vendor is not None:
escaped = vendor.replace("'", "''")
filters.append(f"vendor ct '{escaped}'")
if filter_kwargs:
filters.append(build_filter(**filter_kwargs))
params: dict[str, Any] = {"fields": ",".join(fields)}
if filters:
params["filter"] = " and ".join(filters)
if limit is not None:
params["limit"] = limit
if offset is not None:
params["offset"] = offset
response = self._client._request("GET", self._endpoint, params=params)
if response is None:
return []
if isinstance(response, list):
return [self._to_model(item) for item in response]
return [self._to_model(response)]
[docs]
def get( # type: ignore[override]
self,
key: int,
*,
fields: builtins.list[str] | None = None,
) -> NodeHostGpuDevice:
"""Get a host GPU device by key.
Args:
key: Device $key (ID).
fields: List of fields to return.
Returns:
NodeHostGpuDevice object.
Raises:
NotFoundError: If device not found.
"""
if fields is None:
fields = self._default_fields
params: dict[str, Any] = {"fields": ",".join(fields)}
response = self._client._request("GET", f"{self._endpoint}/{key}", params=params)
if response is None:
raise NotFoundError(f"Host GPU device with key {key} not found")
if not isinstance(response, dict):
raise NotFoundError(f"Host GPU device with key {key} returned invalid response")
return self._to_model(response)
# =============================================================================
# Node vGPU Profiles (Per-node available profiles)
# =============================================================================
[docs]
class NodeVgpuProfile(ResourceObject):
"""Node vGPU profile resource object.
Represents a vGPU profile available on a specific node's physical GPU.
These are determined by the physical hardware and NVIDIA drivers.
"""
@property
def physical_gpu_key(self) -> int | None:
"""Associated physical GPU (PCI device) key."""
pci = self.get("physical_gpu")
return int(pci) if pci else None
@property
def name(self) -> str:
"""Profile name (e.g., 'nvidia-256')."""
return str(self.get("name", ""))
@property
def num_heads(self) -> int:
"""Number of display heads."""
return int(self.get("num_heads", 0))
@property
def frl_config(self) -> int:
"""Frame rate limiter configuration."""
return int(self.get("frl_config", 0))
@property
def framebuffer(self) -> str:
"""Framebuffer (VRAM) size."""
return str(self.get("framebuffer", ""))
@property
def max_resolution(self) -> str:
"""Maximum resolution."""
return str(self.get("max_resolution", ""))
@property
def max_instance(self) -> int:
"""Maximum instances per GPU."""
return int(self.get("max_instance", 0))
@property
def available_instances(self) -> int:
"""Currently available instances."""
return int(self.get("available_instances", 0))
@property
def device_api(self) -> str:
"""Device API version."""
return str(self.get("device_api", ""))
@property
def profile_type(self) -> str:
"""Profile type code (A, B, C, Q)."""
return str(self.get("profile_type", ""))
@property
def profile_type_display(self) -> str:
"""Human-readable profile type."""
return PROFILE_TYPE_DISPLAY.get(self.profile_type, self.profile_type)
@property
def is_virtual_function(self) -> bool:
"""Whether this is a virtual function profile."""
return bool(self.get("virtual_function", False))
@property
def profile_folder(self) -> str:
"""Profile folder path."""
return str(self.get("profile_folder", ""))
def __repr__(self) -> str:
return (
f"<NodeVgpuProfile key={self.get('$key', '?')} "
f"name={self.name!r} avail={self.available_instances}/{self.max_instance}>"
)
[docs]
class NodeVgpuProfileManager(ResourceManager[NodeVgpuProfile]):
"""Manager for node vGPU profile operations.
Provides read-only access to vGPU profiles available on a specific node.
These are determined by the physical hardware.
Example:
>>> # List profiles available on a node's GPU
>>> for profile in node.vgpu_profiles.list():
... print(f"{profile.name}: {profile.available_instances} available")
"""
_endpoint = "node_nvidia_vgpu_profiles"
_default_fields = [
"$key",
"physical_gpu",
"name",
"num_heads",
"frl_config",
"framebuffer",
"max_resolution",
"max_instance",
"available_instances",
"device_api",
"profile_type",
"virtual_function",
"profile_folder",
]
[docs]
def __init__(self, client: VergeClient, physical_gpu_key: int | None = None) -> None:
super().__init__(client)
self._physical_gpu_key = physical_gpu_key
def _to_model(self, data: dict[str, Any]) -> NodeVgpuProfile:
return NodeVgpuProfile(data, self)
[docs]
def list(
self,
filter: str | None = None, # noqa: A002
fields: builtins.list[str] | None = None,
limit: int | None = None,
offset: int | None = None,
*,
profile_type: Literal["A", "B", "C", "Q"] | None = None,
**filter_kwargs: Any,
) -> builtins.list[NodeVgpuProfile]:
"""List vGPU profiles.
Args:
filter: OData filter string.
fields: List of fields to return.
limit: Maximum number of results.
offset: Skip this many results.
profile_type: Filter by profile type.
**filter_kwargs: Additional filter arguments.
Returns:
List of NodeVgpuProfile objects.
"""
if fields is None:
fields = self._default_fields
filters = []
if filter:
filters.append(filter)
if self._physical_gpu_key is not None:
filters.append(f"physical_gpu eq {self._physical_gpu_key}")
if profile_type is not None:
filters.append(f"profile_type eq '{profile_type}'")
if filter_kwargs:
filters.append(build_filter(**filter_kwargs))
params: dict[str, Any] = {"fields": ",".join(fields)}
if filters:
params["filter"] = " and ".join(filters)
if limit is not None:
params["limit"] = limit
if offset is not None:
params["offset"] = offset
response = self._client._request("GET", self._endpoint, params=params)
if response is None:
return []
if isinstance(response, list):
return [self._to_model(item) for item in response]
return [self._to_model(response)]
[docs]
def get(
self,
key: int | None = None,
*,
name: str | None = None,
fields: builtins.list[str] | None = None,
) -> NodeVgpuProfile:
"""Get a vGPU profile by key or name.
Args:
key: Profile $key (ID).
name: Profile name.
fields: List of fields to return.
Returns:
NodeVgpuProfile object.
Raises:
NotFoundError: If profile not found.
ValueError: If neither key nor name provided.
"""
if fields is None:
fields = self._default_fields
if key is not None:
params: dict[str, Any] = {"fields": ",".join(fields)}
response = self._client._request("GET", f"{self._endpoint}/{key}", params=params)
if response is None:
raise NotFoundError(f"vGPU profile with key {key} not found")
if not isinstance(response, dict):
raise NotFoundError(f"vGPU profile with key {key} returned invalid response")
return self._to_model(response)
if name is not None:
escaped = name.replace("'", "''")
results = self.list(filter=f"name eq '{escaped}'", fields=fields, limit=1)
if not results:
raise NotFoundError(f"vGPU profile with name '{name}' not found")
return results[0]
raise ValueError("Either key or name must be provided")