diff --git a/internal/controller/device/vpci/doc.go b/internal/controller/device/vpci/doc.go new file mode 100644 index 0000000000..e7aa197c50 --- /dev/null +++ b/internal/controller/device/vpci/doc.go @@ -0,0 +1,30 @@ +//go:build windows + +// Package vpci provides a controller for managing virtual PCI (vPCI) device +// assignments on a Utility VM (UVM). It handles assigning and removing +// PCI devices from the UVM via HCS modify calls. +// +// # Lifecycle +// +// [Manager] tracks active device assignments by VMBus GUID (device identifier +// within UVM) in an internal map. Each assignment is reference-counted to +// support shared access by multiple callers. +// +// - [Controller.AddToVM] assigns a device and records it in the map. +// If the same device is already assigned, the reference count is incremented. +// - [Controller.RemoveFromVM] decrements the reference count for the device +// identified by VMBus GUID. When it reaches zero, the device is removed +// from the VM. +// +// # Invalid Devices +// +// If the host-side assignment succeeds but the guest-side notification fails, +// the device is marked invalid. It remains tracked so that the caller can call +// [Controller.RemoveFromVM] to perform host-side cleanup. +// +// # Guest Requests +// +// On LCOW, assigning a vPCI device requires a guest-side notification so the +// GCS can wait for the required device paths to become available. +// WCOW does not require a guest request as part of device assignment. +package vpci diff --git a/internal/controller/device/vpci/interface.go b/internal/controller/device/vpci/interface.go new file mode 100644 index 0000000000..347143b486 --- /dev/null +++ b/internal/controller/device/vpci/interface.go @@ -0,0 +1,82 @@ +//go:build windows + +package vpci + +import ( + "context" + + hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2" + "github.com/Microsoft/hcsshim/internal/protocol/guestresource" +) + +// Controller manages the lifecycle of vPCI devices assigned to a UVM. +type Controller interface { + // AddToVM assigns a vPCI device to the VM. If the same device is already + // assigned, the reference count is incremented. + AddToVM(ctx context.Context, opts *AddOptions) error + + // RemoveFromVM removes a vPCI device identified by vmBusGUID from the VM. + // If the device is shared (reference count > 1), the reference count is + // decremented without actually removing the device. + RemoveFromVM(ctx context.Context, vmBusGUID string) error +} + +// AddOptions holds the configuration required to assign a vPCI device to the VM. +type AddOptions struct { + // DeviceInstanceID is the host device instance path of the vPCI device. + DeviceInstanceID string + + // VirtualFunctionIndex is the SR-IOV virtual function index to assign. + VirtualFunctionIndex uint16 + + // VMBusGUID identifies the VirtualPci device (backed by a VMBus channel) + // inside the UVM. + VMBusGUID string +} + +// vmVPCI manages adding and removing vPCI devices for a Utility VM. +// Implemented by [vmmanager.UtilityVM]. +type vmVPCI interface { + // AddDevice adds a vPCI device identified by `vmBusGUID` to the Utility VM with the provided settings. + AddDevice(ctx context.Context, vmBusGUID string, settings hcsschema.VirtualPciDevice) error + + // RemoveDevice removes the vPCI device identified by `vmBusGUID` from the Utility VM. + RemoveDevice(ctx context.Context, vmBusGUID string) error +} + +// linuxGuestVPCI exposes vPCI device operations in the LCOW guest. +// Implemented by [guestmanager.Guest]. +type linuxGuestVPCI interface { + // AddVPCIDevice adds a vPCI device to the guest. + AddVPCIDevice(ctx context.Context, settings guestresource.LCOWMappedVPCIDevice) error +} + +// ============================================================================== +// INTERNAL DATA STRUCTURES +// ============================================================================== + +// deviceKey uniquely identifies a host vPCI device by its instance ID and +// virtual function index. +type deviceKey struct { + deviceInstanceID string + virtualFunctionIndex uint16 +} + +// deviceInfo records one vPCI device's assignment state and reference count. +type deviceInfo struct { + // key is the immutable host device identifier used to detect duplicate + // assignment requests. + key deviceKey + + // vmBusGUID identifies the VirtualPci device (backed by a VMBus channel) + // inside the UVM. + vmBusGUID string + + // refCount is the number of active callers sharing this device. + // Access must be guarded by [Manager.mu]. + refCount uint32 + + // invalid indicates the host-side assignment succeeded but the guest-side + // assignment failed. Access must be guarded by [Manager.mu]. + invalid bool +} diff --git a/internal/controller/device/vpci/utils.go b/internal/controller/device/vpci/utils.go index 044764057d..3d8395362a 100644 --- a/internal/controller/device/vpci/utils.go +++ b/internal/controller/device/vpci/utils.go @@ -3,6 +3,7 @@ package vpci import ( + "fmt" "path/filepath" "strconv" ) @@ -17,6 +18,16 @@ const ( DeviceIDType = "vpci-instance-id" ) +const ( + // vmBusChannelTypeGUIDFormatted is the well-known channel type GUID defined by + // VMBus for all assigned devices. + vmBusChannelTypeGUIDFormatted = "{44c4f61d-4444-4400-9d52-802e27ede19f}" + + // assignedDeviceEnumerator is the VMBus enumerator prefix used in device + // instance IDs for assigned devices. + assignedDeviceEnumerator = "VMBUS" +) + // IsValidDeviceType returns true if the device type is valid i.e. supported by the runtime. func IsValidDeviceType(deviceType string) bool { return (deviceType == DeviceIDType) || @@ -36,3 +47,22 @@ func GetDeviceInfoFromPath(rawDevicePath string) (string, uint16) { // otherwise, just use default index and full device ID given return rawDevicePath, 0 } + +// GetAssignedDeviceVMBUSInstanceID returns the instance ID of the VMBus channel +// device node created when a device is assigned to a UVM via vPCI. +// +// When a device is assigned to a UVM via VPCI support in HCS, a new VMBUS channel device node is +// created in the UVM. The actual device that was assigned in is exposed as a child on this VMBUS +// channel device node. +// +// A device node's instance ID is an identifier that distinguishes that device from other devices +// on the system. The GUID of a VMBUS channel device node refers to that channel's unique +// identifier used internally by VMBUS and can be used to determine the VMBUS channel +// device node's instance ID. +// +// A VMBus channel device node's instance ID is in the form: +// +// "VMBUS\{channelTypeGUID}\{vmBusChannelGUID}" +func GetAssignedDeviceVMBUSInstanceID(vmBusChannelGUID string) string { + return fmt.Sprintf("%s\\%s\\{%s}", assignedDeviceEnumerator, vmBusChannelTypeGUIDFormatted, vmBusChannelGUID) +} diff --git a/internal/controller/device/vpci/vpci.go b/internal/controller/device/vpci/vpci.go new file mode 100644 index 0000000000..ffcafe0735 --- /dev/null +++ b/internal/controller/device/vpci/vpci.go @@ -0,0 +1,199 @@ +//go:build windows + +package vpci + +import ( + "context" + "fmt" + "sync" + + "github.com/sirupsen/logrus" + + hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2" + "github.com/Microsoft/hcsshim/internal/log" +) + +// Manager is the concrete implementation of [Controller]. +type Manager struct { + mu sync.Mutex + + // devices tracks currently assigned vPCI devices, keyed by VMBus GUID. + // Guarded by mu. + devices map[string]*deviceInfo + + // keyToGUID maps a [deviceKey] to its VMBus GUID for duplicate detection + // during [Manager.AddToVM]. Guarded by mu. + keyToGUID map[deviceKey]string + + // vmVPCI performs host-side vPCI device add/remove on the VM. + vmVPCI vmVPCI + + // linuxGuestVPCI performs guest-side vPCI device setup for LCOW. + linuxGuestVPCI linuxGuestVPCI +} + +var _ Controller = (*Manager)(nil) + +// New creates a ready-to-use [Manager]. +func New( + vmVPCI vmVPCI, + linuxGuestVPCI linuxGuestVPCI, +) *Manager { + return &Manager{ + vmVPCI: vmVPCI, + linuxGuestVPCI: linuxGuestVPCI, + devices: make(map[string]*deviceInfo), + keyToGUID: make(map[deviceKey]string), + } +} + +// AddToVM assigns a vPCI device to the VM. +// If the same device is already assigned, the existing assignment is reused. +func (m *Manager) AddToVM(ctx context.Context, opts *AddOptions) (err error) { + if opts.VMBusGUID == "" { + return fmt.Errorf("vmbus guid is required in add options") + } + + key := deviceKey{ + deviceInstanceID: opts.DeviceInstanceID, + virtualFunctionIndex: opts.VirtualFunctionIndex, + } + + // Set vmBusGUID in logging context. + ctx, _ = log.WithContext(ctx, logrus.WithField("vmBusGUID", opts.VMBusGUID)) + + m.mu.Lock() + defer m.mu.Unlock() + + // Check if the caller-provided GUID is already tracked. + if existingDev, ok := m.devices[opts.VMBusGUID]; ok { + // The GUID exists — verify the device settings match what was originally assigned. + // A mismatch means the caller is trying to reuse a GUID for a different device, + // which is a configuration error. + if existingDev.key != key { + return fmt.Errorf( + "vmBusGUID %s is already assigned to device (instanceID=%s, vfIndex=%d), but caller provided different settings (instanceID=%s, vfIndex=%d)", + opts.VMBusGUID, + existingDev.key.deviceInstanceID, existingDev.key.virtualFunctionIndex, + key.deviceInstanceID, key.virtualFunctionIndex, + ) + } + + // If a previous assignment left the device in an invalid state, + // reject new callers until the existing assignment is cleaned up. + if existingDev.invalid { + return fmt.Errorf("vpci device with vmBusGUID %s is in an invalid state", opts.VMBusGUID) + } + + // Same GUID, same device — reuse the existing assignment. + existingDev.refCount++ + + log.G(ctx).WithFields(logrus.Fields{ + "deviceInstanceID": key.deviceInstanceID, + "virtualFunctionIndex": key.virtualFunctionIndex, + "refCount": existingDev.refCount, + }).Debug("vPCI device already assigned, reusing existing assignment") + + return nil + } + + // The GUID is new — check whether the same device key is already assigned + // under a different GUID. This means the caller provided an inconsistent GUID. + if existingGUID, ok := m.keyToGUID[key]; ok { + return fmt.Errorf( + "vpci device (instanceID=%s, vfIndex=%d) is already assigned with vmBusGUID %s, but caller provided %s", + key.deviceInstanceID, key.virtualFunctionIndex, + existingGUID, opts.VMBusGUID, + ) + } + + // Device not attached to VM. + // Build the VirtualPciDevice settings for HCS call. + + log.G(ctx).WithFields(logrus.Fields{ + "deviceInstanceID": key.deviceInstanceID, + "virtualFunctionIndex": key.virtualFunctionIndex, + }).Debug("assigning vPCI device to VM") + + // NUMA affinity is always propagated for assigned devices. + // This feature is available on WS2025 and later. + // Since the V2 shims only support WS2025 and later, this is set as true. + propagateAffinity := true + + settings := hcsschema.VirtualPciDevice{ + Functions: []hcsschema.VirtualPciFunction{ + { + DeviceInstancePath: opts.DeviceInstanceID, + VirtualFunction: opts.VirtualFunctionIndex, + }, + }, + PropagateNumaAffinity: &propagateAffinity, + } + + // Host-side: add the vPCI device to the VM. + if err := m.vmVPCI.AddDevice(ctx, opts.VMBusGUID, settings); err != nil { + return fmt.Errorf("add vpci device %s to vm: %w", opts.DeviceInstanceID, err) + } + + // Track early so RemoveFromVM can clean up even if the guest-side call fails. + dev := &deviceInfo{ + key: key, + vmBusGUID: opts.VMBusGUID, + refCount: 1, + } + m.devices[opts.VMBusGUID] = dev + m.keyToGUID[key] = opts.VMBusGUID + + // Guest-side: device attach notification. + if err := m.waitGuestDeviceReady(ctx, opts.VMBusGUID); err != nil { + // Mark the device as invalid so the caller can call RemoveFromVM + // to clean up the host-side assignment. + dev.invalid = true + log.G(ctx).WithError(err).Error("guest-side vpci device setup failed, device marked invalid") + return fmt.Errorf("add guest vpci device with vmBusGUID %s to vm: %w", opts.VMBusGUID, err) + } + + log.G(ctx).Info("vPCI device assigned to VM") + + return nil +} + +// RemoveFromVM removes a vPCI device from the VM. +// If the device is shared (reference count > 1), the reference count is +// decremented without actually removing the device from the VM. +func (m *Manager) RemoveFromVM(ctx context.Context, vmBusGUID string) error { + m.mu.Lock() + defer m.mu.Unlock() + + ctx, _ = log.WithContext(ctx, logrus.WithField("vmBusGUID", vmBusGUID)) + + dev, ok := m.devices[vmBusGUID] + if !ok { + return fmt.Errorf("no vpci device with vmBusGUID %s is assigned to the vm", vmBusGUID) + } + + dev.refCount-- + if dev.refCount > 0 { + log.G(ctx).WithField("refCount", dev.refCount).Debug("vPCI device still in use, decremented ref count") + return nil + } + + // This path is reached when the device is no longer shared (refCount == 0) or + // had transitioned into an invalid state during AddToVM call. + + log.G(ctx).Debug("removing vPCI device from VM") + + // Host-side: remove the vPCI device from the VM. + if err := m.vmVPCI.RemoveDevice(ctx, vmBusGUID); err != nil { + // Restore the ref count since the removal failed. + dev.refCount++ + return fmt.Errorf("remove vpci device %s from vm: %w", vmBusGUID, err) + } + + delete(m.devices, vmBusGUID) + delete(m.keyToGUID, dev.key) + + log.G(ctx).Info("vPCI device removed from VM") + + return nil +} diff --git a/internal/controller/device/vpci/vpci_lcow.go b/internal/controller/device/vpci/vpci_lcow.go new file mode 100644 index 0000000000..f2391e91ee --- /dev/null +++ b/internal/controller/device/vpci/vpci_lcow.go @@ -0,0 +1,17 @@ +//go:build windows && !wcow + +package vpci + +import ( + "context" + + "github.com/Microsoft/hcsshim/internal/protocol/guestresource" +) + +// waitGuestDeviceReady notifies the guest about the new device and blocks until +// the required sysfs/device paths are available before workloads use them. +func (m *Manager) waitGuestDeviceReady(ctx context.Context, vmBusGUID string) error { + return m.linuxGuestVPCI.AddVPCIDevice(ctx, guestresource.LCOWMappedVPCIDevice{ + VMBusGUID: vmBusGUID, + }) +} diff --git a/internal/controller/device/vpci/vpci_wcow.go b/internal/controller/device/vpci/vpci_wcow.go new file mode 100644 index 0000000000..96a72516ed --- /dev/null +++ b/internal/controller/device/vpci/vpci_wcow.go @@ -0,0 +1,11 @@ +//go:build windows && wcow + +package vpci + +import "context" + +// waitGuestDeviceReady is a no-op for Windows guests. WCOW does not require a +// guest-side notification as part of vPCI device assignment. +func (m *Manager) waitGuestDeviceReady(_ context.Context, _ string) error { + return nil +} diff --git a/internal/devices/assigned_devices.go b/internal/devices/assigned_devices.go index 97db77e0d6..3b751ad57a 100644 --- a/internal/devices/assigned_devices.go +++ b/internal/devices/assigned_devices.go @@ -10,6 +10,7 @@ import ( "strconv" "github.com/Microsoft/hcsshim/internal/cmd" + vpciCtrl "github.com/Microsoft/hcsshim/internal/controller/device/vpci" "github.com/Microsoft/hcsshim/internal/log" "github.com/Microsoft/hcsshim/internal/uvm" "github.com/pkg/errors" @@ -45,7 +46,7 @@ func AddDevice(ctx context.Context, vm *uvm.UtilityVM, idType, deviceID string, if err != nil { return vpci, nil, errors.Wrapf(err, "failed to assign device %s of type %s to pod %s", deviceID, idType, vm.ID()) } - vmBusInstanceID := vm.GetAssignedDeviceVMBUSInstanceID(vpci.VMBusGUID) + vmBusInstanceID := vpciCtrl.GetAssignedDeviceVMBUSInstanceID(vpci.VMBusGUID) log.G(ctx).WithField("vmbus id", vmBusInstanceID).Info("vmbus instance ID") locationPaths, err = getChildrenDeviceLocationPaths(ctx, vm, vmBusInstanceID, deviceUtilPath) diff --git a/internal/uvm/virtual_device.go b/internal/uvm/virtual_device.go index 7a0518f926..1fc4078397 100644 --- a/internal/uvm/virtual_device.go +++ b/internal/uvm/virtual_device.go @@ -24,10 +24,6 @@ const ( VPCIDeviceIDType = "vpci-instance-id" ) -// this is the well known channel type GUID defined by VMBUS for all assigned devices -const vmbusChannelTypeGUIDFormatted = "{44c4f61d-4444-4400-9d52-802e27ede19f}" -const assignedDeviceEnumerator = "VMBUS" - type VPCIDeviceID struct { deviceInstanceID string virtualFunctionIndex uint16 @@ -55,23 +51,6 @@ type VPCIDevice struct { refCount uint32 } -// GetAssignedDeviceVMBUSInstanceID returns the instance ID of the VMBUS channel device node created. -// -// When a device is assigned to a UVM via VPCI support in HCS, a new VMBUS channel device node is -// created in the UVM. The actual device that was assigned in is exposed as a child on this VMBUS -// channel device node. -// -// A device node's instance ID is an identifier that distinguishes that device from other devices -// on the system. The GUID of a VMBUS channel device node refers to that channel's unique -// identifier used internally by VMBUS and can be used to determine the VMBUS channel -// device node's instance ID. -// -// A VMBUS channel device node's instance ID is in the form: -// "VMBUS\vmbusChannelTypeGUIDFormatted\{vmBusChannelGUID}" -func (uvm *UtilityVM) GetAssignedDeviceVMBUSInstanceID(vmBusChannelGUID string) string { - return fmt.Sprintf("%s\\%s\\{%s}", assignedDeviceEnumerator, vmbusChannelTypeGUIDFormatted, vmBusChannelGUID) -} - // Release frees the resources of the corresponding vpci device func (vpci *VPCIDevice) Release(ctx context.Context) error { if err := vpci.vm.RemoveDevice(ctx, vpci.deviceInstanceID, vpci.virtualFunctionIndex); err != nil { diff --git a/internal/vm/guestmanager/device_lcow.go b/internal/vm/guestmanager/device_lcow.go index c6cece1fa0..61524a0ccc 100644 --- a/internal/vm/guestmanager/device_lcow.go +++ b/internal/vm/guestmanager/device_lcow.go @@ -11,18 +11,6 @@ import ( "github.com/Microsoft/hcsshim/internal/protocol/guestresource" ) -// LCOWDeviceManager exposes VPCI and VPMem device operations in the LCOW guest. -type LCOWDeviceManager interface { - // AddVPCIDevice adds a VPCI device to the guest. - AddVPCIDevice(ctx context.Context, settings guestresource.LCOWMappedVPCIDevice) error - // AddVPMemDevice adds a VPMem device to the guest. - AddVPMemDevice(ctx context.Context, settings guestresource.LCOWMappedVPMemDevice) error - // RemoveVPMemDevice removes a VPMem device from the guest. - RemoveVPMemDevice(ctx context.Context, settings guestresource.LCOWMappedVPMemDevice) error -} - -var _ LCOWDeviceManager = (*Guest)(nil) - // AddVPCIDevice adds a VPCI device in the guest. func (gm *Guest) AddVPCIDevice(ctx context.Context, settings guestresource.LCOWMappedVPCIDevice) error { request := &hcsschema.ModifySettingRequest{ diff --git a/internal/vm/vmmanager/pci.go b/internal/vm/vmmanager/pci.go index fbb72c4c04..fe5fc291e2 100644 --- a/internal/vm/vmmanager/pci.go +++ b/internal/vm/vmmanager/pci.go @@ -11,17 +11,6 @@ import ( "github.com/Microsoft/hcsshim/internal/protocol/guestrequest" ) -// PCIManager manages assiging pci devices to a Utility VM. This is Windows specific at the moment. -type PCIManager interface { - // AddDevice adds a pci device identified by `vmbusGUID` to the Utility VM with the provided settings. - AddDevice(ctx context.Context, vmbusGUID string, settings hcsschema.VirtualPciDevice) error - - // RemoveDevice removes the pci device identified by `vmbusGUID` from the Utility VM. - RemoveDevice(ctx context.Context, vmbusGUID string) error -} - -var _ PCIManager = (*UtilityVM)(nil) - func (uvm *UtilityVM) AddDevice(ctx context.Context, vmbusGUID string, settings hcsschema.VirtualPciDevice) error { request := &hcsschema.ModifySettingRequest{ ResourcePath: fmt.Sprintf(resourcepaths.VirtualPCIResourceFormat, vmbusGUID),