From 5fe6f7a9765174920802536c4cf787cbd1e2c5d1 Mon Sep 17 00:00:00 2001 From: Niclas Schad Date: Thu, 7 May 2026 14:04:59 +0200 Subject: [PATCH 1/5] Add support for MutableCSINodeAllocatableCount The CSI list's all PCIe devices that are not of type VIRTIO_BLOCK_DEVICE and subtracts them from the theoretically maximum, so kubernetes can report a correct dynamic max volume count that can be attached for each node. Signed-off-by: Niclas Schad --- pkg/csi/blockstorage/controllerserver.go | 4 ++ pkg/csi/blockstorage/nodeserver.go | 12 +++- pkg/csi/blockstorage/utils.go | 2 +- pkg/csi/blockstorage/utils_test.go | 10 +-- pkg/csi/util/mount/mount_darwin.go | 5 ++ pkg/csi/util/mount/mount_linux.go | 80 ++++++++++++++++++++++++ pkg/stackit/stackiterrors/errors.go | 14 ++++- 7 files changed, 118 insertions(+), 9 deletions(-) diff --git a/pkg/csi/blockstorage/controllerserver.go b/pkg/csi/blockstorage/controllerserver.go index 8de6237e..ef962edd 100644 --- a/pkg/csi/blockstorage/controllerserver.go +++ b/pkg/csi/blockstorage/controllerserver.go @@ -370,6 +370,10 @@ func (cs *controllerServer) ControllerPublishVolume(ctx context.Context, req *cs _, err = cloud.AttachVolume(ctx, instanceID, volumeID) if err != nil { + // Trigger's an immediate `NodeGetInfo` RPC call when MutableCSINodeAllocatableCount is enabled + if stackiterrors.IsTooManyDevicesError(err) { + return nil, status.Errorf(codes.ResourceExhausted, "[ControllerPublishVolume] Node can't accept any more volumes %v. All PCIe lanes are exhausted!", err) + } klog.Errorf("Failed to AttachVolume: %v", err) return nil, status.Errorf(codes.Internal, "[ControllerPublishVolume] Attach Volume failed with error %v", err) } diff --git a/pkg/csi/blockstorage/nodeserver.go b/pkg/csi/blockstorage/nodeserver.go index 648e5df3..0b390d2b 100644 --- a/pkg/csi/blockstorage/nodeserver.go +++ b/pkg/csi/blockstorage/nodeserver.go @@ -308,8 +308,16 @@ func (ns *nodeServer) NodeGetInfo(ctx context.Context, _ *csi.NodeGetInfoRequest } maxVolumesPerNode := DetermineMaxVolumesByFlavor(flavor) - // Subtract 1 for root disk and another for configDrive/spare - maxVolumesPerNode -= 2 + + // Subtract already mounted Volumes + emptyPCIeRootPorts, err := mount.CountNonVirtioBlockDevices() + if err != nil { + klog.Errorf("[NodeGetInfo] unable to retrieve PCIe root ports %v", err) + emptyPCIeRootPorts = 0 + } + + maxVolumesPerNode -= emptyPCIeRootPorts + klog.V(4).Infof("Determined %d PCIe ports occupied by non virtio block devices", emptyPCIeRootPorts) klog.V(4).Infof("Determined node to support %d volumes", maxVolumesPerNode) nodeInfo := &csi.NodeGetInfoResponse{ diff --git a/pkg/csi/blockstorage/utils.go b/pkg/csi/blockstorage/utils.go index aaafc864..eacb77f7 100644 --- a/pkg/csi/blockstorage/utils.go +++ b/pkg/csi/blockstorage/utils.go @@ -97,7 +97,7 @@ func DetermineMaxVolumesByFlavor(flavor string) int64 { return 159 default: // All other flavors can mount 28 volumes - return 25 + return 28 } } diff --git a/pkg/csi/blockstorage/utils_test.go b/pkg/csi/blockstorage/utils_test.go index f9261de4..9d505950 100644 --- a/pkg/csi/blockstorage/utils_test.go +++ b/pkg/csi/blockstorage/utils_test.go @@ -12,14 +12,14 @@ var _ = Describe("Util Test", func() { maxVolumes := DetermineMaxVolumesByFlavor(flavor) Expect(maxVolumes).To(Equal(int64(expectedMaxVolumes))) }, - Entry("Intel 3rd Gen", "c3i.2", 25), - Entry("Intel 2rd Gen", "c2i.2", 25), - Entry("Intel 1st Gen", "c1.2", 25), - Entry("AMD 1st Gen without overprovisioning", "s1a.8d", 25), + Entry("Intel 3rd Gen", "c3i.2", 28), + Entry("Intel 2rd Gen", "c2i.2", 28), + Entry("Intel 1st Gen", "c1.2", 28), + Entry("AMD 1st Gen without overprovisioning", "s1a.8d", 28), Entry("AMD 2nd Gen without overprovisioning", "s2a.8d", 159), Entry("Nvidia GPU", "n2.14d.g1", 10), Entry("Nvidia GPU", "n2.56d.g4", 10), - Entry("ARM Gen1Link without CPU-overprovisioning ARM Gen1", "g1r.4d", 25), + Entry("ARM Gen1Link without CPU-overprovisioning ARM Gen1", "g1r.4d", 28), ) }) }) diff --git a/pkg/csi/util/mount/mount_darwin.go b/pkg/csi/util/mount/mount_darwin.go index 122f4c1c..389fd6cb 100644 --- a/pkg/csi/util/mount/mount_darwin.go +++ b/pkg/csi/util/mount/mount_darwin.go @@ -17,3 +17,8 @@ func newDeviceStats(statfs *unix.Statfs_t) *DeviceStats { UsedInodes: int64(statfs.Files) - int64(statfs.Ffree), } } + +func CountNonVirtioBlockDevices() (int64, error) { + // not implemented + return 0, nil +} diff --git a/pkg/csi/util/mount/mount_linux.go b/pkg/csi/util/mount/mount_linux.go index b525b753..f8925708 100644 --- a/pkg/csi/util/mount/mount_linux.go +++ b/pkg/csi/util/mount/mount_linux.go @@ -4,6 +4,15 @@ package mount import "golang.org/x/sys/unix" +var ( + pciAddressRegex = regexp.MustCompile(`^[0-9a-fA-F]{4}:[0-9a-fA-F]{2}:[0-9a-fA-F]{2}\.[0-9a-fA-F]$`) +) + +const ( + RedhatVendor = "0x1af4" + VirtioBlockDevice = "0x1042" +) + func newDeviceStats(statfs *unix.Statfs_t) *DeviceStats { return &DeviceStats{ Block: false, @@ -17,3 +26,74 @@ func newDeviceStats(statfs *unix.Statfs_t) *DeviceStats { UsedInodes: int64(statfs.Files) - int64(statfs.Ffree), } } + +// CountNonVirtioBlockDevices returns the number of PCIe Root ports who +// are currently occupied by anything else than an VIRTIO 1.0 Block Device +// returns zero when something went wrong +func CountNonVirtioBlockDevices() (int64, error) { + const pciPath = "/sys/bus/pci/devices" + + // Get all PCI devices + devices, err := os.ReadDir(pciPath) + if err != nil { + return 0, fmt.Errorf("failed to read PCI bus: %w", err) + } + + pcieSlotsOccupiedByNonBlockDevice := 0 + + for _, dev := range devices { + devPath := filepath.Join(pciPath, dev.Name()) + + // 1. Identify if it's a Root Port / Bridge + // We check the 'class' file. PCI Bridge class code starts with 0x0604 + classBuf, err := os.ReadFile(filepath.Join(devPath, "class")) + if err != nil { + klog.Errorf("failed to read PCI device class %s : %v", devPath, err) + continue + } + class := strings.TrimSpace(string(classBuf)) + + // Class 0x060400 is a PCI-to-PCI bridge (standard for Root Ports) + if strings.HasPrefix(class, "0x0604") { + // 2. Check if the port has downstream devices + // If the bridge has children, they appear as subdirectories + // matching the PCI address format (e.g., 0000:01:00.0) + files, err2 := os.ReadDir(devPath) + if err2 != nil { + klog.Errorf("failed to read dir %s : %v", devPath, err2) + } + for _, file := range files { + // Ignore PCI bus directories such as pci001 pci002 and pci010 + // Devices must follow format + if pciAddressRegex.MatchString(file.Name()) { + isNonBlockDevice := IsNonBlockDevice(devPath, file) + if isNonBlockDevice { + pcieSlotsOccupiedByNonBlockDevice++ + } + break + } + } + } else { + klog.V(4).Infof("skipping class %s: path: %s", class, devPath) + } + } + + return int64(pcieSlotsOccupiedByNonBlockDevice), nil +} + +func IsNonBlockDevice(devPath string, file os.DirEntry) bool { + var isNonBlockDevice bool + pciDevicePath := filepath.Join(devPath, file.Name()) + vendorBuf, err := os.ReadFile(filepath.Join(pciDevicePath, "vendor")) + if err != nil { + klog.Errorf("failed to read PCI device vendor %s : %v", pciDevicePath, err) + } + deviceBuf, err := os.ReadFile(filepath.Join(pciDevicePath, "device")) + if err != nil { + klog.Errorf("failed to read PCI device file %s : %v", pciDevicePath, err) + } + if strings.TrimSpace(string(vendorBuf)) == RedhatVendor && strings.TrimSpace(string(deviceBuf)) != VirtioBlockDevice { + isNonBlockDevice = true + } + return isNonBlockDevice +} diff --git a/pkg/stackit/stackiterrors/errors.go b/pkg/stackit/stackiterrors/errors.go index ae19b7d7..0e37be49 100644 --- a/pkg/stackit/stackiterrors/errors.go +++ b/pkg/stackit/stackiterrors/errors.go @@ -4,9 +4,10 @@ import ( "errors" "fmt" "net/http" + "strings" oapiError "github.com/stackitcloud/stackit-sdk-go/core/oapierror" - wait "github.com/stackitcloud/stackit-sdk-go/services/iaas/v2api/wait" + "github.com/stackitcloud/stackit-sdk-go/services/iaas/v2api/wait" ) var ErrNotFound = errors.New("failed to find object") @@ -20,6 +21,17 @@ func IsNotFound(err error) bool { return oAPIError.StatusCode == http.StatusNotFound } +func IsTooManyDevicesError(err error) bool { + var oAPIError *oapiError.GenericOpenAPIError + if ok := errors.As(err, &oAPIError); !ok { + return false + } + + // TODO: Improve this if possible + return oAPIError.StatusCode == http.StatusForbidden && + strings.Contains(oAPIError.ErrorMessage, "maximum allowed number of disk devices") +} + func IgnoreNotFound(err error) error { if IsNotFound(err) { return nil From 9072ccda0ca474966fb3b1926517b0982bfe4aef Mon Sep 17 00:00:00 2001 From: Niclas Schad Date: Fri, 15 May 2026 14:39:28 +0200 Subject: [PATCH 2/5] fix imports for linux Signed-off-by: Niclas Schad --- pkg/csi/util/mount/mount_linux.go | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pkg/csi/util/mount/mount_linux.go b/pkg/csi/util/mount/mount_linux.go index f8925708..c6259b47 100644 --- a/pkg/csi/util/mount/mount_linux.go +++ b/pkg/csi/util/mount/mount_linux.go @@ -2,7 +2,16 @@ package mount -import "golang.org/x/sys/unix" +import ( + "fmt" + "os" + "path/filepath" + "regexp" + "strings" + + "golang.org/x/sys/unix" + "k8s.io/klog/v2" +) var ( pciAddressRegex = regexp.MustCompile(`^[0-9a-fA-F]{4}:[0-9a-fA-F]{2}:[0-9a-fA-F]{2}\.[0-9a-fA-F]$`) From 517a7171ac45810140169cdc3d5da91089075b26 Mon Sep 17 00:00:00 2001 From: Niclas Schad Date: Tue, 19 May 2026 13:54:12 +0200 Subject: [PATCH 3/5] subtract one from maxVolumes for root partition Signed-off-by: Niclas Schad --- pkg/csi/blockstorage/nodeserver.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pkg/csi/blockstorage/nodeserver.go b/pkg/csi/blockstorage/nodeserver.go index 0b390d2b..44e5c450 100644 --- a/pkg/csi/blockstorage/nodeserver.go +++ b/pkg/csi/blockstorage/nodeserver.go @@ -320,6 +320,9 @@ func (ns *nodeServer) NodeGetInfo(ctx context.Context, _ *csi.NodeGetInfoRequest klog.V(4).Infof("Determined %d PCIe ports occupied by non virtio block devices", emptyPCIeRootPorts) klog.V(4).Infof("Determined node to support %d volumes", maxVolumesPerNode) + // always subtract one for every SKE node, because they always have a root partition + maxVolumesPerNode -= 1 + nodeInfo := &csi.NodeGetInfoResponse{ NodeId: nodeID, MaxVolumesPerNode: maxVolumesPerNode, From 2dba8e703120f7899a9bb68f5caf9be987e881b6 Mon Sep 17 00:00:00 2001 From: Niclas Schad Date: Mon, 1 Jun 2026 11:57:50 +0200 Subject: [PATCH 4/5] WIP Signed-off-by: Niclas Schad --- pkg/csi/blockstorage/nodeserver.go | 22 ++++++------ pkg/csi/blockstorage/utils.go | 4 +-- pkg/csi/util/mount/mount_darwin.go | 6 +++- pkg/csi/util/mount/mount_linux.go | 57 +++++++++++++++++++++--------- 4 files changed, 58 insertions(+), 31 deletions(-) diff --git a/pkg/csi/blockstorage/nodeserver.go b/pkg/csi/blockstorage/nodeserver.go index 44e5c450..9ef5c3ab 100644 --- a/pkg/csi/blockstorage/nodeserver.go +++ b/pkg/csi/blockstorage/nodeserver.go @@ -302,26 +302,24 @@ func (ns *nodeServer) NodeGetInfo(ctx context.Context, _ *csi.NodeGetInfoRequest return nil, status.Errorf(codes.Internal, "[NodeGetInfo] unable to retrieve instance id of node %v", err) } - flavor, err := ns.Metadata.GetFlavor(ctx) - if err != nil { - return nil, status.Errorf(codes.Internal, "[NodeGetInfo] unable to retrieve flavor of node %v", err) - } - - maxVolumesPerNode := DetermineMaxVolumesByFlavor(flavor) + //flavor, err := ns.Metadata.GetFlavor(ctx) + //if err != nil { + // return nil, status.Errorf(codes.Internal, "[NodeGetInfo] unable to retrieve flavor of node %v", err) + //} // Subtract already mounted Volumes - emptyPCIeRootPorts, err := mount.CountNonVirtioBlockDevices() + emptyPCIeRootPorts, err := mount.CountFreePCIeSlots() if err != nil { klog.Errorf("[NodeGetInfo] unable to retrieve PCIe root ports %v", err) emptyPCIeRootPorts = 0 } - maxVolumesPerNode -= emptyPCIeRootPorts - klog.V(4).Infof("Determined %d PCIe ports occupied by non virtio block devices", emptyPCIeRootPorts) - klog.V(4).Infof("Determined node to support %d volumes", maxVolumesPerNode) + vols, err := mount.CountLocalCSIVolumes(driverName) + if err != nil { + klog.Errorf("[NodeGetInfo] unable to retrieve volume count %v", err) + } - // always subtract one for every SKE node, because they always have a root partition - maxVolumesPerNode -= 1 + maxVolumesPerNode := emptyPCIeRootPorts + vols nodeInfo := &csi.NodeGetInfoResponse{ NodeId: nodeID, diff --git a/pkg/csi/blockstorage/utils.go b/pkg/csi/blockstorage/utils.go index eacb77f7..a14cdafb 100644 --- a/pkg/csi/blockstorage/utils.go +++ b/pkg/csi/blockstorage/utils.go @@ -90,8 +90,8 @@ func DetermineMaxVolumesByFlavor(flavor string) int64 { // The following numbers were specified by the IaaS team. They are based on actual tests. switch { case strings.HasPrefix(flavor, "n"): - // Flavors starting with 'n' are nvidia GPU flavors, all GPU VM's can only mount 10 volumes - return 10 + // Flavors starting with 'n' are nvidia GPU flavors + return 13 case strings.HasSuffix(flavorParts[0], "2a"): // AMD 2nd Gen return 159 diff --git a/pkg/csi/util/mount/mount_darwin.go b/pkg/csi/util/mount/mount_darwin.go index 389fd6cb..07dcba11 100644 --- a/pkg/csi/util/mount/mount_darwin.go +++ b/pkg/csi/util/mount/mount_darwin.go @@ -18,7 +18,11 @@ func newDeviceStats(statfs *unix.Statfs_t) *DeviceStats { } } -func CountNonVirtioBlockDevices() (int64, error) { +func CountLocalCSIVolumes(_ string) (int64, error) { // not implemented return 0, nil } + +func CountFreePCIeSlots() (int64, error) { + return 0, nil +} diff --git a/pkg/csi/util/mount/mount_linux.go b/pkg/csi/util/mount/mount_linux.go index c6259b47..efa5a6bd 100644 --- a/pkg/csi/util/mount/mount_linux.go +++ b/pkg/csi/util/mount/mount_linux.go @@ -7,6 +7,7 @@ import ( "os" "path/filepath" "regexp" + "slices" "strings" "golang.org/x/sys/unix" @@ -36,10 +37,9 @@ func newDeviceStats(statfs *unix.Statfs_t) *DeviceStats { } } -// CountNonVirtioBlockDevices returns the number of PCIe Root ports who -// are currently occupied by anything else than an VIRTIO 1.0 Block Device -// returns zero when something went wrong -func CountNonVirtioBlockDevices() (int64, error) { +// CountFreePCIeSlots returns the number of PCIe Root ports who +// are currently not occupied by anything. +func CountFreePCIeSlots() (int64, error) { const pciPath = "/sys/bus/pci/devices" // Get all PCI devices @@ -48,7 +48,7 @@ func CountNonVirtioBlockDevices() (int64, error) { return 0, fmt.Errorf("failed to read PCI bus: %w", err) } - pcieSlotsOccupiedByNonBlockDevice := 0 + freePCIeSlots := 0 for _, dev := range devices { devPath := filepath.Join(pciPath, dev.Name()) @@ -71,23 +71,48 @@ func CountNonVirtioBlockDevices() (int64, error) { if err2 != nil { klog.Errorf("failed to read dir %s : %v", devPath, err2) } - for _, file := range files { - // Ignore PCI bus directories such as pci001 pci002 and pci010 - // Devices must follow format - if pciAddressRegex.MatchString(file.Name()) { - isNonBlockDevice := IsNonBlockDevice(devPath, file) - if isNonBlockDevice { - pcieSlotsOccupiedByNonBlockDevice++ - } - break - } + hasDownStreamFolder := slices.ContainsFunc(files, func(s os.DirEntry) bool { + return pciAddressRegex.MatchString(s.Name()) + }) + if !hasDownStreamFolder { + freePCIeSlots += 1 } } else { klog.V(4).Infof("skipping class %s: path: %s", class, devPath) } } - return int64(pcieSlotsOccupiedByNonBlockDevice), nil + return int64(freePCIeSlots), nil +} + +// CountLocalCSIVolumes tries to count how many volumes are mounted for a given driverName. +func CountLocalCSIVolumes(driverName string) (int64, error) { + const kubeletDir = "/var/lib/kubelet" + volumeCount := 0 + // The path where Kubelet mounts global tracking directories for a specific CSI driver + targetDir := filepath.Join(kubeletDir, "plugins", "kubernetes.io", "csi", driverName) + + if _, err := os.Stat(targetDir); os.IsNotExist(err) { + return 0, nil + } else if err != nil { + return 0, fmt.Errorf("failed to check directory: %w", err) + } + + volumes, err := os.ReadDir(targetDir) + if err != nil { + return 0, fmt.Errorf("failed to read dir %s: %w", targetDir, err) + } + for _, vol := range volumes { + // Check if volume has a "globalmount" dir to determine if it's mounted correctly + globalMountPath := filepath.Join(vol.Name(), "globalmount") + if _, err := os.Stat(globalMountPath); os.IsNotExist(err) { + continue + } + + volumeCount++ + } + + return int64(volumeCount), nil } func IsNonBlockDevice(devPath string, file os.DirEntry) bool { From b158724614454b82de9695b3eb882513a0f244f1 Mon Sep 17 00:00:00 2001 From: Niclas Schad Date: Mon, 1 Jun 2026 15:38:30 +0200 Subject: [PATCH 5/5] parse Body instead of ErrorMessage field in IsTooManyDevicesError() Signed-off-by: Niclas Schad --- pkg/stackit/stackiterrors/errors.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/stackit/stackiterrors/errors.go b/pkg/stackit/stackiterrors/errors.go index 0e37be49..98b0f528 100644 --- a/pkg/stackit/stackiterrors/errors.go +++ b/pkg/stackit/stackiterrors/errors.go @@ -29,7 +29,7 @@ func IsTooManyDevicesError(err error) bool { // TODO: Improve this if possible return oAPIError.StatusCode == http.StatusForbidden && - strings.Contains(oAPIError.ErrorMessage, "maximum allowed number of disk devices") + strings.Contains(string(oAPIError.Body), "maximum allowed number of disk devices") } func IgnoreNotFound(err error) error {