From b972092252c2f25334b355891817ac688b2ee843 Mon Sep 17 00:00:00 2001 From: Olev Kartau Date: Fri, 23 Aug 2019 14:42:35 +0300 Subject: [PATCH 1/2] pmem-device-manager: cleaned alignment and size compensating code pmd-lvm.go: Delete old irrelevant comments about selection strategy. LVM uses byte-aligned size arg instead or MB. Check for zero size at upper level before calling either LVM or ndctl layer, as this is common concern for both. This lets us to treat size as non zero in both device managers. --- pkg/pmem-csi-driver/controllerserver-node.go | 7 ++++- pkg/pmem-device-manager/pmd-lvm.go | 33 ++++++++++---------- pkg/pmem-device-manager/pmd-ndctl.go | 25 +++++++-------- 3 files changed, 33 insertions(+), 32 deletions(-) diff --git a/pkg/pmem-csi-driver/controllerserver-node.go b/pkg/pmem-csi-driver/controllerserver-node.go index ade07929f1..22243adeca 100644 --- a/pkg/pmem-csi-driver/controllerserver-node.go +++ b/pkg/pmem-csi-driver/controllerserver-node.go @@ -206,7 +206,12 @@ func (cs *nodeControllerServer) CreateVolume(ctx context.Context, req *csi.Creat } }(volumeID) } - + if asked <= 0 { + // Allocating volumes of zero size isn't supported. + // We use some arbitrary small minimum size instead. + // It will get rounded up by below layer to meet the alignment. + asked = 1 + } if err := cs.dm.CreateDevice(volumeID, uint64(asked), nsmode); err != nil { return nil, status.Errorf(codes.Internal, "Node CreateVolume: failed to create volume: %s", err.Error()) } diff --git a/pkg/pmem-device-manager/pmd-lvm.go b/pkg/pmem-device-manager/pmd-lvm.go index c8881a45bb..e57debd34f 100644 --- a/pkg/pmem-device-manager/pmd-lvm.go +++ b/pkg/pmem-device-manager/pmd-lvm.go @@ -11,6 +11,11 @@ import ( "k8s.io/klog" ) +const ( + // 4 MB alignment is used by LVM + lvmAlign uint64 = 4 * 1024 * 1024 +) + type pmemLvm struct { volumeGroups []string devices map[string]PmemDeviceInfo @@ -94,30 +99,24 @@ func (lvm *pmemLvm) CreateDevice(name string, size uint64, nsmode string) error if err == nil { return fmt.Errorf("CreateDevice: Failed: namespace with that name '%s' exists", name) } - // pick a region, few possible strategies: - // 1. pick first with enough available space: simplest, regions get filled in order; - // 2. pick first with largest available space: regions get used round-robin, i.e. load-balanced, but does not leave large unused; - // 3. pick first with smallest available which satisfies the request: ordered initially, but later leaves bigger free available; - // Let's implement strategy 1 for now, simplest to code as no need to compare sizes in all regions - // NOTE: We walk buses and regions in ndctl context, but avail.size we check in LV context vgs, err := getVolumeGroups(lvm.volumeGroups, nsmode) if err != nil { return err } - // lvcreate takes size in MBytes if no unit. - // We use MBytes here to avoid problems with byte-granularity, as lvcreate - // may refuse to create some arbitrary sizes. - // Division by 1M should not result in smaller-than-asked here - // as lvcreate will round up to next 4MB boundary. - sizeM := int(size / (1024 * 1024)) - // Asked==zero means unspecified by CSI spec, we create a small 4 Mbyte volume - // as lvcreate does not allow zero size (csi-sanity creates zero-sized volumes) - if sizeM <= 0 { - sizeM = 4 + // Adjust up to next alignment boundary, if not aligned already. + // This logic relies on size guaranteed to be nonzero, + // which is now achieved by check in upper layer. + // If zero size possible then we would need to check and increment by lvmAlign, + // because LVM does not tolerate creation of zero size. + if reminder := size % lvmAlign; reminder != 0 { + klog.V(5).Infof("CreateDevice align size up by %v: from %v", lvmAlign-reminder, size) + size += lvmAlign - reminder + klog.V(5).Infof("CreateDevice align size up: to %v", size) } - strSz := strconv.Itoa(sizeM) + strSz := strconv.FormatUint(size, 10) + "B" for _, vg := range vgs { + // use first Vgroup with enough available space if vg.free >= size { // In some container environments clearing device fails with race condition. // So, we ask lvm not to clear(-Zn) the newly created device, instead we do ourself in later stage. diff --git a/pkg/pmem-device-manager/pmd-ndctl.go b/pkg/pmem-device-manager/pmd-ndctl.go index 8c5b8cdd8b..230ec81f9e 100644 --- a/pkg/pmem-device-manager/pmd-ndctl.go +++ b/pkg/pmem-device-manager/pmd-ndctl.go @@ -9,6 +9,12 @@ import ( "k8s.io/kubernetes/pkg/util/mount" ) +const ( + // 1 GB align in ndctl creation request has proven to be reliable. + // Newer kernels may allow smaller alignment but we do not want to introduce kernel depenency. + ndctlAlign uint64 = 1024 * 1024 * 1024 +) + type pmemNdctl struct { ctx *ndctl.Context } @@ -60,10 +66,9 @@ func (pmem *pmemNdctl) GetCapacity() (map[string]uint64, error) { Capacity := map[string]uint64{} nsmodes := []ndctl.NamespaceMode{ndctl.FsdaxMode, ndctl.SectorMode} var capacity uint64 - align := uint64(1024 * 1024 * 1024) for _, bus := range pmem.ctx.GetBuses() { for _, r := range bus.ActiveRegions() { - realalign := align * r.InterleaveWays() + realalign := ndctlAlign * r.InterleaveWays() available := r.MaxAvailableExtent() // align down, avoid claiming more than what we really can serve klog.V(4).Infof("GetCapacity: available before realalign: %d", available) @@ -97,26 +102,18 @@ func (pmem *pmemNdctl) CreateDevice(name string, size uint64, nsmode string) err klog.V(4).Infof("Device with name: %s already exists, refuse to create another", name) return fmt.Errorf("CreateDevice: Failed: namespace with that name exists") } - if size <= 0 { - // Allocating volumes of zero size isn't supported. - // We use some arbitrary small minimum size instead. - // It will get rounded up by libndctl to meet the alignment. - size = 1 - } - // Pass align = 1 GB into creation request as that has proven to be reliable. - const align uint64 = 1024 * 1024 * 1024 // libndctl needs to store meta data and will use some of the allocated // space for that (https://github.com/pmem/ndctl/issues/79). // We don't know exactly how much space that is, just // that it should be a small amount. But because libndctl // rounds up to the alignment, in practice that means we need // to request `align` additional bytes. - compensatedsize := size + align - klog.V(4).Infof("CreateDevice:%s: Compensate for libndctl creating one alignment step smaller: change size %d to %d", name, size, compensatedsize) + size += ndctlAlign + klog.V(4).Infof("Compensate for libndctl creating one alignment step smaller: increase size to %d", size) ns, err := pmem.ctx.CreateNamespace(ndctl.CreateNamespaceOpts{ Name: name, - Size: compensatedsize, - Align: align, + Size: size, + Align: ndctlAlign, Mode: ndctl.NamespaceMode(nsmode), }) if err != nil { From 9d5cd7c521cddfcc1a5f8c6612d72e5fc3a76dd6 Mon Sep 17 00:00:00 2001 From: Olev Kartau Date: Tue, 8 Oct 2019 13:03:21 +0300 Subject: [PATCH 2/2] Use VolumeId instead of "name" in Device Manager layer In CSI spec, Name and VolumeID are different things. In PMEM-CSI we use VolumeID for naming volumes and namespaces, so let's stick to the style and use VolumeId in code, otherwise it may be confusing why we suddenly switch back to "Name" when code gets down to device-manager level. --- pkg/pmem-csi-driver/controllerserver-node.go | 2 +- pkg/pmem-device-manager/pmd-lvm.go | 46 ++++++++++---------- pkg/pmem-device-manager/pmd-manager.go | 4 +- pkg/pmem-device-manager/pmd-ndctl.go | 36 +++++++-------- 4 files changed, 44 insertions(+), 44 deletions(-) diff --git a/pkg/pmem-csi-driver/controllerserver-node.go b/pkg/pmem-csi-driver/controllerserver-node.go index 22243adeca..03bb43a460 100644 --- a/pkg/pmem-csi-driver/controllerserver-node.go +++ b/pkg/pmem-csi-driver/controllerserver-node.go @@ -95,7 +95,7 @@ func NewNodeControllerServer(nodeID string, dm pmdmanager.PmemDeviceManager, sm err = sm.GetAll(v, func(id string) bool { // See if the device data stored at StateManager is still valid for _, devInfo := range devices { - if devInfo.Name == id { + if devInfo.VolumeId == id { ncs.pmemVolumes[id] = v.Copy() return true } diff --git a/pkg/pmem-device-manager/pmd-lvm.go b/pkg/pmem-device-manager/pmd-lvm.go index e57debd34f..30d53a337b 100644 --- a/pkg/pmem-device-manager/pmd-lvm.go +++ b/pkg/pmem-device-manager/pmd-lvm.go @@ -84,20 +84,20 @@ func (lvm *pmemLvm) GetCapacity() (map[string]uint64, error) { } // nsmode is expected to be either "fsdax" or "sector" -func (lvm *pmemLvm) CreateDevice(name string, size uint64, nsmode string) error { +func (lvm *pmemLvm) CreateDevice(volumeId string, size uint64, nsmode string) error { if nsmode != string(ndctl.FsdaxMode) && nsmode != string(ndctl.SectorMode) { return fmt.Errorf("Unknown nsmode(%v)", nsmode) } lvmMutex.Lock() defer lvmMutex.Unlock() - // Check that such name does not exist. In certain error states, for example when + // Check that such volume does not exist. In certain error states, for example when // namespace creation works but device zeroing fails (missing /dev/pmemX.Y in container), // this function is asked to create new devices repeatedly, forcing running out of space. // Avoid device filling with garbage entries by returning error. - // Overall, no point having more than one namespace with same name. - _, err := lvm.getDevice(name) + // Overall, no point having more than one namespace with same volumeId. + _, err := lvm.getDevice(volumeId) if err == nil { - return fmt.Errorf("CreateDevice: Failed: namespace with that name '%s' exists", name) + return fmt.Errorf("CreateDevice: Failed: volume with that name '%s' exists", volumeId) } vgs, err := getVolumeGroups(lvm.volumeGroups, nsmode) if err != nil { @@ -121,11 +121,11 @@ func (lvm *pmemLvm) CreateDevice(name string, size uint64, nsmode string) error // In some container environments clearing device fails with race condition. // So, we ask lvm not to clear(-Zn) the newly created device, instead we do ourself in later stage. // lvcreate takes size in MBytes if no unit - if _, err := pmemexec.RunCommand("lvcreate", "-Zn", "-L", strSz, "-n", name, vg.name); err != nil { + if _, err := pmemexec.RunCommand("lvcreate", "-Zn", "-L", strSz, "-n", volumeId, vg.name); err != nil { klog.V(3).Infof("lvcreate failed with error: %v, trying for next free region", err) } else { // clear start of device to avoid old data being recognized as file system - device, err := getUncachedDevice(name, vg.name) + device, err := getUncachedDevice(volumeId, vg.name) if err != nil { return err } @@ -138,7 +138,7 @@ func (lvm *pmemLvm) CreateDevice(name string, size uint64, nsmode string) error return err } - lvm.devices[device.Name] = device + lvm.devices[device.VolumeId] = device return nil } @@ -147,11 +147,11 @@ func (lvm *pmemLvm) CreateDevice(name string, size uint64, nsmode string) error return fmt.Errorf("No region is having enough space required(%v)", size) } -func (lvm *pmemLvm) DeleteDevice(name string, flush bool) error { +func (lvm *pmemLvm) DeleteDevice(volumeId string, flush bool) error { lvmMutex.Lock() defer lvmMutex.Unlock() - device, err := lvm.getDevice(name) + device, err := lvm.getDevice(volumeId) if err != nil { return err } @@ -164,16 +164,16 @@ func (lvm *pmemLvm) DeleteDevice(name string, flush bool) error { } // Remove device from cache - delete(lvm.devices, name) + delete(lvm.devices, volumeId) return nil } -func (lvm *pmemLvm) FlushDeviceData(name string) error { +func (lvm *pmemLvm) FlushDeviceData(volumeId string) error { lvmMutex.Lock() defer lvmMutex.Unlock() - device, err := lvm.getDevice(name) + device, err := lvm.getDevice(volumeId) if err != nil { return err } @@ -193,31 +193,31 @@ func (lvm *pmemLvm) ListDevices() ([]PmemDeviceInfo, error) { return devices, nil } -func (lvm *pmemLvm) GetDevice(id string) (PmemDeviceInfo, error) { +func (lvm *pmemLvm) GetDevice(volumeId string) (PmemDeviceInfo, error) { lvmMutex.Lock() defer lvmMutex.Unlock() - return lvm.getDevice(id) + return lvm.getDevice(volumeId) } -func (lvm *pmemLvm) getDevice(id string) (PmemDeviceInfo, error) { - if dev, ok := lvm.devices[id]; ok { +func (lvm *pmemLvm) getDevice(volumeId string) (PmemDeviceInfo, error) { + if dev, ok := lvm.devices[volumeId]; ok { return dev, nil } - return PmemDeviceInfo{}, fmt.Errorf("Device not found with name %s", id) + return PmemDeviceInfo{}, fmt.Errorf("Device not found with name %s", volumeId) } -func getUncachedDevice(id string, volumeGroup string) (PmemDeviceInfo, error) { +func getUncachedDevice(volumeId string, volumeGroup string) (PmemDeviceInfo, error) { devices, err := listDevices(volumeGroup) if err != nil { return PmemDeviceInfo{}, err } - if dev, ok := devices[id]; ok { + if dev, ok := devices[volumeId]; ok { return dev, nil } - return PmemDeviceInfo{}, fmt.Errorf("Device not found with name %s", id) + return PmemDeviceInfo{}, fmt.Errorf("Device not found with name %s", volumeId) } // listDevices Lists available logical devices in given volume groups @@ -245,11 +245,11 @@ func parseLVSOuput(output string) (map[string]PmemDeviceInfo, error) { } dev := PmemDeviceInfo{} - dev.Name = fields[0] + dev.VolumeId = fields[0] dev.Path = fields[1] dev.Size, _ = strconv.ParseUint(fields[2], 10, 64) - devices[dev.Name] = dev + devices[dev.VolumeId] = dev } return devices, nil diff --git a/pkg/pmem-device-manager/pmd-manager.go b/pkg/pmem-device-manager/pmd-manager.go index 210f9f8854..e3677fc1e1 100644 --- a/pkg/pmem-device-manager/pmd-manager.go +++ b/pkg/pmem-device-manager/pmd-manager.go @@ -2,8 +2,8 @@ package pmdmanager //PmemDeviceInfo represents a block device type PmemDeviceInfo struct { - //Name name of the block device - Name string + //VolumeId is name of the block device + VolumeId string //Path actual device path Path string //Size size allocated for block device diff --git a/pkg/pmem-device-manager/pmd-ndctl.go b/pkg/pmem-device-manager/pmd-ndctl.go index 230ec81f9e..88a3d35d9a 100644 --- a/pkg/pmem-device-manager/pmd-ndctl.go +++ b/pkg/pmem-device-manager/pmd-ndctl.go @@ -89,17 +89,17 @@ func (pmem *pmemNdctl) GetCapacity() (map[string]uint64, error) { return Capacity, nil } -func (pmem *pmemNdctl) CreateDevice(name string, size uint64, nsmode string) error { +func (pmem *pmemNdctl) CreateDevice(volumeId string, size uint64, nsmode string) error { ndctlMutex.Lock() defer ndctlMutex.Unlock() - // Check that such name does not exist. In certain error states, for example when + // Check that such volume does not exist. In certain error states, for example when // namespace creation works but device zeroing fails (missing /dev/pmemX.Y in container), // this function is asked to create new devices repeatedly, forcing running out of space. // Avoid device filling with garbage entries by returning error. // Overall, no point having more than one namespace with same name. - _, err := pmem.getDevice(name) + _, err := pmem.getDevice(volumeId) if err == nil { - klog.V(4).Infof("Device with name: %s already exists, refuse to create another", name) + klog.V(4).Infof("Device with name: %s already exists, refuse to create another", volumeId) return fmt.Errorf("CreateDevice: Failed: namespace with that name exists") } // libndctl needs to store meta data and will use some of the allocated @@ -111,7 +111,7 @@ func (pmem *pmemNdctl) CreateDevice(name string, size uint64, nsmode string) err size += ndctlAlign klog.V(4).Infof("Compensate for libndctl creating one alignment step smaller: increase size to %d", size) ns, err := pmem.ctx.CreateNamespace(ndctl.CreateNamespaceOpts{ - Name: name, + Name: volumeId, Size: size, Align: ndctlAlign, Mode: ndctl.NamespaceMode(nsmode), @@ -122,7 +122,7 @@ func (pmem *pmemNdctl) CreateDevice(name string, size uint64, nsmode string) err data, _ := ns.MarshalJSON() //nolint: gosec klog.V(3).Infof("Namespace created: %s", data) // clear start of device to avoid old data being recognized as file system - device, err := pmem.getDevice(name) + device, err := pmem.getDevice(volumeId) if err != nil { return err } @@ -134,11 +134,11 @@ func (pmem *pmemNdctl) CreateDevice(name string, size uint64, nsmode string) err return nil } -func (pmem *pmemNdctl) DeleteDevice(name string, flush bool) error { +func (pmem *pmemNdctl) DeleteDevice(volumeId string, flush bool) error { ndctlMutex.Lock() defer ndctlMutex.Unlock() - device, err := pmem.getDevice(name) + device, err := pmem.getDevice(volumeId) if err != nil { return err } @@ -146,25 +146,25 @@ func (pmem *pmemNdctl) DeleteDevice(name string, flush bool) error { if err != nil { return err } - return pmem.ctx.DestroyNamespaceByName(name) + return pmem.ctx.DestroyNamespaceByName(volumeId) } -func (pmem *pmemNdctl) FlushDeviceData(name string) error { +func (pmem *pmemNdctl) FlushDeviceData(volumeId string) error { ndctlMutex.Lock() defer ndctlMutex.Unlock() - device, err := pmem.getDevice(name) + device, err := pmem.getDevice(volumeId) if err != nil { return err } return ClearDevice(device, true) } -func (pmem *pmemNdctl) GetDevice(name string) (PmemDeviceInfo, error) { +func (pmem *pmemNdctl) GetDevice(volumeId string) (PmemDeviceInfo, error) { ndctlMutex.Lock() defer ndctlMutex.Unlock() - return pmem.getDevice(name) + return pmem.getDevice(volumeId) } func (pmem *pmemNdctl) ListDevices() ([]PmemDeviceInfo, error) { @@ -178,8 +178,8 @@ func (pmem *pmemNdctl) ListDevices() ([]PmemDeviceInfo, error) { return devices, nil } -func (pmem *pmemNdctl) getDevice(name string) (PmemDeviceInfo, error) { - ns, err := pmem.ctx.GetNamespaceByName(name) +func (pmem *pmemNdctl) getDevice(volumeId string) (PmemDeviceInfo, error) { + ns, err := pmem.ctx.GetNamespaceByName(volumeId) if err != nil { return PmemDeviceInfo{}, err } @@ -189,8 +189,8 @@ func (pmem *pmemNdctl) getDevice(name string) (PmemDeviceInfo, error) { func namespaceToPmemInfo(ns *ndctl.Namespace) PmemDeviceInfo { return PmemDeviceInfo{ - Name: ns.Name(), - Path: "/dev/" + ns.BlockDeviceName(), - Size: ns.Size(), + VolumeId: ns.Name(), + Path: "/dev/" + ns.BlockDeviceName(), + Size: ns.Size(), } }