Skip to content

Commit

Permalink
Merge pull request intel#391 from okartau/sizecalc
Browse files Browse the repository at this point in the history
cleaned alignment code, unified naming to use VolumeID more
  • Loading branch information
okartau authored Oct 22, 2019
2 parents 0230715 + 9d5cd7c commit 0332d08
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 76 deletions.
9 changes: 7 additions & 2 deletions pkg/pmem-csi-driver/controllerserver-node.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ func NewNodeControllerServer(nodeID string, dm pmdmanager.PmemDeviceManager, sm
err = sm.GetAll(v, func(id string) bool {
// See if the device data stored at StateManager is still valid
for _, devInfo := range devices {
if devInfo.Name == id {
if devInfo.VolumeId == id {
ncs.pmemVolumes[id] = v.Copy()
return true
}
Expand Down Expand Up @@ -206,7 +206,12 @@ func (cs *nodeControllerServer) CreateVolume(ctx context.Context, req *csi.Creat
}
}(volumeID)
}

if asked <= 0 {
// Allocating volumes of zero size isn't supported.
// We use some arbitrary small minimum size instead.
// It will get rounded up by below layer to meet the alignment.
asked = 1
}
if err := cs.dm.CreateDevice(volumeID, uint64(asked), nsmode); err != nil {
return nil, status.Errorf(codes.Internal, "Node CreateVolume: failed to create volume: %s", err.Error())
}
Expand Down
79 changes: 39 additions & 40 deletions pkg/pmem-device-manager/pmd-lvm.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@ import (
"k8s.io/klog"
)

const (
// 4 MB alignment is used by LVM
lvmAlign uint64 = 4 * 1024 * 1024
)

type pmemLvm struct {
volumeGroups []string
devices map[string]PmemDeviceInfo
Expand Down Expand Up @@ -79,54 +84,48 @@ func (lvm *pmemLvm) GetCapacity() (map[string]uint64, error) {
}

// nsmode is expected to be either "fsdax" or "sector"
func (lvm *pmemLvm) CreateDevice(name string, size uint64, nsmode string) error {
func (lvm *pmemLvm) CreateDevice(volumeId string, size uint64, nsmode string) error {
if nsmode != string(ndctl.FsdaxMode) && nsmode != string(ndctl.SectorMode) {
return fmt.Errorf("Unknown nsmode(%v)", nsmode)
}
lvmMutex.Lock()
defer lvmMutex.Unlock()
// Check that such name does not exist. In certain error states, for example when
// Check that such volume does not exist. In certain error states, for example when
// namespace creation works but device zeroing fails (missing /dev/pmemX.Y in container),
// this function is asked to create new devices repeatedly, forcing running out of space.
// Avoid device filling with garbage entries by returning error.
// Overall, no point having more than one namespace with same name.
_, err := lvm.getDevice(name)
// Overall, no point having more than one namespace with same volumeId.
_, err := lvm.getDevice(volumeId)
if err == nil {
return fmt.Errorf("CreateDevice: Failed: namespace with that name '%s' exists", name)
return fmt.Errorf("CreateDevice: Failed: volume with that name '%s' exists", volumeId)
}
// pick a region, few possible strategies:
// 1. pick first with enough available space: simplest, regions get filled in order;
// 2. pick first with largest available space: regions get used round-robin, i.e. load-balanced, but does not leave large unused;
// 3. pick first with smallest available which satisfies the request: ordered initially, but later leaves bigger free available;
// Let's implement strategy 1 for now, simplest to code as no need to compare sizes in all regions
// NOTE: We walk buses and regions in ndctl context, but avail.size we check in LV context
vgs, err := getVolumeGroups(lvm.volumeGroups, nsmode)
if err != nil {
return err
}
// lvcreate takes size in MBytes if no unit.
// We use MBytes here to avoid problems with byte-granularity, as lvcreate
// may refuse to create some arbitrary sizes.
// Division by 1M should not result in smaller-than-asked here
// as lvcreate will round up to next 4MB boundary.
sizeM := int(size / (1024 * 1024))
// Asked==zero means unspecified by CSI spec, we create a small 4 Mbyte volume
// as lvcreate does not allow zero size (csi-sanity creates zero-sized volumes)
if sizeM <= 0 {
sizeM = 4
// Adjust up to next alignment boundary, if not aligned already.
// This logic relies on size guaranteed to be nonzero,
// which is now achieved by check in upper layer.
// If zero size possible then we would need to check and increment by lvmAlign,
// because LVM does not tolerate creation of zero size.
if reminder := size % lvmAlign; reminder != 0 {
klog.V(5).Infof("CreateDevice align size up by %v: from %v", lvmAlign-reminder, size)
size += lvmAlign - reminder
klog.V(5).Infof("CreateDevice align size up: to %v", size)
}
strSz := strconv.Itoa(sizeM)
strSz := strconv.FormatUint(size, 10) + "B"

for _, vg := range vgs {
// use first Vgroup with enough available space
if vg.free >= size {
// In some container environments clearing device fails with race condition.
// So, we ask lvm not to clear(-Zn) the newly created device, instead we do ourself in later stage.
// lvcreate takes size in MBytes if no unit
if _, err := pmemexec.RunCommand("lvcreate", "-Zn", "-L", strSz, "-n", name, vg.name); err != nil {
if _, err := pmemexec.RunCommand("lvcreate", "-Zn", "-L", strSz, "-n", volumeId, vg.name); err != nil {
klog.V(3).Infof("lvcreate failed with error: %v, trying for next free region", err)
} else {
// clear start of device to avoid old data being recognized as file system
device, err := getUncachedDevice(name, vg.name)
device, err := getUncachedDevice(volumeId, vg.name)
if err != nil {
return err
}
Expand All @@ -139,7 +138,7 @@ func (lvm *pmemLvm) CreateDevice(name string, size uint64, nsmode string) error
return err
}

lvm.devices[device.Name] = device
lvm.devices[device.VolumeId] = device

return nil
}
Expand All @@ -148,11 +147,11 @@ func (lvm *pmemLvm) CreateDevice(name string, size uint64, nsmode string) error
return fmt.Errorf("No region is having enough space required(%v)", size)
}

func (lvm *pmemLvm) DeleteDevice(name string, flush bool) error {
func (lvm *pmemLvm) DeleteDevice(volumeId string, flush bool) error {
lvmMutex.Lock()
defer lvmMutex.Unlock()

device, err := lvm.getDevice(name)
device, err := lvm.getDevice(volumeId)
if err != nil {
return err
}
Expand All @@ -165,16 +164,16 @@ func (lvm *pmemLvm) DeleteDevice(name string, flush bool) error {
}

// Remove device from cache
delete(lvm.devices, name)
delete(lvm.devices, volumeId)

return nil
}

func (lvm *pmemLvm) FlushDeviceData(name string) error {
func (lvm *pmemLvm) FlushDeviceData(volumeId string) error {
lvmMutex.Lock()
defer lvmMutex.Unlock()

device, err := lvm.getDevice(name)
device, err := lvm.getDevice(volumeId)
if err != nil {
return err
}
Expand All @@ -194,31 +193,31 @@ func (lvm *pmemLvm) ListDevices() ([]PmemDeviceInfo, error) {
return devices, nil
}

func (lvm *pmemLvm) GetDevice(id string) (PmemDeviceInfo, error) {
func (lvm *pmemLvm) GetDevice(volumeId string) (PmemDeviceInfo, error) {
lvmMutex.Lock()
defer lvmMutex.Unlock()

return lvm.getDevice(id)
return lvm.getDevice(volumeId)
}

func (lvm *pmemLvm) getDevice(id string) (PmemDeviceInfo, error) {
if dev, ok := lvm.devices[id]; ok {
func (lvm *pmemLvm) getDevice(volumeId string) (PmemDeviceInfo, error) {
if dev, ok := lvm.devices[volumeId]; ok {
return dev, nil
}

return PmemDeviceInfo{}, fmt.Errorf("Device not found with name %s", id)
return PmemDeviceInfo{}, fmt.Errorf("Device not found with name %s", volumeId)
}

func getUncachedDevice(id string, volumeGroup string) (PmemDeviceInfo, error) {
func getUncachedDevice(volumeId string, volumeGroup string) (PmemDeviceInfo, error) {
devices, err := listDevices(volumeGroup)
if err != nil {
return PmemDeviceInfo{}, err
}

if dev, ok := devices[id]; ok {
if dev, ok := devices[volumeId]; ok {
return dev, nil
}
return PmemDeviceInfo{}, fmt.Errorf("Device not found with name %s", id)
return PmemDeviceInfo{}, fmt.Errorf("Device not found with name %s", volumeId)
}

// listDevices Lists available logical devices in given volume groups
Expand Down Expand Up @@ -246,11 +245,11 @@ func parseLVSOuput(output string) (map[string]PmemDeviceInfo, error) {
}

dev := PmemDeviceInfo{}
dev.Name = fields[0]
dev.VolumeId = fields[0]
dev.Path = fields[1]
dev.Size, _ = strconv.ParseUint(fields[2], 10, 64)

devices[dev.Name] = dev
devices[dev.VolumeId] = dev
}

return devices, nil
Expand Down
4 changes: 2 additions & 2 deletions pkg/pmem-device-manager/pmd-manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ package pmdmanager

//PmemDeviceInfo represents a block device
type PmemDeviceInfo struct {
//Name name of the block device
Name string
//VolumeId is name of the block device
VolumeId string
//Path actual device path
Path string
//Size size allocated for block device
Expand Down
61 changes: 29 additions & 32 deletions pkg/pmem-device-manager/pmd-ndctl.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@ import (
"k8s.io/kubernetes/pkg/util/mount"
)

const (
// 1 GB align in ndctl creation request has proven to be reliable.
// Newer kernels may allow smaller alignment but we do not want to introduce kernel depenency.
ndctlAlign uint64 = 1024 * 1024 * 1024
)

type pmemNdctl struct {
ctx *ndctl.Context
}
Expand Down Expand Up @@ -60,10 +66,9 @@ func (pmem *pmemNdctl) GetCapacity() (map[string]uint64, error) {
Capacity := map[string]uint64{}
nsmodes := []ndctl.NamespaceMode{ndctl.FsdaxMode, ndctl.SectorMode}
var capacity uint64
align := uint64(1024 * 1024 * 1024)
for _, bus := range pmem.ctx.GetBuses() {
for _, r := range bus.ActiveRegions() {
realalign := align * r.InterleaveWays()
realalign := ndctlAlign * r.InterleaveWays()
available := r.MaxAvailableExtent()
// align down, avoid claiming more than what we really can serve
klog.V(4).Infof("GetCapacity: available before realalign: %d", available)
Expand All @@ -84,39 +89,31 @@ func (pmem *pmemNdctl) GetCapacity() (map[string]uint64, error) {
return Capacity, nil
}

func (pmem *pmemNdctl) CreateDevice(name string, size uint64, nsmode string) error {
func (pmem *pmemNdctl) CreateDevice(volumeId string, size uint64, nsmode string) error {
ndctlMutex.Lock()
defer ndctlMutex.Unlock()
// Check that such name does not exist. In certain error states, for example when
// Check that such volume does not exist. In certain error states, for example when
// namespace creation works but device zeroing fails (missing /dev/pmemX.Y in container),
// this function is asked to create new devices repeatedly, forcing running out of space.
// Avoid device filling with garbage entries by returning error.
// Overall, no point having more than one namespace with same name.
_, err := pmem.getDevice(name)
_, err := pmem.getDevice(volumeId)
if err == nil {
klog.V(4).Infof("Device with name: %s already exists, refuse to create another", name)
klog.V(4).Infof("Device with name: %s already exists, refuse to create another", volumeId)
return fmt.Errorf("CreateDevice: Failed: namespace with that name exists")
}
if size <= 0 {
// Allocating volumes of zero size isn't supported.
// We use some arbitrary small minimum size instead.
// It will get rounded up by libndctl to meet the alignment.
size = 1
}
// Pass align = 1 GB into creation request as that has proven to be reliable.
const align uint64 = 1024 * 1024 * 1024
// libndctl needs to store meta data and will use some of the allocated
// space for that (https://github.com/pmem/ndctl/issues/79).
// We don't know exactly how much space that is, just
// that it should be a small amount. But because libndctl
// rounds up to the alignment, in practice that means we need
// to request `align` additional bytes.
compensatedsize := size + align
klog.V(4).Infof("CreateDevice:%s: Compensate for libndctl creating one alignment step smaller: change size %d to %d", name, size, compensatedsize)
size += ndctlAlign
klog.V(4).Infof("Compensate for libndctl creating one alignment step smaller: increase size to %d", size)
ns, err := pmem.ctx.CreateNamespace(ndctl.CreateNamespaceOpts{
Name: name,
Size: compensatedsize,
Align: align,
Name: volumeId,
Size: size,
Align: ndctlAlign,
Mode: ndctl.NamespaceMode(nsmode),
})
if err != nil {
Expand All @@ -125,7 +122,7 @@ func (pmem *pmemNdctl) CreateDevice(name string, size uint64, nsmode string) err
data, _ := ns.MarshalJSON() //nolint: gosec
klog.V(3).Infof("Namespace created: %s", data)
// clear start of device to avoid old data being recognized as file system
device, err := pmem.getDevice(name)
device, err := pmem.getDevice(volumeId)
if err != nil {
return err
}
Expand All @@ -137,37 +134,37 @@ func (pmem *pmemNdctl) CreateDevice(name string, size uint64, nsmode string) err
return nil
}

func (pmem *pmemNdctl) DeleteDevice(name string, flush bool) error {
func (pmem *pmemNdctl) DeleteDevice(volumeId string, flush bool) error {
ndctlMutex.Lock()
defer ndctlMutex.Unlock()

device, err := pmem.getDevice(name)
device, err := pmem.getDevice(volumeId)
if err != nil {
return err
}
err = ClearDevice(device, flush)
if err != nil {
return err
}
return pmem.ctx.DestroyNamespaceByName(name)
return pmem.ctx.DestroyNamespaceByName(volumeId)
}

func (pmem *pmemNdctl) FlushDeviceData(name string) error {
func (pmem *pmemNdctl) FlushDeviceData(volumeId string) error {
ndctlMutex.Lock()
defer ndctlMutex.Unlock()

device, err := pmem.getDevice(name)
device, err := pmem.getDevice(volumeId)
if err != nil {
return err
}
return ClearDevice(device, true)
}

func (pmem *pmemNdctl) GetDevice(name string) (PmemDeviceInfo, error) {
func (pmem *pmemNdctl) GetDevice(volumeId string) (PmemDeviceInfo, error) {
ndctlMutex.Lock()
defer ndctlMutex.Unlock()

return pmem.getDevice(name)
return pmem.getDevice(volumeId)
}

func (pmem *pmemNdctl) ListDevices() ([]PmemDeviceInfo, error) {
Expand All @@ -181,8 +178,8 @@ func (pmem *pmemNdctl) ListDevices() ([]PmemDeviceInfo, error) {
return devices, nil
}

func (pmem *pmemNdctl) getDevice(name string) (PmemDeviceInfo, error) {
ns, err := pmem.ctx.GetNamespaceByName(name)
func (pmem *pmemNdctl) getDevice(volumeId string) (PmemDeviceInfo, error) {
ns, err := pmem.ctx.GetNamespaceByName(volumeId)
if err != nil {
return PmemDeviceInfo{}, err
}
Expand All @@ -192,8 +189,8 @@ func (pmem *pmemNdctl) getDevice(name string) (PmemDeviceInfo, error) {

func namespaceToPmemInfo(ns *ndctl.Namespace) PmemDeviceInfo {
return PmemDeviceInfo{
Name: ns.Name(),
Path: "/dev/" + ns.BlockDeviceName(),
Size: ns.Size(),
VolumeId: ns.Name(),
Path: "/dev/" + ns.BlockDeviceName(),
Size: ns.Size(),
}
}

0 comments on commit 0332d08

Please sign in to comment.