Skip to content
This repository has been archived by the owner on Dec 24, 2019. It is now read-only.

Commit

Permalink
Merge pull request #13 from hjacobs/allocatable-resources
Browse files Browse the repository at this point in the history
Allocatable resources
  • Loading branch information
hjacobs authored Feb 12, 2017
2 parents e0596b0 + 5154d94 commit 7625f37
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 24 deletions.
12 changes: 11 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ Goals:
* respect Availability Zones, i.e. make sure that all AZs provide enough capacity
* be deterministic and predictable, i.e. the ``DesiredCapacity`` is only calculated based on the current cluster state
* scale down slowly to mitigate service disruptions, i.e. at most one node at a time
* support "elastic" workloads like daily up/down scaling
* support AWS Spot Fleet (not yet implemented)
* require a minimum amount of configuration (preferably none)
* keep it simple

Expand All @@ -32,6 +34,13 @@ This hack was created as a proof of concept and born out of frustration with the
* it requires unnecessary configuration
* the code is quite complex

Disclaimer
==========

** Use at your own risk! **
This autoscaler was only tested with Kubernetes version 1.5.2.
There is no guarantee that it works in previous Kubernetes versions.


How it works
============
Expand All @@ -48,7 +57,7 @@ The ``autoscale`` function performs the following task:
* iterate through every ASG/AZ combination
* use the calculated resource usage (sum of resource requests) and add the resource requests of any unassigned pods (pods not scheduled on any node yet)
* apply the configured buffer values (10% extra for CPU and memory by default)
* find the capacity of the weakest node
* find the `allocatable capacity`_ of the weakest node
* calculate the number of required nodes by adding up the capacity of the weakest node until the sum is greater than or equal to requested+buffer for both CPU and memory
* sum up the number of required nodes from all AZ for the ASG

Expand Down Expand Up @@ -99,3 +108,4 @@ The following command line options are supported:


.. _"official" cluster-autoscaler: https://github.com/kubernetes/contrib/tree/master/cluster-autoscaler
.. _allocatable capacity: https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md
34 changes: 18 additions & 16 deletions kube_aws_autoscaler/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,9 @@ def parse_resource(v: str):
return int(match.group(1)) * factor


def get_node_capacity_tuple(node: dict):
capacity = node['capacity']
return tuple(capacity[resource] for resource in RESOURCES)
def get_node_allocatable_tuple(node: dict):
allocatable = node['allocatable']
return tuple(allocatable[resource] for resource in RESOURCES)


def apply_buffer(requested: dict, buffer_percentage: dict, buffer_fixed: dict):
Expand All @@ -60,11 +60,11 @@ def apply_buffer(requested: dict, buffer_percentage: dict, buffer_fixed: dict):


def find_weakest_node(nodes):
return sorted(nodes, key=get_node_capacity_tuple)[0]
return sorted(nodes, key=get_node_allocatable_tuple)[0]


def is_sufficient(requested: dict, capacity: dict):
for resource, cap in capacity.items():
def is_sufficient(requested: dict, allocatable: dict):
for resource, cap in allocatable.items():
if requested.get(resource, 0) > cap:
return False
return True
Expand All @@ -86,13 +86,15 @@ def get_nodes(api) -> dict:
region = node.labels['failure-domain.beta.kubernetes.io/region']
zone = node.labels['failure-domain.beta.kubernetes.io/zone']
instance_type = node.labels['beta.kubernetes.io/instance-type']
capacity = {}
for key, val in node.obj['status']['capacity'].items():
capacity[key] = parse_resource(val)
allocatable = {}
# Use the Node Allocatable Resources to account for any kube/system reservations:
# https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md
for key, val in node.obj['status']['allocatable'].items():
allocatable[key] = parse_resource(val)
instance_id = node.obj['spec']['externalID']
obj = {'name': node.name,
'region': region, 'zone': zone, 'instance_id': instance_id, 'instance_type': instance_type,
'capacity': capacity,
'allocatable': allocatable,
'ready': is_node_ready(node),
'unschedulable': node.obj['spec'].get('unschedulable', False),
'master': node.labels.get('master', 'false') == 'true'}
Expand Down Expand Up @@ -201,10 +203,10 @@ def calculate_required_auto_scaling_group_sizes(nodes_by_asg_zone: dict, usage_b
requested_with_buffer = apply_buffer(requested, buffer_percentage, buffer_fixed)
weakest_node = find_weakest_node(nodes)
required_nodes = 0
capacity = {resource: 0 for resource in RESOURCES}
while not is_sufficient(requested_with_buffer, capacity):
for resource in capacity:
capacity[resource] += weakest_node['capacity'][resource]
allocatable = {resource: 0 for resource in RESOURCES}
while not is_sufficient(requested_with_buffer, allocatable):
for resource in allocatable:
allocatable[resource] += weakest_node['allocatable'][resource]
required_nodes += 1

for node in nodes:
Expand All @@ -215,7 +217,7 @@ def calculate_required_auto_scaling_group_sizes(nodes_by_asg_zone: dict, usage_b
required_nodes += 1

overprovisioned = {resource: 0 for resource in RESOURCES}
for resource, value in capacity.items():
for resource, value in allocatable.items():
overprovisioned[resource] = value - requested[resource]

if dump_info:
Expand All @@ -226,7 +228,7 @@ def calculate_required_auto_scaling_group_sizes(nodes_by_asg_zone: dict, usage_b
logger.info('{}/{}: with buffer: {}'.format(asg_name, zone,
' '.join([format_resource(requested_with_buffer[r], r).rjust(10) for r in RESOURCES])))
logger.info('{}/{}: weakest node: {}'.format(asg_name, zone,
' '.join([format_resource(weakest_node['capacity'][r], r).rjust(10) for r in RESOURCES])))
' '.join([format_resource(weakest_node['allocatable'][r], r).rjust(10) for r in RESOURCES])))
logger.info('{}/{}: overprovision: {}'.format(asg_name, zone,
' '.join([format_resource(overprovisioned[r], r).rjust(10) for r in RESOURCES])))
logger.info('{}/{}: => {} nodes required (current: {})'.format(asg_name, zone, required_nodes, len(nodes)))
Expand Down
14 changes: 7 additions & 7 deletions tests/test_autoscaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,20 +66,20 @@ def test_calculate_usage_by_asg_zone():

def test_calculate_required_auto_scaling_group_sizes():
assert calculate_required_auto_scaling_group_sizes({}, {}, {}, {}) == {}
node = {'capacity': {'cpu': 1, 'memory': 1, 'pods': 1}, 'unschedulable': False, 'master': False}
node = {'allocatable': {'cpu': 1, 'memory': 1, 'pods': 1}, 'unschedulable': False, 'master': False}
assert calculate_required_auto_scaling_group_sizes({('a1', 'z1'): [node]}, {}, {}, {}) == {'a1': 0}
assert calculate_required_auto_scaling_group_sizes({('a1', 'z1'): [node]}, {('a1', 'z1'): {'cpu': 1, 'memory': 1, 'pods': 1}}, {}, {}) == {'a1': 1}
assert calculate_required_auto_scaling_group_sizes({('a1', 'z1'): [node]}, {('unknown', 'unknown'): {'cpu': 1, 'memory': 1, 'pods': 1}}, {}, {}) == {'a1': 1}


def test_calculate_required_auto_scaling_group_sizes_cordon():
node = {'name': 'mynode', 'capacity': {'cpu': 1, 'memory': 1, 'pods': 1}, 'unschedulable': True, 'master': False, 'asg_lifecycle_state': 'InService'}
node = {'name': 'mynode', 'allocatable': {'cpu': 1, 'memory': 1, 'pods': 1}, 'unschedulable': True, 'master': False, 'asg_lifecycle_state': 'InService'}
assert calculate_required_auto_scaling_group_sizes({('a1', 'z1'): [node]}, {}, {}, {}) == {'a1': 1}
assert calculate_required_auto_scaling_group_sizes({('a1', 'z1'): [node]}, {('a1', 'z1'): {'cpu': 1, 'memory': 1, 'pods': 1}}, {}, {}) == {'a1': 2}


def test_calculate_required_auto_scaling_group_sizes_unschedulable_terminating():
node = {'name': 'mynode', 'capacity': {'cpu': 1, 'memory': 1, 'pods': 1}, 'unschedulable': True, 'master': False, 'asg_lifecycle_state': 'Terminating'}
node = {'name': 'mynode', 'allocatable': {'cpu': 1, 'memory': 1, 'pods': 1}, 'unschedulable': True, 'master': False, 'asg_lifecycle_state': 'Terminating'}
# do not compensate if the instance is terminating.. (it will probably be replaced by ASG)
assert calculate_required_auto_scaling_group_sizes({('a1', 'z1'): [node]}, {}, {}, {}) == {'a1': 0}
assert calculate_required_auto_scaling_group_sizes({('a1', 'z1'): [node]}, {('a1', 'z1'): {'cpu': 1, 'memory': 1, 'pods': 1}}, {}, {}) == {'a1': 1}
Expand Down Expand Up @@ -246,7 +246,7 @@ def test_get_nodes(monkeypatch):
'beta.kubernetes.io/instance-type': 'x1.mega'
}
node.obj = {
'status': {'capacity': {'cpu': '2', 'memory': '16Gi', 'pods': '10'}},
'status': {'allocatable': {'cpu': '2', 'memory': '16Gi', 'pods': '10'}},
'spec': {'externalID': 'i-123'}
}

Expand All @@ -257,7 +257,7 @@ def test_get_nodes(monkeypatch):
assert get_nodes(api) == {'n1': {
'name': 'n1',
'region': 'eu-north-1', 'zone': 'eu-north-1a', 'instance_id': 'i-123', 'instance_type': 'x1.mega',
'capacity': {'cpu': 2, 'memory': 16*1024*1024*1024, 'pods': 10},
'allocatable': {'cpu': 2, 'memory': 16*1024*1024*1024, 'pods': 10},
'ready': False,
'unschedulable': False,
'master': False}}
Expand All @@ -278,7 +278,7 @@ def test_autoscale(monkeypatch):
get_nodes.return_value = {'n1': {
'name': 'n1',
'region': 'eu-north-1', 'zone': 'eu-north-1a', 'instance_id': 'i-123', 'instance_type': 'x1.mega',
'capacity': {'cpu': 2, 'memory': 16*1024*1024*1024, 'pods': 10},
'allocatable': {'cpu': 2, 'memory': 16*1024*1024*1024, 'pods': 10},
'ready': True,
'unschedulable': False,
'master': False}}
Expand Down Expand Up @@ -309,7 +309,7 @@ def test_autoscale_node_without_asg(monkeypatch):
get_nodes.return_value = {'n1': {
'name': 'n1',
'region': 'eu-north-1', 'zone': 'eu-north-1a', 'instance_id': 'i-123', 'instance_type': 'x1.mega',
'capacity': {'cpu': 2, 'memory': 16*1024*1024*1024, 'pods': 10},
'allocatable': {'cpu': 2, 'memory': 16*1024*1024*1024, 'pods': 10},
'ready': True,
'unschedulable': False,
'master': False}}
Expand Down

0 comments on commit 7625f37

Please sign in to comment.