Handle capacity algorithm corner cases

Instance has gone lost, and jobs are still either running
or waiting inside of its instance group
RBAC - user does not have permission to see some of the
groups that would be used in the capacity calculation

For some cases, a naive capacity dictionary is returned,
main goal is to not throw errors and avoid unpredicted behavior

Detailed capacity tests are moved into new unit test file.
This commit is contained in:
AlanCoding
2017-08-28 15:50:13 -04:00
parent 5327a4c622
commit d54eb93f26
5 changed files with 176 additions and 73 deletions

View File

@@ -1,10 +1,8 @@
import pytest
import mock
from django.test import TransactionTestCase
from awx.main.models import (
Job,
Instance,
InstanceGroup,
)
@@ -28,42 +26,9 @@ class TestCapacityMapping(TransactionTestCase):
def test_mapping(self):
self.sample_cluster()
with self.assertNumQueries(2):
inst_map, ig_map = Instance.objects.capacity_mapping()
inst_map, ig_map = InstanceGroup.objects.capacity_mapping()
assert inst_map['i1'] == set(['ig_small'])
assert inst_map['i2'] == set(['ig_large', 'tower'])
assert ig_map['ig_small'] == set(['ig_small'])
assert ig_map['ig_large'] == set(['ig_large', 'tower'])
assert ig_map['tower'] == set(['ig_large', 'tower'])
def test_committed_capacity(self):
tower, ig_large, ig_small = self.sample_cluster()
tasks = [
Job(status='waiting', instance_group=tower),
Job(status='waiting', instance_group=ig_large),
Job(status='waiting', instance_group=ig_small)
]
with mock.patch.object(Job, 'task_impact', new=mock.PropertyMock(return_value=43)):
capacities = InstanceGroup.objects.capacity_values(
tasks=tasks, breakdown=True
)
# Jobs submitted to either tower or ig_larg must count toward both
assert capacities['tower']['committed_capacity'] == 43 * 2
assert capacities['ig_large']['committed_capacity'] == 43 * 2
assert capacities['ig_small']['committed_capacity'] == 43
def test_running_capacity(self):
tower, ig_large, ig_small = self.sample_cluster()
tasks = [
Job(status='running', execution_node='i1'),
Job(status='running', execution_node='i2'),
Job(status='running', execution_node='i3')
]
with mock.patch.object(Job, 'task_impact', new=mock.PropertyMock(return_value=43)):
capacities = InstanceGroup.objects.capacity_values(
tasks=tasks, breakdown=True
)
# Tower is only given 1 instance
assert capacities['tower']['running_capacity'] == 43
# Large IG has 2 instances
assert capacities['ig_large']['running_capacity'] == 43 * 2
assert capacities['ig_small']['running_capacity'] == 43

View File

@@ -0,0 +1,135 @@
import pytest
from awx.main.models import InstanceGroup
class FakeObject(object):
def __init__(self, **kwargs):
for k, v in kwargs.items():
setattr(self, k, v)
class Job(FakeObject):
task_impact = 43
def log_format(self):
return 'job 382 (fake)'
@pytest.fixture
def sample_cluster():
def stand_up_cluster():
class Instances(FakeObject):
def add(self, *args):
for instance in args:
self.obj.instance_list.append(instance)
def all(self):
return self.obj.instance_list
class InstanceGroup(FakeObject):
def __init__(self, **kwargs):
super(InstanceGroup, self).__init__(**kwargs)
self.instance_list = []
@property
def instances(self):
mgr = Instances(obj=self)
return mgr
class Instance(FakeObject):
pass
ig_small = InstanceGroup(name='ig_small')
ig_large = InstanceGroup(name='ig_large')
tower = InstanceGroup(name='tower')
i1 = Instance(hostname='i1', capacity=200)
i2 = Instance(hostname='i2', capacity=200)
i3 = Instance(hostname='i3', capacity=200)
ig_small.instances.add(i1)
ig_large.instances.add(i2, i3)
tower.instances.add(i2)
return [tower, ig_large, ig_small]
return stand_up_cluster
def test_committed_capacity(sample_cluster):
tower, ig_large, ig_small = sample_cluster()
tasks = [
Job(status='waiting', instance_group=tower),
Job(status='waiting', instance_group=ig_large),
Job(status='waiting', instance_group=ig_small)
]
capacities = InstanceGroup.objects.capacity_values(
qs=[tower, ig_large, ig_small], tasks=tasks, breakdown=True
)
# Jobs submitted to either tower or ig_larg must count toward both
assert capacities['tower']['committed_capacity'] == 43 * 2
assert capacities['ig_large']['committed_capacity'] == 43 * 2
assert capacities['ig_small']['committed_capacity'] == 43
def test_running_capacity(sample_cluster):
tower, ig_large, ig_small = sample_cluster()
tasks = [
Job(status='running', execution_node='i1'),
Job(status='running', execution_node='i2'),
Job(status='running', execution_node='i3')
]
capacities = InstanceGroup.objects.capacity_values(
qs=[tower, ig_large, ig_small], tasks=tasks, breakdown=True
)
# Tower is only given 1 instance
assert capacities['tower']['running_capacity'] == 43
# Large IG has 2 instances
assert capacities['ig_large']['running_capacity'] == 43 * 2
assert capacities['ig_small']['running_capacity'] == 43
def test_offline_node_running(sample_cluster):
"""
Assure that algorithm doesn't explode if a job is marked running
in an offline node
"""
tower, ig_large, ig_small = sample_cluster()
ig_small.instance_list[0].capacity = 0
tasks = [Job(status='running', execution_node='i1', instance_group=ig_small)]
capacities = InstanceGroup.objects.capacity_values(
qs=[tower, ig_large, ig_small], tasks=tasks)
assert capacities['ig_small']['consumed_capacity'] == 43
def test_offline_node_waiting(sample_cluster):
"""
Same but for a waiting job
"""
tower, ig_large, ig_small = sample_cluster()
ig_small.instance_list[0].capacity = 0
tasks = [Job(status='waiting', instance_group=ig_small)]
capacities = InstanceGroup.objects.capacity_values(
qs=[tower, ig_large, ig_small], tasks=tasks)
assert capacities['ig_small']['consumed_capacity'] == 43
def test_RBAC_reduced_filter(sample_cluster):
"""
User can see jobs that are running in `ig_small` and `ig_large` IGs,
but user does not have permission to see those actual instance groups.
Verify that this does not blow everything up.
"""
tower, ig_large, ig_small = sample_cluster()
tasks = [
Job(status='waiting', instance_group=tower),
Job(status='waiting', instance_group=ig_large),
Job(status='waiting', instance_group=ig_small)
]
capacities = InstanceGroup.objects.capacity_values(
qs=[tower], tasks=tasks, breakdown=True
)
# Cross-links between groups not visible to current user,
# so a naieve accounting of capacities is returned instead
assert capacities['tower']['committed_capacity'] == 43

View File

@@ -20,7 +20,7 @@ class TestCleanupInconsistentCeleryTasks():
@mock.patch.object(cache, 'get', return_value=None)
@mock.patch.object(TaskManager, 'get_active_tasks', return_value=([], {}))
@mock.patch.object(TaskManager, 'get_running_tasks', return_value=({'host1': [Job(id=2), Job(id=3),]}, []))
@mock.patch.object(InstanceGroup.objects, 'all', return_value=[])
@mock.patch.object(InstanceGroup.objects, 'prefetch_related', return_value=[])
@mock.patch.object(Instance.objects, 'get', side_effect=Instance.DoesNotExist)
@mock.patch('awx.main.scheduler.logger')
def test_instance_does_not_exist(self, logger_mock, *args):
@@ -36,7 +36,7 @@ class TestCleanupInconsistentCeleryTasks():
@mock.patch.object(cache, 'get', return_value=None)
@mock.patch.object(TaskManager, 'get_active_tasks', return_value=([], {'host1': []}))
@mock.patch.object(InstanceGroup.objects, 'all', return_value=[])
@mock.patch.object(InstanceGroup.objects, 'prefetch_related', return_value=[])
@mock.patch.object(TaskManager, 'get_running_tasks')
@mock.patch('awx.main.scheduler.logger')
def test_save_failed(self, logger_mock, get_running_tasks, *args):