From 05cb876df599065df1828a268478d9f22e777c3a Mon Sep 17 00:00:00 2001 From: Ryan Petrello Date: Tue, 16 Mar 2021 13:30:56 -0400 Subject: [PATCH 01/27] implement an initial development environment for receptor-based clusters --- Makefile | 18 +- awx/api/serializers.py | 4 + awx/api/views/__init__.py | 9 - .../management/commands/list_instances.py | 5 +- awx/main/models/ha.py | 27 +-- awx/main/scheduler/task_manager.py | 12 +- awx/main/tasks.py | 214 +++++++++++++++--- awx/main/tests/functional/test_jobs.py | 36 +-- awx/main/utils/common.py | 12 +- awx/main/wsbroadcast.py | 2 +- awx/settings/development.py | 2 + .../Instances/InstanceListItem.js | 3 + tools/docker-compose-cluster.yml | 119 ---------- tools/docker-compose-cluster/.gitignore | 0 .../awx-1-receptor.conf | 38 ++-- .../awx-2-receptor.conf | 38 ++-- .../awx-3-receptor.conf | 38 ++-- .../awx-hop-receptor.conf | 26 +++ tools/docker-compose-cluster/certs/.gitignore | 2 + .../sources/templates/docker-compose.yml.j2 | 1 + tools/docker-compose/receptor.conf | 16 +- tools/docker-receptor.yml | 60 +++++ 22 files changed, 396 insertions(+), 286 deletions(-) delete mode 100644 tools/docker-compose-cluster.yml create mode 100644 tools/docker-compose-cluster/.gitignore create mode 100644 tools/docker-compose-cluster/awx-hop-receptor.conf create mode 100644 tools/docker-compose-cluster/certs/.gitignore create mode 100644 tools/docker-receptor.yml diff --git a/Makefile b/Makefile index 519a21b007..c541ab16a4 100644 --- a/Makefile +++ b/Makefile @@ -173,7 +173,20 @@ init: . $(VENV_BASE)/awx/bin/activate; \ fi; \ $(MANAGEMENT_COMMAND) provision_instance --hostname=$(COMPOSE_HOST); \ - $(MANAGEMENT_COMMAND) register_queue --queuename=controlplane --instance_percent=100; + $(MANAGEMENT_COMMAND) register_queue --queuename=controlplane --instance_percent=100;\ + if [ "$(AWX_GROUP_QUEUES)" == "tower,thepentagon" ]; then \ + $(MANAGEMENT_COMMAND) provision_instance --hostname=isolated; \ + $(MANAGEMENT_COMMAND) register_queue --queuename='thepentagon' --hostnames=isolated --controller=tower; \ + $(MANAGEMENT_COMMAND) generate_isolated_key > /awx_devel/awx/main/isolated/authorized_keys; \ + fi; + if [ ! -f /etc/receptor/certs/awx.key ]; then \ + rm -f /etc/receptor/certs/*; \ + receptor --cert-init commonname="AWX Test CA" bits=2048 outcert=/etc/receptor/certs/ca.crt outkey=/etc/receptor/certs/ca.key; \ + for node in $(RECEPTOR_MUTUAL_TLS); do \ + receptor --cert-makereq bits=2048 commonname="$$node test cert" dnsname=$$node nodeid=$$node outreq=/etc/receptor/certs/$$node.csr outkey=/etc/receptor/certs/$$node.key; \ + receptor --cert-signreq req=/etc/receptor/certs/$$node.csr cacert=/etc/receptor/certs/ca.crt cakey=/etc/receptor/certs/ca.key outcert=/etc/receptor/certs/$$node.crt verify=yes; \ + done; \ + fi; \ # Refresh development environment after pulling new code. refresh: clean requirements_dev version_file develop migrate @@ -536,6 +549,9 @@ docker-refresh: docker-clean docker-compose docker-compose-elk: docker-auth awx/projects docker-compose-sources docker-compose -f tools/docker-compose/_sources/docker-compose.yml -f tools/elastic/docker-compose.logstash-link.yml -f tools/elastic/docker-compose.elastic-override.yml up --no-recreate +docker-compose-cluster: docker-auth awx/projects docker-compose-sources + docker-compose -f tools/docker-compose/_sources/docker-compose.yml -f tools/docker-receptor.yml up + docker-compose-cluster-elk: docker-auth awx/projects docker-compose-sources docker-compose -f tools/docker-compose/_sources/docker-compose.yml -f tools/elastic/docker-compose.logstash-link-cluster.yml -f tools/elastic/docker-compose.elastic-override.yml up --no-recreate diff --git a/awx/api/serializers.py b/awx/api/serializers.py index be4d5581e2..eabb94db75 100644 --- a/awx/api/serializers.py +++ b/awx/api/serializers.py @@ -4773,6 +4773,7 @@ class InstanceSerializer(BaseSerializer): percent_capacity_remaining = serializers.SerializerMethodField() jobs_running = serializers.IntegerField(help_text=_('Count of jobs in the running or waiting state that ' 'are targeted for this instance'), read_only=True) jobs_total = serializers.IntegerField(help_text=_('Count of all jobs that target this instance'), read_only=True) + is_receptor = serializers.SerializerMethodField() class Meta: model = Instance @@ -4817,6 +4818,9 @@ class InstanceSerializer(BaseSerializer): else: return float("{0:.2f}".format(((float(obj.capacity) - float(obj.consumed_capacity)) / (float(obj.capacity))) * 100)) + def get_is_receptor(self, obj): + return obj.is_receptor() + class InstanceGroupSerializer(BaseSerializer): diff --git a/awx/api/views/__init__.py b/awx/api/views/__init__.py index b0cb4aa86a..8ff25c10f9 100644 --- a/awx/api/views/__init__.py +++ b/awx/api/views/__init__.py @@ -370,15 +370,6 @@ class InstanceDetail(RetrieveUpdateAPIView): model = models.Instance serializer_class = serializers.InstanceSerializer - def update(self, request, *args, **kwargs): - r = super(InstanceDetail, self).update(request, *args, **kwargs) - if status.is_success(r.status_code): - obj = self.get_object() - obj.refresh_capacity() - obj.save() - r.data = serializers.InstanceSerializer(obj, context=self.get_serializer_context()).to_representation(obj) - return r - class InstanceUnifiedJobsList(SubListAPIView): diff --git a/awx/main/management/commands/list_instances.py b/awx/main/management/commands/list_instances.py index 7568f0b45c..7302953a76 100644 --- a/awx/main/management/commands/list_instances.py +++ b/awx/main/management/commands/list_instances.py @@ -48,7 +48,8 @@ class Command(BaseCommand): if no_color: color = '' fmt = '\t' + color + '{0.hostname} capacity={0.capacity} version={1}' - if x.capacity: - fmt += ' heartbeat="{0.modified:%Y-%m-%d %H:%M:%S}"' + if x.last_isolated_check: + fmt += ' last_isolated_check="{0.last_isolated_check:%Y-%m-%d %H:%M:%S}"' + fmt += ' heartbeat="{0.modified:%Y-%m-%d %H:%M:%S}"' print((fmt + '\033[0m').format(x, x.version or '?')) print('') diff --git a/awx/main/models/ha.py b/awx/main/models/ha.py index 5f4657b230..edfa1a0dca 100644 --- a/awx/main/models/ha.py +++ b/awx/main/models/ha.py @@ -125,33 +125,18 @@ class Instance(HasPolicyEditsMixin, BaseModel): def jobs_total(self): return UnifiedJob.objects.filter(execution_node=self.hostname).count() + @staticmethod + def choose_online_control_plane_node(): + return random.choice(Instance.objects.filter(enabled=True).exclude(version__startswith='ansible-runner-').values_list('hostname', flat=True)) + def is_lost(self, ref_time=None): if ref_time is None: ref_time = now() grace_period = 120 return self.modified < ref_time - timedelta(seconds=grace_period) - def refresh_capacity(self): - cpu = get_cpu_capacity() - mem = get_mem_capacity() - if self.enabled: - self.capacity = get_system_task_capacity(self.capacity_adjustment) - else: - self.capacity = 0 - - try: - # if redis is down for some reason, that means we can't persist - # playbook event data; we should consider this a zero capacity event - redis.Redis.from_url(settings.BROKER_URL).ping() - except redis.ConnectionError: - self.capacity = 0 - - self.cpu = cpu[0] - self.memory = mem[0] - self.cpu_capacity = cpu[1] - self.mem_capacity = mem[1] - self.version = awx_application_version - self.save(update_fields=['capacity', 'version', 'modified', 'cpu', 'memory', 'cpu_capacity', 'mem_capacity']) + def is_receptor(self): + return self.version.startswith('ansible-runner-') class InstanceGroup(HasPolicyEditsMixin, BaseModel, RelatedJobsMixin): diff --git a/awx/main/scheduler/task_manager.py b/awx/main/scheduler/task_manager.py index 867a4d649a..ed80cc6d4f 100644 --- a/awx/main/scheduler/task_manager.py +++ b/awx/main/scheduler/task_manager.py @@ -253,6 +253,12 @@ class TaskManager: } dependencies = [{'type': get_type_for_model(type(t)), 'id': t.id} for t in dependent_tasks] + try: + controller_node = Instance.choose_online_control_plane_node() + except IndexError: + logger.warning("No control plane nodes available to manage {}".format(task.log_format)) + return + task.status = 'waiting' (start_status, opts) = task.pre_start() @@ -289,9 +295,9 @@ class TaskManager: logger.debug('Submitting containerized {} to queue {}.'.format(task.log_format, task.execution_node)) else: task.instance_group = rampart_group - if instance is not None: - task.execution_node = instance.hostname - logger.debug('Submitting {} to <{},{}>.'.format(task.log_format, task.instance_group_id, task.execution_node)) + task.execution_node = instance.hostname + task.controller_node = controller_node + logger.debug('Submitting job {} to queue {} controlled by {}.'.format(task.log_format, task.execution_node, controller_node)) with disable_activity_stream(): task.celery_task_id = str(uuid.uuid4()) task.save() diff --git a/awx/main/tasks.py b/awx/main/tasks.py index 77eef071ae..5435c3fad3 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -55,7 +55,11 @@ import ansible_runner # Receptor from receptorctl.socket_interface import ReceptorControl +# dateutil +from dateutil.parser import parse as parse_date + # AWX +from awx import MODE from awx import __version__ as awx_application_version from awx.main.constants import PRIVILEGE_ESCALATION_METHODS, STANDARD_INVENTORY_UPDATE_ENV, MINIMAL_EVENTS from awx.main.access import access_registry @@ -98,6 +102,9 @@ from awx.main.utils.common import ( parse_yaml_or_json, cleanup_new_process, create_partition, + get_mem_capacity, + get_cpu_capacity, + get_system_task_capacity, ) from awx.main.utils.execution_environments import get_default_pod_spec, CONTAINER_ROOT, to_container_path from awx.main.utils.ansible import read_ansible_config @@ -114,6 +121,9 @@ from awx.main.analytics.subsystem_metrics import Metrics from rest_framework.exceptions import PermissionDenied +RECEPTOR_SOCK = '/var/run/receptor/receptor.sock' +RECEPTOR_PENDING = 'ansible-runner-???' + __all__ = [ 'RunJob', @@ -398,48 +408,74 @@ def cleanup_execution_environment_images(): if process.returncode != 0: logger.debug(f"Failed to delete image {image_name}") +@task(queue=get_local_queuename) +def check_heartbeat(node): + AWXReceptorJob.check_heartbeat(node) + + +def discover_receptor_nodes(): + ctl = ReceptorControl(RECEPTOR_SOCK) + connections = ctl.simple_command('status')['Advertisements'] + nowtime = now() + for ad in connections: + hostname = ad['NodeID'] + commands = ad['WorkCommands'] or [] + if 'ansible-runner' not in commands: + if 'local' in commands: + # this node is strictly a control plane node, and does not + # provide ansible-runner as a work command + (changed, instance) = Instance.objects.register(hostname=hostname) + if changed: + logger.info("Registered tower control node '{}'".format(hostname)) + instance.capacity = instance.cpu = instance.memory = instance.cpu_capacity = instance.mem_capacity = 0 # noqa + instance.version = get_awx_version() + instance.save(update_fields=['capacity', 'version', 'cpu', 'memory', 'cpu_capacity', 'mem_capacity']) + continue + (changed, instance) = Instance.objects.register(hostname=hostname) + was_lost = instance.is_lost(ref_time=nowtime) + if changed: + logger.info("Registered tower execution node '{}'".format(hostname)) + instance.capacity = 0 + instance.version = RECEPTOR_PENDING + instance.save(update_fields=['capacity', 'version', 'modified']) + check_heartbeat.apply_async([hostname]) + else: + last_seen = parse_date(ad['Time']) + logger.debug("Updated tower control node '{}' last seen {}".format(hostname, last_seen)) + instance.modified = last_seen + if instance.is_lost(ref_time=nowtime): + # if the instance hasn't advertised in awhile, + # don't save a new modified time + continue + + instance.save(update_fields=['modified']) + if was_lost: + # if the instance *was* lost, but has appeared again, + # attempt to re-establish the initial capacity and version + # check + logger.warning('Attempting to rejoin the cluster as instance {}.'.format(hostname)) + check_heartbeat.apply_async([hostname]) + @task(queue=get_local_queuename) def cluster_node_heartbeat(): logger.debug("Cluster node heartbeat task.") nowtime = now() instance_list = list(Instance.objects.all()) - this_inst = None lost_instances = [] (changed, instance) = Instance.objects.get_or_register() if changed: - logger.info("Registered tower node '{}'".format(instance.hostname)) + logger.info("Registered tower control node '{}'".format(instance.hostname)) + + discover_receptor_nodes() for inst in list(instance_list): if inst.hostname == settings.CLUSTER_HOST_ID: - this_inst = inst instance_list.remove(inst) elif inst.is_lost(ref_time=nowtime): lost_instances.append(inst) instance_list.remove(inst) - if this_inst: - startup_event = this_inst.is_lost(ref_time=nowtime) - this_inst.refresh_capacity() - if startup_event: - logger.warning('Rejoining the cluster as instance {}.'.format(this_inst.hostname)) - return - else: - raise RuntimeError("Cluster Host Not Found: {}".format(settings.CLUSTER_HOST_ID)) - # IFF any node has a greater version than we do, then we'll shutdown services - for other_inst in instance_list: - if other_inst.version == "": - continue - if Version(other_inst.version.split('-', 1)[0]) > Version(awx_application_version.split('-', 1)[0]) and not settings.DEBUG: - logger.error( - "Host {} reports version {}, but this node {} is at {}, shutting down".format( - other_inst.hostname, other_inst.version, this_inst.hostname, this_inst.version - ) - ) - # Shutdown signal will set the capacity to zero to ensure no Jobs get added to this instance. - # The heartbeat task will reset the capacity to the system capacity after upgrade. - stop_local_services(communicate=False) - raise RuntimeError("Shutting down.") for other_inst in lost_instances: try: reaper.reap(other_inst) @@ -1720,6 +1756,7 @@ class RunJob(BaseTask): ] ) + params['process_isolation'] = False if MODE == 'development' else True return params def pre_run_hook(self, job, private_data_dir): @@ -2793,6 +2830,10 @@ class RunAdHocCommand(BaseTask): d[r'Password:\s*?$'] = 'ssh_password' return d + def build_execution_environment_params(self, instance, private_data_dir): + params = super(RunAdHocCommand, self).build_execution_environment_params(instance) + params['process_isolation'] = False if MODE == 'development' else True + return params @task(queue=get_local_queuename) class RunSystemJob(BaseTask): @@ -2927,18 +2968,120 @@ class AWXReceptorJob: execution_environment_params = self.task.build_execution_environment_params(self.task.instance, runner_params['private_data_dir']) self.runner_params['settings'].update(execution_environment_params) - def run(self): + def run(self, work_type=None): # We establish a connection to the Receptor socket - receptor_ctl = get_receptor_ctl() + # TODO: Seems like we already have a method for doing this now? + # receptor_ctl = get_receptor_ctl() + receptor_ctl = ReceptorControl(RECEPTOR_SOCK) try: - return self._run_internal(receptor_ctl) + return self._run_internal(receptor_ctl, work_type=work_type) finally: # Make sure to always release the work unit if we established it if self.unit_id is not None and settings.RECEPTOR_RELEASE_WORK: receptor_ctl.simple_command(f"work release {self.unit_id}") - def _run_internal(self, receptor_ctl): + @classmethod + def check_heartbeat(cls, node): + # make a private data dir and env dir + private_data_dir = tempfile.mkdtemp(prefix='awx_heartbeat_', dir=settings.AWX_ISOLATION_BASE_PATH) + env_path = os.path.join(private_data_dir, 'env') + os.makedirs(os.path.join(env_path), mode=0o700) + # write a cmdline file for adhoc + f = os.fdopen(os.open(os.path.join(env_path, 'cmdline'), os.O_RDWR | os.O_CREAT, stat.S_IREAD | stat.S_IWRITE), 'w') + f.write(ansible_runner.utils.args2cmdline('localhost')) + f.close() + # write a custom facts.d to report the runner version + facts_path = os.path.join(private_data_dir, 'facts.d') + os.makedirs(facts_path, mode=0o700) + with open(os.path.join(facts_path, 'ansible_runner.fact'), 'w') as f: + os.chmod(f.name, 0o700) + f.write("""#!/usr/bin/env sh\necho "{\\"version\\": \\"`ansible-runner --version`\\"}"\n""") # noqa + # write a local inventory + inventory_path = os.path.join(private_data_dir, 'inventory') + os.makedirs(inventory_path, mode=0o700) + fn = os.path.join(inventory_path, 'hosts') + with open(fn, 'w') as f: + os.chmod(fn, stat.S_IRUSR | stat.S_IXUSR | stat.S_IWUSR) + f.write('localhost ansible_connection=local') + + runner_params = { + 'ident': str(uuid4()), + 'private_data_dir': private_data_dir, + 'module': 'setup', + 'module_args': f'fact_path={private_data_dir}/facts.d', + 'inventory': inventory_path, + 'only_transmit_kwargs': False, + 'settings': { + "container_image": get_default_execution_environment().image, + "container_options": ['--user=root'], + "process_isolation": False if MODE == 'development' else True, + }, + } + + class _Instance(object): + pk = -1 + job_env = {} + + @property + def is_container_group_task(self): + return False + + @property + def execution_node(self): + return node + + class _BaseTask(object): + instance = _Instance() + cpus = 0 + mem_mb = 0 + version = RECEPTOR_PENDING + + def build_execution_environment_params(self, instance, private_data_dir): + return {} + + def event_handler(self, event_data): + if event_data.get('event') == 'runner_on_ok': + facts = event_data.get('event_data', {}).get('res', {}).get('ansible_facts', {}) + if facts: + self.cpus = facts.get('ansible_processor_vcpus', 0) + self.mem_mb = facts.get('ansible_memtotal_mb', 0) + version = facts.get('ansible_local', {}).get('ansible_runner', {}).get('version', '') # noqa + if version: + self.version = f'ansible-runner-{version}' + + def finished_callback(self, runner_obj): + pass + + def cancel_callback(self): + pass + + def status_handler(self, status_data, runner_config): + pass + + def update_model(self, *args, **kw): + pass + + task = _BaseTask() + receptor_job = cls(task, runner_params) + res = receptor_job.run(work_type='ansible-runner') + if res.status == 'successful': + cpu = get_cpu_capacity(task.cpus) + mem = get_cpu_capacity(task.mem_mb) + instance = Instance.objects.get(hostname=node) + instance.cpu = cpu[0] + instance.cpu_capacity = cpu[1] + instance.memory = mem[0] + instance.memory_capacity = mem[1] + instance.capacity = get_system_task_capacity( + instance.capacity_adjustment, + instance.cpu_capacity, + instance.mem_capacity, + ) + instance.version = task.version + instance.save(update_fields=['capacity', 'version', 'cpu', 'memory', 'cpu_capacity', 'mem_capacity']) + + def _run_internal(self, receptor_ctl, work_type=None): # Create a socketpair. Where the left side will be used for writing our payload # (private data dir, kwargs). The right side will be passed to Receptor for # reading. @@ -2949,7 +3092,14 @@ class AWXReceptorJob: # submit our work, passing # in the right side of our socketpair for reading. - result = receptor_ctl.submit_work(worktype=self.work_type, payload=sockout.makefile('rb'), params=self.receptor_params) + _kw = {} + work_type = work_type or self.work_type + if work_type == 'ansible-runner': + _kw['node'] = self.task.instance.execution_node + logger.debug(f'receptorctl.submit_work(node={_kw["node"]})') + else: + logger.debug(f'receptorctl.submit_work({work_type})') + result = receptor_ctl.submit_work(worktype=work_type, payload=sockout.makefile('rb'), params=self.receptor_params, **_kw) self.unit_id = result['unitid'] self.task.update_model(self.task.instance.pk, work_unit_id=result['unitid']) @@ -3005,7 +3155,7 @@ class AWXReceptorJob: # write our payload to the left side of our socketpair. @cleanup_new_process def transmit(self, _socket): - if not settings.IS_K8S and self.work_type == 'local': + if not settings.IS_K8S and self.work_type == 'local' and 'only_transmit_kwargs' not in self.runner_params: self.runner_params['only_transmit_kwargs'] = True try: @@ -3052,6 +3202,8 @@ class AWXReceptorJob: work_type = 'kubernetes-runtime-auth' else: work_type = 'kubernetes-incluster-auth' + elif isinstance(self.task.instance, (Job, AdHocCommand)): + work_type = 'ansible-runner' else: work_type = 'local' diff --git a/awx/main/tests/functional/test_jobs.py b/awx/main/tests/functional/test_jobs.py index 197c1197e1..0a3966fc56 100644 --- a/awx/main/tests/functional/test_jobs.py +++ b/awx/main/tests/functional/test_jobs.py @@ -1,11 +1,7 @@ -import redis import pytest -from unittest import mock import json -from awx.main.models import Job, Instance, JobHostSummary, InventoryUpdate, InventorySource, Project, ProjectUpdate, SystemJob, AdHocCommand -from awx.main.tasks import cluster_node_heartbeat -from django.test.utils import override_settings +from awx.main.models import Job, JobHostSummary, InventoryUpdate, InventorySource, Project, ProjectUpdate, SystemJob, AdHocCommand @pytest.mark.django_db @@ -19,36 +15,6 @@ def test_orphan_unified_job_creation(instance, inventory): assert job2.launch_type == 'relaunch' -@pytest.mark.django_db -@mock.patch('awx.main.utils.common.get_cpu_capacity', lambda: (2, 8)) -@mock.patch('awx.main.utils.common.get_mem_capacity', lambda: (8000, 62)) -def test_job_capacity_and_with_inactive_node(): - i = Instance.objects.create(hostname='test-1') - with mock.patch.object(redis.client.Redis, 'ping', lambda self: True): - i.refresh_capacity() - assert i.capacity == 62 - i.enabled = False - i.save() - with override_settings(CLUSTER_HOST_ID=i.hostname): - cluster_node_heartbeat() - i = Instance.objects.get(id=i.id) - assert i.capacity == 0 - - -@pytest.mark.django_db -@mock.patch('awx.main.utils.common.get_cpu_capacity', lambda: (2, 8)) -@mock.patch('awx.main.utils.common.get_mem_capacity', lambda: (8000, 62)) -def test_job_capacity_with_redis_disabled(): - i = Instance.objects.create(hostname='test-1') - - def _raise(self): - raise redis.ConnectionError() - - with mock.patch.object(redis.client.Redis, 'ping', _raise): - i.refresh_capacity() - assert i.capacity == 0 - - @pytest.mark.django_db def test_job_type_name(): job = Job.objects.create() diff --git a/awx/main/utils/common.py b/awx/main/utils/common.py index 35ce3da945..7a09a8615f 100644 --- a/awx/main/utils/common.py +++ b/awx/main/utils/common.py @@ -15,7 +15,6 @@ import urllib.parse import threading import contextlib import tempfile -import psutil from functools import reduce, wraps from decimal import Decimal @@ -699,7 +698,7 @@ def parse_yaml_or_json(vars_str, silent_failure=True): return vars_dict -def get_cpu_capacity(): +def get_cpu_capacity(raw): from django.conf import settings settings_forkcpu = getattr(settings, 'SYSTEM_TASK_FORKS_CPU', None) @@ -713,18 +712,16 @@ def get_cpu_capacity(): elif settings_abscpu is not None: return 0, int(settings_abscpu) - cpu = psutil.cpu_count() - if env_forkcpu: forkcpu = int(env_forkcpu) elif settings_forkcpu: forkcpu = int(settings_forkcpu) else: forkcpu = 4 - return (cpu, cpu * forkcpu) + return (raw, raw * forkcpu) -def get_mem_capacity(): +def get_mem_capacity(raw_mb): from django.conf import settings settings_forkmem = getattr(settings, 'SYSTEM_TASK_FORKS_MEM', None) @@ -745,8 +742,7 @@ def get_mem_capacity(): else: forkmem = 100 - mem = psutil.virtual_memory().total - return (mem, max(1, ((mem // 1024 // 1024) - 2048) // forkmem)) + return (raw_mb, max(1, ((raw_mb // 1024 // 1024) - 2048) // forkmem)) def get_system_task_capacity(scale=Decimal(1.0), cpu_capacity=None, mem_capacity=None): diff --git a/awx/main/wsbroadcast.py b/awx/main/wsbroadcast.py index b35747aee3..afd444ff5b 100644 --- a/awx/main/wsbroadcast.py +++ b/awx/main/wsbroadcast.py @@ -32,7 +32,7 @@ def unwrap_broadcast_msg(payload: dict): def get_broadcast_hosts(): Instance = apps.get_model('main', 'Instance') - instances = Instance.objects.exclude(hostname=Instance.objects.me().hostname).order_by('hostname').values('hostname', 'ip_address').distinct() + instances = Instance.objects.exclude(hostname=Instance.objects.me().hostname).exclude(version__startswith='ansible-runner-').order_by('hostname').values('hostname', 'ip_address').distinct() return {i['hostname']: i['ip_address'] or i['hostname'] for i in instances} diff --git a/awx/settings/development.py b/awx/settings/development.py index 3c41dde684..7bba10351a 100644 --- a/awx/settings/development.py +++ b/awx/settings/development.py @@ -125,6 +125,8 @@ if "pytest" in sys.modules: } } +CELERYBEAT_SCHEDULE['cluster_heartbeat']['schedule'] = timedelta(seconds=10) + CLUSTER_HOST_ID = socket.gethostname() AWX_CALLBACK_PROFILE = True diff --git a/awx/ui/src/screens/InstanceGroup/Instances/InstanceListItem.js b/awx/ui/src/screens/InstanceGroup/Instances/InstanceListItem.js index 442f3b464f..461d66ca4a 100644 --- a/awx/ui/src/screens/InstanceGroup/Instances/InstanceListItem.js +++ b/awx/ui/src/screens/InstanceGroup/Instances/InstanceListItem.js @@ -65,6 +65,9 @@ function InstanceListItem({ const labelId = `check-action-${instance.id}`; function usedCapacity(item) { + if (!item.is_receptor) { + return ``; + } if (item.enabled) { return ( +# +# ^-------------- +# ^-------------- +# ^-------------- +# +--- +version: '2' +services: + awx_1: + environment: + RECEPTOR_MUTUAL_TLS: awx receptor-hop receptor-1 receptor-2 receptor-3 + volumes: + - "./docker-compose-cluster:/etc/receptor" + receptor-hop: + image: quay.io/ansible/awx-ee + user: root + container_name: tools_receptor_hop + hostname: receptor-hop + command: 'receptor --config /etc/receptor/awx-hop-receptor.conf' + links: + - awx_1 + ports: + - "5555:5555" + volumes: + - "../../docker-compose-cluster:/etc/receptor" + receptor-1: + image: quay.io/ansible/awx-ee + user: root + container_name: tools_receptor_1 + hostname: receptor-1 + command: 'receptor --config /etc/receptor/awx-1-receptor.conf' + links: + - receptor-hop + volumes: + - "../../docker-compose-cluster:/etc/receptor" + receptor-2: + image: quay.io/ansible/awx-ee + user: root + container_name: tools_receptor_2 + hostname: receptor-2 + command: 'receptor --config /etc/receptor/awx-2-receptor.conf' + links: + - receptor-hop + volumes: + - "../../docker-compose-cluster:/etc/receptor" + receptor-3: + image: quay.io/ansible/awx-ee + user: root + container_name: tools_receptor_3 + hostname: receptor-3 + command: 'receptor --config /etc/receptor/awx-3-receptor.conf' + links: + - receptor-hop + volumes: + - "../../docker-compose-cluster:/etc/receptor" From b10a8b0fa9026170633a50fe2472e0b56b74f7e7 Mon Sep 17 00:00:00 2001 From: Alan Rominger Date: Mon, 12 Jul 2021 13:29:54 -0400 Subject: [PATCH 02/27] Initial functionality tweaks --- tools/docker-compose-cluster/awx-1-receptor.conf | 2 +- tools/docker-compose-cluster/awx-2-receptor.conf | 2 +- tools/docker-compose-cluster/awx-3-receptor.conf | 2 +- tools/docker-receptor.yml | 8 ++++---- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/docker-compose-cluster/awx-1-receptor.conf b/tools/docker-compose-cluster/awx-1-receptor.conf index bbcec384d1..82cd21471b 100644 --- a/tools/docker-compose-cluster/awx-1-receptor.conf +++ b/tools/docker-compose-cluster/awx-1-receptor.conf @@ -11,7 +11,7 @@ # rootcas: /etc/receptor/certs/ca.crt - tcp-peer: - address: receptor-hop:5555 + address: tools_receptor_hop:5555 redial: true #tls: mutual-tls-client diff --git a/tools/docker-compose-cluster/awx-2-receptor.conf b/tools/docker-compose-cluster/awx-2-receptor.conf index b180cbf9a6..af85fe0ea6 100644 --- a/tools/docker-compose-cluster/awx-2-receptor.conf +++ b/tools/docker-compose-cluster/awx-2-receptor.conf @@ -11,7 +11,7 @@ # rootcas: /etc/receptor/certs/ca.crt - tcp-peer: - address: receptor-hop:5555 + address: tools_receptor_hop:5555 redial: true #tls: mutual-tls-client diff --git a/tools/docker-compose-cluster/awx-3-receptor.conf b/tools/docker-compose-cluster/awx-3-receptor.conf index fb40add836..095ff8fee9 100644 --- a/tools/docker-compose-cluster/awx-3-receptor.conf +++ b/tools/docker-compose-cluster/awx-3-receptor.conf @@ -11,7 +11,7 @@ # rootcas: /etc/receptor/certs/ca.crt - tcp-peer: - address: receptor-hop:5555 + address: tools_receptor_hop:5555 redial: true #tls: mutual-tls-client diff --git a/tools/docker-receptor.yml b/tools/docker-receptor.yml index 06045753a0..02a30cbf66 100644 --- a/tools/docker-receptor.yml +++ b/tools/docker-receptor.yml @@ -17,7 +17,7 @@ services: volumes: - "./docker-compose-cluster:/etc/receptor" receptor-hop: - image: quay.io/ansible/awx-ee + image: quay.io/ansible/awx-ee:devel user: root container_name: tools_receptor_hop hostname: receptor-hop @@ -29,7 +29,7 @@ services: volumes: - "../../docker-compose-cluster:/etc/receptor" receptor-1: - image: quay.io/ansible/awx-ee + image: quay.io/ansible/awx-ee:devel user: root container_name: tools_receptor_1 hostname: receptor-1 @@ -39,7 +39,7 @@ services: volumes: - "../../docker-compose-cluster:/etc/receptor" receptor-2: - image: quay.io/ansible/awx-ee + image: quay.io/ansible/awx-ee:devel user: root container_name: tools_receptor_2 hostname: receptor-2 @@ -49,7 +49,7 @@ services: volumes: - "../../docker-compose-cluster:/etc/receptor" receptor-3: - image: quay.io/ansible/awx-ee + image: quay.io/ansible/awx-ee:devel user: root container_name: tools_receptor_3 hostname: receptor-3 From 39e23db5237da4317ff4a2023817b6608869a88c Mon Sep 17 00:00:00 2001 From: Alan Rominger Date: Mon, 12 Jul 2021 15:32:17 -0400 Subject: [PATCH 03/27] Make minor changes to add needed imports --- awx/main/models/ha.py | 1 + awx/main/tasks.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/awx/main/models/ha.py b/awx/main/models/ha.py index edfa1a0dca..17116c5c3c 100644 --- a/awx/main/models/ha.py +++ b/awx/main/models/ha.py @@ -2,6 +2,7 @@ # All Rights Reserved. from decimal import Decimal +import random from django.core.validators import MinValueValidator from django.db import models, connection diff --git a/awx/main/tasks.py b/awx/main/tasks.py index 5435c3fad3..e5a8d4a6c6 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -106,7 +106,7 @@ from awx.main.utils.common import ( get_cpu_capacity, get_system_task_capacity, ) -from awx.main.utils.execution_environments import get_default_pod_spec, CONTAINER_ROOT, to_container_path +from awx.main.utils.execution_environments import get_default_execution_environment, get_default_pod_spec, CONTAINER_ROOT, to_container_path from awx.main.utils.ansible import read_ansible_config from awx.main.utils.external_logging import reconfigure_rsyslog from awx.main.utils.safe_yaml import safe_dump, sanitize_jinja @@ -408,6 +408,7 @@ def cleanup_execution_environment_images(): if process.returncode != 0: logger.debug(f"Failed to delete image {image_name}") + @task(queue=get_local_queuename) def check_heartbeat(node): AWXReceptorJob.check_heartbeat(node) @@ -2835,6 +2836,7 @@ class RunAdHocCommand(BaseTask): params['process_isolation'] = False if MODE == 'development' else True return params + @task(queue=get_local_queuename) class RunSystemJob(BaseTask): From b09da4883517642f78cf626b4f699e865a5213b2 Mon Sep 17 00:00:00 2001 From: Alan Rominger Date: Tue, 13 Jul 2021 20:23:19 -0400 Subject: [PATCH 04/27] Remove some diff that we dont want from PoC --- Makefile | 5 ----- awx/api/serializers.py | 4 ---- .../src/screens/InstanceGroup/Instances/InstanceListItem.js | 3 --- 3 files changed, 12 deletions(-) diff --git a/Makefile b/Makefile index c541ab16a4..ca7ada2b61 100644 --- a/Makefile +++ b/Makefile @@ -174,11 +174,6 @@ init: fi; \ $(MANAGEMENT_COMMAND) provision_instance --hostname=$(COMPOSE_HOST); \ $(MANAGEMENT_COMMAND) register_queue --queuename=controlplane --instance_percent=100;\ - if [ "$(AWX_GROUP_QUEUES)" == "tower,thepentagon" ]; then \ - $(MANAGEMENT_COMMAND) provision_instance --hostname=isolated; \ - $(MANAGEMENT_COMMAND) register_queue --queuename='thepentagon' --hostnames=isolated --controller=tower; \ - $(MANAGEMENT_COMMAND) generate_isolated_key > /awx_devel/awx/main/isolated/authorized_keys; \ - fi; if [ ! -f /etc/receptor/certs/awx.key ]; then \ rm -f /etc/receptor/certs/*; \ receptor --cert-init commonname="AWX Test CA" bits=2048 outcert=/etc/receptor/certs/ca.crt outkey=/etc/receptor/certs/ca.key; \ diff --git a/awx/api/serializers.py b/awx/api/serializers.py index eabb94db75..be4d5581e2 100644 --- a/awx/api/serializers.py +++ b/awx/api/serializers.py @@ -4773,7 +4773,6 @@ class InstanceSerializer(BaseSerializer): percent_capacity_remaining = serializers.SerializerMethodField() jobs_running = serializers.IntegerField(help_text=_('Count of jobs in the running or waiting state that ' 'are targeted for this instance'), read_only=True) jobs_total = serializers.IntegerField(help_text=_('Count of all jobs that target this instance'), read_only=True) - is_receptor = serializers.SerializerMethodField() class Meta: model = Instance @@ -4818,9 +4817,6 @@ class InstanceSerializer(BaseSerializer): else: return float("{0:.2f}".format(((float(obj.capacity) - float(obj.consumed_capacity)) / (float(obj.capacity))) * 100)) - def get_is_receptor(self, obj): - return obj.is_receptor() - class InstanceGroupSerializer(BaseSerializer): diff --git a/awx/ui/src/screens/InstanceGroup/Instances/InstanceListItem.js b/awx/ui/src/screens/InstanceGroup/Instances/InstanceListItem.js index 461d66ca4a..442f3b464f 100644 --- a/awx/ui/src/screens/InstanceGroup/Instances/InstanceListItem.js +++ b/awx/ui/src/screens/InstanceGroup/Instances/InstanceListItem.js @@ -65,9 +65,6 @@ function InstanceListItem({ const labelId = `check-action-${instance.id}`; function usedCapacity(item) { - if (!item.is_receptor) { - return ``; - } if (item.enabled) { return ( Date: Wed, 14 Jul 2021 14:53:50 -0400 Subject: [PATCH 05/27] Update rebase to keep old control plane capacity check Also do some basic work to separate control versus execution capacity this is to assure that we don't send jobs to the control node --- .../management/commands/list_instances.py | 5 +-- awx/main/models/ha.py | 30 +++++++++++++++ awx/main/scheduler/task_manager.py | 3 +- awx/main/tasks.py | 38 +++++++++++++------ awx/main/tests/functional/test_jobs.py | 36 +++++++++++++++++- awx/main/utils/common.py | 10 ++++- 6 files changed, 104 insertions(+), 18 deletions(-) diff --git a/awx/main/management/commands/list_instances.py b/awx/main/management/commands/list_instances.py index 7302953a76..7568f0b45c 100644 --- a/awx/main/management/commands/list_instances.py +++ b/awx/main/management/commands/list_instances.py @@ -48,8 +48,7 @@ class Command(BaseCommand): if no_color: color = '' fmt = '\t' + color + '{0.hostname} capacity={0.capacity} version={1}' - if x.last_isolated_check: - fmt += ' last_isolated_check="{0.last_isolated_check:%Y-%m-%d %H:%M:%S}"' - fmt += ' heartbeat="{0.modified:%Y-%m-%d %H:%M:%S}"' + if x.capacity: + fmt += ' heartbeat="{0.modified:%Y-%m-%d %H:%M:%S}"' print((fmt + '\033[0m').format(x, x.version or '?')) print('') diff --git a/awx/main/models/ha.py b/awx/main/models/ha.py index 17116c5c3c..3ab2439b95 100644 --- a/awx/main/models/ha.py +++ b/awx/main/models/ha.py @@ -136,6 +136,28 @@ class Instance(HasPolicyEditsMixin, BaseModel): grace_period = 120 return self.modified < ref_time - timedelta(seconds=grace_period) + def refresh_capacity(self): + cpu = get_cpu_capacity() + mem = get_mem_capacity() + if self.enabled: + self.capacity = get_system_task_capacity(self.capacity_adjustment) + else: + self.capacity = 0 + + try: + # if redis is down for some reason, that means we can't persist + # playbook event data; we should consider this a zero capacity event + redis.Redis.from_url(settings.BROKER_URL).ping() + except redis.ConnectionError: + self.capacity = 0 + + self.cpu = cpu[0] + self.memory = mem[0] + self.cpu_capacity = cpu[1] + self.mem_capacity = mem[1] + self.version = awx_application_version + self.save(update_fields=['capacity', 'version', 'modified', 'cpu', 'memory', 'cpu_capacity', 'mem_capacity']) + def is_receptor(self): return self.version.startswith('ansible-runner-') @@ -184,6 +206,11 @@ class InstanceGroup(HasPolicyEditsMixin, BaseModel, RelatedJobsMixin): def capacity(self): return sum([inst.capacity for inst in self.instances.all()]) + @property + def execution_capacity(self): + # TODO: update query to exclude based on node_type field + return sum([inst.capacity for inst in self.instances.exclude(version__startswith='ansible-runner-')]) + @property def jobs_running(self): return UnifiedJob.objects.filter(status__in=('running', 'waiting'), instance_group=self).count() @@ -206,6 +233,9 @@ class InstanceGroup(HasPolicyEditsMixin, BaseModel, RelatedJobsMixin): def fit_task_to_most_remaining_capacity_instance(task, instances): instance_most_capacity = None for i in instances: + # TODO: change this to check if "execution" is in node_type field + if not i.version.startswith('ansible-runner'): + continue if i.remaining_capacity >= task.task_impact and ( instance_most_capacity is None or i.remaining_capacity > instance_most_capacity.remaining_capacity ): diff --git a/awx/main/scheduler/task_manager.py b/awx/main/scheduler/task_manager.py index ed80cc6d4f..56a18a1412 100644 --- a/awx/main/scheduler/task_manager.py +++ b/awx/main/scheduler/task_manager.py @@ -75,6 +75,7 @@ class TaskManager: instances_partial = [ SimpleNamespace( obj=instance, + version=instance.version, remaining_capacity=instance.remaining_capacity, capacity=instance.capacity, jobs_running=instance.jobs_running, @@ -86,7 +87,7 @@ class TaskManager: instances_by_hostname = {i.hostname: i for i in instances_partial} for rampart_group in InstanceGroup.objects.prefetch_related('instances'): - self.graph[rampart_group.name] = dict(graph=DependencyGraph(), capacity_total=rampart_group.capacity, consumed_capacity=0, instances=[]) + self.graph[rampart_group.name] = dict(graph=DependencyGraph(), capacity_total=rampart_group.execution_capacity, consumed_capacity=0, instances=[]) for instance in rampart_group.instances.filter(enabled=True).order_by('hostname'): if instance.hostname in instances_by_hostname: self.graph[rampart_group.name]['instances'].append(instances_by_hostname[instance.hostname]) diff --git a/awx/main/tasks.py b/awx/main/tasks.py index e5a8d4a6c6..27a3482766 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -102,7 +102,6 @@ from awx.main.utils.common import ( parse_yaml_or_json, cleanup_new_process, create_partition, - get_mem_capacity, get_cpu_capacity, get_system_task_capacity, ) @@ -422,15 +421,6 @@ def discover_receptor_nodes(): hostname = ad['NodeID'] commands = ad['WorkCommands'] or [] if 'ansible-runner' not in commands: - if 'local' in commands: - # this node is strictly a control plane node, and does not - # provide ansible-runner as a work command - (changed, instance) = Instance.objects.register(hostname=hostname) - if changed: - logger.info("Registered tower control node '{}'".format(hostname)) - instance.capacity = instance.cpu = instance.memory = instance.cpu_capacity = instance.mem_capacity = 0 # noqa - instance.version = get_awx_version() - instance.save(update_fields=['capacity', 'version', 'cpu', 'memory', 'cpu_capacity', 'mem_capacity']) continue (changed, instance) = Instance.objects.register(hostname=hostname) was_lost = instance.is_lost(ref_time=nowtime) @@ -454,7 +444,7 @@ def discover_receptor_nodes(): # if the instance *was* lost, but has appeared again, # attempt to re-establish the initial capacity and version # check - logger.warning('Attempting to rejoin the cluster as instance {}.'.format(hostname)) + logger.warning('Execution node attempting to rejoin as instance {}.'.format(hostname)) check_heartbeat.apply_async([hostname]) @@ -463,6 +453,7 @@ def cluster_node_heartbeat(): logger.debug("Cluster node heartbeat task.") nowtime = now() instance_list = list(Instance.objects.all()) + this_inst = None lost_instances = [] (changed, instance) = Instance.objects.get_or_register() @@ -473,10 +464,35 @@ def cluster_node_heartbeat(): for inst in list(instance_list): if inst.hostname == settings.CLUSTER_HOST_ID: + this_inst = inst instance_list.remove(inst) elif inst.is_lost(ref_time=nowtime): lost_instances.append(inst) instance_list.remove(inst) + + if this_inst: + startup_event = this_inst.is_lost(ref_time=nowtime) + this_inst.refresh_capacity() + if startup_event: + logger.warning('Rejoining the cluster as instance {}.'.format(this_inst.hostname)) + return + else: + raise RuntimeError("Cluster Host Not Found: {}".format(settings.CLUSTER_HOST_ID)) + # IFF any node has a greater version than we do, then we'll shutdown services + for other_inst in instance_list: + if other_inst.version == "" or other_inst.version.startswith('ansible-runner'): + continue + if Version(other_inst.version.split('-', 1)[0]) > Version(awx_application_version.split('-', 1)[0]) and not settings.DEBUG: + logger.error( + "Host {} reports version {}, but this node {} is at {}, shutting down".format( + other_inst.hostname, other_inst.version, this_inst.hostname, this_inst.version + ) + ) + # Shutdown signal will set the capacity to zero to ensure no Jobs get added to this instance. + # The heartbeat task will reset the capacity to the system capacity after upgrade. + stop_local_services(communicate=False) + raise RuntimeError("Shutting down.") + for other_inst in lost_instances: try: reaper.reap(other_inst) diff --git a/awx/main/tests/functional/test_jobs.py b/awx/main/tests/functional/test_jobs.py index 0a3966fc56..197c1197e1 100644 --- a/awx/main/tests/functional/test_jobs.py +++ b/awx/main/tests/functional/test_jobs.py @@ -1,7 +1,11 @@ +import redis import pytest +from unittest import mock import json -from awx.main.models import Job, JobHostSummary, InventoryUpdate, InventorySource, Project, ProjectUpdate, SystemJob, AdHocCommand +from awx.main.models import Job, Instance, JobHostSummary, InventoryUpdate, InventorySource, Project, ProjectUpdate, SystemJob, AdHocCommand +from awx.main.tasks import cluster_node_heartbeat +from django.test.utils import override_settings @pytest.mark.django_db @@ -15,6 +19,36 @@ def test_orphan_unified_job_creation(instance, inventory): assert job2.launch_type == 'relaunch' +@pytest.mark.django_db +@mock.patch('awx.main.utils.common.get_cpu_capacity', lambda: (2, 8)) +@mock.patch('awx.main.utils.common.get_mem_capacity', lambda: (8000, 62)) +def test_job_capacity_and_with_inactive_node(): + i = Instance.objects.create(hostname='test-1') + with mock.patch.object(redis.client.Redis, 'ping', lambda self: True): + i.refresh_capacity() + assert i.capacity == 62 + i.enabled = False + i.save() + with override_settings(CLUSTER_HOST_ID=i.hostname): + cluster_node_heartbeat() + i = Instance.objects.get(id=i.id) + assert i.capacity == 0 + + +@pytest.mark.django_db +@mock.patch('awx.main.utils.common.get_cpu_capacity', lambda: (2, 8)) +@mock.patch('awx.main.utils.common.get_mem_capacity', lambda: (8000, 62)) +def test_job_capacity_with_redis_disabled(): + i = Instance.objects.create(hostname='test-1') + + def _raise(self): + raise redis.ConnectionError() + + with mock.patch.object(redis.client.Redis, 'ping', _raise): + i.refresh_capacity() + assert i.capacity == 0 + + @pytest.mark.django_db def test_job_type_name(): job = Job.objects.create() diff --git a/awx/main/utils/common.py b/awx/main/utils/common.py index 7a09a8615f..e034dda02c 100644 --- a/awx/main/utils/common.py +++ b/awx/main/utils/common.py @@ -15,6 +15,7 @@ import urllib.parse import threading import contextlib import tempfile +import psutil from functools import reduce, wraps from decimal import Decimal @@ -698,7 +699,7 @@ def parse_yaml_or_json(vars_str, silent_failure=True): return vars_dict -def get_cpu_capacity(raw): +def get_cpu_capacity(raw=None): from django.conf import settings settings_forkcpu = getattr(settings, 'SYSTEM_TASK_FORKS_CPU', None) @@ -712,6 +713,9 @@ def get_cpu_capacity(raw): elif settings_abscpu is not None: return 0, int(settings_abscpu) + if raw is None: + raw = psutil.cpu_count() + if env_forkcpu: forkcpu = int(env_forkcpu) elif settings_forkcpu: @@ -721,7 +725,7 @@ def get_cpu_capacity(raw): return (raw, raw * forkcpu) -def get_mem_capacity(raw_mb): +def get_mem_capacity(raw_mb=None): from django.conf import settings settings_forkmem = getattr(settings, 'SYSTEM_TASK_FORKS_MEM', None) @@ -742,6 +746,8 @@ def get_mem_capacity(raw_mb): else: forkmem = 100 + if raw_mb is None: + raw_mb = psutil.virtual_memory().total return (raw_mb, max(1, ((raw_mb // 1024 // 1024) - 2048) // forkmem)) From e7be86867deee71d5e81d738671cbf767055d4f2 Mon Sep 17 00:00:00 2001 From: Alan Rominger Date: Mon, 19 Jul 2021 09:43:46 -0400 Subject: [PATCH 06/27] Fix rebase bug specific to ad hoc commands --- awx/main/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/awx/main/tasks.py b/awx/main/tasks.py index 27a3482766..1098fdbfac 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -2848,7 +2848,7 @@ class RunAdHocCommand(BaseTask): return d def build_execution_environment_params(self, instance, private_data_dir): - params = super(RunAdHocCommand, self).build_execution_environment_params(instance) + params = super(RunAdHocCommand, self).build_execution_environment_params(instance, private_data_dir) params['process_isolation'] = False if MODE == 'development' else True return params From f597205fa7afd5f98b00bd416707aea7069a0d8a Mon Sep 17 00:00:00 2001 From: Alan Rominger Date: Wed, 21 Jul 2021 09:53:22 -0400 Subject: [PATCH 07/27] Run capacity checks with container isolation (#10688) This requires swapping out the container images for the execution nodes from awx-ee to the awx image For completeness, the hop node image is switched to the raw receptor image A few outright bugs are fixed here memory calculation just was not right at all the execution_capacity calculation was reverse of intention Drop in a few TODOs about error handling from debugging --- awx/main/models/ha.py | 2 +- awx/main/tasks.py | 32 +++++++++++++++++++++----------- tools/docker-receptor.yml | 20 +++++++++++++------- 3 files changed, 35 insertions(+), 19 deletions(-) diff --git a/awx/main/models/ha.py b/awx/main/models/ha.py index 3ab2439b95..5cb1ec2e09 100644 --- a/awx/main/models/ha.py +++ b/awx/main/models/ha.py @@ -209,7 +209,7 @@ class InstanceGroup(HasPolicyEditsMixin, BaseModel, RelatedJobsMixin): @property def execution_capacity(self): # TODO: update query to exclude based on node_type field - return sum([inst.capacity for inst in self.instances.exclude(version__startswith='ansible-runner-')]) + return sum([inst.capacity for inst in self.instances.filter(version__startswith='ansible-runner-')]) @property def jobs_running(self): diff --git a/awx/main/tasks.py b/awx/main/tasks.py index 1098fdbfac..07bc6af986 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -59,7 +59,6 @@ from receptorctl.socket_interface import ReceptorControl from dateutil.parser import parse as parse_date # AWX -from awx import MODE from awx import __version__ as awx_application_version from awx.main.constants import PRIVILEGE_ESCALATION_METHODS, STANDARD_INVENTORY_UPDATE_ENV, MINIMAL_EVENTS from awx.main.access import access_registry @@ -103,6 +102,7 @@ from awx.main.utils.common import ( cleanup_new_process, create_partition, get_cpu_capacity, + get_mem_capacity, get_system_task_capacity, ) from awx.main.utils.execution_environments import get_default_execution_environment, get_default_pod_spec, CONTAINER_ROOT, to_container_path @@ -466,6 +466,8 @@ def cluster_node_heartbeat(): if inst.hostname == settings.CLUSTER_HOST_ID: this_inst = inst instance_list.remove(inst) + elif inst.version.startswith('ansible-runner'): # TODO: use proper field when introduced + continue elif inst.is_lost(ref_time=nowtime): lost_instances.append(inst) instance_list.remove(inst) @@ -1773,7 +1775,6 @@ class RunJob(BaseTask): ] ) - params['process_isolation'] = False if MODE == 'development' else True return params def pre_run_hook(self, job, private_data_dir): @@ -2847,11 +2848,6 @@ class RunAdHocCommand(BaseTask): d[r'Password:\s*?$'] = 'ssh_password' return d - def build_execution_environment_params(self, instance, private_data_dir): - params = super(RunAdHocCommand, self).build_execution_environment_params(instance, private_data_dir) - params['process_isolation'] = False if MODE == 'development' else True - return params - @task(queue=get_local_queuename) class RunSystemJob(BaseTask): @@ -3000,7 +2996,8 @@ class AWXReceptorJob: receptor_ctl.simple_command(f"work release {self.unit_id}") @classmethod - def check_heartbeat(cls, node): + def check_heartbeat(cls, node): # TODO: rename most of these "heartbeat" things + logger.info(f'Checking capacity of execution node {node}') # make a private data dir and env dir private_data_dir = tempfile.mkdtemp(prefix='awx_heartbeat_', dir=settings.AWX_ISOLATION_BASE_PATH) env_path = os.path.join(private_data_dir, 'env') @@ -3022,6 +3019,10 @@ class AWXReceptorJob: with open(fn, 'w') as f: os.chmod(fn, stat.S_IRUSR | stat.S_IXUSR | stat.S_IWUSR) f.write('localhost ansible_connection=local') + # we have to create the project directory because it is --workdir and crun needs it to exist + # https://github.com/ansible/ansible-runner/issues/758 + project_path = os.path.join(private_data_dir, 'project') + os.makedirs(project_path, mode=0o700) runner_params = { 'ident': str(uuid4()), @@ -3033,7 +3034,7 @@ class AWXReceptorJob: 'settings': { "container_image": get_default_execution_environment().image, "container_options": ['--user=root'], - "process_isolation": False if MODE == 'development' else True, + "process_isolation": True, }, } @@ -3067,6 +3068,7 @@ class AWXReceptorJob: version = facts.get('ansible_local', {}).get('ansible_runner', {}).get('version', '') # noqa if version: self.version = f'ansible-runner-{version}' + # TODO: save event_data["stdout"] and log when errors happen def finished_callback(self, runner_obj): pass @@ -3075,6 +3077,7 @@ class AWXReceptorJob: pass def status_handler(self, status_data, runner_config): + # TODO: log error cases pass def update_model(self, *args, **kw): @@ -3085,12 +3088,13 @@ class AWXReceptorJob: res = receptor_job.run(work_type='ansible-runner') if res.status == 'successful': cpu = get_cpu_capacity(task.cpus) - mem = get_cpu_capacity(task.mem_mb) + mem = get_mem_capacity(task.mem_mb * 1000000) + logger.info(f'Calculated memory capacity: {task.mem_mb}, out: {mem}') instance = Instance.objects.get(hostname=node) instance.cpu = cpu[0] instance.cpu_capacity = cpu[1] instance.memory = mem[0] - instance.memory_capacity = mem[1] + instance.mem_capacity = mem[1] instance.capacity = get_system_task_capacity( instance.capacity_adjustment, instance.cpu_capacity, @@ -3098,6 +3102,12 @@ class AWXReceptorJob: ) instance.version = task.version instance.save(update_fields=['capacity', 'version', 'cpu', 'memory', 'cpu_capacity', 'mem_capacity']) + logger.info(f'Updated capacity of {node} to cpu: {instance.cpu_capacity} mem: {instance.mem_capacity}') + else: + # TODO: error handling like we do with jobs + # receptorctl work results + # receptorctl work list + logger.info(f'Capacity check not successful for execution node {node}') def _run_internal(self, receptor_ctl, work_type=None): # Create a socketpair. Where the left side will be used for writing our payload diff --git a/tools/docker-receptor.yml b/tools/docker-receptor.yml index 02a30cbf66..57e3ca657d 100644 --- a/tools/docker-receptor.yml +++ b/tools/docker-receptor.yml @@ -17,7 +17,7 @@ services: volumes: - "./docker-compose-cluster:/etc/receptor" receptor-hop: - image: quay.io/ansible/awx-ee:devel + image: quay.io/project-receptor/receptor:latest user: root container_name: tools_receptor_hop hostname: receptor-hop @@ -29,8 +29,8 @@ services: volumes: - "../../docker-compose-cluster:/etc/receptor" receptor-1: - image: quay.io/ansible/awx-ee:devel - user: root + image: quay.io/awx/awx_devel:devel + user: "1000" container_name: tools_receptor_1 hostname: receptor-1 command: 'receptor --config /etc/receptor/awx-1-receptor.conf' @@ -38,9 +38,11 @@ services: - receptor-hop volumes: - "../../docker-compose-cluster:/etc/receptor" + - "/sys/fs/cgroup:/sys/fs/cgroup" + privileged: true receptor-2: - image: quay.io/ansible/awx-ee:devel - user: root + image: quay.io/awx/awx_devel:devel + user: "1000" container_name: tools_receptor_2 hostname: receptor-2 command: 'receptor --config /etc/receptor/awx-2-receptor.conf' @@ -48,9 +50,11 @@ services: - receptor-hop volumes: - "../../docker-compose-cluster:/etc/receptor" + - "/sys/fs/cgroup:/sys/fs/cgroup" + privileged: true receptor-3: - image: quay.io/ansible/awx-ee:devel - user: root + image: quay.io/awx/awx_devel:devel + user: "1000" container_name: tools_receptor_3 hostname: receptor-3 command: 'receptor --config /etc/receptor/awx-3-receptor.conf' @@ -58,3 +62,5 @@ services: - receptor-hop volumes: - "../../docker-compose-cluster:/etc/receptor" + - "/sys/fs/cgroup:/sys/fs/cgroup" + privileged: true From c3843004aa65c1a26bfb4323ce030677e4919880 Mon Sep 17 00:00:00 2001 From: Bianca Henderson Date: Wed, 21 Jul 2021 11:57:18 -0400 Subject: [PATCH 08/27] Update docker-compose (#10664) * Update docker-compose - Deploys 1 control and 1 execution node * Add a new Receptor cluster configuration file * update receptor peer to awx_1 to match how hop node is configured in cluster (Jim Ladd's commit) * Move receptor_1 instantiation in the docker-compose setup * Hard code receptor_1 name * Update execution node name, move standalone conf file to docker-compose directory * Reformat docker-compose file, mount another volume, change privileges --- .../sources/templates/docker-compose.yml.j2 | 17 ++++++++++--- .../awx-1-receptor-standalone.conf | 25 +++++++++++++++++++ 2 files changed, 39 insertions(+), 3 deletions(-) create mode 100644 tools/docker-compose/awx-1-receptor-standalone.conf diff --git a/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 b/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 index 5d084f51be..1fd016f554 100644 --- a/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 +++ b/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 @@ -27,6 +27,7 @@ services: links: - postgres - redis_{{ container_postfix }} + - execution_node_1 working_dir: "/awx_devel" volumes: - "../../../:/awx_devel" @@ -40,7 +41,7 @@ services: - "/sys/fs/cgroup:/sys/fs/cgroup" - "~/.kube/config:/var/lib/awx/.kube/config" - "redis_socket_{{ container_postfix }}:/var/run/redis/:rw" - - "receptor_{{ container_postfix }}:/var/run/receptor/" + - "execution_node_1:/var/run/receptor/" privileged: true tty: true ports: @@ -62,6 +63,16 @@ services: entrypoint: ["redis-server"] command: ["/usr/local/etc/redis/redis.conf"] {% endfor %} + execution_node_1: + image: quay.io/awx/awx_devel:devel + user: root + container_name: tools_execution_node_1 + hostname: execution_node_1 + command: 'receptor --config /etc/receptor/awx-1-receptor-standalone.conf' + volumes: + - "../../docker-compose-cluster:/etc/receptor" + - "/sys/fs/cgroup:/sys/fs/cgroup" + privileged: true {% if cluster_node_count|int > 1 %} haproxy: image: haproxy @@ -99,8 +110,8 @@ volumes: name: tools_awx_db {% for i in range(cluster_node_count|int) -%} {% set container_postfix = loop.index %} - receptor_{{ container_postfix }}: - name: tools_receptor_{{ container_postfix }} + execution_node_1: + name: tools_execution_node_1 redis_socket_{{ container_postfix }}: name: tools_redis_socket_{{ container_postfix }} {% endfor -%} diff --git a/tools/docker-compose/awx-1-receptor-standalone.conf b/tools/docker-compose/awx-1-receptor-standalone.conf new file mode 100644 index 0000000000..4c2912ea88 --- /dev/null +++ b/tools/docker-compose/awx-1-receptor-standalone.conf @@ -0,0 +1,25 @@ +--- +- node: + id: execution_node_1 + +- log-level: info + +#- tls-client: +# name: mutual-tls-client +# cert: /etc/receptor/certs/receptor_1.crt +# key: /etc/receptor/certs/receptor_1.key +# rootcas: /etc/receptor/certs/ca.crt + +- tcp-peer: + address: awx_1:4444 + redial: true + #tls: mutual-tls-client + +- work-command: + worktype: ansible-runner + command: ansible-runner + params: worker + allowruntimeparams: true + +- control-service: + service: control From f8d074db01ecb2ab161771445cd8906740eb56e2 Mon Sep 17 00:00:00 2001 From: beeankha Date: Wed, 21 Jul 2021 15:05:15 -0400 Subject: [PATCH 09/27] Point to correct config file for execution_node_1 --- .../ansible/roles/sources/templates/docker-compose.yml.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 b/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 index 1fd016f554..f320437cba 100644 --- a/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 +++ b/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 @@ -70,7 +70,7 @@ services: hostname: execution_node_1 command: 'receptor --config /etc/receptor/awx-1-receptor-standalone.conf' volumes: - - "../../docker-compose-cluster:/etc/receptor" + - "../../docker-compose:/etc/receptor" - "/sys/fs/cgroup:/sys/fs/cgroup" privileged: true {% if cluster_node_count|int > 1 %} From a79d7444e55c8a85e41c466f90fa6aebf5d3025b Mon Sep 17 00:00:00 2001 From: Jim Ladd Date: Wed, 21 Jul 2021 17:41:16 -0700 Subject: [PATCH 10/27] set userid to 1000 (#10714) --- .../ansible/roles/sources/templates/docker-compose.yml.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 b/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 index f320437cba..4cfa3038eb 100644 --- a/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 +++ b/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 @@ -65,7 +65,7 @@ services: {% endfor %} execution_node_1: image: quay.io/awx/awx_devel:devel - user: root + user: "1000" container_name: tools_execution_node_1 hostname: execution_node_1 command: 'receptor --config /etc/receptor/awx-1-receptor-standalone.conf' From 2fc581c24971cff28aee467e23a2bf8c4e9eec95 Mon Sep 17 00:00:00 2001 From: beeankha Date: Thu, 22 Jul 2021 13:03:49 -0400 Subject: [PATCH 11/27] Pull in user's uid vs hardcode to 1000 --- .../ansible/roles/sources/templates/docker-compose.yml.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 b/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 index 4cfa3038eb..f448384020 100644 --- a/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 +++ b/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 @@ -65,7 +65,7 @@ services: {% endfor %} execution_node_1: image: quay.io/awx/awx_devel:devel - user: "1000" + user: "{{ ansible_user_uid }}" container_name: tools_execution_node_1 hostname: execution_node_1 command: 'receptor --config /etc/receptor/awx-1-receptor-standalone.conf' From 264c560a8a7eafb15251ad0275bf9436631e4b40 Mon Sep 17 00:00:00 2001 From: beeankha Date: Thu, 22 Jul 2021 13:28:13 -0400 Subject: [PATCH 12/27] Template docker receptor yaml file, update Makefile to reflect this change --- Makefile | 2 +- tools/docker-compose/ansible/roles/sources/tasks/main.yml | 6 ++++++ .../roles/sources/templates/docker-receptor.yml.j2} | 8 ++++---- 3 files changed, 11 insertions(+), 5 deletions(-) rename tools/{docker-receptor.yml => docker-compose/ansible/roles/sources/templates/docker-receptor.yml.j2} (93%) diff --git a/Makefile b/Makefile index ca7ada2b61..f6a0675323 100644 --- a/Makefile +++ b/Makefile @@ -545,7 +545,7 @@ docker-compose-elk: docker-auth awx/projects docker-compose-sources docker-compose -f tools/docker-compose/_sources/docker-compose.yml -f tools/elastic/docker-compose.logstash-link.yml -f tools/elastic/docker-compose.elastic-override.yml up --no-recreate docker-compose-cluster: docker-auth awx/projects docker-compose-sources - docker-compose -f tools/docker-compose/_sources/docker-compose.yml -f tools/docker-receptor.yml up + docker-compose -f tools/docker-compose/_sources/docker-compose.yml -f tools/docker-compose/_sources/docker-receptor.yml up docker-compose-cluster-elk: docker-auth awx/projects docker-compose-sources docker-compose -f tools/docker-compose/_sources/docker-compose.yml -f tools/elastic/docker-compose.logstash-link-cluster.yml -f tools/elastic/docker-compose.elastic-override.yml up --no-recreate diff --git a/tools/docker-compose/ansible/roles/sources/tasks/main.yml b/tools/docker-compose/ansible/roles/sources/tasks/main.yml index 55937b4051..9cb959441e 100644 --- a/tools/docker-compose/ansible/roles/sources/tasks/main.yml +++ b/tools/docker-compose/ansible/roles/sources/tasks/main.yml @@ -81,3 +81,9 @@ src: docker-compose.yml.j2 dest: "{{ sources_dest }}/{{ compose_name }}" mode: '0600' + +- name: Render Docker-Compose (Receptor) + template: + src: docker-receptor.yml.j2 + dest: "{{ sources_dest }}/docker-receptor.yml" + mode: '0600' diff --git a/tools/docker-receptor.yml b/tools/docker-compose/ansible/roles/sources/templates/docker-receptor.yml.j2 similarity index 93% rename from tools/docker-receptor.yml rename to tools/docker-compose/ansible/roles/sources/templates/docker-receptor.yml.j2 index 57e3ca657d..574349cf43 100644 --- a/tools/docker-receptor.yml +++ b/tools/docker-compose/ansible/roles/sources/templates/docker-receptor.yml.j2 @@ -9,7 +9,7 @@ # ^-------------- # --- -version: '2' +version: '2.1' services: awx_1: environment: @@ -30,7 +30,7 @@ services: - "../../docker-compose-cluster:/etc/receptor" receptor-1: image: quay.io/awx/awx_devel:devel - user: "1000" + user: "{{ ansible_user_uid }}" container_name: tools_receptor_1 hostname: receptor-1 command: 'receptor --config /etc/receptor/awx-1-receptor.conf' @@ -42,7 +42,7 @@ services: privileged: true receptor-2: image: quay.io/awx/awx_devel:devel - user: "1000" + user: "{{ ansible_user_uid }}" container_name: tools_receptor_2 hostname: receptor-2 command: 'receptor --config /etc/receptor/awx-2-receptor.conf' @@ -54,7 +54,7 @@ services: privileged: true receptor-3: image: quay.io/awx/awx_devel:devel - user: "1000" + user: "{{ ansible_user_uid }}" container_name: tools_receptor_3 hostname: receptor-3 command: 'receptor --config /etc/receptor/awx-3-receptor.conf' From 9881bb72b8e205f5a4ed9ca480cd4eeefaa13a42 Mon Sep 17 00:00:00 2001 From: Alan Rominger Date: Thu, 22 Jul 2021 15:35:10 -0400 Subject: [PATCH 13/27] Treat the awx_1 node as a hybrid node for now, use local work type (#10726) --- awx/main/models/ha.py | 2 +- awx/main/tasks.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/awx/main/models/ha.py b/awx/main/models/ha.py index 5cb1ec2e09..984daed346 100644 --- a/awx/main/models/ha.py +++ b/awx/main/models/ha.py @@ -209,7 +209,7 @@ class InstanceGroup(HasPolicyEditsMixin, BaseModel, RelatedJobsMixin): @property def execution_capacity(self): # TODO: update query to exclude based on node_type field - return sum([inst.capacity for inst in self.instances.filter(version__startswith='ansible-runner-')]) + return sum([inst.capacity for inst in self.instances.all()]) @property def jobs_running(self): diff --git a/awx/main/tasks.py b/awx/main/tasks.py index 07bc6af986..1e9e57ba45 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -3231,7 +3231,10 @@ class AWXReceptorJob: else: work_type = 'kubernetes-incluster-auth' elif isinstance(self.task.instance, (Job, AdHocCommand)): - work_type = 'ansible-runner' + if self.task.instance.execution_node == self.task.instance.controller_node: + work_type = 'local' + else: + work_type = 'ansible-runner' else: work_type = 'local' From 460c7c337921b855609b3eea6b7bc1157472ea61 Mon Sep 17 00:00:00 2001 From: Shane McDonald Date: Thu, 22 Jul 2021 19:30:00 -0400 Subject: [PATCH 14/27] Allow for dynamically scaling automation mesh in dev env --- Makefile | 9 ++- tools/docker-compose-cluster/.gitignore | 0 .../awx-2-receptor.conf | 25 ------- .../awx-3-receptor.conf | 25 ------- .../awx-hop-receptor.conf | 26 -------- tools/docker-compose-cluster/certs/.gitignore | 2 - tools/docker-compose/README.md | 4 +- .../ansible/roles/sources/defaults/main.yml | 2 +- .../ansible/roles/sources/tasks/main.yml | 20 ++++-- .../sources/templates/docker-compose.yml.j2 | 53 +++++++++------ .../sources/templates/docker-receptor.yml.j2 | 66 ------------------- .../roles/sources/templates/haproxy.cfg.j2 | 4 +- .../sources/templates/receptor-hop.conf.j2 | 12 ++++ .../templates/receptor-worker.conf.j2} | 9 +-- 14 files changed, 71 insertions(+), 186 deletions(-) delete mode 100644 tools/docker-compose-cluster/.gitignore delete mode 100644 tools/docker-compose-cluster/awx-2-receptor.conf delete mode 100644 tools/docker-compose-cluster/awx-3-receptor.conf delete mode 100644 tools/docker-compose-cluster/awx-hop-receptor.conf delete mode 100644 tools/docker-compose-cluster/certs/.gitignore delete mode 100644 tools/docker-compose/ansible/roles/sources/templates/docker-receptor.yml.j2 create mode 100644 tools/docker-compose/ansible/roles/sources/templates/receptor-hop.conf.j2 rename tools/{docker-compose-cluster/awx-1-receptor.conf => docker-compose/ansible/roles/sources/templates/receptor-worker.conf.j2} (53%) diff --git a/Makefile b/Makefile index f6a0675323..f8269bdd1b 100644 --- a/Makefile +++ b/Makefile @@ -479,7 +479,8 @@ awx/projects: COMPOSE_UP_OPTS ?= COMPOSE_OPTS ?= -CLUSTER_NODE_COUNT ?= 1 +CONTROL_PLANE_NODE_COUNT ?= 1 +EXECUTION_NODE_COUNT ?= 2 MINIKUBE_CONTAINER_GROUP ?= false docker-compose-sources: .git/hooks/pre-commit @@ -490,7 +491,8 @@ docker-compose-sources: .git/hooks/pre-commit ansible-playbook -i tools/docker-compose/inventory tools/docker-compose/ansible/sources.yml \ -e awx_image=$(DEV_DOCKER_TAG_BASE)/awx_devel \ -e awx_image_tag=$(COMPOSE_TAG) \ - -e cluster_node_count=$(CLUSTER_NODE_COUNT) \ + -e control_plane_node_count=$(CONTROL_PLANE_NODE_COUNT) \ + -e execution_node_count=$(EXECUTION_NODE_COUNT) \ -e minikube_container_group=$(MINIKUBE_CONTAINER_GROUP) @@ -544,9 +546,6 @@ docker-refresh: docker-clean docker-compose docker-compose-elk: docker-auth awx/projects docker-compose-sources docker-compose -f tools/docker-compose/_sources/docker-compose.yml -f tools/elastic/docker-compose.logstash-link.yml -f tools/elastic/docker-compose.elastic-override.yml up --no-recreate -docker-compose-cluster: docker-auth awx/projects docker-compose-sources - docker-compose -f tools/docker-compose/_sources/docker-compose.yml -f tools/docker-compose/_sources/docker-receptor.yml up - docker-compose-cluster-elk: docker-auth awx/projects docker-compose-sources docker-compose -f tools/docker-compose/_sources/docker-compose.yml -f tools/elastic/docker-compose.logstash-link-cluster.yml -f tools/elastic/docker-compose.elastic-override.yml up --no-recreate diff --git a/tools/docker-compose-cluster/.gitignore b/tools/docker-compose-cluster/.gitignore deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/tools/docker-compose-cluster/awx-2-receptor.conf b/tools/docker-compose-cluster/awx-2-receptor.conf deleted file mode 100644 index af85fe0ea6..0000000000 --- a/tools/docker-compose-cluster/awx-2-receptor.conf +++ /dev/null @@ -1,25 +0,0 @@ ---- -- node: - id: receptor-2 - -- log-level: info - -#- tls-client: -# name: mutual-tls-client -# cert: /etc/receptor/certs/receptor-2.crt -# key: /etc/receptor/certs/receptor-2.key -# rootcas: /etc/receptor/certs/ca.crt - -- tcp-peer: - address: tools_receptor_hop:5555 - redial: true - #tls: mutual-tls-client - -- work-command: - worktype: ansible-runner - command: ansible-runner - params: worker - allowruntimeparams: true - -- control-service: - service: control diff --git a/tools/docker-compose-cluster/awx-3-receptor.conf b/tools/docker-compose-cluster/awx-3-receptor.conf deleted file mode 100644 index 095ff8fee9..0000000000 --- a/tools/docker-compose-cluster/awx-3-receptor.conf +++ /dev/null @@ -1,25 +0,0 @@ ---- -- node: - id: receptor-3 - -- log-level: info - -#- tls-client: -# name: mutual-tls-client -# cert: /etc/receptor/certs/receptor-3.crt -# key: /etc/receptor/certs/receptor-3.key -# rootcas: /etc/receptor/certs/ca.crt - -- tcp-peer: - address: tools_receptor_hop:5555 - redial: true - #tls: mutual-tls-client - -- work-command: - worktype: ansible-runner - command: ansible-runner - params: worker - allowruntimeparams: true - -- control-service: - service: control diff --git a/tools/docker-compose-cluster/awx-hop-receptor.conf b/tools/docker-compose-cluster/awx-hop-receptor.conf deleted file mode 100644 index 07a1140e66..0000000000 --- a/tools/docker-compose-cluster/awx-hop-receptor.conf +++ /dev/null @@ -1,26 +0,0 @@ ---- -- node: - id: receptor-hop - -- log-level: info - -#- tls-server: -# name: mutual-tls-server -# cert: /etc/receptor/certs/receptor-hop.crt -# key: /etc/receptor/certs/receptor-hop.key -# clientcas: /etc/receptor/certs/ca.crt - -- tls-client: - name: mutual-tls-client - cert: /etc/receptor/certs/receptor-hop.crt - key: /etc/receptor/certs/receptor-hop.key - rootcas: /etc/receptor/certs/ca.crt - -- tcp-peer: - address: awx_1:4444 - redial: true - #tls: mutual-tls-client - -- tcp-listener: - port: 5555 - #tls: mutual-tls-server diff --git a/tools/docker-compose-cluster/certs/.gitignore b/tools/docker-compose-cluster/certs/.gitignore deleted file mode 100644 index d6b7ef32c8..0000000000 --- a/tools/docker-compose-cluster/certs/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore diff --git a/tools/docker-compose/README.md b/tools/docker-compose/README.md index 7396dd9326..13e7a5be21 100644 --- a/tools/docker-compose/README.md +++ b/tools/docker-compose/README.md @@ -211,10 +211,10 @@ need to call `bootstrap_development.sh` first. Certain features or bugs are only applicable when running a cluster of AWX nodes. To bring up a 3 node cluster development environment simply run the below command. ```bash -(host)$ CLUSTER_NODE_COUNT=3 make docker-compose +(host)$ CONTROL_PLANE_NODE_COUNT=3 make docker-compose ``` -`CLUSTER_NODE_COUNT` is configurable and defaults to 1, effectively a non-clustered AWX. +`CONTROL_PLANE_NODE_COUNT` is configurable and defaults to 1, effectively a non-clustered AWX. Note that you may see multiple messages of the form `2021-03-04 20:11:47,666 WARNING [-] awx.main.wsbroadcast Connection from awx_2 to awx_5 failed: 'Cannot connect to host awx_5:8013 ssl:False [Name or service not known]'.`. This can happen when you bring up a cluster of many nodes, say 10, then you bring up a cluster of less nodes, say 3. In this example, there will be 7 `Instance` records in the database that represent AWX instances. The AWX development environment mimics the VM deployment (vs. kubernetes) and expects the missing nodes to be brought back to healthy by the admin. The warning message you are seeing is all of the AWX nodes trying to connect the websocket backplane. You can manually delete the `Instance` records from the database i.e. `Instance.objects.get(hostname='awx_9').delete()` to stop the warnings. diff --git a/tools/docker-compose/ansible/roles/sources/defaults/main.yml b/tools/docker-compose/ansible/roles/sources/defaults/main.yml index b64afc5012..62d90ca4b9 100644 --- a/tools/docker-compose/ansible/roles/sources/defaults/main.yml +++ b/tools/docker-compose/ansible/roles/sources/defaults/main.yml @@ -5,5 +5,5 @@ awx_image: 'quay.io/ansible/awx_devel' pg_port: 5432 pg_username: 'awx' pg_database: 'awx' -cluster_node_count: 1 +control_plane_node_count: 1 minikube_container_group: false diff --git a/tools/docker-compose/ansible/roles/sources/tasks/main.yml b/tools/docker-compose/ansible/roles/sources/tasks/main.yml index 9cb959441e..57bc4ed922 100644 --- a/tools/docker-compose/ansible/roles/sources/tasks/main.yml +++ b/tools/docker-compose/ansible/roles/sources/tasks/main.yml @@ -1,10 +1,13 @@ --- -- name: Create _sources directory +- name: Create _sources directories file: - path: "{{ sources_dest }}/secrets" + path: "{{ sources_dest }}/{{ item }}" state: 'directory' mode: '0700' + loop: + - secrets + - receptor - name: Detect secrets stat: @@ -82,8 +85,15 @@ dest: "{{ sources_dest }}/{{ compose_name }}" mode: '0600' -- name: Render Docker-Compose (Receptor) +- name: Render Receptor Hop Config template: - src: docker-receptor.yml.j2 - dest: "{{ sources_dest }}/docker-receptor.yml" + src: "receptor-hop.conf.j2" + dest: "{{ sources_dest }}/receptor/receptor-hop.conf" mode: '0600' + +- name: Render Receptor Worker Config(s) + template: + src: "receptor-worker.conf.j2" + dest: "{{ sources_dest }}/receptor/receptor-worker-{{ item }}.conf" + mode: '0600' + with_sequence: start=1 end={{ execution_node_count }} diff --git a/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 b/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 index f448384020..c24e7e85f6 100644 --- a/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 +++ b/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 @@ -1,7 +1,7 @@ --- version: '2.1' services: -{% for i in range(cluster_node_count|int) %} +{% for i in range(control_plane_node_count|int) %} {% set container_postfix = loop.index %} {% set awx_sdb_port_start = 7899 + (loop.index0*1000) | int %} {% set awx_sdb_port_end = 7999 + (loop.index0*1000) | int %} @@ -27,7 +27,6 @@ services: links: - postgres - redis_{{ container_postfix }} - - execution_node_1 working_dir: "/awx_devel" volumes: - "../../../:/awx_devel" @@ -41,12 +40,11 @@ services: - "/sys/fs/cgroup:/sys/fs/cgroup" - "~/.kube/config:/var/lib/awx/.kube/config" - "redis_socket_{{ container_postfix }}:/var/run/redis/:rw" - - "execution_node_1:/var/run/receptor/" privileged: true tty: true ports: - "{{ awx_sdb_port_start }}-{{ awx_sdb_port_end }}:{{ awx_sdb_port_start }}-{{ awx_sdb_port_end }}" # sdb-listen -{% if cluster_node_count|int == 1 %} +{% if control_plane_node_count|int == 1 %} - "6899:6899" - "8080:8080" # unused but mapped for debugging - "8888:8888" # jupyter notebook @@ -63,17 +61,7 @@ services: entrypoint: ["redis-server"] command: ["/usr/local/etc/redis/redis.conf"] {% endfor %} - execution_node_1: - image: quay.io/awx/awx_devel:devel - user: "{{ ansible_user_uid }}" - container_name: tools_execution_node_1 - hostname: execution_node_1 - command: 'receptor --config /etc/receptor/awx-1-receptor-standalone.conf' - volumes: - - "../../docker-compose:/etc/receptor" - - "/sys/fs/cgroup:/sys/fs/cgroup" - privileged: true -{% if cluster_node_count|int > 1 %} +{% if control_plane_node_count|int > 1 %} haproxy: image: haproxy user: "{{ ansible_user_uid }}" @@ -84,7 +72,7 @@ services: - "8043:8043" - "1936:1936" depends_on: - {% for i in range(cluster_node_count|int) -%} + {% for i in range(control_plane_node_count|int) -%} {% set container_postfix = loop.index %} - "awx_{{ container_postfix }}" {% endfor %} @@ -105,13 +93,40 @@ services: POSTGRES_PASSWORD: {{ pg_password }} volumes: - "awx_db:/var/lib/postgresql/data" +{% if execution_node_count|int > 0 %} + receptor-hop: + image: quay.io/project-receptor/receptor:latest + user: root + container_name: tools_receptor_hop + hostname: receptor-hop + command: 'receptor --config /etc/receptor/receptor.conf' + links: + - awx_1 + ports: + - "5555:5555" + volumes: + - "../../docker-compose/_sources/receptor/receptor-hop.conf:/etc/receptor/receptor.conf" + {% for i in range(execution_node_count|int) -%} + receptor-{{ loop.index }}: + image: quay.io/awx/awx_devel:devel + user: "{{ ansible_user_uid }}" + container_name: tools_receptor_{{ loop.index }} + hostname: receptor-1 + command: 'receptor --config /etc/receptor/receptor.conf' + links: + - receptor-hop + volumes: + - "../../docker-compose/_sources/receptor/receptor-worker-{{ loop.index }}.conf:/etc/receptor/receptor.conf" + - "/sys/fs/cgroup:/sys/fs/cgroup" + privileged: true + {% endfor %} +{% endif %} + volumes: awx_db: name: tools_awx_db -{% for i in range(cluster_node_count|int) -%} +{% for i in range(control_plane_node_count|int) -%} {% set container_postfix = loop.index %} - execution_node_1: - name: tools_execution_node_1 redis_socket_{{ container_postfix }}: name: tools_redis_socket_{{ container_postfix }} {% endfor -%} diff --git a/tools/docker-compose/ansible/roles/sources/templates/docker-receptor.yml.j2 b/tools/docker-compose/ansible/roles/sources/templates/docker-receptor.yml.j2 deleted file mode 100644 index 574349cf43..0000000000 --- a/tools/docker-compose/ansible/roles/sources/templates/docker-receptor.yml.j2 +++ /dev/null @@ -1,66 +0,0 @@ -# -# a simple receptor network with a single control plane node and one hop -# to nodes where receptor + runner are installed -# -# -# -# ^-------------- -# ^-------------- -# ^-------------- -# ---- -version: '2.1' -services: - awx_1: - environment: - RECEPTOR_MUTUAL_TLS: awx receptor-hop receptor-1 receptor-2 receptor-3 - volumes: - - "./docker-compose-cluster:/etc/receptor" - receptor-hop: - image: quay.io/project-receptor/receptor:latest - user: root - container_name: tools_receptor_hop - hostname: receptor-hop - command: 'receptor --config /etc/receptor/awx-hop-receptor.conf' - links: - - awx_1 - ports: - - "5555:5555" - volumes: - - "../../docker-compose-cluster:/etc/receptor" - receptor-1: - image: quay.io/awx/awx_devel:devel - user: "{{ ansible_user_uid }}" - container_name: tools_receptor_1 - hostname: receptor-1 - command: 'receptor --config /etc/receptor/awx-1-receptor.conf' - links: - - receptor-hop - volumes: - - "../../docker-compose-cluster:/etc/receptor" - - "/sys/fs/cgroup:/sys/fs/cgroup" - privileged: true - receptor-2: - image: quay.io/awx/awx_devel:devel - user: "{{ ansible_user_uid }}" - container_name: tools_receptor_2 - hostname: receptor-2 - command: 'receptor --config /etc/receptor/awx-2-receptor.conf' - links: - - receptor-hop - volumes: - - "../../docker-compose-cluster:/etc/receptor" - - "/sys/fs/cgroup:/sys/fs/cgroup" - privileged: true - receptor-3: - image: quay.io/awx/awx_devel:devel - user: "{{ ansible_user_uid }}" - container_name: tools_receptor_3 - hostname: receptor-3 - command: 'receptor --config /etc/receptor/awx-3-receptor.conf' - links: - - receptor-hop - volumes: - - "../../docker-compose-cluster:/etc/receptor" - - "/sys/fs/cgroup:/sys/fs/cgroup" - privileged: true diff --git a/tools/docker-compose/ansible/roles/sources/templates/haproxy.cfg.j2 b/tools/docker-compose/ansible/roles/sources/templates/haproxy.cfg.j2 index 33837ded1f..1def668189 100644 --- a/tools/docker-compose/ansible/roles/sources/templates/haproxy.cfg.j2 +++ b/tools/docker-compose/ansible/roles/sources/templates/haproxy.cfg.j2 @@ -29,7 +29,7 @@ backend nodes http-request set-header X-Forwarded-Port %[dst_port] http-request add-header X-Forwarded-Proto https if { ssl_fc } option httpchk HEAD / HTTP/1.1\r\nHost:localhost -{% for i in range(cluster_node_count|int) %} +{% for i in range(control_plane_node_count|int) %} {% set container_postfix = loop.index %} server tools_awx_{{ container_postfix }} tools_awx_{{ container_postfix }}:8013 check {% endfor %} @@ -37,7 +37,7 @@ backend nodes backend nodes_ssl mode tcp balance roundrobin -{% for i in range(cluster_node_count|int) %} +{% for i in range(control_plane_node_count|int) %} {% set container_postfix = loop.index %} server tools_awx_{{ container_postfix }} tools_awx_{{ container_postfix }}:8043 check {% endfor %} diff --git a/tools/docker-compose/ansible/roles/sources/templates/receptor-hop.conf.j2 b/tools/docker-compose/ansible/roles/sources/templates/receptor-hop.conf.j2 new file mode 100644 index 0000000000..69fd33c3ca --- /dev/null +++ b/tools/docker-compose/ansible/roles/sources/templates/receptor-hop.conf.j2 @@ -0,0 +1,12 @@ +--- +- node: + id: receptor-hop + +- log-level: info + +- tcp-peer: + address: awx_1:4444 + redial: true + +- tcp-listener: + port: 5555 diff --git a/tools/docker-compose-cluster/awx-1-receptor.conf b/tools/docker-compose/ansible/roles/sources/templates/receptor-worker.conf.j2 similarity index 53% rename from tools/docker-compose-cluster/awx-1-receptor.conf rename to tools/docker-compose/ansible/roles/sources/templates/receptor-worker.conf.j2 index 82cd21471b..ef31f4d443 100644 --- a/tools/docker-compose-cluster/awx-1-receptor.conf +++ b/tools/docker-compose/ansible/roles/sources/templates/receptor-worker.conf.j2 @@ -1,19 +1,12 @@ --- - node: - id: receptor-1 + id: receptor-{{ item }} - log-level: info -#- tls-client: -# name: mutual-tls-client -# cert: /etc/receptor/certs/receptor-1.crt -# key: /etc/receptor/certs/receptor-1.key -# rootcas: /etc/receptor/certs/ca.crt - - tcp-peer: address: tools_receptor_hop:5555 redial: true - #tls: mutual-tls-client - work-command: worktype: ansible-runner From 289beb85d2d48d1e8c190fa663e5b1d8e869fea6 Mon Sep 17 00:00:00 2001 From: Alan Rominger Date: Mon, 26 Jul 2021 14:51:41 -0400 Subject: [PATCH 15/27] Add developer docs for incoming receptor mesh features (#10747) * Add developer docs for incoming receptor mesh features * Additional wording about the receptor mesh process * Wrap up docs feedback changes and polishing * Add in way more terminology introductions, delete statement about past * Fix typo around OCP-incluster type --- docs/receptor_mesh.md | 76 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 docs/receptor_mesh.md diff --git a/docs/receptor_mesh.md b/docs/receptor_mesh.md new file mode 100644 index 0000000000..be6e6b5cfc --- /dev/null +++ b/docs/receptor_mesh.md @@ -0,0 +1,76 @@ +## Receptor Mesh + +AWX uses a [Receptor](https://github.com/ansible/receptor) mesh to transmit "user-space" unified jobs: + - jobs + - ad hoc commands + - inventory updates + +to the node where they run. + +> NOTE: user-space jobs are what carry out the user's Ansible automation. These job types run inside of the designated execution environment so that the needed content is available. + +> NOTE: The word "node" corresponds to entries in the `Instance` database model, or the `/api/v2/instances/` endpoint, and is a machine participating in the cluster / mesh. + +The unified jobs API reports `controller_node` and `execution_node` fields. +The execution node is where the job runs, and the controller node interfaces between the job and server functions. + +Before a job can start, the controller node prepares the `private_data_dir` needed for the job to run. +Next, the controller node sends the data via `ansible-runner`'s `transmit`, and connects to the output stream with `process`. +For details on these commands, see the [ansible-runner docs on remote execution](https://ansible-runner.readthedocs.io/en/latest/remote_jobs.html). + +On the other side, the execution node runs the job under `ansible-runner worker`. + +### Split of Control Plane versus Execution Plane + +Instances in the **control plane** run persistent AWX services (like the web server, task dispatcher, etc.), project updates, and management jobs. Instances in the **execution plane** run user-space jobs. + +The task manager logic will not send user-space jobs to **control-only** nodes. +In the inventory definition, the user can set a flag to designate this node type. + +**Execution-only** nodes have a minimal set of software requirements needed to participate in the receptor mesh and run jobs under ansible-runner with podman isolation. +These _only_ run user-space jobs, and may be geographically separated (with high latency) from the control plane. +They may not even have a direct connection to the cluster, and use other receptor **hop** nodes to communicate. + +**Hybrid** (control & execution nodes) are instances in the control plane that are allowed to run user-space jobs. + +#### Receptor Configuration Work Type + +Execution-only nodes need to advertise the "ansible-runner" work type. +User-space jobs are submitted as a receptor work unit with this work type. + +An entry like this should appear in its `receptor.conf` (receptor configuration file): + +``` +- work-command: + worktype: ansible-runner + command: ansible-runner + params: worker + allowruntimeparams: true +``` + +Control (and hybrid) nodes advertise the "local" work type instead. +So the entry is the same as above, except that it has `worktype: local`. +Project updates are submitted as this work type. +If user-space jobs run on a hybrid node, they will also run as the "local" work type. + +Here is a listing of work types that you may encounter: + + - `local` - any ansible-runner job ran in a traditional install + - `ansible-runner` - remote execution of user-space jobs + - `kubernetes-runtime-auth` - user-space jobs ran in a container group + - `kubernetes-incluster-auth` - project updates and management jobs on OpenShift Container Platform + +### Auto-discovery of execution nodes + +Instances in control plane must be registered by the installer via `awx-manage` +commands like `awx-manage register_queue` or `awx-manage register_instance`. + +Execution-only nodes are automatically discovered after they have been configured and join the receptor mesh. +Control nodes should see them as a "Known Node". + +Control nodes check the receptor network (reported via `receptorctl status`) when their heartbeat task runs. +Nodes on the receptor network are compared against the `Instance` model in the database. + +If a node appears in the mesh network which is not in the database, then a "health check" is started. +This will submit a work unit to the execution node which then outputs important node data via `ansible-runner`. +The `capacity` field will obtain a non-zero value through this process, which is necessary to run jobs. From 46ccc5874900dd07142e496446866a969e50ed35 Mon Sep 17 00:00:00 2001 From: Alan Rominger Date: Tue, 27 Jul 2021 11:19:07 -0400 Subject: [PATCH 16/27] Make the AWX nodes fully connected in the development environment (#10758) --- .../ansible/roles/sources/tasks/main.yml | 9 ++++++- .../sources/templates/docker-compose.yml.j2 | 2 +- .../sources/templates/receptor-awx.conf.j2} | 12 +++++---- .../sources/templates/receptor-hop.conf.j2 | 2 +- .../awx-1-receptor-standalone.conf | 25 ------------------- 5 files changed, 17 insertions(+), 33 deletions(-) rename tools/docker-compose/{receptor.conf => ansible/roles/sources/templates/receptor-awx.conf.j2} (82%) delete mode 100644 tools/docker-compose/awx-1-receptor-standalone.conf diff --git a/tools/docker-compose/ansible/roles/sources/tasks/main.yml b/tools/docker-compose/ansible/roles/sources/tasks/main.yml index 57bc4ed922..5d0d7170bb 100644 --- a/tools/docker-compose/ansible/roles/sources/tasks/main.yml +++ b/tools/docker-compose/ansible/roles/sources/tasks/main.yml @@ -85,12 +85,19 @@ dest: "{{ sources_dest }}/{{ compose_name }}" mode: '0600' +- name: Render Receptor Config(s) for Control Plane + template: + src: "receptor-awx.conf.j2" + dest: "{{ sources_dest }}/receptor/receptor-awx-{{ item }}.conf" + mode: '0600' + with_sequence: start=1 end={{ control_plane_node_count }} + - name: Render Receptor Hop Config template: src: "receptor-hop.conf.j2" dest: "{{ sources_dest }}/receptor/receptor-hop.conf" mode: '0600' - + - name: Render Receptor Worker Config(s) template: src: "receptor-worker.conf.j2" diff --git a/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 b/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 index c24e7e85f6..cd30a90792 100644 --- a/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 +++ b/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 @@ -35,7 +35,7 @@ services: - "../../docker-compose/_sources/websocket_secret.py:/etc/tower/conf.d/websocket_secret.py" - "../../docker-compose/_sources/local_settings.py:/etc/tower/conf.d/local_settings.py" - "../../docker-compose/_sources/SECRET_KEY:/etc/tower/SECRET_KEY" - - "../../docker-compose/receptor.conf:/etc/receptor/receptor.conf" + - "../../docker-compose/_sources/receptor/receptor-awx-{{ loop.index }}.conf:/etc/receptor/receptor.conf" - "../../docker-compose-cluster/certs:/etc/receptor/certs" - "/sys/fs/cgroup:/sys/fs/cgroup" - "~/.kube/config:/var/lib/awx/.kube/config" diff --git a/tools/docker-compose/receptor.conf b/tools/docker-compose/ansible/roles/sources/templates/receptor-awx.conf.j2 similarity index 82% rename from tools/docker-compose/receptor.conf rename to tools/docker-compose/ansible/roles/sources/templates/receptor-awx.conf.j2 index 94f360a14e..e95097e1ef 100644 --- a/tools/docker-compose/receptor.conf +++ b/tools/docker-compose/ansible/roles/sources/templates/receptor-awx.conf.j2 @@ -1,6 +1,6 @@ --- - node: - id: awx_1 + id: awx_{{ item }} - log-level: debug @@ -8,6 +8,12 @@ - tcp-listener: port: 2222 +{% for i in range(item | int + 1, control_plane_node_count | int + 1) %} +- tcp-peer: + address: awx_{{ i }}:2222 + redial: true +{% endfor %} + #- tls-server: # name: mutual-tls # cert: /etc/receptor/certs/awx.crt @@ -38,7 +44,3 @@ allowruntimeauth: true allowruntimepod: true allowruntimeparams: true - -- tcp-listener: - port: 4444 - #tls: mutual-tls diff --git a/tools/docker-compose/ansible/roles/sources/templates/receptor-hop.conf.j2 b/tools/docker-compose/ansible/roles/sources/templates/receptor-hop.conf.j2 index 69fd33c3ca..4f055fa1f3 100644 --- a/tools/docker-compose/ansible/roles/sources/templates/receptor-hop.conf.j2 +++ b/tools/docker-compose/ansible/roles/sources/templates/receptor-hop.conf.j2 @@ -5,7 +5,7 @@ - log-level: info - tcp-peer: - address: awx_1:4444 + address: awx_1:2222 redial: true - tcp-listener: diff --git a/tools/docker-compose/awx-1-receptor-standalone.conf b/tools/docker-compose/awx-1-receptor-standalone.conf deleted file mode 100644 index 4c2912ea88..0000000000 --- a/tools/docker-compose/awx-1-receptor-standalone.conf +++ /dev/null @@ -1,25 +0,0 @@ ---- -- node: - id: execution_node_1 - -- log-level: info - -#- tls-client: -# name: mutual-tls-client -# cert: /etc/receptor/certs/receptor_1.crt -# key: /etc/receptor/certs/receptor_1.key -# rootcas: /etc/receptor/certs/ca.crt - -- tcp-peer: - address: awx_1:4444 - redial: true - #tls: mutual-tls-client - -- work-command: - worktype: ansible-runner - command: ansible-runner - params: worker - allowruntimeparams: true - -- control-service: - service: control From b53d3bc81d5e4620667cc19fea200fb989a66d20 Mon Sep 17 00:00:00 2001 From: Alan Rominger Date: Wed, 28 Jul 2021 13:33:35 -0400 Subject: [PATCH 17/27] Undo some things not compatible with hybrid node hack (#10763) --- awx/main/models/ha.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/awx/main/models/ha.py b/awx/main/models/ha.py index 984daed346..36217ebf45 100644 --- a/awx/main/models/ha.py +++ b/awx/main/models/ha.py @@ -128,6 +128,7 @@ class Instance(HasPolicyEditsMixin, BaseModel): @staticmethod def choose_online_control_plane_node(): + # TODO: update query to use node_type field return random.choice(Instance.objects.filter(enabled=True).exclude(version__startswith='ansible-runner-').values_list('hostname', flat=True)) def is_lost(self, ref_time=None): @@ -158,9 +159,6 @@ class Instance(HasPolicyEditsMixin, BaseModel): self.version = awx_application_version self.save(update_fields=['capacity', 'version', 'modified', 'cpu', 'memory', 'cpu_capacity', 'mem_capacity']) - def is_receptor(self): - return self.version.startswith('ansible-runner-') - class InstanceGroup(HasPolicyEditsMixin, BaseModel, RelatedJobsMixin): """A model representing a Queue/Group of AWX Instances.""" @@ -233,9 +231,7 @@ class InstanceGroup(HasPolicyEditsMixin, BaseModel, RelatedJobsMixin): def fit_task_to_most_remaining_capacity_instance(task, instances): instance_most_capacity = None for i in instances: - # TODO: change this to check if "execution" is in node_type field - if not i.version.startswith('ansible-runner'): - continue + # TODO: continue if node is control-only node type if i.remaining_capacity >= task.task_impact and ( instance_most_capacity is None or i.remaining_capacity > instance_most_capacity.remaining_capacity ): From 5d4ab133867870193ee3a4beb7fe7c8553bb6855 Mon Sep 17 00:00:00 2001 From: Alan Rominger Date: Wed, 28 Jul 2021 15:07:34 -0400 Subject: [PATCH 18/27] Add topology of docker-compose to docs, remove old mount (#10773) --- docs/receptor_mesh.md | 30 +++++++++++++++++++ .../sources/templates/docker-compose.yml.j2 | 2 +- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/docs/receptor_mesh.md b/docs/receptor_mesh.md index be6e6b5cfc..f1023eb7f9 100644 --- a/docs/receptor_mesh.md +++ b/docs/receptor_mesh.md @@ -74,3 +74,33 @@ Nodes on the receptor network are compared against the `Instance` model in the d If a node appears in the mesh network which is not in the database, then a "health check" is started. This will submit a work unit to the execution node which then outputs important node data via `ansible-runner`. The `capacity` field will obtain a non-zero value through this process, which is necessary to run jobs. + +### Development Environment + +A "toy" cluster with execution nodes and a hop node is created by the docker-compose Makefile target. +By default, it will create 1 hybrid node, 1 hop node, and 2 execution nodes. +The number of nodes can be changed with environment variables, for example: + +``` +CONTROL_PLANE_NODE_COUNT=2 EXECUTION_NODE_COUNT=3 COMPOSE_TAG=devel make docker-compose +``` + +This will spin up a topology represented below. +(names are the receptor node names, which differ from the AWX Instance names and network address in some cases) + +``` +---v + -----v + + + ^-------------- + ^-------------- + ^-------------- +``` + +All execution (`receptor-*`) nodes connect to the hop node. +Only the `awx_1` node connects to the hop node out of the AWX cluster. +`awx_1` connects to `awx_2`, fulfilling the requirement that the AWX cluster is fully connected. + +For an example, if a job is launched with `awx_2` as the `controller_node` and `receptor-3` as the `execution_node`, +then `awx_2` communicates to `receptor-3` via `awx_1` and then `receptor-hop`. diff --git a/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 b/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 index cd30a90792..3a503f106c 100644 --- a/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 +++ b/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 @@ -36,7 +36,7 @@ services: - "../../docker-compose/_sources/local_settings.py:/etc/tower/conf.d/local_settings.py" - "../../docker-compose/_sources/SECRET_KEY:/etc/tower/SECRET_KEY" - "../../docker-compose/_sources/receptor/receptor-awx-{{ loop.index }}.conf:/etc/receptor/receptor.conf" - - "../../docker-compose-cluster/certs:/etc/receptor/certs" + # - "../../docker-compose/_sources/certs:/etc/receptor/certs" # TODO: optionally generate certs - "/sys/fs/cgroup:/sys/fs/cgroup" - "~/.kube/config:/var/lib/awx/.kube/config" - "redis_socket_{{ container_postfix }}:/var/run/redis/:rw" From f47eb126e2d08976b5ce8c2ddf33664480bc88dc Mon Sep 17 00:00:00 2001 From: Alan Rominger Date: Wed, 4 Aug 2021 10:59:44 -0400 Subject: [PATCH 19/27] Adopt the node_type field in receptor logic (#10802) * Adopt the node_type field in receptor logic * Refactor Instance.objects.register so we do not reset capacity to 0 --- awx/main/constants.py | 4 ++++ awx/main/managers.py | 15 +++++++++++---- awx/main/models/ha.py | 14 ++++---------- awx/main/scheduler/task_manager.py | 1 + awx/main/tasks.py | 15 ++++++--------- 5 files changed, 26 insertions(+), 23 deletions(-) diff --git a/awx/main/constants.py b/awx/main/constants.py index f2af99d167..b7a5813f7f 100644 --- a/awx/main/constants.py +++ b/awx/main/constants.py @@ -77,3 +77,7 @@ LOGGER_BLOCKLIST = ( # loggers that may be called getting logging settings 'awx.conf', ) + +# Reported version for node seen in receptor mesh but for which capacity check +# failed or is in progress +RECEPTOR_PENDING = 'ansible-runner-???' diff --git a/awx/main/managers.py b/awx/main/managers.py index b26d476fd1..01fb223df5 100644 --- a/awx/main/managers.py +++ b/awx/main/managers.py @@ -10,6 +10,7 @@ from django.conf import settings from awx.main.utils.filters import SmartFilter from awx.main.utils.pglock import advisory_lock +from awx.main.constants import RECEPTOR_PENDING ___all__ = ['HostManager', 'InstanceManager', 'InstanceGroupManager', 'DeferJobCreatedManager'] @@ -111,13 +112,11 @@ class InstanceManager(models.Manager): return node[0] raise RuntimeError("No instance found with the current cluster host id") - def register(self, uuid=None, hostname=None, ip_address=None, node_type=None): + def register(self, uuid=None, hostname=None, ip_address=None, node_type='hybrid', defaults=None): if not uuid: uuid = settings.SYSTEM_UUID if not hostname: hostname = settings.CLUSTER_HOST_ID - if not node_type: - node_type = "hybrid" with advisory_lock('instance_registration_%s' % hostname): if settings.AWX_AUTO_DEPROVISION_INSTANCES: # detect any instances with the same IP address. @@ -130,6 +129,7 @@ class InstanceManager(models.Manager): other_inst.save(update_fields=['ip_address']) logger.warning("IP address {0} conflict detected, ip address unset for host {1}.".format(ip_address, other_hostname)) + # Return existing instance that matches hostname instance = self.filter(hostname=hostname) if instance.exists(): instance = instance.get() @@ -145,7 +145,14 @@ class InstanceManager(models.Manager): return (True, instance) else: return (False, instance) - instance = self.create(uuid=uuid, hostname=hostname, ip_address=ip_address, capacity=0, node_type=node_type) + + # Create new instance, and fill in default values + create_defaults = dict(capacity=0, uuid=uuid) + if defaults is not None: + create_defaults.update(defaults) + if node_type == 'execution' and 'version' not in create_defaults: + create_defaults['version'] = RECEPTOR_PENDING + instance = self.create(hostname=hostname, ip_address=ip_address, node_type=node_type, **create_defaults) return (True, instance) def get_or_register(self): diff --git a/awx/main/models/ha.py b/awx/main/models/ha.py index 36217ebf45..ebef8dc7be 100644 --- a/awx/main/models/ha.py +++ b/awx/main/models/ha.py @@ -107,11 +107,6 @@ class Instance(HasPolicyEditsMixin, BaseModel): def remaining_capacity(self): return self.capacity - self.consumed_capacity - @property - def role(self): - # NOTE: TODO: Likely to repurpose this once standalone ramparts are a thing - return "awx" - @property def jobs_running(self): return UnifiedJob.objects.filter( @@ -128,8 +123,7 @@ class Instance(HasPolicyEditsMixin, BaseModel): @staticmethod def choose_online_control_plane_node(): - # TODO: update query to use node_type field - return random.choice(Instance.objects.filter(enabled=True).exclude(version__startswith='ansible-runner-').values_list('hostname', flat=True)) + return random.choice(Instance.objects.filter(enabled=True).filter(node_type__in=['control', 'hybrid']).values_list('hostname', flat=True)) def is_lost(self, ref_time=None): if ref_time is None: @@ -206,8 +200,7 @@ class InstanceGroup(HasPolicyEditsMixin, BaseModel, RelatedJobsMixin): @property def execution_capacity(self): - # TODO: update query to exclude based on node_type field - return sum([inst.capacity for inst in self.instances.all()]) + return sum([inst.capacity for inst in self.instances.filter(node_type__in=['hybrid', 'execution'])]) @property def jobs_running(self): @@ -231,7 +224,8 @@ class InstanceGroup(HasPolicyEditsMixin, BaseModel, RelatedJobsMixin): def fit_task_to_most_remaining_capacity_instance(task, instances): instance_most_capacity = None for i in instances: - # TODO: continue if node is control-only node type + if i.node_type == 'control': + continue if i.remaining_capacity >= task.task_impact and ( instance_most_capacity is None or i.remaining_capacity > instance_most_capacity.remaining_capacity ): diff --git a/awx/main/scheduler/task_manager.py b/awx/main/scheduler/task_manager.py index 56a18a1412..4eaf2743cc 100644 --- a/awx/main/scheduler/task_manager.py +++ b/awx/main/scheduler/task_manager.py @@ -76,6 +76,7 @@ class TaskManager: SimpleNamespace( obj=instance, version=instance.version, + node_type=instance.node_type, remaining_capacity=instance.remaining_capacity, capacity=instance.capacity, jobs_running=instance.jobs_running, diff --git a/awx/main/tasks.py b/awx/main/tasks.py index 1e9e57ba45..830bc2ad0c 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -85,7 +85,7 @@ from awx.main.models import ( SystemJobEvent, build_safe_env, ) -from awx.main.constants import ACTIVE_STATES +from awx.main.constants import ACTIVE_STATES, RECEPTOR_PENDING from awx.main.exceptions import AwxTaskError, PostRunError from awx.main.queue import CallbackQueueDispatcher from awx.main.dispatch.publish import task @@ -121,7 +121,6 @@ from awx.main.analytics.subsystem_metrics import Metrics from rest_framework.exceptions import PermissionDenied RECEPTOR_SOCK = '/var/run/receptor/receptor.sock' -RECEPTOR_PENDING = 'ansible-runner-???' __all__ = [ @@ -422,17 +421,14 @@ def discover_receptor_nodes(): commands = ad['WorkCommands'] or [] if 'ansible-runner' not in commands: continue - (changed, instance) = Instance.objects.register(hostname=hostname) + (changed, instance) = Instance.objects.register(hostname=hostname, node_type='execution') was_lost = instance.is_lost(ref_time=nowtime) if changed: - logger.info("Registered tower execution node '{}'".format(hostname)) - instance.capacity = 0 - instance.version = RECEPTOR_PENDING - instance.save(update_fields=['capacity', 'version', 'modified']) + logger.info("Registered execution node '{}'".format(hostname)) check_heartbeat.apply_async([hostname]) else: last_seen = parse_date(ad['Time']) - logger.debug("Updated tower control node '{}' last seen {}".format(hostname, last_seen)) + logger.debug("Updated execution node '{}' modified from {} to {}".format(hostname, instance.modified, last_seen)) instance.modified = last_seen if instance.is_lost(ref_time=nowtime): # if the instance hasn't advertised in awhile, @@ -466,7 +462,8 @@ def cluster_node_heartbeat(): if inst.hostname == settings.CLUSTER_HOST_ID: this_inst = inst instance_list.remove(inst) - elif inst.version.startswith('ansible-runner'): # TODO: use proper field when introduced + elif inst.node_type == 'execution': + # Only considering control plane for this logic continue elif inst.is_lost(ref_time=nowtime): lost_instances.append(inst) From 4e84c7c4c4c8b5d28d9fbca4e201f3ec2eed92cb Mon Sep 17 00:00:00 2001 From: Alan Rominger Date: Wed, 4 Aug 2021 11:02:04 -0400 Subject: [PATCH 20/27] Use the existing get_receptor_ctl method (#10813) --- awx/main/tasks.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/awx/main/tasks.py b/awx/main/tasks.py index 830bc2ad0c..6ca32e83b2 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -120,8 +120,6 @@ from awx.main.analytics.subsystem_metrics import Metrics from rest_framework.exceptions import PermissionDenied -RECEPTOR_SOCK = '/var/run/receptor/receptor.sock' - __all__ = [ 'RunJob', @@ -413,7 +411,7 @@ def check_heartbeat(node): def discover_receptor_nodes(): - ctl = ReceptorControl(RECEPTOR_SOCK) + ctl = get_receptor_ctl() connections = ctl.simple_command('status')['Advertisements'] nowtime = now() for ad in connections: @@ -2981,9 +2979,7 @@ class AWXReceptorJob: def run(self, work_type=None): # We establish a connection to the Receptor socket - # TODO: Seems like we already have a method for doing this now? - # receptor_ctl = get_receptor_ctl() - receptor_ctl = ReceptorControl(RECEPTOR_SOCK) + receptor_ctl = get_receptor_ctl() try: return self._run_internal(receptor_ctl, work_type=work_type) From 3b1e40d227cc22510d03fe6b375a17e43c6b0661 Mon Sep 17 00:00:00 2001 From: Alan Rominger Date: Wed, 11 Aug 2021 10:14:20 -0400 Subject: [PATCH 21/27] Use the ansible-runner worker --worker-info to perform execution node capacity checks (#10825) * Introduce utilities for --worker-info health check integration * Handle case where ansible-runner is not installed * Add ttl parameter for health check * Reformulate return data structure and add lots of error cases * Move up the cleanup tasks, close sockets * Integrate new --worker-info into the execution node capacity check * Undo the raw value override from the PoC * Additional refinement to execution node check frequency * Put in more complete network diagram * Followup on comment to remove modified from from health check responsibilities --- awx/main/models/ha.py | 7 ++ awx/main/tasks.py | 208 +++++++++++-------------------------- awx/main/utils/common.py | 53 ++++++---- awx/main/utils/receptor.py | 77 ++++++++++++++ docs/receptor_mesh.md | 45 ++++++-- 5 files changed, 212 insertions(+), 178 deletions(-) create mode 100644 awx/main/utils/receptor.py diff --git a/awx/main/models/ha.py b/awx/main/models/ha.py index ebef8dc7be..aaf0b990ec 100644 --- a/awx/main/models/ha.py +++ b/awx/main/models/ha.py @@ -131,6 +131,13 @@ class Instance(HasPolicyEditsMixin, BaseModel): grace_period = 120 return self.modified < ref_time - timedelta(seconds=grace_period) + def mark_offline(self, on_good_terms=False): + self.cpu = self.cpu_capacity = self.memory = self.mem_capacity = self.capacity = 0 + update_fields = ['capacity', 'cpu', 'memory', 'cpu_capacity', 'mem_capacity'] + if on_good_terms: + update_fields.append('modified') + self.save() + def refresh_capacity(self): cpu = get_cpu_capacity() mem = get_mem_capacity() diff --git a/awx/main/tasks.py b/awx/main/tasks.py index 6ca32e83b2..56f10dacc0 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -52,9 +52,6 @@ from gitdb.exc import BadName as BadGitName # Runner import ansible_runner -# Receptor -from receptorctl.socket_interface import ReceptorControl - # dateutil from dateutil.parser import parse as parse_date @@ -85,7 +82,7 @@ from awx.main.models import ( SystemJobEvent, build_safe_env, ) -from awx.main.constants import ACTIVE_STATES, RECEPTOR_PENDING +from awx.main.constants import ACTIVE_STATES from awx.main.exceptions import AwxTaskError, PostRunError from awx.main.queue import CallbackQueueDispatcher from awx.main.dispatch.publish import task @@ -101,17 +98,18 @@ from awx.main.utils.common import ( parse_yaml_or_json, cleanup_new_process, create_partition, - get_cpu_capacity, - get_mem_capacity, + get_cpu_effective_capacity, + get_mem_effective_capacity, get_system_task_capacity, ) -from awx.main.utils.execution_environments import get_default_execution_environment, get_default_pod_spec, CONTAINER_ROOT, to_container_path +from awx.main.utils.execution_environments import get_default_pod_spec, CONTAINER_ROOT, to_container_path from awx.main.utils.ansible import read_ansible_config from awx.main.utils.external_logging import reconfigure_rsyslog from awx.main.utils.safe_yaml import safe_dump, sanitize_jinja from awx.main.utils.reload import stop_local_services from awx.main.utils.pglock import advisory_lock from awx.main.utils.handlers import SpecialInventoryHandler +from awx.main.utils.receptor import get_receptor_ctl, worker_info from awx.main.consumers import emit_channel_notification from awx.main import analytics from awx.conf import settings_registry @@ -182,8 +180,7 @@ def dispatch_startup(): def inform_cluster_of_shutdown(): try: this_inst = Instance.objects.get(hostname=settings.CLUSTER_HOST_ID) - this_inst.capacity = 0 # No thank you to new jobs while shut down - this_inst.save(update_fields=['capacity', 'modified']) + this_inst.mark_offline(on_good_terms=True) # No thank you to new jobs while shut down try: reaper.reap(this_inst) except Exception: @@ -407,7 +404,34 @@ def cleanup_execution_environment_images(): @task(queue=get_local_queuename) def check_heartbeat(node): - AWXReceptorJob.check_heartbeat(node) + try: + instance = Instance.objects.get(hostname=node) + except Instance.DoesNotExist: + logger.warn(f'Instance record for {node} missing, could not check capacity.') + return + data = worker_info(node) + + if data['Errors']: + formatted_error = "\n".join(data["Errors"]) + if instance.capacity: + logger.warn(f'Health check marking execution node {node} as lost, errors:\n{formatted_error}') + else: + logger.info(f'Failed to find capacity of new or lost execution node {node}, errors:\n{formatted_error}') + instance.mark_offline() + else: + # TODO: spin off new instance method from refresh_capacity that calculates derived fields + instance.cpu = data['CPU Capacity'] # TODO: rename field on runner side to not say "Capacity" + instance.cpu_capacity = get_cpu_effective_capacity(instance.cpu) + instance.memory = data['Memory Capacity'] * 1000 # TODO: double-check the multiplier here + instance.mem_capacity = get_mem_effective_capacity(instance.memory) + instance.capacity = get_system_task_capacity( + instance.capacity_adjustment, + instance.cpu_capacity, + instance.mem_capacity, + ) + instance.version = 'ansible-runner-' + data['Version'] + instance.save(update_fields=['capacity', 'version', 'cpu', 'memory', 'cpu_capacity', 'mem_capacity']) + logger.info('Set capacity of execution node {} to {}, worker info data:\n{}'.format(node, instance.capacity, json.dumps(data, indent=2))) def discover_receptor_nodes(): @@ -421,25 +445,30 @@ def discover_receptor_nodes(): continue (changed, instance) = Instance.objects.register(hostname=hostname, node_type='execution') was_lost = instance.is_lost(ref_time=nowtime) - if changed: - logger.info("Registered execution node '{}'".format(hostname)) - check_heartbeat.apply_async([hostname]) - else: - last_seen = parse_date(ad['Time']) - logger.debug("Updated execution node '{}' modified from {} to {}".format(hostname, instance.modified, last_seen)) - instance.modified = last_seen - if instance.is_lost(ref_time=nowtime): - # if the instance hasn't advertised in awhile, - # don't save a new modified time - continue + last_seen = parse_date(ad['Time']) + if instance.modified == last_seen: + continue + instance.modified = last_seen + if instance.is_lost(ref_time=nowtime): + # if the instance hasn't advertised in awhile, don't save a new modified time + # this is so multiple cluster nodes do all make repetitive updates + continue - instance.save(update_fields=['modified']) - if was_lost: - # if the instance *was* lost, but has appeared again, - # attempt to re-establish the initial capacity and version - # check - logger.warning('Execution node attempting to rejoin as instance {}.'.format(hostname)) - check_heartbeat.apply_async([hostname]) + instance.save(update_fields=['modified']) + if changed: + logger.warn("Registered execution node '{}'".format(hostname)) + check_heartbeat.apply_async([hostname]) + elif was_lost: + # if the instance *was* lost, but has appeared again, + # attempt to re-establish the initial capacity and version + # check + logger.warn(f'Execution node attempting to rejoin as instance {hostname}.') + check_heartbeat.apply_async([hostname]) + elif instance.capacity == 0: + # Periodically re-run the health check of errored nodes, in case someone fixed it + # TODO: perhaps decrease the frequency of these checks + logger.debug(f'Restarting health check for execution node {hostname} with known errors.') + check_heartbeat.apply_async([hostname]) @task(queue=get_local_queuename) @@ -460,7 +489,7 @@ def cluster_node_heartbeat(): if inst.hostname == settings.CLUSTER_HOST_ID: this_inst = inst instance_list.remove(inst) - elif inst.node_type == 'execution': + elif inst.node_type == 'execution': # TODO: zero out capacity of execution nodes that are MIA # Only considering control plane for this logic continue elif inst.is_lost(ref_time=nowtime): @@ -504,8 +533,7 @@ def cluster_node_heartbeat(): # If auto deprovisining is on, don't bother setting the capacity to 0 # since we will delete the node anyway. if other_inst.capacity != 0 and not settings.AWX_AUTO_DEPROVISION_INSTANCES: - other_inst.capacity = 0 - other_inst.save(update_fields=['capacity']) + other_inst.mark_offline() logger.error("Host {} last checked in at {}, marked as lost.".format(other_inst.hostname, other_inst.modified)) elif settings.AWX_AUTO_DEPROVISION_INSTANCES: deprovision_hostname = other_inst.hostname @@ -802,10 +830,6 @@ def with_path_cleanup(f): return _wrapped -def get_receptor_ctl(): - return ReceptorControl('/var/run/receptor/receptor.sock') - - class BaseTask(object): model = None event_model = None @@ -2988,120 +3012,6 @@ class AWXReceptorJob: if self.unit_id is not None and settings.RECEPTOR_RELEASE_WORK: receptor_ctl.simple_command(f"work release {self.unit_id}") - @classmethod - def check_heartbeat(cls, node): # TODO: rename most of these "heartbeat" things - logger.info(f'Checking capacity of execution node {node}') - # make a private data dir and env dir - private_data_dir = tempfile.mkdtemp(prefix='awx_heartbeat_', dir=settings.AWX_ISOLATION_BASE_PATH) - env_path = os.path.join(private_data_dir, 'env') - os.makedirs(os.path.join(env_path), mode=0o700) - # write a cmdline file for adhoc - f = os.fdopen(os.open(os.path.join(env_path, 'cmdline'), os.O_RDWR | os.O_CREAT, stat.S_IREAD | stat.S_IWRITE), 'w') - f.write(ansible_runner.utils.args2cmdline('localhost')) - f.close() - # write a custom facts.d to report the runner version - facts_path = os.path.join(private_data_dir, 'facts.d') - os.makedirs(facts_path, mode=0o700) - with open(os.path.join(facts_path, 'ansible_runner.fact'), 'w') as f: - os.chmod(f.name, 0o700) - f.write("""#!/usr/bin/env sh\necho "{\\"version\\": \\"`ansible-runner --version`\\"}"\n""") # noqa - # write a local inventory - inventory_path = os.path.join(private_data_dir, 'inventory') - os.makedirs(inventory_path, mode=0o700) - fn = os.path.join(inventory_path, 'hosts') - with open(fn, 'w') as f: - os.chmod(fn, stat.S_IRUSR | stat.S_IXUSR | stat.S_IWUSR) - f.write('localhost ansible_connection=local') - # we have to create the project directory because it is --workdir and crun needs it to exist - # https://github.com/ansible/ansible-runner/issues/758 - project_path = os.path.join(private_data_dir, 'project') - os.makedirs(project_path, mode=0o700) - - runner_params = { - 'ident': str(uuid4()), - 'private_data_dir': private_data_dir, - 'module': 'setup', - 'module_args': f'fact_path={private_data_dir}/facts.d', - 'inventory': inventory_path, - 'only_transmit_kwargs': False, - 'settings': { - "container_image": get_default_execution_environment().image, - "container_options": ['--user=root'], - "process_isolation": True, - }, - } - - class _Instance(object): - pk = -1 - job_env = {} - - @property - def is_container_group_task(self): - return False - - @property - def execution_node(self): - return node - - class _BaseTask(object): - instance = _Instance() - cpus = 0 - mem_mb = 0 - version = RECEPTOR_PENDING - - def build_execution_environment_params(self, instance, private_data_dir): - return {} - - def event_handler(self, event_data): - if event_data.get('event') == 'runner_on_ok': - facts = event_data.get('event_data', {}).get('res', {}).get('ansible_facts', {}) - if facts: - self.cpus = facts.get('ansible_processor_vcpus', 0) - self.mem_mb = facts.get('ansible_memtotal_mb', 0) - version = facts.get('ansible_local', {}).get('ansible_runner', {}).get('version', '') # noqa - if version: - self.version = f'ansible-runner-{version}' - # TODO: save event_data["stdout"] and log when errors happen - - def finished_callback(self, runner_obj): - pass - - def cancel_callback(self): - pass - - def status_handler(self, status_data, runner_config): - # TODO: log error cases - pass - - def update_model(self, *args, **kw): - pass - - task = _BaseTask() - receptor_job = cls(task, runner_params) - res = receptor_job.run(work_type='ansible-runner') - if res.status == 'successful': - cpu = get_cpu_capacity(task.cpus) - mem = get_mem_capacity(task.mem_mb * 1000000) - logger.info(f'Calculated memory capacity: {task.mem_mb}, out: {mem}') - instance = Instance.objects.get(hostname=node) - instance.cpu = cpu[0] - instance.cpu_capacity = cpu[1] - instance.memory = mem[0] - instance.mem_capacity = mem[1] - instance.capacity = get_system_task_capacity( - instance.capacity_adjustment, - instance.cpu_capacity, - instance.mem_capacity, - ) - instance.version = task.version - instance.save(update_fields=['capacity', 'version', 'cpu', 'memory', 'cpu_capacity', 'mem_capacity']) - logger.info(f'Updated capacity of {node} to cpu: {instance.cpu_capacity} mem: {instance.mem_capacity}') - else: - # TODO: error handling like we do with jobs - # receptorctl work results - # receptorctl work list - logger.info(f'Capacity check not successful for execution node {node}') - def _run_internal(self, receptor_ctl, work_type=None): # Create a socketpair. Where the left side will be used for writing our payload # (private data dir, kwargs). The right side will be passed to Receptor for diff --git a/awx/main/utils/common.py b/awx/main/utils/common.py index e034dda02c..9194fa51d9 100644 --- a/awx/main/utils/common.py +++ b/awx/main/utils/common.py @@ -699,12 +699,25 @@ def parse_yaml_or_json(vars_str, silent_failure=True): return vars_dict -def get_cpu_capacity(raw=None): +def get_cpu_effective_capacity(cpu_count): from django.conf import settings settings_forkcpu = getattr(settings, 'SYSTEM_TASK_FORKS_CPU', None) env_forkcpu = os.getenv('SYSTEM_TASK_FORKS_CPU', None) + if env_forkcpu: + forkcpu = int(env_forkcpu) + elif settings_forkcpu: + forkcpu = int(settings_forkcpu) + else: + forkcpu = 4 + + return cpu_count * forkcpu + + +def get_cpu_capacity(): + from django.conf import settings + settings_abscpu = getattr(settings, 'SYSTEM_TASK_ABS_CPU', None) env_abscpu = os.getenv('SYSTEM_TASK_ABS_CPU', None) @@ -713,24 +726,30 @@ def get_cpu_capacity(raw=None): elif settings_abscpu is not None: return 0, int(settings_abscpu) - if raw is None: - raw = psutil.cpu_count() + cpu = psutil.cpu_count() - if env_forkcpu: - forkcpu = int(env_forkcpu) - elif settings_forkcpu: - forkcpu = int(settings_forkcpu) - else: - forkcpu = 4 - return (raw, raw * forkcpu) + return (cpu, get_cpu_effective_capacity(cpu)) -def get_mem_capacity(raw_mb=None): +def get_mem_effective_capacity(mem_mb): from django.conf import settings settings_forkmem = getattr(settings, 'SYSTEM_TASK_FORKS_MEM', None) env_forkmem = os.getenv('SYSTEM_TASK_FORKS_MEM', None) + if env_forkmem: + forkmem = int(env_forkmem) + elif settings_forkmem: + forkmem = int(settings_forkmem) + else: + forkmem = 100 + + return max(1, ((mem_mb // 1024 // 1024) - 2048) // forkmem) + + +def get_mem_capacity(): + from django.conf import settings + settings_absmem = getattr(settings, 'SYSTEM_TASK_ABS_MEM', None) env_absmem = os.getenv('SYSTEM_TASK_ABS_MEM', None) @@ -739,16 +758,8 @@ def get_mem_capacity(raw_mb=None): elif settings_absmem is not None: return 0, int(settings_absmem) - if env_forkmem: - forkmem = int(env_forkmem) - elif settings_forkmem: - forkmem = int(settings_forkmem) - else: - forkmem = 100 - - if raw_mb is None: - raw_mb = psutil.virtual_memory().total - return (raw_mb, max(1, ((raw_mb // 1024 // 1024) - 2048) // forkmem)) + mem = psutil.virtual_memory().total + return (mem, get_mem_effective_capacity(mem)) def get_system_task_capacity(scale=Decimal(1.0), cpu_capacity=None, mem_capacity=None): diff --git a/awx/main/utils/receptor.py b/awx/main/utils/receptor.py new file mode 100644 index 0000000000..8a205334c1 --- /dev/null +++ b/awx/main/utils/receptor.py @@ -0,0 +1,77 @@ +import logging +import yaml +import time + +from receptorctl.socket_interface import ReceptorControl + + +logger = logging.getLogger('awx.main.utils.receptor') + + +def get_receptor_ctl(): + return ReceptorControl('/var/run/receptor/receptor.sock') + + +def worker_info(node_name): + receptor_ctl = get_receptor_ctl() + transmit_start = time.time() + error_list = [] + data = {'Errors': error_list, 'transmit_timing': 0.0} + + result = receptor_ctl.submit_work(worktype='ansible-runner', payload='', params={"params": f"--worker-info"}, ttl='20s', node=node_name) + + unit_id = result['unitid'] + run_start = time.time() + data['transmit_timing'] = run_start - transmit_start + data['run_timing'] = 0.0 + + try: + + resultfile = receptor_ctl.get_work_results(unit_id) + + stdout = '' + + while data['run_timing'] < 20.0: + status = receptor_ctl.simple_command(f'work status {unit_id}') + state_name = status.get('StateName') + if state_name not in ('Pending', 'Running'): + break + data['run_timing'] = time.time() - run_start + time.sleep(0.5) + else: + error_list.append(f'Timeout getting worker info on {node_name}, state remains in {state_name}') + + stdout = resultfile.read() + stdout = str(stdout, encoding='utf-8') + + finally: + + res = receptor_ctl.simple_command(f"work release {unit_id}") + if res != {'released': unit_id}: + logger.warn(f'Could not confirm release of receptor work unit id {unit_id} from {node_name}, data: {res}') + + receptor_ctl.close() + + if state_name.lower() == 'failed': + work_detail = status.get('Detail', '') + if not work_detail.startswith('exit status'): + error_list.append(f'Receptor error getting worker info from {node_name}, detail:\n{work_detail}') + elif 'unrecognized arguments: --worker-info' in stdout: + error_list.append(f'Old version (2.0.1 or earlier) of ansible-runner on node {node_name} without --worker-info') + else: + error_list.append(f'Unknown ansible-runner error on node {node_name}, stdout:\n{stdout}') + else: + yaml_stdout = stdout.strip() + remote_data = {} + try: + remote_data = yaml.safe_load(yaml_stdout) + except Exception as json_e: + error_list.append(f'Failed to parse node {node_name} --worker-info output as YAML, error: {json_e}, data:\n{yaml_stdout}') + + if not isinstance(remote_data, dict): + error_list.append(f'Remote node {node_name} --worker-info output is not a YAML dict, output:{stdout}') + else: + error_list.extend(remote_data.pop('Errors')) # merge both error lists + data.update(remote_data) + + return data diff --git a/docs/receptor_mesh.md b/docs/receptor_mesh.md index f1023eb7f9..78c893b119 100644 --- a/docs/receptor_mesh.md +++ b/docs/receptor_mesh.md @@ -72,9 +72,28 @@ Control nodes check the receptor network (reported via `receptorctl status`) whe Nodes on the receptor network are compared against the `Instance` model in the database. If a node appears in the mesh network which is not in the database, then a "health check" is started. -This will submit a work unit to the execution node which then outputs important node data via `ansible-runner`. The `capacity` field will obtain a non-zero value through this process, which is necessary to run jobs. +#### Health Check Mechanics + +All relevant data for health checks is reported from the ansible-runner command: + +``` +ansible-runner worker --worker-info +``` + +This will output YAML data to standard out containing CPU, memory, and other metrics used to compute `capacity`. + +AWX invokes this command by submitting a receptor work unit (of type `ansible-runner`) to the target execution node. +If you have the development environment running, you can run a one-off health check of a node with this command: + +``` +echo "from awx.main.utils.receptor import worker_info; worker_info('receptor-1')" | awx-manage shell_plus --quiet +``` + +This must be ran as the awx user inside one of the hybrid or control nodes. +This will not affect actual `Instance` record, but will just run the command and report the data. + ### Development Environment A "toy" cluster with execution nodes and a hop node is created by the docker-compose Makefile target. @@ -89,13 +108,23 @@ This will spin up a topology represented below. (names are the receptor node names, which differ from the AWX Instance names and network address in some cases) ``` ----v - -----v - - - ^-------------- - ^-------------- - ^-------------- + ┌──────────────┐ + │ │ +┌──────────────┐ ┌──────────┤ receptor-1 │ +│ │ │ │ │ +│ awx_1 │◄──────────┐ │ └──────────────┘ +│ │ │ ▼ +└──────┬───────┘ ┌──────┴───────┐ ┌──────────────┐ + │ │ │ │ │ + │ │ receptor-hop │◄───────┤ receptor-2 │ + ▼ │ │ │ │ +┌──────────────┐ └──────────────┘ └──────────────┘ +│ │ ▲ +│ awx_2 │ │ ┌──────────────┐ +│ │ │ │ │ +└──────────────┘ └──────────┤ receptor-3 │ + │ │ + └──────────────┘ ``` All execution (`receptor-*`) nodes connect to the hop node. From 1a9fcdccc24ed40a465731fab8daa168a85a91b8 Mon Sep 17 00:00:00 2001 From: beeankha Date: Mon, 16 Aug 2021 11:58:57 -0400 Subject: [PATCH 22/27] Change place where controller node is being looked for in the task manager --- awx/main/scheduler/task_manager.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/awx/main/scheduler/task_manager.py b/awx/main/scheduler/task_manager.py index 4eaf2743cc..313c89f5c7 100644 --- a/awx/main/scheduler/task_manager.py +++ b/awx/main/scheduler/task_manager.py @@ -255,12 +255,6 @@ class TaskManager: } dependencies = [{'type': get_type_for_model(type(t)), 'id': t.id} for t in dependent_tasks] - try: - controller_node = Instance.choose_online_control_plane_node() - except IndexError: - logger.warning("No control plane nodes available to manage {}".format(task.log_format)) - return - task.status = 'waiting' (start_status, opts) = task.pre_start() @@ -298,8 +292,13 @@ class TaskManager: else: task.instance_group = rampart_group task.execution_node = instance.hostname + try: + controller_node = Instance.choose_online_control_plane_node() + except IndexError: + logger.warning("No control plane nodes available to manage {}".format(task.log_format)) + return task.controller_node = controller_node - logger.debug('Submitting job {} to queue {} controlled by {}.'.format(task.log_format, task.execution_node, controller_node)) + logger.debug('Submitting job {} to queue {} controlled by {}.'.format(task.log_format, task.execution_node, task.controller_node)) with disable_activity_stream(): task.celery_task_id = str(uuid.uuid4()) task.save() From 928c35ede559ce48bd990c6b5097352fa0de3c21 Mon Sep 17 00:00:00 2001 From: Alan Rominger Date: Tue, 17 Aug 2021 11:52:57 -0400 Subject: [PATCH 23/27] Model changes for instance last_seen field to replace modified (#10870) * Model changes for instance last_seen field to replace modified * Break up refresh_capacity into smaller units * Rename execution node methods, fix last_seen clustering * Use update_fields to make it clear save only affects capacity * Restructing to pass unit tests * Fix bug where a PATCH did not update capacity value --- awx/api/views/__init__.py | 9 ++ awx/main/managers.py | 2 +- .../migrations/0153_instance_last_seen.py | 27 +++++ awx/main/models/ha.py | 109 +++++++++++++----- awx/main/tasks.py | 95 ++++++++------- .../tests/functional/api/test_instance.py | 32 +++++ awx/main/tests/functional/test_jobs.py | 19 +-- awx/main/tests/unit/models/test_ha.py | 15 ++- awx/main/utils/common.py | 52 +++------ awx/settings/defaults.py | 10 +- .../sources/templates/receptor-awx.conf.j2 | 3 +- 11 files changed, 246 insertions(+), 127 deletions(-) create mode 100644 awx/main/migrations/0153_instance_last_seen.py create mode 100644 awx/main/tests/functional/api/test_instance.py diff --git a/awx/api/views/__init__.py b/awx/api/views/__init__.py index 8ff25c10f9..2223c590e9 100644 --- a/awx/api/views/__init__.py +++ b/awx/api/views/__init__.py @@ -370,6 +370,15 @@ class InstanceDetail(RetrieveUpdateAPIView): model = models.Instance serializer_class = serializers.InstanceSerializer + def update(self, request, *args, **kwargs): + r = super(InstanceDetail, self).update(request, *args, **kwargs) + if status.is_success(r.status_code): + obj = self.get_object() + obj.set_capacity_value() + obj.save(update_fields=['capacity']) + r.data = serializers.InstanceSerializer(obj, context=self.get_serializer_context()).to_representation(obj) + return r + class InstanceUnifiedJobsList(SubListAPIView): diff --git a/awx/main/managers.py b/awx/main/managers.py index 01fb223df5..05ffb3ecbb 100644 --- a/awx/main/managers.py +++ b/awx/main/managers.py @@ -105,7 +105,7 @@ class InstanceManager(models.Manager): """Return the currently active instance.""" # If we are running unit tests, return a stub record. if settings.IS_TESTING(sys.argv) or hasattr(sys, '_called_from_test'): - return self.model(id=1, hostname='localhost', uuid='00000000-0000-0000-0000-000000000000') + return self.model(id=1, hostname=settings.CLUSTER_HOST_ID, uuid='00000000-0000-0000-0000-000000000000') node = self.filter(hostname=settings.CLUSTER_HOST_ID) if node.exists(): diff --git a/awx/main/migrations/0153_instance_last_seen.py b/awx/main/migrations/0153_instance_last_seen.py new file mode 100644 index 0000000000..408146cda8 --- /dev/null +++ b/awx/main/migrations/0153_instance_last_seen.py @@ -0,0 +1,27 @@ +# Generated by Django 2.2.20 on 2021-08-12 13:55 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('main', '0152_instance_node_type'), + ] + + operations = [ + migrations.AddField( + model_name='instance', + name='last_seen', + field=models.DateTimeField( + editable=False, + help_text='Last time instance ran its heartbeat task for main cluster nodes. Last known connection to receptor mesh for execution nodes.', + null=True, + ), + ), + migrations.AlterField( + model_name='instance', + name='memory', + field=models.BigIntegerField(default=0, editable=False, help_text='Total system memory of this instance in bytes.'), + ), + ] diff --git a/awx/main/models/ha.py b/awx/main/models/ha.py index aaf0b990ec..ebd1ad44b0 100644 --- a/awx/main/models/ha.py +++ b/awx/main/models/ha.py @@ -21,7 +21,7 @@ from awx.main.managers import InstanceManager, InstanceGroupManager from awx.main.fields import JSONField from awx.main.models.base import BaseModel, HasEditsMixin, prevent_search from awx.main.models.unified_jobs import UnifiedJob -from awx.main.utils import get_cpu_capacity, get_mem_capacity, get_system_task_capacity +from awx.main.utils.common import measure_cpu, get_corrected_cpu, get_cpu_effective_capacity, measure_memory, get_corrected_memory, get_mem_effective_capacity from awx.main.models.mixins import RelatedJobsMixin __all__ = ('Instance', 'InstanceGroup', 'TowerScheduleState') @@ -52,6 +52,7 @@ class Instance(HasPolicyEditsMixin, BaseModel): objects = InstanceManager() + # Fields set in instance registration uuid = models.CharField(max_length=40) hostname = models.CharField(max_length=250, unique=True) ip_address = models.CharField( @@ -61,16 +62,11 @@ class Instance(HasPolicyEditsMixin, BaseModel): max_length=50, unique=True, ) + # Auto-fields, implementation is different from BaseModel created = models.DateTimeField(auto_now_add=True) modified = models.DateTimeField(auto_now=True) + # Fields defined in health check or heartbeat version = models.CharField(max_length=120, blank=True) - capacity = models.PositiveIntegerField( - default=100, - editable=False, - ) - capacity_adjustment = models.DecimalField(default=Decimal(1.0), max_digits=3, decimal_places=2, validators=[MinValueValidator(0)]) - enabled = models.BooleanField(default=True) - managed_by_policy = models.BooleanField(default=True) cpu = models.IntegerField( default=0, editable=False, @@ -78,7 +74,22 @@ class Instance(HasPolicyEditsMixin, BaseModel): memory = models.BigIntegerField( default=0, editable=False, + help_text=_('Total system memory of this instance in bytes.'), ) + last_seen = models.DateTimeField( + null=True, + editable=False, + help_text=_('Last time instance ran its heartbeat task for main cluster nodes. Last known connection to receptor mesh for execution nodes.'), + ) + # Capacity management + capacity = models.PositiveIntegerField( + default=100, + editable=False, + ) + capacity_adjustment = models.DecimalField(default=Decimal(1.0), max_digits=3, decimal_places=2, validators=[MinValueValidator(0)]) + enabled = models.BooleanField(default=True) + managed_by_policy = models.BooleanField(default=True) + cpu_capacity = models.IntegerField( default=0, editable=False, @@ -126,39 +137,83 @@ class Instance(HasPolicyEditsMixin, BaseModel): return random.choice(Instance.objects.filter(enabled=True).filter(node_type__in=['control', 'hybrid']).values_list('hostname', flat=True)) def is_lost(self, ref_time=None): + if self.last_seen is None: + return True if ref_time is None: ref_time = now() - grace_period = 120 - return self.modified < ref_time - timedelta(seconds=grace_period) + grace_period = settings.CLUSTER_NODE_HEARTBEAT_PERIOD * 2 + if self.node_type == 'execution': + grace_period += settings.RECEPTOR_SERVICE_ADVERTISEMENT_PERIOD + return self.last_seen < ref_time - timedelta(seconds=grace_period) - def mark_offline(self, on_good_terms=False): - self.cpu = self.cpu_capacity = self.memory = self.mem_capacity = self.capacity = 0 - update_fields = ['capacity', 'cpu', 'memory', 'cpu_capacity', 'mem_capacity'] - if on_good_terms: - update_fields.append('modified') - self.save() + def mark_offline(self, update_last_seen=False, perform_save=True): + if self.cpu_capacity == 0 and self.mem_capacity == 0 and self.capacity == 0 and (not update_last_seen): + return + self.cpu_capacity = self.mem_capacity = self.capacity = 0 + if update_last_seen: + self.last_seen = now() - def refresh_capacity(self): - cpu = get_cpu_capacity() - mem = get_mem_capacity() + if perform_save: + update_fields = ['capacity', 'cpu_capacity', 'mem_capacity'] + if update_last_seen: + update_fields += ['last_seen'] + self.save(update_fields=update_fields) + + def set_capacity_value(self): + """Sets capacity according to capacity adjustment rule (no save)""" if self.enabled: - self.capacity = get_system_task_capacity(self.capacity_adjustment) + lower_cap = min(self.mem_capacity, self.cpu_capacity) + higher_cap = max(self.mem_capacity, self.cpu_capacity) + self.capacity = lower_cap + (higher_cap - lower_cap) * self.capacity_adjustment else: self.capacity = 0 + def refresh_capacity_fields(self): + """Update derived capacity fields from cpu and memory (no save)""" + self.cpu_capacity = get_cpu_effective_capacity(self.cpu) + self.mem_capacity = get_mem_effective_capacity(self.memory) + self.set_capacity_value() + + def save_health_data(self, version, cpu, memory, last_seen=None, has_error=False): + update_fields = [] + + if last_seen is not None and self.last_seen != last_seen: + self.last_seen = last_seen + update_fields.append('last_seen') + + if self.version != version: + self.version = version + update_fields.append('version') + + new_cpu = get_corrected_cpu(cpu) + if new_cpu != self.cpu: + self.cpu = new_cpu + update_fields.append('cpu') + + new_memory = get_corrected_memory(memory) + if new_memory != self.memory: + self.memory = new_memory + update_fields.append('memory') + + if not has_error: + self.refresh_capacity_fields() + else: + self.mark_offline(perform_save=False) + update_fields.extend(['cpu_capacity', 'mem_capacity', 'capacity']) + + self.save(update_fields=update_fields) + + def local_health_check(self): + """Only call this method on the instance that this record represents""" + has_error = False try: # if redis is down for some reason, that means we can't persist # playbook event data; we should consider this a zero capacity event redis.Redis.from_url(settings.BROKER_URL).ping() except redis.ConnectionError: - self.capacity = 0 + has_error = True - self.cpu = cpu[0] - self.memory = mem[0] - self.cpu_capacity = cpu[1] - self.mem_capacity = mem[1] - self.version = awx_application_version - self.save(update_fields=['capacity', 'version', 'modified', 'cpu', 'memory', 'cpu_capacity', 'mem_capacity']) + self.save_health_data(awx_application_version, measure_cpu(), measure_memory(), last_seen=now(), has_error=has_error) class InstanceGroup(HasPolicyEditsMixin, BaseModel, RelatedJobsMixin): diff --git a/awx/main/tasks.py b/awx/main/tasks.py index 56f10dacc0..9dc0065459 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -98,9 +98,6 @@ from awx.main.utils.common import ( parse_yaml_or_json, cleanup_new_process, create_partition, - get_cpu_effective_capacity, - get_mem_effective_capacity, - get_system_task_capacity, ) from awx.main.utils.execution_environments import get_default_pod_spec, CONTAINER_ROOT, to_container_path from awx.main.utils.ansible import read_ansible_config @@ -180,7 +177,7 @@ def dispatch_startup(): def inform_cluster_of_shutdown(): try: this_inst = Instance.objects.get(hostname=settings.CLUSTER_HOST_ID) - this_inst.mark_offline(on_good_terms=True) # No thank you to new jobs while shut down + this_inst.mark_offline(update_last_seen=True) # No thank you to new jobs while shut down try: reaper.reap(this_inst) except Exception: @@ -403,7 +400,7 @@ def cleanup_execution_environment_images(): @task(queue=get_local_queuename) -def check_heartbeat(node): +def execution_node_health_check(node): try: instance = Instance.objects.get(hostname=node) except Instance.DoesNotExist: @@ -411,64 +408,66 @@ def check_heartbeat(node): return data = worker_info(node) + prior_capacity = instance.capacity + + instance.save_health_data( + 'ansible-runner-' + data.get('Version', '???'), + data.get('CPU Capacity', 0), # TODO: rename field on runner side to not say "Capacity" + data.get('Memory Capacity', 0) * 1000, # TODO: double-check the multiplier here + has_error=bool(data.get('Errors')), + ) + if data['Errors']: formatted_error = "\n".join(data["Errors"]) - if instance.capacity: + if prior_capacity: logger.warn(f'Health check marking execution node {node} as lost, errors:\n{formatted_error}') else: logger.info(f'Failed to find capacity of new or lost execution node {node}, errors:\n{formatted_error}') - instance.mark_offline() else: - # TODO: spin off new instance method from refresh_capacity that calculates derived fields - instance.cpu = data['CPU Capacity'] # TODO: rename field on runner side to not say "Capacity" - instance.cpu_capacity = get_cpu_effective_capacity(instance.cpu) - instance.memory = data['Memory Capacity'] * 1000 # TODO: double-check the multiplier here - instance.mem_capacity = get_mem_effective_capacity(instance.memory) - instance.capacity = get_system_task_capacity( - instance.capacity_adjustment, - instance.cpu_capacity, - instance.mem_capacity, - ) - instance.version = 'ansible-runner-' + data['Version'] - instance.save(update_fields=['capacity', 'version', 'cpu', 'memory', 'cpu_capacity', 'mem_capacity']) logger.info('Set capacity of execution node {} to {}, worker info data:\n{}'.format(node, instance.capacity, json.dumps(data, indent=2))) -def discover_receptor_nodes(): +def inspect_execution_nodes(instance_list): + node_lookup = {} + for inst in instance_list: + if inst.node_type == 'execution': + node_lookup[inst.hostname] = inst + ctl = get_receptor_ctl() connections = ctl.simple_command('status')['Advertisements'] nowtime = now() for ad in connections: hostname = ad['NodeID'] - commands = ad['WorkCommands'] or [] + commands = ad.get('WorkCommands') or [] if 'ansible-runner' not in commands: continue - (changed, instance) = Instance.objects.register(hostname=hostname, node_type='execution') + changed = False + if hostname in node_lookup: + instance = node_lookup[hostname] + else: + (changed, instance) = Instance.objects.register(hostname=hostname, node_type='execution') was_lost = instance.is_lost(ref_time=nowtime) last_seen = parse_date(ad['Time']) - if instance.modified == last_seen: - continue - instance.modified = last_seen - if instance.is_lost(ref_time=nowtime): - # if the instance hasn't advertised in awhile, don't save a new modified time - # this is so multiple cluster nodes do all make repetitive updates - continue - instance.save(update_fields=['modified']) + if instance.last_seen and instance.last_seen >= last_seen: + continue + instance.last_seen = last_seen + instance.save(update_fields=['last_seen']) + if changed: logger.warn("Registered execution node '{}'".format(hostname)) - check_heartbeat.apply_async([hostname]) + execution_node_health_check.apply_async([hostname]) elif was_lost: # if the instance *was* lost, but has appeared again, # attempt to re-establish the initial capacity and version # check logger.warn(f'Execution node attempting to rejoin as instance {hostname}.') - check_heartbeat.apply_async([hostname]) + execution_node_health_check.apply_async([hostname]) elif instance.capacity == 0: # Periodically re-run the health check of errored nodes, in case someone fixed it # TODO: perhaps decrease the frequency of these checks logger.debug(f'Restarting health check for execution node {hostname} with known errors.') - check_heartbeat.apply_async([hostname]) + execution_node_health_check.apply_async([hostname]) @task(queue=get_local_queuename) @@ -479,34 +478,34 @@ def cluster_node_heartbeat(): this_inst = None lost_instances = [] - (changed, instance) = Instance.objects.get_or_register() - if changed: - logger.info("Registered tower control node '{}'".format(instance.hostname)) - - discover_receptor_nodes() - - for inst in list(instance_list): + for inst in instance_list: if inst.hostname == settings.CLUSTER_HOST_ID: this_inst = inst instance_list.remove(inst) - elif inst.node_type == 'execution': # TODO: zero out capacity of execution nodes that are MIA - # Only considering control plane for this logic - continue - elif inst.is_lost(ref_time=nowtime): + break + else: + (changed, this_inst) = Instance.objects.get_or_register() + if changed: + logger.info("Registered tower control node '{}'".format(this_inst.hostname)) + + inspect_execution_nodes(instance_list) + + for inst in list(instance_list): + if inst.is_lost(ref_time=nowtime): lost_instances.append(inst) instance_list.remove(inst) if this_inst: startup_event = this_inst.is_lost(ref_time=nowtime) - this_inst.refresh_capacity() - if startup_event: + this_inst.local_health_check() + if startup_event and this_inst.capacity != 0: logger.warning('Rejoining the cluster as instance {}.'.format(this_inst.hostname)) return else: raise RuntimeError("Cluster Host Not Found: {}".format(settings.CLUSTER_HOST_ID)) # IFF any node has a greater version than we do, then we'll shutdown services for other_inst in instance_list: - if other_inst.version == "" or other_inst.version.startswith('ansible-runner'): + if other_inst.version == "" or other_inst.version.startswith('ansible-runner') or other_inst.node_type == 'execution': continue if Version(other_inst.version.split('-', 1)[0]) > Version(awx_application_version.split('-', 1)[0]) and not settings.DEBUG: logger.error( @@ -534,7 +533,7 @@ def cluster_node_heartbeat(): # since we will delete the node anyway. if other_inst.capacity != 0 and not settings.AWX_AUTO_DEPROVISION_INSTANCES: other_inst.mark_offline() - logger.error("Host {} last checked in at {}, marked as lost.".format(other_inst.hostname, other_inst.modified)) + logger.error("Host {} last checked in at {}, marked as lost.".format(other_inst.hostname, other_inst.last_seen)) elif settings.AWX_AUTO_DEPROVISION_INSTANCES: deprovision_hostname = other_inst.hostname other_inst.delete() diff --git a/awx/main/tests/functional/api/test_instance.py b/awx/main/tests/functional/api/test_instance.py new file mode 100644 index 0000000000..88f0586fd9 --- /dev/null +++ b/awx/main/tests/functional/api/test_instance.py @@ -0,0 +1,32 @@ +import pytest + +from awx.api.versioning import reverse + +from awx.main.models.ha import Instance + + +@pytest.mark.django_db +def test_disabled_zeros_capacity(patch, admin_user): + instance = Instance.objects.create(hostname='example-host', cpu=6, memory=36000000000, cpu_capacity=6, mem_capacity=42) + + url = reverse('api:instance_detail', kwargs={'pk': instance.pk}) + + r = patch(url=url, data={'enabled': False}, user=admin_user) + assert r.data['capacity'] == 0 + + instance.refresh_from_db() + assert instance.capacity == 0 + + +@pytest.mark.django_db +def test_enabled_sets_capacity(patch, admin_user): + instance = Instance.objects.create(hostname='example-host', enabled=False, cpu=6, memory=36000000000, cpu_capacity=6, mem_capacity=42, capacity=0) + assert instance.capacity == 0 + + url = reverse('api:instance_detail', kwargs={'pk': instance.pk}) + + r = patch(url=url, data={'enabled': True}, user=admin_user) + assert r.data['capacity'] > 0 + + instance.refresh_from_db() + assert instance.capacity > 0 diff --git a/awx/main/tests/functional/test_jobs.py b/awx/main/tests/functional/test_jobs.py index 197c1197e1..7d97aa0b9b 100644 --- a/awx/main/tests/functional/test_jobs.py +++ b/awx/main/tests/functional/test_jobs.py @@ -20,24 +20,27 @@ def test_orphan_unified_job_creation(instance, inventory): @pytest.mark.django_db -@mock.patch('awx.main.utils.common.get_cpu_capacity', lambda: (2, 8)) -@mock.patch('awx.main.utils.common.get_mem_capacity', lambda: (8000, 62)) +@mock.patch('awx.main.tasks.inspect_execution_nodes', lambda *args, **kwargs: None) +@mock.patch('awx.main.models.ha.get_cpu_effective_capacity', lambda cpu: 8) +@mock.patch('awx.main.models.ha.get_mem_effective_capacity', lambda mem: 62) def test_job_capacity_and_with_inactive_node(): i = Instance.objects.create(hostname='test-1') - with mock.patch.object(redis.client.Redis, 'ping', lambda self: True): - i.refresh_capacity() + i.save_health_data('18.0.1', 2, 8000) + assert i.enabled is True + assert i.capacity_adjustment == 1.0 assert i.capacity == 62 i.enabled = False i.save() with override_settings(CLUSTER_HOST_ID=i.hostname): - cluster_node_heartbeat() + with mock.patch.object(redis.client.Redis, 'ping', lambda self: True): + cluster_node_heartbeat() i = Instance.objects.get(id=i.id) assert i.capacity == 0 @pytest.mark.django_db -@mock.patch('awx.main.utils.common.get_cpu_capacity', lambda: (2, 8)) -@mock.patch('awx.main.utils.common.get_mem_capacity', lambda: (8000, 62)) +@mock.patch('awx.main.models.ha.get_cpu_effective_capacity', lambda cpu: 8) +@mock.patch('awx.main.models.ha.get_mem_effective_capacity', lambda mem: 62) def test_job_capacity_with_redis_disabled(): i = Instance.objects.create(hostname='test-1') @@ -45,7 +48,7 @@ def test_job_capacity_with_redis_disabled(): raise redis.ConnectionError() with mock.patch.object(redis.client.Redis, 'ping', _raise): - i.refresh_capacity() + i.local_health_check() assert i.capacity == 0 diff --git a/awx/main/tests/unit/models/test_ha.py b/awx/main/tests/unit/models/test_ha.py index 52d5fdc16c..ec71a47fc2 100644 --- a/awx/main/tests/unit/models/test_ha.py +++ b/awx/main/tests/unit/models/test_ha.py @@ -1,10 +1,19 @@ import pytest from unittest import mock from unittest.mock import Mock +from decimal import Decimal -from awx.main.models import ( - InstanceGroup, -) +from awx.main.models import InstanceGroup, Instance + + +@pytest.mark.parametrize('capacity_adjustment', [0.0, 0.25, 0.5, 0.75, 1, 1.5, 3]) +def test_capacity_adjustment_no_save(capacity_adjustment): + inst = Instance(hostname='test-host', capacity_adjustment=Decimal(capacity_adjustment), capacity=0, cpu_capacity=10, mem_capacity=1000) + assert inst.capacity == 0 + assert inst.capacity_adjustment == capacity_adjustment # sanity + inst.set_capacity_value() + assert inst.capacity > 0 + assert inst.capacity == (float(inst.capacity_adjustment) * abs(inst.mem_capacity - inst.cpu_capacity) + min(inst.mem_capacity, inst.cpu_capacity)) def T(impact): diff --git a/awx/main/utils/common.py b/awx/main/utils/common.py index 9194fa51d9..dc324efaf5 100644 --- a/awx/main/utils/common.py +++ b/awx/main/utils/common.py @@ -18,8 +18,6 @@ import tempfile import psutil from functools import reduce, wraps -from decimal import Decimal - # Django from django.core.exceptions import ObjectDoesNotExist, FieldDoesNotExist from django.utils.dateparse import parse_datetime @@ -72,9 +70,6 @@ __all__ = [ 'set_current_apps', 'extract_ansible_vars', 'get_search_fields', - 'get_system_task_capacity', - 'get_cpu_capacity', - 'get_mem_capacity', 'model_to_dict', 'NullablePromptPseudoField', 'model_instance_diff', @@ -715,7 +710,14 @@ def get_cpu_effective_capacity(cpu_count): return cpu_count * forkcpu -def get_cpu_capacity(): +def measure_cpu(): # TODO: replace with import from ansible-runner + return psutil.cpu_count() + + +def get_corrected_cpu(cpu_count): # formerlly get_cpu_capacity + """Some environments will do a correction to the reported CPU number + because the given OpenShift value is a lie + """ from django.conf import settings settings_abscpu = getattr(settings, 'SYSTEM_TASK_ABS_CPU', None) @@ -726,9 +728,7 @@ def get_cpu_capacity(): elif settings_abscpu is not None: return 0, int(settings_abscpu) - cpu = psutil.cpu_count() - - return (cpu, get_cpu_effective_capacity(cpu)) + return cpu_count # no correction def get_mem_effective_capacity(mem_mb): @@ -747,7 +747,11 @@ def get_mem_effective_capacity(mem_mb): return max(1, ((mem_mb // 1024 // 1024) - 2048) // forkmem) -def get_mem_capacity(): +def measure_memory(): # TODO: replace with import from ansible-runner + return psutil.virtual_memory().total + + +def get_corrected_memory(memory): from django.conf import settings settings_absmem = getattr(settings, 'SYSTEM_TASK_ABS_MEM', None) @@ -758,33 +762,7 @@ def get_mem_capacity(): elif settings_absmem is not None: return 0, int(settings_absmem) - mem = psutil.virtual_memory().total - return (mem, get_mem_effective_capacity(mem)) - - -def get_system_task_capacity(scale=Decimal(1.0), cpu_capacity=None, mem_capacity=None): - """ - Measure system memory and use it as a baseline for determining the system's capacity - """ - from django.conf import settings - - settings_forks = getattr(settings, 'SYSTEM_TASK_FORKS_CAPACITY', None) - env_forks = os.getenv('SYSTEM_TASK_FORKS_CAPACITY', None) - - if env_forks: - return int(env_forks) - elif settings_forks: - return int(settings_forks) - - if cpu_capacity is None: - _, cpu_cap = get_cpu_capacity() - else: - cpu_cap = cpu_capacity - if mem_capacity is None: - _, mem_cap = get_mem_capacity() - else: - mem_cap = mem_capacity - return min(mem_cap, cpu_cap) + ((max(mem_cap, cpu_cap) - min(mem_cap, cpu_cap)) * scale) + return memory _inventory_updates = threading.local() diff --git a/awx/settings/defaults.py b/awx/settings/defaults.py index d87ddc072a..b4b5876863 100644 --- a/awx/settings/defaults.py +++ b/awx/settings/defaults.py @@ -419,10 +419,18 @@ DEVSERVER_DEFAULT_PORT = '8013' # Set default ports for live server tests. os.environ.setdefault('DJANGO_LIVE_TEST_SERVER_ADDRESS', 'localhost:9013-9199') +# heartbeat period can factor into some forms of logic, so it is maintained as a setting here +CLUSTER_NODE_HEARTBEAT_PERIOD = 60 +RECEPTOR_SERVICE_ADVERTISEMENT_PERIOD = 60 # https://github.com/ansible/receptor/blob/aa1d589e154d8a0cb99a220aff8f98faf2273be6/pkg/netceptor/netceptor.go#L34 + BROKER_URL = 'unix:///var/run/redis/redis.sock' CELERYBEAT_SCHEDULE = { 'tower_scheduler': {'task': 'awx.main.tasks.awx_periodic_scheduler', 'schedule': timedelta(seconds=30), 'options': {'expires': 20}}, - 'cluster_heartbeat': {'task': 'awx.main.tasks.cluster_node_heartbeat', 'schedule': timedelta(seconds=60), 'options': {'expires': 50}}, + 'cluster_heartbeat': { + 'task': 'awx.main.tasks.cluster_node_heartbeat', + 'schedule': timedelta(seconds=CLUSTER_NODE_HEARTBEAT_PERIOD), + 'options': {'expires': 50}, + }, 'gather_analytics': {'task': 'awx.main.tasks.gather_analytics', 'schedule': timedelta(minutes=5)}, 'task_manager': {'task': 'awx.main.scheduler.tasks.run_task_manager', 'schedule': timedelta(seconds=20), 'options': {'expires': 20}}, 'k8s_reaper': {'task': 'awx.main.tasks.awx_k8s_reaper', 'schedule': timedelta(seconds=60), 'options': {'expires': 50}}, diff --git a/tools/docker-compose/ansible/roles/sources/templates/receptor-awx.conf.j2 b/tools/docker-compose/ansible/roles/sources/templates/receptor-awx.conf.j2 index e95097e1ef..a274b5b2b7 100644 --- a/tools/docker-compose/ansible/roles/sources/templates/receptor-awx.conf.j2 +++ b/tools/docker-compose/ansible/roles/sources/templates/receptor-awx.conf.j2 @@ -2,8 +2,7 @@ - node: id: awx_{{ item }} -- log-level: - debug +- log-level: info - tcp-listener: port: 2222 From c3ad479fc64600b78361c9ad7ce5ef3ab3d31f0f Mon Sep 17 00:00:00 2001 From: Alan Rominger Date: Wed, 18 Aug 2021 07:53:12 -0400 Subject: [PATCH 24/27] Minor tweaks for the mesh_code branch from review (#10902) --- Makefile | 2 +- awx/main/scheduler/task_manager.py | 1 - awx/main/wsbroadcast.py | 8 +++++++- awx/settings/development.py | 2 -- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index f8269bdd1b..7e35284667 100644 --- a/Makefile +++ b/Makefile @@ -181,7 +181,7 @@ init: receptor --cert-makereq bits=2048 commonname="$$node test cert" dnsname=$$node nodeid=$$node outreq=/etc/receptor/certs/$$node.csr outkey=/etc/receptor/certs/$$node.key; \ receptor --cert-signreq req=/etc/receptor/certs/$$node.csr cacert=/etc/receptor/certs/ca.crt cakey=/etc/receptor/certs/ca.key outcert=/etc/receptor/certs/$$node.crt verify=yes; \ done; \ - fi; \ + fi # Refresh development environment after pulling new code. refresh: clean requirements_dev version_file develop migrate diff --git a/awx/main/scheduler/task_manager.py b/awx/main/scheduler/task_manager.py index 313c89f5c7..99fd8015f7 100644 --- a/awx/main/scheduler/task_manager.py +++ b/awx/main/scheduler/task_manager.py @@ -75,7 +75,6 @@ class TaskManager: instances_partial = [ SimpleNamespace( obj=instance, - version=instance.version, node_type=instance.node_type, remaining_capacity=instance.remaining_capacity, capacity=instance.capacity, diff --git a/awx/main/wsbroadcast.py b/awx/main/wsbroadcast.py index afd444ff5b..1ec4fa1256 100644 --- a/awx/main/wsbroadcast.py +++ b/awx/main/wsbroadcast.py @@ -32,7 +32,13 @@ def unwrap_broadcast_msg(payload: dict): def get_broadcast_hosts(): Instance = apps.get_model('main', 'Instance') - instances = Instance.objects.exclude(hostname=Instance.objects.me().hostname).exclude(version__startswith='ansible-runner-').order_by('hostname').values('hostname', 'ip_address').distinct() + instances = ( + Instance.objects.exclude(hostname=Instance.objects.me().hostname) + .exclude(node_type='execution') + .order_by('hostname') + .values('hostname', 'ip_address') + .distinct() + ) return {i['hostname']: i['ip_address'] or i['hostname'] for i in instances} diff --git a/awx/settings/development.py b/awx/settings/development.py index 7bba10351a..3c41dde684 100644 --- a/awx/settings/development.py +++ b/awx/settings/development.py @@ -125,8 +125,6 @@ if "pytest" in sys.modules: } } -CELERYBEAT_SCHEDULE['cluster_heartbeat']['schedule'] = timedelta(seconds=10) - CLUSTER_HOST_ID = socket.gethostname() AWX_CALLBACK_PROFILE = True From 940c189c12bdc0843372ce1d88051d05869860f2 Mon Sep 17 00:00:00 2001 From: Alan Rominger Date: Tue, 24 Aug 2021 08:30:55 -0400 Subject: [PATCH 25/27] Corresponding AWX changes for runner --worker-info schema update (#10926) --- awx/main/models/ha.py | 11 ++++++++++- awx/main/tasks.py | 13 +++++++------ awx/main/utils/receptor.py | 9 +++++++-- 3 files changed, 24 insertions(+), 9 deletions(-) diff --git a/awx/main/models/ha.py b/awx/main/models/ha.py index ebd1ad44b0..00475254bd 100644 --- a/awx/main/models/ha.py +++ b/awx/main/models/ha.py @@ -3,6 +3,7 @@ from decimal import Decimal import random +import logging from django.core.validators import MinValueValidator from django.db import models, connection @@ -26,6 +27,8 @@ from awx.main.models.mixins import RelatedJobsMixin __all__ = ('Instance', 'InstanceGroup', 'TowerScheduleState') +logger = logging.getLogger('awx.main.models.ha') + class HasPolicyEditsMixin(HasEditsMixin): class Meta: @@ -174,13 +177,19 @@ class Instance(HasPolicyEditsMixin, BaseModel): self.mem_capacity = get_mem_effective_capacity(self.memory) self.set_capacity_value() - def save_health_data(self, version, cpu, memory, last_seen=None, has_error=False): + def save_health_data(self, version, cpu, memory, uuid=None, last_seen=None, has_error=False): update_fields = [] if last_seen is not None and self.last_seen != last_seen: self.last_seen = last_seen update_fields.append('last_seen') + if uuid is not None and self.uuid != uuid: + if self.uuid is not None: + logger.warn(f'Self-reported uuid of {self.hostname} changed from {self.uuid} to {uuid}') + self.uuid = uuid + update_fields.append('uuid') + if self.version != version: self.version = version update_fields.append('version') diff --git a/awx/main/tasks.py b/awx/main/tasks.py index 9dc0065459..c577a426e8 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -411,14 +411,15 @@ def execution_node_health_check(node): prior_capacity = instance.capacity instance.save_health_data( - 'ansible-runner-' + data.get('Version', '???'), - data.get('CPU Capacity', 0), # TODO: rename field on runner side to not say "Capacity" - data.get('Memory Capacity', 0) * 1000, # TODO: double-check the multiplier here - has_error=bool(data.get('Errors')), + version='ansible-runner-' + data.get('runner_version', '???'), + cpu=data.get('cpu_count', 0), + memory=data.get('mem_in_bytes', 0), + uuid=data.get('uuid'), + has_error=bool(data.get('errors')), ) - if data['Errors']: - formatted_error = "\n".join(data["Errors"]) + if data['errors']: + formatted_error = "\n".join(data["errors"]) if prior_capacity: logger.warn(f'Health check marking execution node {node} as lost, errors:\n{formatted_error}') else: diff --git a/awx/main/utils/receptor.py b/awx/main/utils/receptor.py index 8a205334c1..b792561cf8 100644 --- a/awx/main/utils/receptor.py +++ b/awx/main/utils/receptor.py @@ -16,7 +16,7 @@ def worker_info(node_name): receptor_ctl = get_receptor_ctl() transmit_start = time.time() error_list = [] - data = {'Errors': error_list, 'transmit_timing': 0.0} + data = {'errors': error_list, 'transmit_timing': 0.0} result = receptor_ctl.submit_work(worktype='ansible-runner', payload='', params={"params": f"--worker-info"}, ttl='20s', node=node_name) @@ -71,7 +71,12 @@ def worker_info(node_name): if not isinstance(remote_data, dict): error_list.append(f'Remote node {node_name} --worker-info output is not a YAML dict, output:{stdout}') else: - error_list.extend(remote_data.pop('Errors')) # merge both error lists + error_list.extend(remote_data.pop('errors', [])) # merge both error lists data.update(remote_data) + # see tasks.py usage of keys + missing_keys = set(('runner_version', 'mem_in_bytes', 'cpu_count')) - set(data.keys()) + if missing_keys: + data['errors'].append('Worker failed to return keys {}'.format(' '.join(missing_keys))) + return data From 274e487a964969996dfba080c37375ce666e8ec0 Mon Sep 17 00:00:00 2001 From: Shane McDonald Date: Tue, 24 Aug 2021 10:33:00 -0400 Subject: [PATCH 26/27] Attempt to surface streaming errors that were being eaten (#10918) --- awx/main/tasks.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/awx/main/tasks.py b/awx/main/tasks.py index c577a426e8..fa65f97f40 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -3078,7 +3078,13 @@ class AWXReceptorJob: return res if not self.task.instance.result_traceback: - raise RuntimeError(detail) + try: + resultsock = receptor_ctl.get_work_results(self.unit_id, return_sockfile=True) + lines = resultsock.readlines() + self.task.instance.result_traceback = b"".join(lines).decode() + self.task.instance.save(update_fields=['result_traceback']) + except Exception: + raise RuntimeError(detail) return res From 42484cf98ddd2e9ffb8d9c0cf8eb35abb2eb3a93 Mon Sep 17 00:00:00 2001 From: Alan Rominger Date: Tue, 24 Aug 2021 11:20:21 -0400 Subject: [PATCH 27/27] Obtain receptor sockfile from the receptor.conf file (#10932) --- awx/main/utils/receptor.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/awx/main/utils/receptor.py b/awx/main/utils/receptor.py index b792561cf8..cec32756f9 100644 --- a/awx/main/utils/receptor.py +++ b/awx/main/utils/receptor.py @@ -8,8 +8,24 @@ from receptorctl.socket_interface import ReceptorControl logger = logging.getLogger('awx.main.utils.receptor') +def get_receptor_sockfile(): + receptor_conf = '/etc/receptor/receptor.conf' + with open(receptor_conf, 'r') as f: + data = yaml.safe_load(f) + for section in data: + for entry_name, entry_data in section.items(): + if entry_name == 'control-service': + if 'filename' in entry_data: + return entry_data['filename'] + else: + raise RuntimeError(f'Receptor conf {receptor_conf} control-service entry does not have a filename parameter') + else: + raise RuntimeError(f'Receptor conf {receptor_conf} does not have control-service entry needed to get sockfile') + + def get_receptor_ctl(): - return ReceptorControl('/var/run/receptor/receptor.sock') + receptor_sockfile = get_receptor_sockfile() + return ReceptorControl(receptor_sockfile) def worker_info(node_name):