add workflow manager

2026-05-14 03:38:36 -05:00 · 2022-06-28 16:35:07 -04:00
parent a8a823525a
commit 109f39de75
6 changed files with 180 additions and 167 deletions
@@ -188,7 +188,6 @@ class Metrics:
            SetFloatM('task_manager_start_task_seconds', 'Time spent starting task'),
            SetFloatM('task_manager_process_running_tasks_seconds', 'Time spent processing running tasks'),
            SetFloatM('task_manager_process_pending_tasks_seconds', 'Time spent processing pending tasks'),
-            SetFloatM('task_manager_spawn_workflow_graph_jobs_seconds', 'Time spent spawning workflow jobs'),
            SetFloatM('task_manager__schedule_seconds', 'Time spent in running the entire _schedule'),
            IntM('task_manager_schedule_calls', 'Number of calls to task manager schedule'),
            SetFloatM('task_manager_recorded_timestamp', 'Unix timestamp when metrics were last recorded'),
@@ -196,12 +195,16 @@ class Metrics:
            SetIntM('task_manager_running_processed', 'Number of running tasks processed'),
            SetIntM('task_manager_pending_processed', 'Number of pending tasks processed'),
            SetIntM('task_manager_tasks_blocked', 'Number of tasks blocked from running'),
-            SetFloatM('task_prepper_get_tasks_seconds', 'Time spent in loading all tasks from db'),
-            SetFloatM('task_prepper_generate_dependencies_seconds', 'Time spent generating dependencies for pending tasks'),
-            SetFloatM('task_prepper__schedule_seconds', 'Time spent in running the entire _schedule'),
-            IntM('task_prepper_schedule_calls', 'Number of calls to task manager schedule'),
-            SetFloatM('task_prepper_recorded_timestamp', 'Unix timestamp when metrics were last recorded'),
-            SetIntM('task_prepper_pending_processed', 'Number of pending tasks processed'),
+            SetFloatM('dependency_manager_get_tasks_seconds', 'Time spent in loading all tasks from db'),
+            SetFloatM('dependency_manager_generate_dependencies_seconds', 'Time spent generating dependencies for pending tasks'),
+            SetFloatM('dependency_manager__schedule_seconds', 'Time spent in running the entire _schedule'),
+            IntM('dependency_manager_schedule_calls', 'Number of calls to task manager schedule'),
+            SetFloatM('dependency_manager_recorded_timestamp', 'Unix timestamp when metrics were last recorded'),
+            SetIntM('dependency_manager_pending_processed', 'Number of pending tasks processed'),
+            SetFloatM('workflow_manager__schedule_seconds', 'Time spent in running the entire _schedule'),
+            IntM('workflow_manager_schedule_calls', 'Number of calls to task manager schedule'),
+            SetFloatM('workflow_manager_recorded_timestamp', 'Unix timestamp when metrics were last recorded'),
+            SetFloatM('workflow_manager_spawn_workflow_graph_jobs_seconds', 'Time spent spawning workflow jobs'),
        ]
        # turn metric list into dictionary with the metric name as a key
        self.METRICS = {}
@@ -1,6 +1,6 @@
 # Copyright (c) 2017 Ansible, Inc.
 #

-from .task_manager import TaskManager, TaskPrepper
+from .task_manager import TaskManager, DependencyManager, WorkflowManager

-__all__ = ['TaskManager', 'TaskPrepper']
+__all__ = ['TaskManager', 'DependencyManager', 'WorkflowManager']
@@ -53,11 +53,7 @@ def timeit(func):
        t_now = time.perf_counter()
        result = func(*args, **kwargs)
        dur = time.perf_counter() - t_now
-        if func.__qualname__.startswith("TaskManager"):
-            prefix = "task_manager"
-        else:
-            prefix = "task_prepper"
-        args[0].subsystem_metrics.inc(f"{prefix}_{func.__name__}_seconds", dur)
+        args[0].subsystem_metrics.inc(f"{args[0].prefix}_{func.__name__}_seconds", dur)
        return result

    return inner
@@ -129,9 +125,160 @@ class TaskBase:
                logger.debug(f"Finishing {self.prefix} Scheduler")


-class TaskPrepper(TaskBase):
+class WorkflowManager(TaskBase):
    def __init__(self):
-        self.prefix = "task_prepper"
+        self.prefix = "workflow_manager"
+        super().__init__()
+
+    @timeit
+    def spawn_workflow_graph_jobs(self, workflow_jobs):
+        logger.debug(f"=== {workflow_jobs}")
+        for workflow_job in workflow_jobs:
+            if workflow_job.cancel_flag:
+                logger.debug('Not spawning jobs for %s because it is pending cancelation.', workflow_job.log_format)
+                continue
+            dag = WorkflowDAG(workflow_job)
+            spawn_nodes = dag.bfs_nodes_to_run()
+            if spawn_nodes:
+                logger.debug('Spawning jobs for %s', workflow_job.log_format)
+            else:
+                logger.debug('No nodes to spawn for %s', workflow_job.log_format)
+            for spawn_node in spawn_nodes:
+                if spawn_node.unified_job_template is None:
+                    continue
+                kv = spawn_node.get_job_kwargs()
+                job = spawn_node.unified_job_template.create_unified_job(**kv)
+                spawn_node.job = job
+                spawn_node.save()
+                logger.debug('Spawned %s in %s for node %s', job.log_format, workflow_job.log_format, spawn_node.pk)
+                can_start = True
+                if isinstance(spawn_node.unified_job_template, WorkflowJobTemplate):
+                    workflow_ancestors = job.get_ancestor_workflows()
+                    if spawn_node.unified_job_template in set(workflow_ancestors):
+                        can_start = False
+                        logger.info(
+                            'Refusing to start recursive workflow-in-workflow id={}, wfjt={}, ancestors={}'.format(
+                                job.id, spawn_node.unified_job_template.pk, [wa.pk for wa in workflow_ancestors]
+                            )
+                        )
+                        display_list = [spawn_node.unified_job_template] + workflow_ancestors
+                        job.job_explanation = gettext_noop(
+                            "Workflow Job spawned from workflow could not start because it " "would result in recursion (spawn order, most recent first: {})"
+                        ).format(', '.join(['<{}>'.format(tmp) for tmp in display_list]))
+                    else:
+                        logger.debug(
+                            'Starting workflow-in-workflow id={}, wfjt={}, ancestors={}'.format(
+                                job.id, spawn_node.unified_job_template.pk, [wa.pk for wa in workflow_ancestors]
+                            )
+                        )
+                if not job._resources_sufficient_for_launch():
+                    can_start = False
+                    job.job_explanation = gettext_noop(
+                        "Job spawned from workflow could not start because it " "was missing a related resource such as project or inventory"
+                    )
+                if can_start:
+                    if workflow_job.start_args:
+                        start_args = json.loads(decrypt_field(workflow_job, 'start_args'))
+                    else:
+                        start_args = {}
+                    can_start = job.signal_start(**start_args)
+                    if not can_start:
+                        job.job_explanation = gettext_noop(
+                            "Job spawned from workflow could not start because it " "was not in the right state or required manual credentials"
+                        )
+                if not can_start:
+                    job.status = 'failed'
+                    job.save(update_fields=['status', 'job_explanation'])
+                    job.websocket_emit_status('failed')
+
+                # TODO: should we emit a status on the socket here similar to tasks.py awx_periodic_scheduler() ?
+                # emit_websocket_notification('/socket.io/jobs', '', dict(id=))
+
+    def process_finished_workflow_jobs(self, workflow_jobs):
+        result = []
+        for workflow_job in workflow_jobs:
+            dag = WorkflowDAG(workflow_job)
+            status_changed = False
+            if workflow_job.cancel_flag:
+                workflow_job.workflow_nodes.filter(do_not_run=False, job__isnull=True).update(do_not_run=True)
+                logger.debug('Canceling spawned jobs of %s due to cancel flag.', workflow_job.log_format)
+                cancel_finished = dag.cancel_node_jobs()
+                if cancel_finished:
+                    logger.info('Marking %s as canceled, all spawned jobs have concluded.', workflow_job.log_format)
+                    workflow_job.status = 'canceled'
+                    workflow_job.start_args = ''  # blank field to remove encrypted passwords
+                    workflow_job.save(update_fields=['status', 'start_args'])
+                    status_changed = True
+            else:
+                workflow_nodes = dag.mark_dnr_nodes()
+                for n in workflow_nodes:
+                    n.save(update_fields=['do_not_run'])
+                is_done = dag.is_workflow_done()
+                if not is_done:
+                    continue
+                has_failed, reason = dag.has_workflow_failed()
+                logger.debug('Marking %s as %s.', workflow_job.log_format, 'failed' if has_failed else 'successful')
+                result.append(workflow_job.id)
+                new_status = 'failed' if has_failed else 'successful'
+                logger.debug("Transitioning {} to {} status.".format(workflow_job.log_format, new_status))
+                update_fields = ['status', 'start_args']
+                workflow_job.status = new_status
+                if reason:
+                    logger.info(f'Workflow job {workflow_job.id} failed due to reason: {reason}')
+                    workflow_job.job_explanation = gettext_noop("No error handling paths found, marking workflow as failed")
+                    update_fields.append('job_explanation')
+                workflow_job.start_args = ''  # blank field to remove encrypted passwords
+                workflow_job.save(update_fields=update_fields)
+                status_changed = True
+            if status_changed:
+                if workflow_job.spawned_by_workflow:
+                    schedule_task_manager()
+                workflow_job.websocket_emit_status(workflow_job.status)
+                # Operations whose queries rely on modifications made during the atomic scheduling session
+                workflow_job.send_notification_templates('succeeded' if workflow_job.status == 'successful' else 'failed')
+        return result
+
+    def timeout_approval_node(self):
+        workflow_approvals = WorkflowApproval.objects.filter(status='pending')
+        now = tz_now()
+        for task in workflow_approvals:
+            approval_timeout_seconds = timedelta(seconds=task.timeout)
+            if task.timeout == 0:
+                continue
+            if (now - task.created) >= approval_timeout_seconds:
+                timeout_message = _("The approval node {name} ({pk}) has expired after {timeout} seconds.").format(
+                    name=task.name, pk=task.pk, timeout=task.timeout
+                )
+                logger.warning(timeout_message)
+                task.timed_out = True
+                task.status = 'failed'
+                task.send_approval_notification('timed_out')
+                task.websocket_emit_status(task.status)
+                task.job_explanation = timeout_message
+                task.save(update_fields=['status', 'job_explanation', 'timed_out'])
+
+    @timeit
+    def _schedule(self):
+        running_workflow_tasks = self.get_running_workflow_jobs()
+        if len(running_workflow_tasks) > 0:
+            self.process_finished_workflow_jobs(running_workflow_tasks)
+
+            previously_running_workflow_tasks = running_workflow_tasks
+            running_workflow_tasks = []
+            for workflow_job in previously_running_workflow_tasks:
+                if workflow_job.status == 'running':
+                    running_workflow_tasks.append(workflow_job)
+                else:
+                    logger.debug('Removed %s from job spawning consideration.', workflow_job.log_format)
+
+            self.spawn_workflow_graph_jobs(running_workflow_tasks)
+
+            self.timeout_approval_node()
+
+
+class DependencyManager(TaskBase):
+    def __init__(self):
+        self.prefix = "dependency_manager"
        super().__init__()

    def create_project_update(self, task, project_id=None):
@@ -433,133 +580,6 @@ class TaskManager(TaskBase):
        task.websocket_emit_status(task.status)  # adds to on_commit
        connection.on_commit(post_commit)

-    @timeit
-    def spawn_workflow_graph_jobs(self, workflow_jobs):
-        logger.debug(f"=== {workflow_jobs}")
-        for workflow_job in workflow_jobs:
-            if workflow_job.cancel_flag:
-                logger.debug('Not spawning jobs for %s because it is pending cancelation.', workflow_job.log_format)
-                continue
-            dag = WorkflowDAG(workflow_job)
-            spawn_nodes = dag.bfs_nodes_to_run()
-            if spawn_nodes:
-                logger.debug('Spawning jobs for %s', workflow_job.log_format)
-            else:
-                logger.debug('No nodes to spawn for %s', workflow_job.log_format)
-            for spawn_node in spawn_nodes:
-                if spawn_node.unified_job_template is None:
-                    continue
-                kv = spawn_node.get_job_kwargs()
-                job = spawn_node.unified_job_template.create_unified_job(**kv)
-                spawn_node.job = job
-                spawn_node.save()
-                logger.debug('Spawned %s in %s for node %s', job.log_format, workflow_job.log_format, spawn_node.pk)
-                can_start = True
-                if isinstance(spawn_node.unified_job_template, WorkflowJobTemplate):
-                    workflow_ancestors = job.get_ancestor_workflows()
-                    if spawn_node.unified_job_template in set(workflow_ancestors):
-                        can_start = False
-                        logger.info(
-                            'Refusing to start recursive workflow-in-workflow id={}, wfjt={}, ancestors={}'.format(
-                                job.id, spawn_node.unified_job_template.pk, [wa.pk for wa in workflow_ancestors]
-                            )
-                        )
-                        display_list = [spawn_node.unified_job_template] + workflow_ancestors
-                        job.job_explanation = gettext_noop(
-                            "Workflow Job spawned from workflow could not start because it " "would result in recursion (spawn order, most recent first: {})"
-                        ).format(', '.join(['<{}>'.format(tmp) for tmp in display_list]))
-                    else:
-                        logger.debug(
-                            'Starting workflow-in-workflow id={}, wfjt={}, ancestors={}'.format(
-                                job.id, spawn_node.unified_job_template.pk, [wa.pk for wa in workflow_ancestors]
-                            )
-                        )
-                if not job._resources_sufficient_for_launch():
-                    can_start = False
-                    job.job_explanation = gettext_noop(
-                        "Job spawned from workflow could not start because it " "was missing a related resource such as project or inventory"
-                    )
-                if can_start:
-                    if workflow_job.start_args:
-                        start_args = json.loads(decrypt_field(workflow_job, 'start_args'))
-                    else:
-                        start_args = {}
-                    can_start = job.signal_start(**start_args)
-                    if not can_start:
-                        job.job_explanation = gettext_noop(
-                            "Job spawned from workflow could not start because it " "was not in the right state or required manual credentials"
-                        )
-                if not can_start:
-                    job.status = 'failed'
-                    job.save(update_fields=['status', 'job_explanation'])
-                    job.websocket_emit_status('failed')
-
-                # TODO: should we emit a status on the socket here similar to tasks.py awx_periodic_scheduler() ?
-                # emit_websocket_notification('/socket.io/jobs', '', dict(id=))
-
-    def process_finished_workflow_jobs(self, workflow_jobs):
-        result = []
-        for workflow_job in workflow_jobs:
-            dag = WorkflowDAG(workflow_job)
-            status_changed = False
-            if workflow_job.cancel_flag:
-                workflow_job.workflow_nodes.filter(do_not_run=False, job__isnull=True).update(do_not_run=True)
-                logger.debug('Canceling spawned jobs of %s due to cancel flag.', workflow_job.log_format)
-                cancel_finished = dag.cancel_node_jobs()
-                if cancel_finished:
-                    logger.info('Marking %s as canceled, all spawned jobs have concluded.', workflow_job.log_format)
-                    workflow_job.status = 'canceled'
-                    workflow_job.start_args = ''  # blank field to remove encrypted passwords
-                    workflow_job.save(update_fields=['status', 'start_args'])
-                    status_changed = True
-            else:
-                workflow_nodes = dag.mark_dnr_nodes()
-                for n in workflow_nodes:
-                    n.save(update_fields=['do_not_run'])
-                is_done = dag.is_workflow_done()
-                if not is_done:
-                    continue
-                has_failed, reason = dag.has_workflow_failed()
-                logger.debug('Marking %s as %s.', workflow_job.log_format, 'failed' if has_failed else 'successful')
-                result.append(workflow_job.id)
-                new_status = 'failed' if has_failed else 'successful'
-                logger.debug("Transitioning {} to {} status.".format(workflow_job.log_format, new_status))
-                update_fields = ['status', 'start_args']
-                workflow_job.status = new_status
-                if reason:
-                    logger.info(f'Workflow job {workflow_job.id} failed due to reason: {reason}')
-                    workflow_job.job_explanation = gettext_noop("No error handling paths found, marking workflow as failed")
-                    update_fields.append('job_explanation')
-                workflow_job.start_args = ''  # blank field to remove encrypted passwords
-                workflow_job.save(update_fields=update_fields)
-                status_changed = True
-            if status_changed:
-                if workflow_job.spawned_by_workflow:
-                    schedule_task_manager()
-                workflow_job.websocket_emit_status(workflow_job.status)
-                # Operations whose queries rely on modifications made during the atomic scheduling session
-                workflow_job.send_notification_templates('succeeded' if workflow_job.status == 'successful' else 'failed')
-        return result
-
-    def timeout_approval_node(self):
-        workflow_approvals = WorkflowApproval.objects.filter(status='pending')
-        now = tz_now()
-        for task in workflow_approvals:
-            approval_timeout_seconds = timedelta(seconds=task.timeout)
-            if task.timeout == 0:
-                continue
-            if (now - task.created) >= approval_timeout_seconds:
-                timeout_message = _("The approval node {name} ({pk}) has expired after {timeout} seconds.").format(
-                    name=task.name, pk=task.pk, timeout=task.timeout
-                )
-                logger.warning(timeout_message)
-                task.timed_out = True
-                task.status = 'failed'
-                task.send_approval_notification('timed_out')
-                task.websocket_emit_status(task.status)
-                task.job_explanation = timeout_message
-                task.save(update_fields=['status', 'job_explanation', 'timed_out'])
-
    @timeit
    def process_running_tasks(self, running_tasks):
        for task in running_tasks:
@@ -713,19 +733,4 @@ class TaskManager(TaskBase):
        self.reap_jobs_from_orphaned_instances()

        if len(all_sorted_tasks) > 0:
-            running_workflow_tasks = self.get_running_workflow_jobs()
-            self.process_finished_workflow_jobs(running_workflow_tasks)
-
-            previously_running_workflow_tasks = running_workflow_tasks
-            running_workflow_tasks = []
-            for workflow_job in previously_running_workflow_tasks:
-                if workflow_job.status == 'running':
-                    running_workflow_tasks.append(workflow_job)
-                else:
-                    logger.debug('Removed %s from job spawning consideration.', workflow_job.log_format)
-
-            self.spawn_workflow_graph_jobs(running_workflow_tasks)
-
-            self.timeout_approval_node()
-
            self.process_tasks(all_sorted_tasks)
@@ -2,7 +2,7 @@
 import logging

 # AWX
-from awx.main.scheduler import TaskManager, TaskPrepper
+from awx.main.scheduler import TaskManager, DependencyManager, WorkflowManager
 from awx.main.dispatch.publish import task
 from awx.main.dispatch import get_local_queuename

@@ -11,16 +11,20 @@ logger = logging.getLogger('awx.main.scheduler')

@task(queue=get_local_queuename)
 def task_manager():
-    logger.debug("=== Running task manager.")
    TaskManager().schedule()


@task(queue=get_local_queuename)
-def task_prepper():
-    logger.debug("=== Running task prepper.")
-    TaskPrepper().schedule()
+def dependency_manager():
+    DependencyManager().schedule()
+
+
+@task(queue=get_local_queuename)
+def workflow_manager():
+    WorkflowManager().schedule()


 def run_task_manager():
    task_manager()
-    task_prepper()
+    dependency_manager()
+    workflow_manager()
@@ -862,12 +862,12 @@ def ignore_inventory_computed_fields():


 def _schedule_task_manager():
-    from awx.main.scheduler.tasks import task_manager, task_prepper
+    from awx.main.scheduler.tasks import task_manager, dependency_manager
    from django.db import connection

    # runs right away if not in transaction
    connection.on_commit(lambda: task_manager.delay())
-    connection.on_commit(lambda: task_prepper.delay())
+    connection.on_commit(lambda: dependency_manager.delay())


@contextlib.contextmanager
@@ -443,7 +443,8 @@ CELERYBEAT_SCHEDULE = {
    },
    'gather_analytics': {'task': 'awx.main.tasks.system.gather_analytics', 'schedule': timedelta(minutes=5)},
    'task_manager': {'task': 'awx.main.scheduler.tasks.task_manager', 'schedule': timedelta(seconds=20), 'options': {'expires': 20}},
-    'task_prepper': {'task': 'awx.main.scheduler.tasks.task_prepper', 'schedule': timedelta(seconds=20), 'options': {'expires': 20}},
+    'dependency_manager': {'task': 'awx.main.scheduler.tasks.dependency_manager', 'schedule': timedelta(seconds=20), 'options': {'expires': 20}},
+    'workflow_manager': {'task': 'awx.main.scheduler.tasks.workflow_manager', 'schedule': timedelta(seconds=20), 'options': {'expires': 20}},
    'k8s_reaper': {'task': 'awx.main.tasks.system.awx_k8s_reaper', 'schedule': timedelta(seconds=60), 'options': {'expires': 50}},
    'receptor_reaper': {'task': 'awx.main.tasks.system.awx_receptor_workunit_reaper', 'schedule': timedelta(seconds=60)},
    'send_subsystem_metrics': {'task': 'awx.main.analytics.analytics_tasks.send_subsystem_metrics', 'schedule': timedelta(seconds=20)},