diff --git a/awx/main/dispatch/pool.py b/awx/main/dispatch/pool.py index 576f6bf799..a302325993 100644 --- a/awx/main/dispatch/pool.py +++ b/awx/main/dispatch/pool.py @@ -406,8 +406,8 @@ class AutoscalePool(WorkerPool): w.managed_tasks[current_task['uuid']]['started'] = time.time() age = time.time() - current_task['started'] w.managed_tasks[current_task['uuid']]['age'] = age - if age > (60 * 5): - logger.error(f'run_task_manager has held the advisory lock for >5m, sending SIGTERM to {w.pid}') # noqa + if age > (settings.TASK_MANAGER_TIMEOUT + settings.TASK_MANAGER_TIMEOUT_GRACE_PERIOD): + logger.error(f'run_task_manager has held the advisory lock for {age}, sending SIGTERM to {w.pid}') # noqa os.kill(w.pid, signal.SIGTERM) for m in orphaned: diff --git a/awx/settings/defaults.py b/awx/settings/defaults.py index 6a02d0d710..a2b19a82d8 100644 --- a/awx/settings/defaults.py +++ b/awx/settings/defaults.py @@ -248,6 +248,11 @@ SUBSYSTEM_METRICS_TASK_MANAGER_RECORD_INTERVAL = 15 # The maximum allowed jobs to start on a given task manager cycle START_TASK_LIMIT = 100 +# Time out task managers if they take longer than this many seconds, plus TASK_MANAGER_TIMEOUT_GRACE_PERIOD +# We have the grace period so the task manager can bail out before the timeout. +TASK_MANAGER_TIMEOUT = 300 +TASK_MANAGER_TIMEOUT_GRACE_PERIOD = 60 + # Disallow sending session cookies over insecure connections SESSION_COOKIE_SECURE = True