From 06883dc88272ea112c871908db6b65ef0929c2e4 Mon Sep 17 00:00:00 2001 From: Elijah DeLee Date: Wed, 6 Jul 2022 23:11:49 -0400 Subject: [PATCH] add settings to define task manager timeout Also have a seperate setting for a grace period beyond the timeout. This is so in the future, we can exit after the timeout but before the grace period. --- awx/main/dispatch/pool.py | 4 ++-- awx/settings/defaults.py | 5 +++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/awx/main/dispatch/pool.py b/awx/main/dispatch/pool.py index 576f6bf799..a302325993 100644 --- a/awx/main/dispatch/pool.py +++ b/awx/main/dispatch/pool.py @@ -406,8 +406,8 @@ class AutoscalePool(WorkerPool): w.managed_tasks[current_task['uuid']]['started'] = time.time() age = time.time() - current_task['started'] w.managed_tasks[current_task['uuid']]['age'] = age - if age > (60 * 5): - logger.error(f'run_task_manager has held the advisory lock for >5m, sending SIGTERM to {w.pid}') # noqa + if age > (settings.TASK_MANAGER_TIMEOUT + settings.TASK_MANAGER_TIMEOUT_GRACE_PERIOD): + logger.error(f'run_task_manager has held the advisory lock for {age}, sending SIGTERM to {w.pid}') # noqa os.kill(w.pid, signal.SIGTERM) for m in orphaned: diff --git a/awx/settings/defaults.py b/awx/settings/defaults.py index ef389e5151..ea84d235fe 100644 --- a/awx/settings/defaults.py +++ b/awx/settings/defaults.py @@ -248,6 +248,11 @@ SUBSYSTEM_METRICS_TASK_MANAGER_RECORD_INTERVAL = 15 # The maximum allowed jobs to start on a given task manager cycle START_TASK_LIMIT = 100 +# Time out task managers if they take longer than this many seconds, plus TASK_MANAGER_TIMEOUT_GRACE_PERIOD +# We have the grace period so the task manager can bail out before the timeout. +TASK_MANAGER_TIMEOUT = 300 +TASK_MANAGER_TIMEOUT_GRACE_PERIOD = 60 + # Disallow sending session cookies over insecure connections SESSION_COOKIE_SECURE = True