Upgrade RQ to v1.5 (#5207)

* upgrade RQ to v1.5

* set job's started_at

* update healthcheck to match string worker names

* delay worker healthcheck for 5 minutes from start to allow enough time to load in case many workers try to load simultaneously

* log when worker cannot be found
This commit is contained in:
Omer Lachish
2021-02-15 22:52:53 +02:00
committed by GitHub
parent 640fea5e47
commit 46e97a08cc
9 changed files with 40 additions and 73 deletions

View File

@@ -50,30 +50,22 @@ def worker(queues):
class WorkerHealthcheck(base.BaseCheck): class WorkerHealthcheck(base.BaseCheck):
NAME = 'RQ Worker Healthcheck' NAME = "RQ Worker Healthcheck"
INTERVAL = datetime.timedelta(minutes=5)
_last_check_time = {}
def time_to_check(self, pid):
now = datetime.datetime.utcnow()
if pid not in self._last_check_time:
self._last_check_time[pid] = now
if now - self._last_check_time[pid] >= self.INTERVAL:
self._last_check_time[pid] = now
return True
return False
def __call__(self, process_spec): def __call__(self, process_spec):
pid = process_spec['pid'] pid = process_spec["pid"]
if not self.time_to_check(pid):
return True
all_workers = Worker.all(connection=rq_redis_connection) all_workers = Worker.all(connection=rq_redis_connection)
worker = [w for w in all_workers if w.hostname == socket.gethostname().encode() and workers = [
w.pid == pid].pop() w
for w in all_workers
if w.hostname == socket.gethostname() and w.pid == pid
]
if not workers:
self._log(f"Cannot find worker for hostname {socket.gethostname()} and pid {pid}. ==> Is healthy? False")
return False
worker = workers.pop()
is_busy = worker.get_state() == WorkerStatus.BUSY is_busy = worker.get_state() == WorkerStatus.BUSY
@@ -85,12 +77,19 @@ class WorkerHealthcheck(base.BaseCheck):
is_healthy = is_busy or seen_lately or has_nothing_to_do is_healthy = is_busy or seen_lately or has_nothing_to_do
self._log("Worker %s healthcheck: Is busy? %s. " self._log(
"Seen lately? %s (%d seconds ago). " "Worker %s healthcheck: Is busy? %s. "
"Has nothing to do? %s (%d jobs in watched queues). " "Seen lately? %s (%d seconds ago). "
"==> Is healthy? %s", "Has nothing to do? %s (%d jobs in watched queues). "
worker.key, is_busy, seen_lately, time_since_seen.seconds, "==> Is healthy? %s",
has_nothing_to_do, total_jobs_in_watched_queues, is_healthy) worker.key,
is_busy,
seen_lately,
time_since_seen.seconds,
has_nothing_to_do,
total_jobs_in_watched_queues,
is_healthy,
)
return is_healthy return is_healthy
@@ -98,4 +97,5 @@ class WorkerHealthcheck(base.BaseCheck):
@manager.command() @manager.command()
def healthcheck(): def healthcheck():
return check_runner.CheckRunner( return check_runner.CheckRunner(
'worker_healthcheck', 'worker', None, [(WorkerHealthcheck, {})]).run() "worker_healthcheck", "worker", None, [(WorkerHealthcheck, {})]
).run()

View File

@@ -3,7 +3,6 @@ from .general import (
version_check, version_check,
send_mail, send_mail,
sync_user_details, sync_user_details,
purge_failed_jobs,
) )
from .queries import ( from .queries import (
enqueue_query, enqueue_query,

View File

@@ -2,13 +2,10 @@ import requests
from datetime import datetime from datetime import datetime
from flask_mail import Message from flask_mail import Message
from rq import Connection, Queue from redash import mail, models, settings
from rq.registry import FailedJobRegistry
from rq.job import Job
from redash import mail, models, settings, rq_redis_connection
from redash.models import users from redash.models import users
from redash.version_check import run_version_check from redash.version_check import run_version_check
from redash.worker import job, get_job_logger, default_operational_queues from redash.worker import job, get_job_logger
from redash.tasks.worker import Queue from redash.tasks.worker import Queue
from redash.query_runner import NotSupported from redash.query_runner import NotSupported
@@ -94,35 +91,3 @@ def get_schema(data_source_id, refresh):
def sync_user_details(): def sync_user_details():
users.sync_last_active_at() users.sync_last_active_at()
def purge_failed_jobs():
with Connection(rq_redis_connection):
queues = [q for q in Queue.all() if q.name not in default_operational_queues]
for queue in queues:
failed_job_ids = FailedJobRegistry(queue=queue).get_job_ids()
failed_jobs = Job.fetch_many(failed_job_ids, rq_redis_connection)
stale_jobs = []
for failed_job in failed_jobs:
# the job may not actually exist anymore in Redis
if not failed_job:
continue
# the job could have an empty ended_at value in case
# of a worker dying before it can save the ended_at value,
# in which case we also consider them stale
if not failed_job.ended_at:
stale_jobs.append(failed_job)
elif (
datetime.utcnow() - failed_job.ended_at
).total_seconds() > settings.JOB_DEFAULT_FAILURE_TTL:
stale_jobs.append(failed_job)
for stale_job in stale_jobs:
stale_job.delete()
if stale_jobs:
logger.info(
"Purged %d old failed jobs from the %s queue.",
len(stale_jobs),
queue.name,
)

View File

@@ -90,6 +90,7 @@ def enqueue_query(
"scheduled_query_id": scheduled_query_id, "scheduled_query_id": scheduled_query_id,
"is_api_key": is_api_key, "is_api_key": is_api_key,
"job_timeout": time_limit, "job_timeout": time_limit,
"failure_ttl": settings.JOB_DEFAULT_FAILURE_TTL,
"meta": { "meta": {
"data_source_id": data_source.id, "data_source_id": data_source.id,
"org_id": data_source.org_id, "org_id": data_source.org_id,

View File

@@ -15,7 +15,6 @@ from redash.tasks import (
empty_schedules, empty_schedules,
refresh_schemas, refresh_schemas,
cleanup_query_results, cleanup_query_results,
purge_failed_jobs,
version_check, version_check,
send_aggregated_errors, send_aggregated_errors,
Queue, Queue,
@@ -71,14 +70,13 @@ def periodic_job_definitions():
{ {
"func": refresh_schemas, "func": refresh_schemas,
"interval": timedelta(minutes=settings.SCHEMAS_REFRESH_SCHEDULE), "interval": timedelta(minutes=settings.SCHEMAS_REFRESH_SCHEDULE),
}, },
{ {
"func": sync_user_details, "func": sync_user_details,
"timeout": 60, "timeout": 60,
"interval": timedelta(minutes=1), "interval": timedelta(minutes=1),
"result_ttl": 600, "result_ttl": 600,
}, },
{"func": purge_failed_jobs, "timeout": 3600, "interval": timedelta(days=1)},
{ {
"func": send_aggregated_errors, "func": send_aggregated_errors,
"interval": timedelta(minutes=settings.SEND_FAILURE_EMAIL_INTERVAL), "interval": timedelta(minutes=settings.SEND_FAILURE_EMAIL_INTERVAL),

View File

@@ -101,12 +101,13 @@ class HardLimitingWorker(HerokuWorker):
) )
self.kill_horse() self.kill_horse()
def monitor_work_horse(self, job): def monitor_work_horse(self, job, queue):
"""The worker will monitor the work horse and make sure that it """The worker will monitor the work horse and make sure that it
either executes successfully or the status of the job is set to either executes successfully or the status of the job is set to
failed failed
""" """
self.monitor_started = utcnow() self.monitor_started = utcnow()
job.started_at = utcnow()
while True: while True:
try: try:
with UnixSignalDeathPenalty( with UnixSignalDeathPenalty(
@@ -158,6 +159,7 @@ class HardLimitingWorker(HerokuWorker):
self.handle_job_failure( self.handle_job_failure(
job, job,
queue=queue,
exc_string="Work-horse process was terminated unexpectedly " exc_string="Work-horse process was terminated unexpectedly "
"(waitpid returned %s)" % ret_val, "(waitpid returned %s)" % ret_val,
) )

View File

@@ -30,7 +30,7 @@ class StatsdRecordingJobDecorator(rq_job): # noqa
queue_class = RedashQueue queue_class = RedashQueue
job = partial(StatsdRecordingJobDecorator, connection=rq_redis_connection) job = partial(StatsdRecordingJobDecorator, connection=rq_redis_connection, failure_ttl=settings.JOB_DEFAULT_FAILURE_TTL)
class CurrentJobFilter(logging.Filter): class CurrentJobFilter(logging.Filter):

View File

@@ -24,7 +24,7 @@ psycopg2==2.8.3
python-dateutil==2.8.0 python-dateutil==2.8.0
pytz>=2019.3 pytz>=2019.3
PyYAML==5.1.2 PyYAML==5.1.2
redis==3.3.11 redis==3.5.0
requests==2.21.0 requests==2.21.0
SQLAlchemy==1.3.10 SQLAlchemy==1.3.10
# We can't upgrade SQLAlchemy-Searchable version as newer versions require PostgreSQL > 9.6, but we target older versions at the moment. # We can't upgrade SQLAlchemy-Searchable version as newer versions require PostgreSQL > 9.6, but we target older versions at the moment.
@@ -34,8 +34,9 @@ pyparsing==2.3.0
SQLAlchemy-Utils==0.34.2 SQLAlchemy-Utils==0.34.2
sqlparse==0.3.0 sqlparse==0.3.0
statsd==3.3.0 statsd==3.3.0
greenlet==0.4.16
gunicorn==20.0.4 gunicorn==20.0.4
rq==1.1.0 rq==1.5.0
rq-scheduler==0.9.1 rq-scheduler==0.9.1
jsonschema==3.1.1 jsonschema==3.1.1
RestrictedPython==5.0 RestrictedPython==5.0

View File

@@ -17,6 +17,7 @@ directory=/app
stopsignal=TERM stopsignal=TERM
autostart=true autostart=true
autorestart=true autorestart=true
startsecs=300
stdout_logfile=/dev/stdout stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0 stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr stderr_logfile=/dev/stderr