mirror of
https://github.com/getredash/redash.git
synced 2025-12-19 17:37:19 -05:00
Upgrade RQ to v1.5 (#5207)
* upgrade RQ to v1.5 * set job's started_at * update healthcheck to match string worker names * delay worker healthcheck for 5 minutes from start to allow enough time to load in case many workers try to load simultaneously * log when worker cannot be found
This commit is contained in:
@@ -50,30 +50,22 @@ def worker(queues):
|
|||||||
|
|
||||||
|
|
||||||
class WorkerHealthcheck(base.BaseCheck):
|
class WorkerHealthcheck(base.BaseCheck):
|
||||||
NAME = 'RQ Worker Healthcheck'
|
NAME = "RQ Worker Healthcheck"
|
||||||
INTERVAL = datetime.timedelta(minutes=5)
|
|
||||||
_last_check_time = {}
|
|
||||||
|
|
||||||
def time_to_check(self, pid):
|
|
||||||
now = datetime.datetime.utcnow()
|
|
||||||
|
|
||||||
if pid not in self._last_check_time:
|
|
||||||
self._last_check_time[pid] = now
|
|
||||||
|
|
||||||
if now - self._last_check_time[pid] >= self.INTERVAL:
|
|
||||||
self._last_check_time[pid] = now
|
|
||||||
return True
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
def __call__(self, process_spec):
|
def __call__(self, process_spec):
|
||||||
pid = process_spec['pid']
|
pid = process_spec["pid"]
|
||||||
if not self.time_to_check(pid):
|
|
||||||
return True
|
|
||||||
|
|
||||||
all_workers = Worker.all(connection=rq_redis_connection)
|
all_workers = Worker.all(connection=rq_redis_connection)
|
||||||
worker = [w for w in all_workers if w.hostname == socket.gethostname().encode() and
|
workers = [
|
||||||
w.pid == pid].pop()
|
w
|
||||||
|
for w in all_workers
|
||||||
|
if w.hostname == socket.gethostname() and w.pid == pid
|
||||||
|
]
|
||||||
|
|
||||||
|
if not workers:
|
||||||
|
self._log(f"Cannot find worker for hostname {socket.gethostname()} and pid {pid}. ==> Is healthy? False")
|
||||||
|
return False
|
||||||
|
|
||||||
|
worker = workers.pop()
|
||||||
|
|
||||||
is_busy = worker.get_state() == WorkerStatus.BUSY
|
is_busy = worker.get_state() == WorkerStatus.BUSY
|
||||||
|
|
||||||
@@ -85,12 +77,19 @@ class WorkerHealthcheck(base.BaseCheck):
|
|||||||
|
|
||||||
is_healthy = is_busy or seen_lately or has_nothing_to_do
|
is_healthy = is_busy or seen_lately or has_nothing_to_do
|
||||||
|
|
||||||
self._log("Worker %s healthcheck: Is busy? %s. "
|
self._log(
|
||||||
"Seen lately? %s (%d seconds ago). "
|
"Worker %s healthcheck: Is busy? %s. "
|
||||||
"Has nothing to do? %s (%d jobs in watched queues). "
|
"Seen lately? %s (%d seconds ago). "
|
||||||
"==> Is healthy? %s",
|
"Has nothing to do? %s (%d jobs in watched queues). "
|
||||||
worker.key, is_busy, seen_lately, time_since_seen.seconds,
|
"==> Is healthy? %s",
|
||||||
has_nothing_to_do, total_jobs_in_watched_queues, is_healthy)
|
worker.key,
|
||||||
|
is_busy,
|
||||||
|
seen_lately,
|
||||||
|
time_since_seen.seconds,
|
||||||
|
has_nothing_to_do,
|
||||||
|
total_jobs_in_watched_queues,
|
||||||
|
is_healthy,
|
||||||
|
)
|
||||||
|
|
||||||
return is_healthy
|
return is_healthy
|
||||||
|
|
||||||
@@ -98,4 +97,5 @@ class WorkerHealthcheck(base.BaseCheck):
|
|||||||
@manager.command()
|
@manager.command()
|
||||||
def healthcheck():
|
def healthcheck():
|
||||||
return check_runner.CheckRunner(
|
return check_runner.CheckRunner(
|
||||||
'worker_healthcheck', 'worker', None, [(WorkerHealthcheck, {})]).run()
|
"worker_healthcheck", "worker", None, [(WorkerHealthcheck, {})]
|
||||||
|
).run()
|
||||||
|
|||||||
@@ -3,7 +3,6 @@ from .general import (
|
|||||||
version_check,
|
version_check,
|
||||||
send_mail,
|
send_mail,
|
||||||
sync_user_details,
|
sync_user_details,
|
||||||
purge_failed_jobs,
|
|
||||||
)
|
)
|
||||||
from .queries import (
|
from .queries import (
|
||||||
enqueue_query,
|
enqueue_query,
|
||||||
|
|||||||
@@ -2,13 +2,10 @@ import requests
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
from flask_mail import Message
|
from flask_mail import Message
|
||||||
from rq import Connection, Queue
|
from redash import mail, models, settings
|
||||||
from rq.registry import FailedJobRegistry
|
|
||||||
from rq.job import Job
|
|
||||||
from redash import mail, models, settings, rq_redis_connection
|
|
||||||
from redash.models import users
|
from redash.models import users
|
||||||
from redash.version_check import run_version_check
|
from redash.version_check import run_version_check
|
||||||
from redash.worker import job, get_job_logger, default_operational_queues
|
from redash.worker import job, get_job_logger
|
||||||
from redash.tasks.worker import Queue
|
from redash.tasks.worker import Queue
|
||||||
from redash.query_runner import NotSupported
|
from redash.query_runner import NotSupported
|
||||||
|
|
||||||
@@ -94,35 +91,3 @@ def get_schema(data_source_id, refresh):
|
|||||||
|
|
||||||
def sync_user_details():
|
def sync_user_details():
|
||||||
users.sync_last_active_at()
|
users.sync_last_active_at()
|
||||||
|
|
||||||
|
|
||||||
def purge_failed_jobs():
|
|
||||||
with Connection(rq_redis_connection):
|
|
||||||
queues = [q for q in Queue.all() if q.name not in default_operational_queues]
|
|
||||||
for queue in queues:
|
|
||||||
failed_job_ids = FailedJobRegistry(queue=queue).get_job_ids()
|
|
||||||
failed_jobs = Job.fetch_many(failed_job_ids, rq_redis_connection)
|
|
||||||
stale_jobs = []
|
|
||||||
for failed_job in failed_jobs:
|
|
||||||
# the job may not actually exist anymore in Redis
|
|
||||||
if not failed_job:
|
|
||||||
continue
|
|
||||||
# the job could have an empty ended_at value in case
|
|
||||||
# of a worker dying before it can save the ended_at value,
|
|
||||||
# in which case we also consider them stale
|
|
||||||
if not failed_job.ended_at:
|
|
||||||
stale_jobs.append(failed_job)
|
|
||||||
elif (
|
|
||||||
datetime.utcnow() - failed_job.ended_at
|
|
||||||
).total_seconds() > settings.JOB_DEFAULT_FAILURE_TTL:
|
|
||||||
stale_jobs.append(failed_job)
|
|
||||||
|
|
||||||
for stale_job in stale_jobs:
|
|
||||||
stale_job.delete()
|
|
||||||
|
|
||||||
if stale_jobs:
|
|
||||||
logger.info(
|
|
||||||
"Purged %d old failed jobs from the %s queue.",
|
|
||||||
len(stale_jobs),
|
|
||||||
queue.name,
|
|
||||||
)
|
|
||||||
|
|||||||
@@ -90,6 +90,7 @@ def enqueue_query(
|
|||||||
"scheduled_query_id": scheduled_query_id,
|
"scheduled_query_id": scheduled_query_id,
|
||||||
"is_api_key": is_api_key,
|
"is_api_key": is_api_key,
|
||||||
"job_timeout": time_limit,
|
"job_timeout": time_limit,
|
||||||
|
"failure_ttl": settings.JOB_DEFAULT_FAILURE_TTL,
|
||||||
"meta": {
|
"meta": {
|
||||||
"data_source_id": data_source.id,
|
"data_source_id": data_source.id,
|
||||||
"org_id": data_source.org_id,
|
"org_id": data_source.org_id,
|
||||||
|
|||||||
@@ -15,7 +15,6 @@ from redash.tasks import (
|
|||||||
empty_schedules,
|
empty_schedules,
|
||||||
refresh_schemas,
|
refresh_schemas,
|
||||||
cleanup_query_results,
|
cleanup_query_results,
|
||||||
purge_failed_jobs,
|
|
||||||
version_check,
|
version_check,
|
||||||
send_aggregated_errors,
|
send_aggregated_errors,
|
||||||
Queue,
|
Queue,
|
||||||
@@ -71,14 +70,13 @@ def periodic_job_definitions():
|
|||||||
{
|
{
|
||||||
"func": refresh_schemas,
|
"func": refresh_schemas,
|
||||||
"interval": timedelta(minutes=settings.SCHEMAS_REFRESH_SCHEDULE),
|
"interval": timedelta(minutes=settings.SCHEMAS_REFRESH_SCHEDULE),
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"func": sync_user_details,
|
"func": sync_user_details,
|
||||||
"timeout": 60,
|
"timeout": 60,
|
||||||
"interval": timedelta(minutes=1),
|
"interval": timedelta(minutes=1),
|
||||||
"result_ttl": 600,
|
"result_ttl": 600,
|
||||||
},
|
},
|
||||||
{"func": purge_failed_jobs, "timeout": 3600, "interval": timedelta(days=1)},
|
|
||||||
{
|
{
|
||||||
"func": send_aggregated_errors,
|
"func": send_aggregated_errors,
|
||||||
"interval": timedelta(minutes=settings.SEND_FAILURE_EMAIL_INTERVAL),
|
"interval": timedelta(minutes=settings.SEND_FAILURE_EMAIL_INTERVAL),
|
||||||
|
|||||||
@@ -101,12 +101,13 @@ class HardLimitingWorker(HerokuWorker):
|
|||||||
)
|
)
|
||||||
self.kill_horse()
|
self.kill_horse()
|
||||||
|
|
||||||
def monitor_work_horse(self, job):
|
def monitor_work_horse(self, job, queue):
|
||||||
"""The worker will monitor the work horse and make sure that it
|
"""The worker will monitor the work horse and make sure that it
|
||||||
either executes successfully or the status of the job is set to
|
either executes successfully or the status of the job is set to
|
||||||
failed
|
failed
|
||||||
"""
|
"""
|
||||||
self.monitor_started = utcnow()
|
self.monitor_started = utcnow()
|
||||||
|
job.started_at = utcnow()
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
with UnixSignalDeathPenalty(
|
with UnixSignalDeathPenalty(
|
||||||
@@ -158,6 +159,7 @@ class HardLimitingWorker(HerokuWorker):
|
|||||||
|
|
||||||
self.handle_job_failure(
|
self.handle_job_failure(
|
||||||
job,
|
job,
|
||||||
|
queue=queue,
|
||||||
exc_string="Work-horse process was terminated unexpectedly "
|
exc_string="Work-horse process was terminated unexpectedly "
|
||||||
"(waitpid returned %s)" % ret_val,
|
"(waitpid returned %s)" % ret_val,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ class StatsdRecordingJobDecorator(rq_job): # noqa
|
|||||||
queue_class = RedashQueue
|
queue_class = RedashQueue
|
||||||
|
|
||||||
|
|
||||||
job = partial(StatsdRecordingJobDecorator, connection=rq_redis_connection)
|
job = partial(StatsdRecordingJobDecorator, connection=rq_redis_connection, failure_ttl=settings.JOB_DEFAULT_FAILURE_TTL)
|
||||||
|
|
||||||
|
|
||||||
class CurrentJobFilter(logging.Filter):
|
class CurrentJobFilter(logging.Filter):
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ psycopg2==2.8.3
|
|||||||
python-dateutil==2.8.0
|
python-dateutil==2.8.0
|
||||||
pytz>=2019.3
|
pytz>=2019.3
|
||||||
PyYAML==5.1.2
|
PyYAML==5.1.2
|
||||||
redis==3.3.11
|
redis==3.5.0
|
||||||
requests==2.21.0
|
requests==2.21.0
|
||||||
SQLAlchemy==1.3.10
|
SQLAlchemy==1.3.10
|
||||||
# We can't upgrade SQLAlchemy-Searchable version as newer versions require PostgreSQL > 9.6, but we target older versions at the moment.
|
# We can't upgrade SQLAlchemy-Searchable version as newer versions require PostgreSQL > 9.6, but we target older versions at the moment.
|
||||||
@@ -34,8 +34,9 @@ pyparsing==2.3.0
|
|||||||
SQLAlchemy-Utils==0.34.2
|
SQLAlchemy-Utils==0.34.2
|
||||||
sqlparse==0.3.0
|
sqlparse==0.3.0
|
||||||
statsd==3.3.0
|
statsd==3.3.0
|
||||||
|
greenlet==0.4.16
|
||||||
gunicorn==20.0.4
|
gunicorn==20.0.4
|
||||||
rq==1.1.0
|
rq==1.5.0
|
||||||
rq-scheduler==0.9.1
|
rq-scheduler==0.9.1
|
||||||
jsonschema==3.1.1
|
jsonschema==3.1.1
|
||||||
RestrictedPython==5.0
|
RestrictedPython==5.0
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ directory=/app
|
|||||||
stopsignal=TERM
|
stopsignal=TERM
|
||||||
autostart=true
|
autostart=true
|
||||||
autorestart=true
|
autorestart=true
|
||||||
|
startsecs=300
|
||||||
stdout_logfile=/dev/stdout
|
stdout_logfile=/dev/stdout
|
||||||
stdout_logfile_maxbytes=0
|
stdout_logfile_maxbytes=0
|
||||||
stderr_logfile=/dev/stderr
|
stderr_logfile=/dev/stderr
|
||||||
|
|||||||
Reference in New Issue
Block a user