Files
impala/tests/comparison/leopard/front_end.py
Michael Brown 723509a571 Leopard front end: protect cases where report and schedule paths are absent
If the leopard front end is run on a pristine system, the front end will
raise exceptions, because the directories containing reports and
schedules do not yet exist.

Since it's the controller's responsibility to create and populate these
filesystem artifacts, in the frontend, let's just protect against the
exception and log warnings.

Testing:

Before the fix, the reload_reports thread would immediately raise an
exception and the thread would die; loading the Web app in a browser
would cause an internal server error.

After the fix, the reload_reports thread logs a warning but continues to
run, and loading the Web app in a browser presents an appropriately
empty listing of run names and schedules.

Change-Id: I186174db136694d3df04d9159362c6eeaa70b5b8
Reviewed-on: http://gerrit.cloudera.org:8080/2164
Reviewed-by: Michael Brown <mikeb@cloudera.com>
Tested-by: Internal Jenkins
2016-03-09 00:16:31 +00:00

315 lines
12 KiB
Python
Executable File

#!/usr/bin/env impala-python
# Copyright (c) 2015 Cloudera, Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import os
import pickle
import stat
import time
from time import sleep
from flask import Flask, render_template, request
from schedule_item import ScheduleItem
from controller import PATH_TO_REPORTS, PATH_TO_SCHEDULE
from threading import Thread
from tests.comparison.query_profile import DefaultProfile
from tests.comparison.db_types import (
Boolean,
Char,
Decimal,
Float,
Int,
TYPES,
Timestamp)
MAX_REPORT_AGE = 21 * 24 * 3600 # 21 days
SLEEP_LENGTH = 20 * 60 # 20 min
LOG = logging.getLogger('leopard.front_end')
app = Flask(__name__)
app.reports = {}
ASSETS = {'bootstrap_css': 'css/bootstrap.min.css',
'hljs_css': 'css/default.css',
'favicon': 'favicon.ico',
'bootstrap_js': 'js/bootstrap.min.js',
'hljs_js': 'js/highlight.pack.js'}
@app.route('/reports/<report_id>')
def show_report(report_id):
'''Renders a report as HTML. '''
if report_id not in app.reports:
with open(os.path.join(PATH_TO_REPORTS, report_id), 'r') as f:
app.reports[report_id] = pickle.load(f)
report = app.reports[report_id]
def get_next_id():
'''Generates all natural numbers. '''
i = 0
while True:
yield str(i)
i += 1
gen = get_next_id()
# Generate HTML for displaying the crashes
outer_crashes_list = []
for first_impala_frame in report.grouped_stacks:
crashes_list = []
# results are sorted on the length of the query SQL
for result in sorted(report.grouped_stacks[first_impala_frame],
key = lambda result: len(result['test_sql'])):
inner_id = next(gen)
inner_title = 'Lines in Stack: {0}'.format(
len(result['formatted_stack'].split('\n')))
content = ('<h4>Impala Query:</h4><pre><code>{0}</code></pre>'
'<h4>Stack:</h4><pre>{1}</pre>').format(
result['test_sql'], result['formatted_stack'][:50000])
crashes_list.append((inner_id, inner_title, content))
id = next(gen)
title = first_impala_frame
outer_crashes_list.append((id, title, crashes_list))
# Generate HTML for displaying result row count mismatches
row_count_list = []
for result in sorted(report.grouped_results['row_counts'],
key = lambda result: len(result['test_sql'])):
id = next(gen)
title = 'Impala Rows: {0}, Postgres Rows: {1}'.format(
result['test_row_count'], result['ref_row_count'])
content = ('<h4>Impala Query:</h4><pre><code>{0}</code></pre>'
'<h4>Postgres Query:</h4><pre><code>{1}</code></pre>').format(
result['test_sql'], result['ref_sql'])
row_count_list.append((id, title, content))
# Generate HTML for displaying result content mismatches
mismatch_list = []
for result in sorted(report.grouped_results['mismatch'],
key = lambda result: len(result['test_sql'])):
id = next(gen)
title = 'Query Length: {0}'.format(len(result['test_sql']))
content = ('<h4>Impala Query:</h4><pre><code>{0}</code></pre>'
'<h4>Postgres Query:</h4><pre><code>{1}</code></pre>'
'<h4>Mismatch Impala Row:</h4><pre><code>{2}</code></pre>'
'<h4>Mismatch Postgres Row:</h4><pre><code>{3}</code></pre>').format(
result['test_sql'],
result['ref_sql'],
result['mismatch_test_row'],
result['mismatch_ref_row'])
mismatch_list.append((id, title, content))
return render_template(
'report.template',
assets=ASSETS,
report=report,
outer_crashes_list=outer_crashes_list,
row_count_list=row_count_list,
mismatch_list=mismatch_list)
@app.route('/start_run', methods=['POST', 'GET'])
def start_run():
'''Method that receives POST requests and gernerates a schedule item.'''
if request.method != 'POST': return 'fail'
if 'time_limit' in request.form:
# This is a custom run because time_limit item is present only in the custom_run form.
# Values will be extracted from the form and a new profile will be generated.
new_profile = DefaultProfile()
# Bounds
new_profile._bounds['MAX_NESTED_QUERY_COUNT'] = (
int(request.form['max_nested_query_count_from']),
int(request.form['max_nested_query_count_to']))
new_profile._bounds['MAX_NESTED_EXPR_COUNT'] = (
int(request.form['max_nested_expr_count_from']),
int(request.form['max_nested_expr_count_to']))
new_profile._bounds['SELECT_ITEM_COUNT'] = (
int(request.form['select_item_count_from']),
int(request.form['select_item_count_to']))
new_profile._bounds['WITH_TABLE_COUNT'] = (
int(request.form['with_table_count_from']),
int(request.form['with_table_count_to']))
new_profile._bounds['TABLE_COUNT'] = (
int(request.form['table_count_from']),
int(request.form['table_count_to']))
new_profile._bounds['ANALYTIC_LEAD_LAG_OFFSET'] = (
int(request.form['analytic_lead_lag_offset_from']),
int(request.form['analytic_lead_lag_offset_to']))
new_profile._bounds['ANALYTIC_WINDOW_OFFSET'] = (
int(request.form['analytic_window_offset_from']),
int(request.form['analytic_window_offset_to']))
# Select Item Category
new_profile._weights['SELECT_ITEM_CATEGORY']['AGG'] = int(
request.form['select_agg'])
new_profile._weights['SELECT_ITEM_CATEGORY']['ANALYTIC'] = int(
request.form['select_analytic'])
new_profile._weights['SELECT_ITEM_CATEGORY']['BASIC'] = int(
request.form['select_basic'])
# Types
new_profile._weights['TYPES'][Boolean] = int(request.form['types_boolean'])
new_profile._weights['TYPES'][Char] = int(request.form['types_char'])
new_profile._weights['TYPES'][Decimal] = int(request.form['types_decimal'])
new_profile._weights['TYPES'][Float] = int(request.form['types_float'])
new_profile._weights['TYPES'][Int] = int(request.form['types_int'])
new_profile._weights['TYPES'][Timestamp] = int(request.form['types_timestamp'])
# Join
new_profile._weights['JOIN']['INNER'] = int(request.form['join_inner'])
new_profile._weights['JOIN']['LEFT'] = int(request.form['join_left'])
new_profile._weights['JOIN']['RIGHT'] = int(request.form['join_right'])
new_profile._weights['JOIN']['FULL_OUTER'] = int(request.form['join_full_outer'])
new_profile._weights['JOIN']['CROSS'] = int(request.form['join_cross'])
# Optional Query Clauses Probabilities
new_profile._probabilities['OPTIONAL_QUERY_CLAUSES']['WITH'] = float(
request.form['optional_with'])
new_profile._probabilities['OPTIONAL_QUERY_CLAUSES']['FROM'] = float(
request.form['optional_from'])
new_profile._probabilities['OPTIONAL_QUERY_CLAUSES']['WHERE'] = float(
request.form['optional_where'])
new_profile._probabilities['OPTIONAL_QUERY_CLAUSES']['GROUP_BY'] = float(
request.form['optional_group_by'])
new_profile._probabilities['OPTIONAL_QUERY_CLAUSES']['HAVING'] = float(
request.form['optional_having'])
new_profile._probabilities['OPTIONAL_QUERY_CLAUSES']['UNION'] = float(
request.form['optional_union'])
new_profile._probabilities['OPTIONAL_QUERY_CLAUSES']['ORDER_BY'] = float(
request.form['optional_order_by'])
# Optional Analytic Clauses Probabilities
new_profile._probabilities['OPTIONAL_ANALYTIC_CLAUSES']['PARTITION_BY'] = float(
request.form['optional_analytic_partition_by'])
new_profile._probabilities['OPTIONAL_ANALYTIC_CLAUSES']['ORDER_BY'] = float(
request.form['optional_analytic_order_by'])
new_profile._probabilities['OPTIONAL_ANALYTIC_CLAUSES']['WINDOW'] = float(
request.form['optional_analytic_window'])
# Misc Probabilities
new_profile._probabilities['MISC']['INLINE_VIEW'] = float(
request.form['misc_inline_view'])
new_profile._probabilities['MISC']['SELECT_DISTINCT'] = float(
request.form['misc_select_distinct'])
new_profile._probabilities['MISC']['SCALAR_SUBQUERY'] = float(
request.form['misc_scalar_subquery'])
new_profile._probabilities['MISC']['UNION_ALL'] = float(
request.form['misc_union_all'])
# Analytic Designs
new_profile._flags['ANALYTIC_DESIGNS']['TOP_LEVEL_QUERY_WITHOUT_LIMIT'] = \
'analytic_designs_top_level_no_limit' in request.form
new_profile._flags['ANALYTIC_DESIGNS']['DETERMINISTIC_ORDER_BY'] = \
'analytic_designs_deterministic_order_by' in request.form
new_profile._flags['ANALYTIC_DESIGNS']['NO_ORDER_BY'] = \
'analytic_designs_no_order_by' in request.form
new_profile._flags['ANALYTIC_DESIGNS']['ONLY_SELECT_ITEM'] = \
'analytic_designs_only_select_item' in request.form
new_profile._flags['ANALYTIC_DESIGNS']['UNBOUNDED_WINDOW'] = \
'analytic_designs_unbounded_window' in request.form
new_profile._flags['ANALYTIC_DESIGNS']['RANK_FUNC'] = \
'analytic_designs_rank_func' in request.form
schedule_item = ScheduleItem(
run_name = request.form['run_name'],
query_profile = new_profile,
time_limit_sec = int(request.form['time_limit']),
git_command = request.form['git_command'],
parent_job = '')
else:
# Run based on previous run
schedule_item = ScheduleItem(
run_name = request.form['run_name'],
query_profile = DefaultProfile(),
time_limit_sec = 24 * 3600, # Default time limit is 24 hours
git_command = request.form['git_command'],
parent_job = request.form['report_id'])
schedule_item.save_pickle()
return 'success'
@app.route("/custom_run")
def custom_run():
'''Render the custom run page.
'''
return render_template(
'custom_run.template',
assets=ASSETS)
def reload_reports():
'''Reload reports in the reports directory every 20 minutes. Loaded reports are placed
into app.reports. This allows new reports to appear on the front page. Only reports
from the past 7 days are loaded. This method should be run in a separate thread.
'''
while True:
new_reports = {}
try:
report_ids = os.listdir(PATH_TO_REPORTS)
except EnvironmentError as e:
report_ids = []
LOG.warn('{0}: {1}'.format(e.filename, e.strerror))
for report_id in report_ids:
file_age = time.time() - os.stat(
os.path.join(PATH_TO_REPORTS, report_id))[stat.ST_MTIME]
if file_age < MAX_REPORT_AGE:
# We want this report
if report_id in app.reports:
new_reports[report_id] = app.reports[report_id]
else:
with open(os.path.join(PATH_TO_REPORTS, report_id), 'r') as f:
new_reports[report_id] = pickle.load(f)
app.reports = new_reports
sleep(SLEEP_LENGTH)
@app.route("/")
def front_page():
'''Renders the front page as HTML.
'''
try:
schedule_item_ids = os.listdir(PATH_TO_SCHEDULE)
except EnvironmentError as e:
schedule_item_ids = []
LOG.warn('{0}: {1}'.format(e.filename, e.strerror))
schedule_items = []
for schedule_item_id in schedule_item_ids:
schedule_items.append(pickle.load(
open(os.path.join(PATH_TO_SCHEDULE, schedule_item_id), 'r')))
return render_template(
'index.template',
assets=ASSETS,
reports=sorted(
app.reports.items(), key=lambda (k, report): report.run_date, reverse=True),
schedule_items=schedule_items)
if __name__ == '__main__':
logging.basicConfig(
format='%(asctime)s %(levelname)s [%(name)s.%(threadName)s:%(lineno)s]: '
'%(message)s',
level=logging.INFO)
thread = Thread(name='reload_reports', target=reload_reports)
thread.daemon = True
thread.start()
app.run(host='0.0.0.0', debug=False)