mirror of
https://github.com/apache/impala.git
synced 2026-01-08 12:02:54 -05:00
For files that have a Cloudera copyright (and no other copyright notice), make changes to follow the ASF source file header policy here: http://www.apache.org/legal/src-headers.html#headers Specifically: 1) Remove the Cloudera copyright. 2) Modify NOTICE.txt according to http://www.apache.org/legal/src-headers.html#notice to follow that format and add a line for Cloudera. 3) Replace or add the existing ASF license text with the one given on the website. Much of this change was automatically generated via: git grep -li 'Copyright.*Cloudera' > modified_files.txt cat modified_files.txt | xargs perl -n -i -e 'print unless m#Copyright.*Cloudera#i;' cat modified_files_txt | xargs fix_apache_license.py [1] Some manual fixups were performed following those steps, especially when license text was completely missing from the file. [1] https://gist.github.com/anonymous/ff71292094362fc5c594 with minor modification to ORIG_LICENSE to match Impala's license text. Change-Id: I2e0bd8420945b953e1b806041bea4d72a3943d86 Reviewed-on: http://gerrit.cloudera.org:8080/3779 Reviewed-by: Dan Hecht <dhecht@cloudera.com> Tested-by: Internal Jenkins
319 lines
12 KiB
Python
Executable File
319 lines
12 KiB
Python
Executable File
#!/usr/bin/env impala-python
|
|
#
|
|
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
import logging
|
|
import os
|
|
import pickle
|
|
import stat
|
|
import time
|
|
from time import sleep
|
|
from flask import Flask, render_template, request
|
|
from schedule_item import ScheduleItem
|
|
from controller import PATH_TO_REPORTS, PATH_TO_SCHEDULE
|
|
from threading import Thread
|
|
from tests.comparison.query_profile import DefaultProfile
|
|
from tests.comparison.db_types import (
|
|
Boolean,
|
|
Char,
|
|
Decimal,
|
|
Float,
|
|
Int,
|
|
TYPES,
|
|
Timestamp)
|
|
|
|
MAX_REPORT_AGE = 21 * 24 * 3600 # 21 days
|
|
SLEEP_LENGTH = 20 * 60 # 20 min
|
|
|
|
LOG = logging.getLogger('leopard.front_end')
|
|
|
|
app = Flask(__name__)
|
|
app.reports = {}
|
|
|
|
|
|
ASSETS = {'bootstrap_css': 'css/bootstrap.min.css',
|
|
'hljs_css': 'css/default.css',
|
|
'favicon': 'favicon.ico',
|
|
'bootstrap_js': 'js/bootstrap.min.js',
|
|
'hljs_js': 'js/highlight.pack.js'}
|
|
|
|
@app.route('/reports/<report_id>')
|
|
def show_report(report_id):
|
|
'''Renders a report as HTML. '''
|
|
|
|
if report_id not in app.reports:
|
|
with open(os.path.join(PATH_TO_REPORTS, report_id), 'r') as f:
|
|
app.reports[report_id] = pickle.load(f)
|
|
|
|
report = app.reports[report_id]
|
|
|
|
def get_next_id():
|
|
'''Generates all natural numbers. '''
|
|
i = 0
|
|
while True:
|
|
yield str(i)
|
|
i += 1
|
|
|
|
gen = get_next_id()
|
|
|
|
# Generate HTML for displaying the crashes
|
|
outer_crashes_list = []
|
|
for first_impala_frame in report.grouped_stacks:
|
|
crashes_list = []
|
|
# results are sorted on the length of the query SQL
|
|
for result in sorted(report.grouped_stacks[first_impala_frame],
|
|
key = lambda result: len(result['test_sql'])):
|
|
inner_id = next(gen)
|
|
inner_title = 'Lines in Stack: {0}'.format(
|
|
len(result['formatted_stack'].split('\n')))
|
|
content = ('<h4>Impala Query:</h4><pre><code>{0}</code></pre>'
|
|
'<h4>Stack:</h4><pre>{1}</pre>').format(
|
|
result['test_sql'], result['formatted_stack'][:50000])
|
|
crashes_list.append((inner_id, inner_title, content))
|
|
id = next(gen)
|
|
title = first_impala_frame
|
|
outer_crashes_list.append((id, title, crashes_list))
|
|
|
|
# Generate HTML for displaying result row count mismatches
|
|
row_count_list = []
|
|
for result in sorted(report.grouped_results['row_counts'],
|
|
key = lambda result: len(result['test_sql'])):
|
|
id = next(gen)
|
|
title = 'Impala Rows: {0}, Postgres Rows: {1}'.format(
|
|
result['test_row_count'], result['ref_row_count'])
|
|
content = ('<h4>Impala Query:</h4><pre><code>{0}</code></pre>'
|
|
'<h4>Postgres Query:</h4><pre><code>{1}</code></pre>').format(
|
|
result['test_sql'], result['ref_sql'])
|
|
row_count_list.append((id, title, content))
|
|
|
|
# Generate HTML for displaying result content mismatches
|
|
mismatch_list = []
|
|
for result in sorted(report.grouped_results['mismatch'],
|
|
key = lambda result: len(result['test_sql'])):
|
|
id = next(gen)
|
|
title = 'Query Length: {0}'.format(len(result['test_sql']))
|
|
content = ('<h4>Impala Query:</h4><pre><code>{0}</code></pre>'
|
|
'<h4>Postgres Query:</h4><pre><code>{1}</code></pre>'
|
|
'<h4>Mismatch Impala Row:</h4><pre><code>{2}</code></pre>'
|
|
'<h4>Mismatch Postgres Row:</h4><pre><code>{3}</code></pre>').format(
|
|
result['test_sql'],
|
|
result['ref_sql'],
|
|
result['mismatch_test_row'],
|
|
result['mismatch_ref_row'])
|
|
mismatch_list.append((id, title, content))
|
|
|
|
return render_template(
|
|
'report.template',
|
|
assets=ASSETS,
|
|
report=report,
|
|
outer_crashes_list=outer_crashes_list,
|
|
row_count_list=row_count_list,
|
|
mismatch_list=mismatch_list)
|
|
|
|
@app.route('/start_run', methods=['POST', 'GET'])
|
|
def start_run():
|
|
'''Method that receives POST requests and gernerates a schedule item.'''
|
|
|
|
if request.method != 'POST': return 'fail'
|
|
|
|
if 'time_limit' in request.form:
|
|
# This is a custom run because time_limit item is present only in the custom_run form.
|
|
# Values will be extracted from the form and a new profile will be generated.
|
|
|
|
new_profile = DefaultProfile()
|
|
|
|
# Bounds
|
|
new_profile._bounds['MAX_NESTED_QUERY_COUNT'] = (
|
|
int(request.form['max_nested_query_count_from']),
|
|
int(request.form['max_nested_query_count_to']))
|
|
new_profile._bounds['MAX_NESTED_EXPR_COUNT'] = (
|
|
int(request.form['max_nested_expr_count_from']),
|
|
int(request.form['max_nested_expr_count_to']))
|
|
new_profile._bounds['SELECT_ITEM_COUNT'] = (
|
|
int(request.form['select_item_count_from']),
|
|
int(request.form['select_item_count_to']))
|
|
new_profile._bounds['WITH_TABLE_COUNT'] = (
|
|
int(request.form['with_table_count_from']),
|
|
int(request.form['with_table_count_to']))
|
|
new_profile._bounds['TABLE_COUNT'] = (
|
|
int(request.form['table_count_from']),
|
|
int(request.form['table_count_to']))
|
|
new_profile._bounds['ANALYTIC_LEAD_LAG_OFFSET'] = (
|
|
int(request.form['analytic_lead_lag_offset_from']),
|
|
int(request.form['analytic_lead_lag_offset_to']))
|
|
new_profile._bounds['ANALYTIC_WINDOW_OFFSET'] = (
|
|
int(request.form['analytic_window_offset_from']),
|
|
int(request.form['analytic_window_offset_to']))
|
|
|
|
# Select Item Category
|
|
new_profile._weights['SELECT_ITEM_CATEGORY']['AGG'] = int(
|
|
request.form['select_agg'])
|
|
new_profile._weights['SELECT_ITEM_CATEGORY']['ANALYTIC'] = int(
|
|
request.form['select_analytic'])
|
|
new_profile._weights['SELECT_ITEM_CATEGORY']['BASIC'] = int(
|
|
request.form['select_basic'])
|
|
|
|
# Types
|
|
new_profile._weights['TYPES'][Boolean] = int(request.form['types_boolean'])
|
|
new_profile._weights['TYPES'][Char] = int(request.form['types_char'])
|
|
new_profile._weights['TYPES'][Decimal] = int(request.form['types_decimal'])
|
|
new_profile._weights['TYPES'][Float] = int(request.form['types_float'])
|
|
new_profile._weights['TYPES'][Int] = int(request.form['types_int'])
|
|
new_profile._weights['TYPES'][Timestamp] = int(request.form['types_timestamp'])
|
|
|
|
# Join
|
|
new_profile._weights['JOIN']['INNER'] = int(request.form['join_inner'])
|
|
new_profile._weights['JOIN']['LEFT'] = int(request.form['join_left'])
|
|
new_profile._weights['JOIN']['RIGHT'] = int(request.form['join_right'])
|
|
new_profile._weights['JOIN']['FULL_OUTER'] = int(request.form['join_full_outer'])
|
|
new_profile._weights['JOIN']['CROSS'] = int(request.form['join_cross'])
|
|
|
|
# Optional Query Clauses Probabilities
|
|
new_profile._probabilities['OPTIONAL_QUERY_CLAUSES']['WITH'] = float(
|
|
request.form['optional_with'])
|
|
new_profile._probabilities['OPTIONAL_QUERY_CLAUSES']['FROM'] = float(
|
|
request.form['optional_from'])
|
|
new_profile._probabilities['OPTIONAL_QUERY_CLAUSES']['WHERE'] = float(
|
|
request.form['optional_where'])
|
|
new_profile._probabilities['OPTIONAL_QUERY_CLAUSES']['GROUP_BY'] = float(
|
|
request.form['optional_group_by'])
|
|
new_profile._probabilities['OPTIONAL_QUERY_CLAUSES']['HAVING'] = float(
|
|
request.form['optional_having'])
|
|
new_profile._probabilities['OPTIONAL_QUERY_CLAUSES']['UNION'] = float(
|
|
request.form['optional_union'])
|
|
new_profile._probabilities['OPTIONAL_QUERY_CLAUSES']['ORDER_BY'] = float(
|
|
request.form['optional_order_by'])
|
|
|
|
# Optional Analytic Clauses Probabilities
|
|
new_profile._probabilities['OPTIONAL_ANALYTIC_CLAUSES']['PARTITION_BY'] = float(
|
|
request.form['optional_analytic_partition_by'])
|
|
new_profile._probabilities['OPTIONAL_ANALYTIC_CLAUSES']['ORDER_BY'] = float(
|
|
request.form['optional_analytic_order_by'])
|
|
new_profile._probabilities['OPTIONAL_ANALYTIC_CLAUSES']['WINDOW'] = float(
|
|
request.form['optional_analytic_window'])
|
|
|
|
# Misc Probabilities
|
|
new_profile._probabilities['MISC']['INLINE_VIEW'] = float(
|
|
request.form['misc_inline_view'])
|
|
new_profile._probabilities['MISC']['SELECT_DISTINCT'] = float(
|
|
request.form['misc_select_distinct'])
|
|
new_profile._probabilities['MISC']['SCALAR_SUBQUERY'] = float(
|
|
request.form['misc_scalar_subquery'])
|
|
new_profile._probabilities['MISC']['UNION_ALL'] = float(
|
|
request.form['misc_union_all'])
|
|
|
|
# Analytic Designs
|
|
new_profile._flags['ANALYTIC_DESIGNS']['TOP_LEVEL_QUERY_WITHOUT_LIMIT'] = \
|
|
'analytic_designs_top_level_no_limit' in request.form
|
|
new_profile._flags['ANALYTIC_DESIGNS']['DETERMINISTIC_ORDER_BY'] = \
|
|
'analytic_designs_deterministic_order_by' in request.form
|
|
new_profile._flags['ANALYTIC_DESIGNS']['NO_ORDER_BY'] = \
|
|
'analytic_designs_no_order_by' in request.form
|
|
new_profile._flags['ANALYTIC_DESIGNS']['ONLY_SELECT_ITEM'] = \
|
|
'analytic_designs_only_select_item' in request.form
|
|
new_profile._flags['ANALYTIC_DESIGNS']['UNBOUNDED_WINDOW'] = \
|
|
'analytic_designs_unbounded_window' in request.form
|
|
new_profile._flags['ANALYTIC_DESIGNS']['RANK_FUNC'] = \
|
|
'analytic_designs_rank_func' in request.form
|
|
|
|
schedule_item = ScheduleItem(
|
|
run_name = request.form['run_name'],
|
|
query_profile = new_profile,
|
|
time_limit_sec = int(request.form['time_limit']),
|
|
git_command = request.form['git_command'],
|
|
parent_job = '')
|
|
else:
|
|
# Run based on previous run
|
|
schedule_item = ScheduleItem(
|
|
run_name = request.form['run_name'],
|
|
query_profile = DefaultProfile(),
|
|
time_limit_sec = 24 * 3600, # Default time limit is 24 hours
|
|
git_command = request.form['git_command'],
|
|
parent_job = request.form['report_id'])
|
|
|
|
schedule_item.save_pickle()
|
|
|
|
return 'success'
|
|
|
|
@app.route("/custom_run")
|
|
def custom_run():
|
|
'''Render the custom run page.
|
|
'''
|
|
return render_template(
|
|
'custom_run.template',
|
|
assets=ASSETS)
|
|
|
|
def reload_reports():
|
|
'''Reload reports in the reports directory every 20 minutes. Loaded reports are placed
|
|
into app.reports. This allows new reports to appear on the front page. Only reports
|
|
from the past 7 days are loaded. This method should be run in a separate thread.
|
|
'''
|
|
while True:
|
|
new_reports = {}
|
|
try:
|
|
report_ids = os.listdir(PATH_TO_REPORTS)
|
|
except EnvironmentError as e:
|
|
report_ids = []
|
|
LOG.warn('{0}: {1}'.format(e.filename, e.strerror))
|
|
for report_id in report_ids:
|
|
file_age = time.time() - os.stat(
|
|
os.path.join(PATH_TO_REPORTS, report_id))[stat.ST_MTIME]
|
|
if file_age < MAX_REPORT_AGE:
|
|
# We want this report
|
|
if report_id in app.reports:
|
|
new_reports[report_id] = app.reports[report_id]
|
|
else:
|
|
with open(os.path.join(PATH_TO_REPORTS, report_id), 'r') as f:
|
|
new_reports[report_id] = pickle.load(f)
|
|
app.reports = new_reports
|
|
sleep(SLEEP_LENGTH)
|
|
|
|
@app.route("/")
|
|
def front_page():
|
|
'''Renders the front page as HTML.
|
|
'''
|
|
|
|
try:
|
|
schedule_item_ids = os.listdir(PATH_TO_SCHEDULE)
|
|
except EnvironmentError as e:
|
|
schedule_item_ids = []
|
|
LOG.warn('{0}: {1}'.format(e.filename, e.strerror))
|
|
|
|
schedule_items = []
|
|
|
|
for schedule_item_id in schedule_item_ids:
|
|
schedule_items.append(pickle.load(
|
|
open(os.path.join(PATH_TO_SCHEDULE, schedule_item_id), 'r')))
|
|
|
|
return render_template(
|
|
'index.template',
|
|
assets=ASSETS,
|
|
reports=sorted(
|
|
app.reports.items(), key=lambda (k, report): report.run_date, reverse=True),
|
|
schedule_items=schedule_items)
|
|
|
|
if __name__ == '__main__':
|
|
logging.basicConfig(
|
|
format='%(asctime)s %(levelname)s [%(name)s.%(threadName)s:%(lineno)s]: '
|
|
'%(message)s',
|
|
level=logging.INFO)
|
|
thread = Thread(name='reload_reports', target=reload_reports)
|
|
thread.daemon = True
|
|
thread.start()
|
|
app.run(host='0.0.0.0', debug=False)
|