mirror of
https://github.com/apache/impala.git
synced 2025-12-25 02:03:09 -05:00
This takes steps to make Python 2 behave like Python 3 as
a way to flush out issues with running on Python 3. Specifically,
it handles two main differences:
1. Python 3 requires absolute imports within packages. This
can be emulated via "from __future__ import absolute_import"
2. Python 3 changed division to "true" division that doesn't
round to an integer. This can be emulated via
"from __future__ import division"
This changes all Python files to add imports for absolute_import
and division. For completeness, this also includes print_function in the
import.
I scrutinized each old-division location and converted some locations
to use the integer division '//' operator if it needed an integer
result (e.g. for indices, counts of records, etc). Some code was also using
relative imports and needed to be adjusted to handle absolute_import.
This fixes all Pylint warnings about no-absolute-import and old-division,
and these warnings are now banned.
Testing:
- Ran core tests
Change-Id: Idb0fcbd11f3e8791f5951c4944be44fb580e576b
Reviewed-on: http://gerrit.cloudera.org:8080/19588
Reviewed-by: Joe McDonnell <joemcdonnell@cloudera.com>
Tested-by: Joe McDonnell <joemcdonnell@cloudera.com>
173 lines
6.1 KiB
Python
173 lines
6.1 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
from __future__ import absolute_import, division, print_function
|
|
import pickle
|
|
import re
|
|
import os
|
|
import job
|
|
from collections import defaultdict
|
|
|
|
class Report(object):
|
|
'''Contains information about a completed job, such as the number of crashes and stack
|
|
traces from every crash. The report is usually displayed on a web page.
|
|
'''
|
|
def __init__(self, job_id):
|
|
self.num_queries = 0
|
|
self.run_time = 0
|
|
self.run_date = 0
|
|
self.job_name = ''
|
|
self.num_crashes = 0
|
|
self.num_row_count_mismatch = 0
|
|
self.num_mismatch = 0
|
|
self.job_id = job_id
|
|
self.git_hash = ''
|
|
self.grouped_results = None
|
|
self.parent_job_name = ''
|
|
self.num_queries_returned_correct_data = 0
|
|
self.get_results()
|
|
|
|
@property
|
|
def run_time_str(self):
|
|
'''Return the running time of the job as a string in human readable format.'''
|
|
m, s = divmod(self.run_time, 60)
|
|
h, m = divmod(m, 60)
|
|
return '{0:02d}:{1:02d}:{2:02d}'.format(int(h), int(m), int(s))
|
|
|
|
def classify_error(self, error):
|
|
d = {
|
|
r'LINE \d+:': 'Postgres_error',
|
|
r'Permission denied': 'permission_denied',
|
|
r'^AnalysisException': 'AnalysisException',
|
|
r'^Column \d+ in row \d+ does not match': 'mismatch',
|
|
r'^Could not connect': 'could_not_connect',
|
|
r'^IllegalStateException': 'IllegalStateException',
|
|
r'^Invalid query handle: ': 'invalid_query_handle',
|
|
r'^Invalid or unknown query handle: ': 'invalid_query_handle',
|
|
r'^Known issue:': 'known_issue',
|
|
r'^Operation is in ERROR_STATE': 'error_state',
|
|
r'^Query timed out after \d+ seconds': 'timeout',
|
|
r'^Row counts do not match': 'row_counts',
|
|
r'^Too much data': 'too_much_data',
|
|
r'^Unknown expr node type: \d+': 'unkown_node',
|
|
r'^Year is out of valid range': 'year_range',
|
|
r'^[A-Za-z]+ out of range': 'out_of_range',
|
|
r'^division by zero': 'division_by_zero'}
|
|
|
|
for r in d:
|
|
if re.search(r, error):
|
|
return d[r]
|
|
return 'unrecognized'
|
|
|
|
def group_queries(self, all_queries, group_func):
|
|
'''General function that returns a dictionary with keys that are generated by
|
|
group_func. all_queries is a list of queries.
|
|
group_func should take query as a parameter and return a string containing an
|
|
interesting property of the query which will be used as key in the dictionary.
|
|
'''
|
|
grouped_queries = defaultdict(list)
|
|
for query in all_queries:
|
|
grouped_queries[group_func(query)].append(query)
|
|
return grouped_queries
|
|
|
|
def __str__(self):
|
|
'''TODO: Render report as text.
|
|
'''
|
|
return ''
|
|
|
|
def get_first_impala_frame(self, query_result):
|
|
'''Extracts the first impala frame in the stack trace.
|
|
'''
|
|
stack = query_result['formatted_stack']
|
|
if stack:
|
|
for line in stack.split('\n'):
|
|
match = re.search(r'(impala::.*) \(', line)
|
|
if match:
|
|
return match.group(1)
|
|
else:
|
|
return None
|
|
|
|
def _format_stack(self, stack):
|
|
'''Cleans up the stack trace.
|
|
'''
|
|
|
|
def clean_frame(frame):
|
|
#remove memory address from each frame
|
|
reg = re.match(r'#\d+ *0x[0123456789abcdef]* in (.*)', frame)
|
|
if reg: return reg.group(1)
|
|
# this is for matching lines like "#7 SLL_Next (this=0x9046780, src=0x90467c8...
|
|
reg = re.match(r'#\d+ *(\S.*)', frame)
|
|
if reg: return reg.group(1)
|
|
return frame
|
|
|
|
def stack_gen():
|
|
'''Generator that yields impala stack trace lines line by line.
|
|
'''
|
|
if stack:
|
|
active = False
|
|
for line in stack.split('\n'):
|
|
if active or line.startswith('#0'):
|
|
active = True
|
|
yield line
|
|
|
|
return '\n'.join(clean_frame(l) for l in stack_gen())
|
|
|
|
def get_results(self):
|
|
'''Analyses the completed job and extracts important results into self. This method
|
|
should be called as soon as the object is created.
|
|
'''
|
|
from controller import PATH_TO_FINISHED_JOBS
|
|
|
|
def group_outer_func(query):
|
|
if 'stack' in query:
|
|
return 'stack'
|
|
return self.classify_error(query['error'])
|
|
|
|
def stack_group_func(query):
|
|
return self.get_first_impala_frame(query['stack'])
|
|
|
|
with open(os.path.join(PATH_TO_FINISHED_JOBS, self.job_id)) as f:
|
|
job = pickle.load(f)
|
|
self.grouped_results = self.group_queries(job.result_list, group_outer_func)
|
|
|
|
# Format the stack for queries that have a stack
|
|
for query in self.grouped_results['stack']:
|
|
query['formatted_stack'] = self._format_stack(query['stack'])
|
|
|
|
self.num_crashes = len(self.grouped_results['stack'])
|
|
self.num_row_count_mismatch = len(self.grouped_results['row_counts'])
|
|
self.num_mismatch = len(self.grouped_results['mismatch'])
|
|
|
|
self.grouped_stacks = self.group_queries(
|
|
self.grouped_results['stack'], self.get_first_impala_frame)
|
|
|
|
self.run_time = job.stop_time - job.start_time
|
|
self.run_date = job.start_time
|
|
self.job_name = job.job_name
|
|
self.git_hash = job.git_hash
|
|
self.num_queries_executed = job.num_queries_executed
|
|
self.num_queries_returned_correct_data = job.num_queries_returned_correct_data
|
|
if job.parent_job:
|
|
with open(os.path.join(PATH_TO_FINISHED_JOBS, job.parent_job)) as f:
|
|
parent_job = pickle.load(f)
|
|
self.parent_job_name = parent_job.job_name
|
|
|
|
def save_pickle(self):
|
|
from controller import PATH_TO_REPORTS
|
|
with open(os.path.join(PATH_TO_REPORTS, self.job_id), 'w') as f:
|
|
pickle.dump(self, f)
|