Files
impala/tests/util/test_file_parser.py
2014-01-08 10:46:54 -08:00

196 lines
7.7 KiB
Python

#!/usr/bin/env python
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
#
# This module is used for common utilities related to parsing test files
import collections
import logging
import re
from collections import defaultdict
from os.path import isfile, isdir
from tests.common.test_dimensions import TableFormatInfo
logging.basicConfig(level=logging.INFO, format='%(threadName)s: %(message)s')
LOG = logging.getLogger('impala_test_suite')
# The QueryTestSectionReader provides utility functions that help to parse content
# from a query test file
class QueryTestSectionReader(object):
@staticmethod
def replace_table_suffix(section_text, table_format_info):
"""
Replaces the $TABLE suffix that is append to all table names
$TABLE is replaced with the values in the table format info (file_format, etc)
TODO: This will be updated when we move a naming schedule based on database
"""
table_suffix = QueryTestSectionReader.__build_table_suffix(table_format_info.file_format,
table_format_info.compression_codec, table_format_info.compression_type)
return section_text.replace('$TABLE', table_suffix)
@staticmethod
def build_query(query_section_text, table_format_info, scale_factor):
"""
Build a well formed query.
Given the various test parameters, construct the query that will be executed. This
does more work than replace_table_suffix because it needs to properly replace the
database name based on the given scale factor
"""
query_section_text = remove_comments(query_section_text)
dataset = table_format_info.dataset
file_format, codec, compression_type = (table_format_info.file_format,
table_format_info.compression_codec,
table_format_info.compression_type)
database_name =\
QueryTestSectionReader.__database_name_to_use(dataset, scale_factor)
table_suffix = QueryTestSectionReader.__build_table_suffix(file_format, codec,
compression_type)
# $TABLE is used as a token for table suffix in the queries. Here we replace the token
# the proper database name based on the dataset and scale factor.
# There also may be cases where there is dbname.table_name without a $TABLE (in the
# case of insert). These still need to be fixed up with the proper scale factor
replace_from =\
'(%(dataset)s\.)(?P<table_name>\w+)' % {'dataset': dataset}
replace_by = '%s%s' % (database_name, r'\g<table_name>')
query_str = re.sub(replace_from, replace_by, query_section_text)
replace_from =\
'(%(dataset)s){0,1}(?P<table_name>\w+)\$TABLE' % {'dataset': database_name}
replace_by = '%s%s%s' % (database_name, r'\g<table_name>', table_suffix)
return re.sub(replace_from, replace_by, query_str).strip().rstrip(';')
@staticmethod
def __database_name_to_use(workload, scale_factor):
"""
Return the name of the database to use for the specified workload and scale factor.
"""
if workload != 'functional':
return '%s%s.' % (workload, scale_factor)
return ''
@staticmethod
def __build_table_suffix(file_format, codec, compression_type):
if file_format == 'text' and codec == 'none':
return ''
elif codec == 'none':
return '_%s' % (file_format)
elif compression_type == 'record':
return '_%s_record_%s' % (file_format, codec)
else:
return '_%s_%s' % (file_format, codec)
def remove_comments(section_text):
return '\n'.join([l for l in section_text.split('\n') if not l.strip().startswith('#')])
def parse_query_test_file(file_name):
"""
Reads the specified query test file
Returns the result as a list of dictionaries. Each dictionary in the list corresponds
to a test case and each key in the dictionary maps to a section in that test case.
"""
# Update the valid section names as we support other test types
# (ex. planner, data error)
VALID_SECTION_NAMES = ['QUERY', 'RESULTS', 'TYPES', 'PARTITIONS', 'SETUP']
return parse_test_file(file_name, VALID_SECTION_NAMES)
def parse_table_constraints(constraints_file):
"""
Reads a table contraints file, if one exists
TODO: once the python test frame changes are committed this can be moved to a common
utility function so the tests themselves can make use of this code.
"""
schema_include = defaultdict(list)
schema_exclude = defaultdict(list)
if not isfile(constraints_file):
LOG.info('No schema constraints file file found')
else:
with open(constraints_file, 'rb') as constraints_file:
for line in constraints_file.readlines():
line = line.strip()
if not line or line.startswith('#'):
continue
# Format: table_name:<name>, contraint_type:<type>, file_format:<t1>,<t2>,...
table_name, constraint_type, file_types =\
[value.split(':')[1].strip() for value in line.split(',', 2)]
if constraint_type == 'restrict_to':
schema_include[table_name.lower()] += file_types.split(',')
elif constraint_type == 'exclude':
schema_exclude[table_name.lower()] += file_types.split(',')
else:
raise ValueError, 'Unknown constraint type: %s' % constraint_type
return schema_include, schema_exclude
def parse_test_file(test_file_name, valid_section_names, skip_unknown_sections=True):
"""
Parses an Impala test file
Test files have the format:
==== <- Section
---- [Name] <- Named subsection
// some text
---- [Name2] <- Named subsection
...
====
The valid section names are passed in to this function.
"""
test_file = open(test_file_name, 'rb')
sections = list()
# Read test file, stripping out all comments
file_lines = [l for l in test_file.read().split('\n') if not l.strip().startswith('#')]
# Split the test file up into sections. For each section parse all subsections.
for section in '\n'.join(file_lines).split('===='):
parsed_sections = collections.defaultdict(str)
for sub_section in section.split('----')[1:]:
lines = sub_section.split('\n')
subsection_name = lines[0].strip()
subsection_comment = None
subsection_info = [s.strip() for s in subsection_name.split(':')]
if(len(subsection_info) == 2):
subsection_name, subsection_comment = subsection_info
if subsection_name not in valid_section_names:
if skip_unknown_sections or not subsection_name:
print 'Unknown section %s' % subsection_name
continue
else:
raise RuntimeError, 'Unknown subsection: %s' % subsection_name
if subsection_name == 'QUERY' and subsection_comment:
parsed_sections['QUERY_NAME'] = subsection_comment
parsed_sections[subsection_name] = '\n'.join([line for line in lines[1:-1]])
if parsed_sections:
sections.append(parsed_sections)
test_file.close()
return sections
def write_test_file(test_file_name, test_file_sections):
"""
Given a list of test file sections, write out the corresponding test file
This is useful when updating the results of a test.
"""
with open(test_file_name, 'w') as test_file:
for test_case in test_file_sections:
test_file.write("====\n")
for section_name, section_value in test_case.items():
# Have to special case query name because it is a section name annotation
if section_name == 'QUERY_NAME':
continue
full_section_name = section_name
if section_name == 'QUERY' and 'QUERY_NAME' in test_case:
full_section_name = '%s : %s' % (section_name, test_case['QUERY_NAME'])
test_file.write('---- %s\n' % full_section_name)
test_file.write(test_case[section_name] + '\n')
test_file.write('====')