From 461a48df2bfcfc324fac27c1f937993fd108bbc0 Mon Sep 17 00:00:00 2001 From: Skye Wanderman-Milne Date: Wed, 27 Feb 2013 18:26:31 -0800 Subject: [PATCH] Refactor testing framework to generate Avro tables. --- bin/load-data.py | 32 +- testdata/bin/generate-schema-statements.py | 219 ++++++-- testdata/bin/generate-test-vectors.py | 4 +- .../functional/functional_schema_template.sql | 526 ++++++++---------- .../datasets/tpcds/tpcds_schema_template.sql | 452 +++++++-------- .../datasets/tpch/tpch_schema_template.sql | 340 +++++------ .../functional-query_core.csv | 2 + .../functional-query_dimensions.csv | 2 +- .../functional-query_exhaustive.csv | 2 + .../functional-query_pairwise.csv | 7 +- .../hive-benchmark_dimensions.csv | 2 +- .../hive-benchmark_exhaustive.csv | 2 + .../hive-benchmark_pairwise.csv | 7 +- .../targeted-perf/targeted-perf_core.csv | 2 + .../targeted-stress/targeted-stress_core.csv | 2 + testdata/workloads/tpcds/tpcds_dimensions.csv | 2 +- testdata/workloads/tpcds/tpcds_exhaustive.csv | 2 + testdata/workloads/tpcds/tpcds_pairwise.csv | 7 +- testdata/workloads/tpch/tpch_core.csv | 2 + tests/common/impala_test_suite.py | 3 +- tests/common/test_result_verifier.py | 25 +- tests/query_test/test_aggregation.py | 4 + tests/query_test/test_queries.py | 5 + 23 files changed, 850 insertions(+), 801 deletions(-) diff --git a/bin/load-data.py b/bin/load-data.py index d7efa513a..b658a2b98 100755 --- a/bin/load-data.py +++ b/bin/load-data.py @@ -19,6 +19,7 @@ parser.add_option("-e", "--exploration_strategy", dest="exploration_strategy", help="The exploration strategy for schema gen: 'core', "\ "'pairwise', or 'exhaustive'") parser.add_option("--hive_warehouse_dir", dest="hive_warehouse_dir", + default="/test-warehouse", help="The HDFS path to the base Hive test warehouse directory") parser.add_option("-w", "--workloads", dest="workloads", help="Comma-separated list of workloads to load data for. If 'all' is "\ @@ -38,19 +39,21 @@ parser.add_option("--table_names", dest="table_names", default=None, parser.add_option("--table_formats", dest="table_formats", default=None, help="Override the test vectors and load using the specified table "\ "formats. Ex. --table_formats=seq/snap/block,text/none") - +parser.add_option("--hdfs_namenode", dest="hdfs_namenode", default="localhost:20500", + help="HDFS name node for Avro schema URLs, default localhost:20500") (options, args) = parser.parse_args() WORKLOAD_DIR = os.environ['IMPALA_WORKLOAD_DIR'] DATASET_DIR = os.environ['IMPALA_DATASET_DIR'] TESTDATA_BIN_DIR = os.path.join(os.environ['IMPALA_HOME'], 'testdata/bin') +AVRO_SCHEMA_DIR = "avro_schemas" GENERATE_SCHEMA_CMD = "generate-schema-statements.py --exploration_strategy=%s "\ "--workload=%s --scale_factor=%s --verbose" HIVE_CMD = os.path.join(os.environ['HIVE_HOME'], 'bin/hive') HIVE_ARGS = "-hiveconf hive.root.logger=WARN,console -v" - IMPALA_SHELL_CMD = os.path.join(os.environ['IMPALA_HOME'], 'bin/impala-shell.sh') +HADOOP_CMD = os.path.join(os.environ['HADOOP_HOME'], 'bin/hadoop') def available_workloads(workload_dir): return [subdir for subdir in os.listdir(workload_dir) @@ -99,6 +102,8 @@ def generate_schema_statements(workload): generate_cmd += " --table_formats=%s" % options.table_formats if options.hive_warehouse_dir is not None: generate_cmd += " --hive_warehouse_dir=%s" % options.hive_warehouse_dir + if options.hdfs_namenode is not None: + generate_cmd += " --hdfs_namenode=%s" % options.hdfs_namenode print 'Executing Generate Schema Command: ' + generate_cmd ret_val = subprocess.call(os.path.join(TESTDATA_BIN_DIR, generate_cmd), shell = True) if ret_val != 0: @@ -119,6 +124,15 @@ def get_dataset_for_workload(workload): print 'Dimension file does not contain dataset for workload \'%s\'' % (workload) sys.exit(1) +def copy_avro_schemas_to_hdfs(schemas_dir): + """Recursively copies all of schemas_dir to the test warehouse.""" + cmd = "%s fs -put -f %s /" % (HADOOP_CMD, schemas_dir) + print "Executing HDFS copy command: " + cmd + ret_val = subprocess.call(cmd, shell=True) + if ret_val != 0: + print "Error copying Avro schemas to HDFS, exiting" + sys.exit(ret_val) + if __name__ == "__main__": all_workloads = available_workloads(WORKLOAD_DIR) workloads = [] @@ -142,8 +156,18 @@ if __name__ == "__main__": dataset_dir = os.path.join(DATASET_DIR, dataset) os.chdir(dataset_dir) generate_schema_statements(workload) - exec_hive_query_from_file(os.path.join(dataset_dir, - 'load-%s-%s-generated.sql' % (workload, options.exploration_strategy))) + # We load Avro tables separately due to bugs in the Avro SerDe. + # generate-schema-statements.py separates the avro statements into a + # separate file to get around this. + # See https://issues.apache.org/jira/browse/HIVE-4195. + generated_file = 'load-%s-%s-generated.sql' % (workload, options.exploration_strategy) + if os.path.exists(generated_file): + exec_hive_query_from_file(os.path.join(dataset_dir, generated_file)) + generated_avro_file = \ + 'load-%s-%s-avro-generated.sql' % (workload, options.exploration_strategy) + if os.path.exists(generated_avro_file): + copy_avro_schemas_to_hdfs(AVRO_SCHEMA_DIR) + exec_hive_query_from_file(os.path.join(dataset_dir, generated_avro_file)) loading_time_map[workload] = time.time() - start_time total_time = 0.0 diff --git a/testdata/bin/generate-schema-statements.py b/testdata/bin/generate-schema-statements.py index 993a5ed5b..6be097e72 100755 --- a/testdata/bin/generate-schema-statements.py +++ b/testdata/bin/generate-schema-statements.py @@ -25,6 +25,7 @@ import collections import csv import math +import json import os import random import subprocess @@ -58,6 +59,8 @@ parser.add_option("--table_names", dest="table_names", default=None, parser.add_option("--table_formats", dest="table_formats", default=None, help="Override the test vectors and load using the specified table "\ "formats. Ex. --table_formats=seq/snap/block,text/none") +parser.add_option("--hdfs_namenode", dest="hdfs_namenode", default="localhost:20500", + help="HDFS name node for Avro schema URLs, default localhost:20500") (options, args) = parser.parse_args() if options.workload is None: @@ -67,10 +70,12 @@ if options.workload is None: WORKLOAD_DIR = os.environ['IMPALA_HOME'] + '/testdata/workloads' DATASET_DIR = os.environ['IMPALA_HOME'] + '/testdata/datasets' +AVRO_SCHEMA_DIR = "avro_schemas" COMPRESSION_TYPE = "SET mapred.output.compression.type=%s;" COMPRESSION_ENABLED = "SET hive.exec.compress.output=%s;" COMPRESSION_CODEC = "SET mapred.output.compression.codec=%s;" +AVRO_COMPRESSION_CODEC = "SET avro.output.codec=%s;" SET_DYNAMIC_PARTITION_STATEMENT = "SET hive.exec.dynamic.partition=true;" SET_PARTITION_MODE_NONSTRICT_STATEMENT = "SET hive.exec.dynamic.partition.mode=nonstrict;" SET_HIVE_INPUT_FORMAT = "SET mapred.max.split.size=256000000;\n"\ @@ -88,17 +93,42 @@ COMPRESSION_MAP = {'def': 'org.apache.hadoop.io.compress.DefaultCodec', 'none': '' } -FILE_FORMAT_MAP = {'text': 'TEXTFILE', - 'seq': 'SEQUENCEFILE', - 'rc': 'RCFILE', - 'parquet': '\n' + - 'INPUTFORMAT \'com.cloudera.impala.hive.serde.ParquetInputFormat\'\n' + - 'OUTPUTFORMAT \'com.cloudera.impala.hive.serde.ParquetOutputFormat\'', - 'text_lzo': '\n' + - 'INPUTFORMAT \'com.hadoop.mapred.DeprecatedLzoTextInputFormat\'\n' + - 'OUTPUTFORMAT ' + - '\'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat\'\n' - } +AVRO_COMPRESSION_MAP = { + 'def': 'deflate', + 'snap': 'snappy', + 'none': '', + } + +FILE_FORMAT_MAP = { + 'text': 'TEXTFILE', + 'seq': 'SEQUENCEFILE', + 'rc': 'RCFILE', + 'parquet': + "\nINPUTFORMAT 'com.cloudera.impala.hive.serde.ParquetInputFormat'" + + "\nOUTPUTFORMAT 'com.cloudera.impala.hive.serde.ParquetOutputFormat'", + 'text_lzo': + "\nINPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'" + + "\nOUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'", + 'avro': + "\nINPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'" + + "\nOUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'" + } + +HIVE_TO_AVRO_TYPE_MAP = { + 'STRING': 'string', + 'INT': 'int', + 'TINYINT': 'int', + 'SMALLINT': 'int', + 'BIGINT': 'long', + 'BOOLEAN': 'boolean', + 'FLOAT': 'float', + 'DOUBLE': 'double', + # Avro has no timestamp type, so convert to string + # TODO: this allows us to create our Avro test tables, but any tests that use + # a timestamp column will fail. We probably want to convert back to timestamps + # in our tests. + 'TIMESTAMP': 'string', + } PARQUET_ALTER_STATEMENT = "ALTER TABLE %(table_name)s SET\n\ SERDEPROPERTIES ('blocksize' = '1073741824', 'compression' = '%(compression)s');" @@ -117,12 +147,66 @@ def build_create_statement(table_template, table_name, db_name, db_suffix, hdfs_location=hdfs_location) return create_statement -def build_compression_codec_statement(codec, compression_type): - compression_codec = COMPRESSION_MAP[codec] - if compression_codec: - return COMPRESSION_TYPE % compression_type.upper() + '\n' +\ - COMPRESSION_CODEC % compression_codec - return '' +def build_table_template(file_format, columns, partition_columns, row_format, + avro_schema_dir): + partitioned_by = str() + if partition_columns: + partitioned_by = 'PARTITIONED BY (%s)' % \ + ', '.join(partition_columns.split('\n')) + + row_format_stmt = str() + if row_format: + row_format_stmt = 'ROW FORMAT ' + row_format + + tblproperties = str() + if file_format == 'avro': + tblproperties = "TBLPROPERTIES ('avro.schema.url'=" \ + "'hdfs://%s/%s/{table_name}.json')" \ + % (options.hdfs_namenode, avro_schema_dir) + # Override specified row format + row_format_stmt = "ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'" + + # Note: columns are ignored but allowed if a custom serde is specified + # (e.g. Avro) + return """ +CREATE EXTERNAL TABLE {{db_name}}{{db_suffix}}.{{table_name}} ( +{columns}) +{partitioned_by} +{row_format} +STORED AS {{file_format}} +LOCATION '{hive_warehouse_dir}/{{hdfs_location}}' +{tblproperties}; +""".format( + row_format=row_format_stmt, + columns=',\n'.join(columns.split('\n')), + partitioned_by=partitioned_by, + hive_warehouse_dir=options.hive_warehouse_dir, + tblproperties=tblproperties + ).strip() + +def avro_schema(columns): + record = { + "name": "a", # doesn't matter + "type": "record", + "fields": list() + } + for column_spec in columns.strip().split('\n'): + # column_spec looks something like "col_name col_type COMMENT comment" + # (comment may be omitted, we don't use it) + name = column_spec.split()[0] + type = column_spec.split()[1] + assert type.upper() in HIVE_TO_AVRO_TYPE_MAP, "Cannot convert to Avro type: %s" % type + record["fields"].append( + {'name': name, + 'type': [HIVE_TO_AVRO_TYPE_MAP[type.upper()], "null"]}) # all columns nullable + return json.dumps(record) + +def build_compression_codec_statement(codec, compression_type, file_format): + codec = AVRO_COMPRESSION_MAP[codec] if file_format == 'avro' else COMPRESSION_MAP[codec] + if not codec: + return str() + return (AVRO_COMPRESSION_CODEC % codec) if file_format == 'avro' else ( + COMPRESSION_TYPE % compression_type.upper() + '\n' + COMPRESSION_CODEC % codec) def build_codec_enabled_statement(codec): compression_enabled = 'false' if codec == 'none' else 'true' @@ -138,9 +222,10 @@ def build_insert_into_statement(insert, db_name, db_suffix, table_name, file_for statement = SET_PARTITION_MODE_NONSTRICT_STATEMENT + "\n" statement += SET_DYNAMIC_PARTITION_STATEMENT + "\n" - # For some reason (hive bug?) we need to have the CombineHiveInputFormat set for cases - # where we are compressing in bzip on certain tables that have multiple files. - if 'bzip' in db_suffix and 'multi' in table_name: + # For some reason (hive bug?) we need to have the CombineHiveInputFormat set + # for cases where we are compressing in bzip or lzo on certain tables that + # have multiple files. + if 'multi' in table_name and ('bzip' in db_suffix or 'lzo' in db_suffix): statement += SET_HIVE_INPUT_FORMAT % "CombineHiveInputFormat" else: statement += SET_HIVE_INPUT_FORMAT % "HiveInputFormat" @@ -149,7 +234,7 @@ def build_insert_into_statement(insert, db_name, db_suffix, table_name, file_for def build_insert(insert, db_name, db_suffix, file_format, codec, compression_type, table_name): output = build_codec_enabled_statement(codec) + "\n" - output += build_compression_codec_statement(codec, compression_type) + "\n" + output += build_compression_codec_statement(codec, compression_type, file_format) + "\n" output += build_insert_into_statement(insert, db_name, db_suffix, table_name, file_format) + "\n" return output @@ -180,15 +265,6 @@ def build_db_suffix(file_format, codec, compression_type): else: return '_%s_%s' % (file_format, codec) -def write_parquet_to_file(file_name, array): - # Strip out all the hive SET statements - array.insert(0, 'refresh;\n') - write_array_to_file(file_name, 'w', array) - -def write_array_to_file(file_name, mode, array): - with open(file_name, mode) as f: - f.write('\n\n'.join(array)) - # Does a hdfs directory listing and returns array with all the subdir names. def get_hdfs_subdirs_with_data(path): tmp_file = tempfile.TemporaryFile("w+") @@ -201,13 +277,34 @@ def get_hdfs_subdirs_with_data(path): # So to get subdirectory names just return everything after the last '/' return [line[line.rfind('/') + 1:].strip() for line in tmp_file.readlines()] +class Statements(object): + """Simple container object for storing SQL statements to be output to a + file. Useful for ordering the statements correctly.""" + def __init__(self): + self.create = list() + self.load = list() + self.load_base = list() + + def write_to_file(self, filename): + # Only write to file if there's something to actually write + if self.create or self.load_base or self.load: + # Make sure we create the base tables first + output = self.create + self.load_base + self.load + with open(filename, 'w') as f: + f.write('\n\n'.join(output)) + def generate_statements(output_name, test_vectors, sections, schema_include_constraints, schema_exclude_constraints): - output_stats = [SET_HIVE_INPUT_FORMAT % "HiveInputFormat"] - output_create = [] - output_load = [] - output_load_base = [] - output_parquet = [] + # The Avro SerDe causes strange problems with other unrelated tables (e.g., + # Avro files will be written to LZO-compressed text tables). We generate + # separate schema statement files for Avro tables so we can invoke Hive + # completely separately for them. + # See https://issues.apache.org/jira/browse/HIVE-4195. + avro_output = Statements() + # Parquet statements to be executed separately by Impala + parquet_output = Statements() + default_output = Statements() + table_names = None if options.table_names: table_names = [name.lower() for name in options.table_names.split(',')] @@ -216,6 +313,7 @@ def generate_statements(output_name, test_vectors, sections, file_format, data_set, codec, compression_type =\ [row.file_format, row.dataset, row.compression_codec, row.compression_type] table_format = '%s/%s/%s' % (file_format, codec, compression_type) + output = default_output if 'avro' not in table_format else avro_output for section in sections: alter = section.get('ALTER') @@ -223,6 +321,9 @@ def generate_statements(output_name, test_vectors, sections, insert = section['DEPENDENT_LOAD'] load_local = section['LOAD'] base_table_name = section['BASE_TABLE_NAME'] + columns = section['COLUMNS'] + partition_columns = section['PARTITION_COLUMNS'] + row_format = section['ROW_FORMAT'] table_name = base_table_name db_suffix = build_db_suffix(file_format, codec, compression_type) db_name = '{0}{1}'.format(data_set, options.scale_factor) @@ -253,14 +354,36 @@ def generate_statements(output_name, test_vectors, sections, print 'Skipping \'%s\' due to exclude constraint match' % table_name continue - output_create.append(build_create_statement(create, table_name, db_name, db_suffix, - file_format, codec, hdfs_location)) + # If a CREATE section is provided, use that. Otherwise a COLUMNS section + # must be provided (and optionally PARTITION_COLUMNS and ROW_FORMAT + # sections), which is used to generate the create table statement. + if create: + table_template = create + if file_format == 'avro': + # We don't know how to generalize CREATE sections to Avro. + print "CREATE section not supported with Avro, skipping: '%s'" % table_name + continue + else: + assert columns, "No CREATE or COLUMNS section defined for table " + table_name + avro_schema_dir = "%s/%s" % (AVRO_SCHEMA_DIR, data_set) + table_template = build_table_template( + file_format, columns, partition_columns, row_format, avro_schema_dir) + # Write Avro schema to local file + if not os.path.exists(avro_schema_dir): + os.makedirs(avro_schema_dir) + with open("%s/%s.json" % (avro_schema_dir, table_name),"w") as f: + f.write(avro_schema(columns)) + + output.create.append( + build_create_statement(table_template, table_name, db_name, db_suffix, + file_format, codec, hdfs_location)) + # The ALTER statement in hive does not accept fully qualified table names. # We need the use statement. if alter: use_table = 'USE {db_name}{db_suffix};\n'.format(db_name=db_name, db_suffix=db_suffix) - output_create.append(use_table + alter.format(table_name=table_name)) + output.create.append(use_table + alter.format(table_name=table_name)) # If the directory already exists in HDFS, assume that data files already exist # and skip loading the data. Otherwise, the data is generated using either an @@ -271,35 +394,35 @@ def generate_statements(output_name, test_vectors, sections, print 'HDFS path:', data_path, 'does not exists or is empty. Data will be loaded.' if not db_suffix: if load_local: - output_load_base.append(build_load_statement(load_local, db_name, + output.load_base.append(build_load_statement(load_local, db_name, db_suffix, table_name)) else: print 'Empty base table load for %s. Skipping load generation' % table_name elif file_format == 'parquet': if insert: - # In most cases the same load logic can be used for the parquet and + # In most cases the same load logic can be used for the parquet and # non-parquet case, but sometimes it needs to be special cased. insert = insert if 'LOAD_PARQUET' not in section else section['LOAD_PARQUET'] - output_parquet.append(build_insert_into_statement( + parquet_output.load.append(build_insert_into_statement( insert, db_name, db_suffix, table_name, 'parquet', for_impala=True)) else: print \ 'Empty parquet load for table %s. Skipping insert generation' % table_name else: if insert: - output_load.append(build_insert(insert, db_name, db_suffix, file_format, + output.load.append(build_insert(insert, db_name, db_suffix, file_format, codec, compression_type, table_name)) else: print 'Empty insert for table %s. Skipping insert generation' % table_name - # Make sure we create the base tables first - output_load = output_create + output_load_base + output_load - write_array_to_file('load-' + output_name + '-generated.sql', 'w', output_load) - write_parquet_to_file('load-parquet-' + output_name + '-generated.sql', output_parquet); + avro_output.write_to_file('load-' + output_name + '-avro-generated.sql') + parquet_output.write_to_file('load-' + output_name + '-parquet-generated.sql') + default_output.write_to_file('load-' + output_name + '-generated.sql') def parse_schema_template_file(file_name): - VALID_SECTION_NAMES = ['DATASET', 'BASE_TABLE_NAME', 'CREATE', 'DEPENDENT_LOAD', - 'LOAD', 'ALTER', 'LOAD_PARQUET'] + VALID_SECTION_NAMES = ['DATASET', 'BASE_TABLE_NAME', 'COLUMNS', 'PARTITION_COLUMNS', + 'ROW_FORMAT', 'CREATE', 'DEPENDENT_LOAD', 'LOAD', 'ALTER', + 'LOAD_PARQUET'] return parse_test_file(file_name, VALID_SECTION_NAMES, skip_unknown_sections=False) if __name__ == "__main__": diff --git a/testdata/bin/generate-test-vectors.py b/testdata/bin/generate-test-vectors.py index a82acb9b7..945d3b3c1 100755 --- a/testdata/bin/generate-test-vectors.py +++ b/testdata/bin/generate-test-vectors.py @@ -82,7 +82,9 @@ def is_valid_combination(vector): (vector[COMPRESSION_IDX] != 'none' and vector[COMPRESSION_TYPE_IDX] == 'none') or (vector[FILE_FORMAT_IDX] != 'seq' and vector[COMPRESSION_TYPE_IDX] == 'record') or (vector[FILE_FORMAT_IDX] == 'parquet' and - (vector[COMPRESSION_IDX] == 'gzip' or vector[COMPRESSION_IDX] == 'bzip'))) + (vector[COMPRESSION_IDX] == 'gzip' or vector[COMPRESSION_IDX] == 'bzip')) or + (vector[FILE_FORMAT_IDX] == 'avro' and + vector[COMPRESSION_IDX] not in ['none', 'snap', 'def'])) # The pairwise generator may call this with different vector lengths. In that case this # should always return true. diff --git a/testdata/datasets/functional/functional_schema_template.sql b/testdata/datasets/functional/functional_schema_template.sql index bfc7781c3..7a478e5e7 100644 --- a/testdata/datasets/functional/functional_schema_template.sql +++ b/testdata/datasets/functional/functional_schema_template.sql @@ -3,23 +3,23 @@ functional ---- BASE_TABLE_NAME alltypes ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( - id int COMMENT 'Add a comment', - bool_col boolean, - tinyint_col tinyint, - smallint_col smallint, - int_col int, - bigint_col bigint, - float_col float, - double_col double, - date_string_col string, - string_col string, - timestamp_col timestamp) -partitioned by (year int, month int) -row format delimited fields terminated by ',' escaped by '\\' -stored as {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- PARTITION_COLUMNS +year int +month int +---- COLUMNS +id int COMMENT 'Add a comment' +bool_col boolean +tinyint_col tinyint +smallint_col smallint +int_col int +bigint_col bigint +float_col float +double_col double +date_string_col string +string_col string +timestamp_col timestamp +---- ROW_FORMAT +delimited fields terminated by ',' escaped by '\\' ---- ALTER ALTER TABLE {table_name} ADD PARTITION(year=2009, month=1); ALTER TABLE {table_name} ADD PARTITION(year=2009, month=2); @@ -81,44 +81,42 @@ LOAD DATA LOCAL INPATH '{impala_home}/testdata/target/AllTypes/101201.txt' OVERW functional ---- BASE_TABLE_NAME alltypesnopart ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( - id int, - bool_col boolean, - tinyint_col tinyint, - smallint_col smallint, - int_col int, - bigint_col bigint, - float_col float, - double_col double, - date_string_col string, - string_col string, - timestamp_col timestamp) -row format delimited fields terminated by ',' escaped by '\\' -stored as {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +id int +bool_col boolean +tinyint_col tinyint +smallint_col smallint +int_col int +bigint_col bigint +float_col float +double_col double +date_string_col string +string_col string +timestamp_col timestamp +---- ROW_FORMAT +delimited fields terminated by ',' escaped by '\\' ==== ---- DATASET functional ---- BASE_TABLE_NAME alltypessmall ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( - id int, - bool_col boolean, - tinyint_col tinyint, - smallint_col smallint, - int_col int, - bigint_col bigint, - float_col float, - double_col double, - date_string_col string, - string_col string, - timestamp_col timestamp) -partitioned by (year int, month int) -row format delimited fields terminated by ',' escaped by '\\' -stored as {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- PARTITION_COLUMNS +year int +month int +---- COLUMNS +id int +bool_col boolean +tinyint_col tinyint +smallint_col smallint +int_col int +bigint_col bigint +float_col float +double_col double +date_string_col string +string_col string +timestamp_col timestamp +---- ROW_FORMAT +delimited fields terminated by ',' escaped by '\\' ---- ALTER ALTER TABLE {table_name} ADD PARTITION(year=2009, month=1); ALTER TABLE {table_name} ADD PARTITION(year=2009, month=2); @@ -138,23 +136,23 @@ LOAD DATA LOCAL INPATH '{impala_home}/testdata/target/AllTypesSmall/090401.txt' functional ---- BASE_TABLE_NAME alltypestiny ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( - id int, - bool_col boolean, - tinyint_col tinyint, - smallint_col smallint, - int_col int, - bigint_col bigint, - float_col float, - double_col double, - date_string_col string, - string_col string, - timestamp_col timestamp) -partitioned by (year int, month int) -row format delimited fields terminated by ',' escaped by '\\' -stored as {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- PARTITION_COLUMNS +year int +month int +---- COLUMNS +id int +bool_col boolean +tinyint_col tinyint +smallint_col smallint +int_col int +bigint_col bigint +float_col float +double_col double +date_string_col string +string_col string +timestamp_col timestamp +---- ROW_FORMAT +delimited fields terminated by ',' escaped by '\\' ---- ALTER ALTER TABLE {table_name} ADD PARTITION(year=2009, month=1); ALTER TABLE {table_name} ADD PARTITION(year=2009, month=2); @@ -387,23 +385,24 @@ LOAD DATA LOCAL INPATH '{impala_home}/testdata/AllTypesErrorNoNulls/0903.txt' OV functional ---- BASE_TABLE_NAME alltypesagg ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( - id int, - bool_col boolean, - tinyint_col tinyint, - smallint_col smallint, - int_col int, - bigint_col bigint, - float_col float, - double_col double, - date_string_col string, - string_col string, - timestamp_col timestamp) -partitioned by (year int, month int, day int) -row format delimited fields terminated by ',' escaped by '\\' -stored as {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- PARTITION_COLUMNS +year int +month int +day int +---- COLUMNS +id int +bool_col boolean +tinyint_col tinyint +smallint_col smallint +int_col int +bigint_col bigint +float_col float +double_col double +date_string_col string +string_col string +timestamp_col timestamp +---- ROW_FORMAT +delimited fields terminated by ',' escaped by '\\' ---- ALTER ALTER TABLE {table_name} ADD PARTITION(year=2010, month=1, day=1); ALTER TABLE {table_name} ADD PARTITION(year=2010, month=1, day=2); @@ -435,23 +434,24 @@ LOAD DATA LOCAL INPATH '{impala_home}/testdata/target/AllTypesAgg/100110.txt' OV functional ---- BASE_TABLE_NAME alltypesaggnonulls ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( - id int, - bool_col boolean, - tinyint_col tinyint, - smallint_col smallint, - int_col int, - bigint_col bigint, - float_col float, - double_col double, - date_string_col string, - string_col string, - timestamp_col timestamp) -partitioned by (year int, month int, day int) -row format delimited fields terminated by ',' escaped by '\\' -stored as {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- PARTITION_COLUMNS +year int +month int +day int +---- COLUMNS +id int +bool_col boolean +tinyint_col tinyint +smallint_col smallint +int_col int +bigint_col bigint +float_col float +double_col double +date_string_col string +string_col string +timestamp_col timestamp +---- ROW_FORMAT +delimited fields terminated by ',' escaped by '\\' ---- ALTER ALTER TABLE {table_name} ADD PARTITION(year=2010, month=1, day=1); ALTER TABLE {table_name} ADD PARTITION(year=2010, month=1, day=2); @@ -483,28 +483,23 @@ LOAD DATA LOCAL INPATH '{impala_home}/testdata/target/AllTypesAggNoNulls/100110. functional ---- BASE_TABLE_NAME testtbl ----- CREATE --- testtbl is empty -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( - id bigint, - name string, - zip int) -row format delimited fields terminated by ',' escaped by '\\' -stored as {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +id bigint +name string +zip int +---- ROW_FORMAT +delimited fields terminated by ',' escaped by '\\' ==== ---- DATASET functional ---- BASE_TABLE_NAME dimtbl ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( - id bigint, - name string, - zip int) -row format delimited fields terminated by ',' escaped by '\\' -stored as {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +id bigint +name string +zip int +---- ROW_FORMAT +delimited fields terminated by ',' escaped by '\\' ---- DEPENDENT_LOAD INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT * FROM {db_name}.{table_name}; ---- LOAD @@ -514,15 +509,13 @@ LOAD DATA LOCAL INPATH '{impala_home}/testdata/DimTbl/data.csv' OVERWRITE INTO T functional ---- BASE_TABLE_NAME jointbl ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( - test_id bigint, - test_name string, - test_zip int, - alltypes_id int) -row format delimited fields terminated by ',' escaped by '\\' -stored as {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +test_id bigint +test_name string +test_zip int +alltypes_id int +---- ROW_FORMAT +delimited fields terminated by ',' escaped by '\\' ---- DEPENDENT_LOAD INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT * FROM {db_name}.{table_name}; ---- LOAD @@ -532,16 +525,14 @@ LOAD DATA LOCAL INPATH '{impala_home}/testdata/JoinTbl/data.csv' OVERWRITE INTO functional ---- BASE_TABLE_NAME liketbl ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( - str_col string, - match_like_col string, - no_match_like_col string, - match_regex_col string, - no_match_regex_col string) -row format delimited fields terminated by ',' escaped by '\\' -stored as {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +str_col string +match_like_col string +no_match_like_col string +match_regex_col string +no_match_regex_col string +---- ROW_FORMAT +delimited fields terminated by ',' escaped by '\\' ---- DEPENDENT_LOAD INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT * FROM {db_name}.{table_name}; ---- LOAD @@ -679,15 +670,13 @@ TBLPROPERTIES("hbase.table.name" = "hbasealltypesagg"); functional ---- BASE_TABLE_NAME escapenoquotes ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( - col1 string, - col2 string, - col3 int, - col4 int) -row format delimited fields terminated by ',' escaped by '\\' -stored as {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +col1 string +col2 string +col3 int +col4 int +---- ROW_FORMAT +delimited fields terminated by ',' escaped by '\\' ---- DEPENDENT_LOAD INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT * FROM {db_name}.{table_name}; ---- LOAD @@ -697,17 +686,15 @@ LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/escape-no-quotes.txt' OVERWR functional ---- BASE_TABLE_NAME overflow ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( - tinyint_col tinyint, - smallint_col smallint, - int_col int, - bigint_col bigint, - float_col float, - double_col double) -row format delimited fields terminated by ',' escaped by '\\' -stored as {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +tinyint_col tinyint +smallint_col smallint +int_col int +bigint_col bigint +float_col float +double_col double +---- ROW_FORMAT +delimited fields terminated by ',' escaped by '\\' ---- DEPENDENT_LOAD INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT * FROM {db_name}.{table_name}; ---- LOAD @@ -717,11 +704,8 @@ LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/overflow.txt' OVERWRITE INTO functional ---- BASE_TABLE_NAME greptiny ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( - field string) -stored as {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +field string ---- DEPENDENT_LOAD INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT * FROM {db_name}.{table_name}; ---- LOAD @@ -731,14 +715,12 @@ LOAD DATA LOCAL INPATH '{impala_home}/testdata/hive_benchmark/grepTiny/part-0000 functional ---- BASE_TABLE_NAME rankingssmall ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( - pageRank int, - pageURL string, - avgDuration int) -row format delimited fields terminated by '|' -stored as {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +pageRank int +pageURL string +avgDuration int +---- ROW_FORMAT +delimited fields terminated by '|' ---- DEPENDENT_LOAD INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT * FROM {db_name}.{table_name}; ---- LOAD @@ -748,20 +730,18 @@ LOAD DATA LOCAL INPATH '{impala_home}/testdata/hive_benchmark/htmlTiny/Rankings. functional ---- BASE_TABLE_NAME uservisitssmall ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( - sourceIP string, - destURL string, - visitDate string, - adRevenue float, - userAgent string, - cCode string, - lCode string, - sKeyword string, - avgTimeOnSite int) -row format delimited fields terminated by '|' -stored as {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +sourceIP string +destURL string +visitDate string +adRevenue float +userAgent string +cCode string +lCode string +sKeyword string +avgTimeOnSite int +---- ROW_FORMAT +delimited fields terminated by '|' ---- DEPENDENT_LOAD INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT * FROM {db_name}.{table_name}; ---- LOAD @@ -771,31 +751,31 @@ LOAD DATA LOCAL INPATH '{impala_home}/testdata/hive_benchmark/htmlTiny/UserVisit functional ---- BASE_TABLE_NAME emptytable ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( - field string) -partitioned by (f2 int); +---- PARTITION_COLUMNS +f2 int +---- COLUMNS +field string ==== ---- DATASET functional ---- BASE_TABLE_NAME alltypesaggmultifiles ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( - id int, - bool_col boolean, - tinyint_col tinyint, - smallint_col smallint, - int_col int, - bigint_col bigint, - float_col float, - double_col double, - date_string_col string, - string_col string, - timestamp_col timestamp) -partitioned by (year int, month int, day int) -stored as {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- PARTITION_COLUMNS +year int +month int +day int +---- COLUMNS +id int +bool_col boolean +tinyint_col tinyint +smallint_col smallint +int_col int +bigint_col bigint +float_col float +double_col double +date_string_col string +string_col string +timestamp_col timestamp ---- ALTER ALTER TABLE {table_name} ADD PARTITION(year=2010, month=1, day=1); ALTER TABLE {table_name} ADD PARTITION(year=2010, month=1, day=2); @@ -824,22 +804,18 @@ insert into table {db_name}{db_suffix}.{table_name} partition (year, month, day) functional ---- BASE_TABLE_NAME alltypesaggmultifilesnopart ----- CREATE -DROP TABLE if EXISTS {db_name}{db_suffix}.{table_name}; -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( - id int, - bool_col boolean, - tinyint_col tinyint, - smallint_col smallint, - int_col int, - bigint_col bigint, - float_col float, - double_col double, - date_string_col string, - string_col string, - timestamp_col timestamp) -stored as {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +id int +bool_col boolean +tinyint_col tinyint +smallint_col smallint +int_col int +bigint_col bigint +float_col float +double_col double +date_string_col string +string_col string +timestamp_col timestamp ---- DEPENDENT_LOAD insert overwrite table {db_name}{db_suffix}.{table_name} SELECT id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col FROM {db_name}.{table_name} where id % 4 = 0; insert into table {db_name}{db_suffix}.{table_name} SELECT id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col FROM {db_name}.{table_name} where id % 4 = 1; @@ -858,13 +834,10 @@ insert into table {db_name}{db_suffix}.{table_name} SELECT id, bool_col, tinyint functional ---- BASE_TABLE_NAME stringpartitionkey ----- CREATE --- Regression for IMP-163, failure to load tables partitioned by string column -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( - id int) -PARTITIONED BY (string_col string) -STORED AS {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- PARTITION_COLUMNS +string_col string +---- COLUMNS +id int ---- ALTER ALTER TABLE {table_name} ADD PARTITION (string_col = "partition1"); ==== @@ -872,13 +845,11 @@ ALTER TABLE {table_name} ADD PARTITION (string_col = "partition1"); functional ---- BASE_TABLE_NAME tinytable ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( - a string, - b string) -row format delimited fields terminated by ',' -stored as {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +a string +b string +---- ROW_FORMAT +delimited fields terminated by ',' ---- DEPENDENT_LOAD INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT * FROM {db_name}.{table_name}; ---- LOAD @@ -888,12 +859,10 @@ LOAD DATA LOCAL INPATH '{impala_home}/testdata/TinyTable/data.csv' OVERWRITE INT functional ---- BASE_TABLE_NAME tinyinttable ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( - int_col int) -row format delimited fields terminated by ',' -stored as {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +int_col int +---- ROW_FORMAT +delimited fields terminated by ',' ---- DEPENDENT_LOAD INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT * FROM {db_name}.{table_name}; ---- LOAD @@ -903,15 +872,14 @@ LOAD DATA LOCAL INPATH '{impala_home}/testdata/TinyIntTable/data.csv' OVERWRITE functional ---- BASE_TABLE_NAME nulltable ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( - a string, - b string, - c string, - d int, - e double) -row format delimited fields terminated by ',' -stored as {file_format}; +---- COLUMNS +a string +b string +c string +d int +e double +---- ROW_FORMAT +delimited fields terminated by ',' ---- DEPENDENT_LOAD INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} select 'a', '', NULL, NULL, NULL from {db_name}.alltypes limit 1; ---- LOAD @@ -921,15 +889,14 @@ INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} select 'a', '', NULL, N functional ---- BASE_TABLE_NAME nullescapedtable ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( - a string, - b string, - c string, - d int, - e double) -row format delimited fields terminated by ',' escaped by '\\' -stored as {file_format}; +---- COLUMNS +a string +b string +c string +d int +e double +---- ROW_FORMAT +delimited fields terminated by ',' escaped by '\\' ---- DEPENDENT_LOAD INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} select 'a', '', NULL, NULL, NULL from {db_name}.alltypes limit 1; ---- LOAD @@ -939,13 +906,12 @@ INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} select 'a', '', NULL, N functional ---- BASE_TABLE_NAME escapechartesttable ----- CREATE --- Create a test data with the escape character as the same as the tuple delimiter -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} (bool_col boolean) -partitioned by (id int) -row format delimited fields terminated by ',' escaped by '\n' -stored as {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- PARTITION_COLUMNS +id int +---- COLUMNS +bool_col boolean +---- ROW_FORMAT +delimited fields terminated by ',' escaped by '\n' ---- ALTER ALTER TABLE {table_name} ADD PARTITION(id=0); ALTER TABLE {table_name} ADD PARTITION(id=1); @@ -971,13 +937,11 @@ select bool_col,id FROM {db_name}.alltypesagg where id < 10; functional ---- BASE_TABLE_NAME TblWithRaggedColumns ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( - str_col string, - int_col int) -row format delimited fields terminated by ',' escaped by '\\' -stored as {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +str_col string +int_col int +---- ROW_FORMAT +delimited fields terminated by ',' escaped by '\\' ---- DEPENDENT_LOAD INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT * FROM {db_name}.{table_name}; ---- LOAD @@ -1011,16 +975,14 @@ LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; functional ---- BASE_TABLE_NAME zipcode_incomes ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( - id STRING, - zip STRING, - description1 STRING, - description2 STRING, - income int) -ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' -STORED AS {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +id STRING +zip STRING +description1 STRING +description2 STRING +income int +---- ROW_FORMAT +DELIMITED FIELDS TERMINATED BY ',' ---- DEPENDENT_LOAD INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT * FROM {db_name}.{table_name}; ---- LOAD diff --git a/testdata/datasets/tpcds/tpcds_schema_template.sql b/testdata/datasets/tpcds/tpcds_schema_template.sql index c714ab2fc..84545269f 100644 --- a/testdata/datasets/tpcds/tpcds_schema_template.sql +++ b/testdata/datasets/tpcds/tpcds_schema_template.sql @@ -5,36 +5,32 @@ tpcds ---- BASE_TABLE_NAME store_sales ----- CREATE -create external table {db_name}{db_suffix}.{table_name} -( - ss_sold_date_sk int, - ss_sold_time_sk int, - ss_item_sk int, - ss_customer_sk int, - ss_cdemo_sk int, - ss_hdemo_sk int, - ss_addr_sk int, - ss_store_sk int, - ss_promo_sk int, - ss_ticket_number int, - ss_quantity int, - ss_wholesale_cost float, - ss_list_price float, - ss_sales_price float, - ss_ext_discount_amt float, - ss_ext_sales_price float, - ss_ext_wholesale_cost float, - ss_ext_list_price float, - ss_ext_tax float, - ss_coupon_amt float, - ss_net_paid float, - ss_net_paid_inc_tax float, - ss_net_profit float -) -row format delimited fields terminated by '|' -STORED AS {file_format} -location '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +ss_sold_date_sk int +ss_sold_time_sk int +ss_item_sk int +ss_customer_sk int +ss_cdemo_sk int +ss_hdemo_sk int +ss_addr_sk int +ss_store_sk int +ss_promo_sk int +ss_ticket_number int +ss_quantity int +ss_wholesale_cost float +ss_list_price float +ss_sales_price float +ss_ext_discount_amt float +ss_ext_sales_price float +ss_ext_wholesale_cost float +ss_ext_list_price float +ss_ext_tax float +ss_coupon_amt float +ss_net_paid float +ss_net_paid_inc_tax float +ss_net_profit float +---- ROW_FORMAT +delimited fields terminated by '|' ---- DEPENDENT_LOAD INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT * FROM {db_name}.{table_name}; ---- LOAD @@ -45,22 +41,18 @@ OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; tpcds ---- BASE_TABLE_NAME customer_demographics ----- CREATE -create external table {db_name}{db_suffix}.{table_name} -( - cd_demo_sk int, - cd_gender string, - cd_marital_status string, - cd_education_status string, - cd_purchase_estimate int, - cd_credit_rating string, - cd_dep_count int, - cd_dep_employed_count int, - cd_dep_college_count int -) -row format delimited fields terminated by '|' -STORED AS {file_format} -location '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +cd_demo_sk int +cd_gender string +cd_marital_status string +cd_education_status string +cd_purchase_estimate int +cd_credit_rating string +cd_dep_count int +cd_dep_employed_count int +cd_dep_college_count int +---- ROW_FORMAT +delimited fields terminated by '|' ---- DEPENDENT_LOAD INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT * FROM {db_name}.{table_name}; ---- LOAD @@ -71,41 +63,37 @@ OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; tpcds ---- BASE_TABLE_NAME date_dim ----- CREATE -create external table {db_name}{db_suffix}.{table_name} -( - d_date_sk int, - d_date_id string, - d_date string, - d_month_seq int, - d_week_seq int, - d_quarter_seq int, - d_year int, - d_dow int, - d_moy int, - d_dom int, - d_qoy int, - d_fy_year int, - d_fy_quarter_seq int, - d_fy_week_seq int, - d_day_name string, - d_quarter_name string, - d_holiday string, - d_weekend string, - d_following_holiday string, - d_first_dom int, - d_last_dom int, - d_same_day_ly int, - d_same_day_lq int, - d_current_day string, - d_current_week string, - d_current_month string, - d_current_quarter string, - d_current_year string -) -row format delimited fields terminated by '|' -STORED AS {file_format} -location '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +d_date_sk int +d_date_id string +d_date string +d_month_seq int +d_week_seq int +d_quarter_seq int +d_year int +d_dow int +d_moy int +d_dom int +d_qoy int +d_fy_year int +d_fy_quarter_seq int +d_fy_week_seq int +d_day_name string +d_quarter_name string +d_holiday string +d_weekend string +d_following_holiday string +d_first_dom int +d_last_dom int +d_same_day_ly int +d_same_day_lq int +d_current_day string +d_current_week string +d_current_month string +d_current_quarter string +d_current_year string +---- ROW_FORMAT +delimited fields terminated by '|' ---- DEPENDENT_LOAD INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT * FROM {db_name}.{table_name}; ---- LOAD @@ -116,23 +104,19 @@ OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; tpcds ---- BASE_TABLE_NAME time_dim ----- CREATE -create external table {db_name}{db_suffix}.{table_name} -( - t_time_sk int, - t_time_id string, - t_time int, - t_hour int, - t_minute int, - t_second int, - t_am_pm string, - t_shift string, - t_sub_shift string, - t_meal_time string -) -row format delimited fields terminated by '|' -STORED AS {file_format} -location '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +t_time_sk int +t_time_id string +t_time int +t_hour int +t_minute int +t_second int +t_am_pm string +t_shift string +t_sub_shift string +t_meal_time string +---- ROW_FORMAT +delimited fields terminated by '|' ---- DEPENDENT_LOAD INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT * FROM {db_name}.{table_name}; ---- LOAD @@ -143,35 +127,31 @@ OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; tpcds ---- BASE_TABLE_NAME item ----- CREATE -create external table {db_name}{db_suffix}.{table_name} -( - i_item_sk int, - i_item_id string, - i_rec_start_date string, - i_rec_end_date string, - i_item_desc string, - i_current_price float, - i_wholesale_cost float, - i_brand_id int, - i_brand string, - i_class_id int, - i_class string, - i_category_id int, - i_category string, - i_manufact_id int, - i_manufact string, - i_size string, - i_formulation string, - i_color string, - i_units string, - i_container string, - i_manager_id int, - i_product_name string -) -row format delimited fields terminated by '|' -STORED AS {file_format} -location '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +i_item_sk int +i_item_id string +i_rec_start_date string +i_rec_end_date string +i_item_desc string +i_current_price float +i_wholesale_cost float +i_brand_id int +i_brand string +i_class_id int +i_class string +i_category_id int +i_category string +i_manufact_id int +i_manufact string +i_size string +i_formulation string +i_color string +i_units string +i_container string +i_manager_id int +i_product_name string +---- ROW_FORMAT +delimited fields terminated by '|' ---- DEPENDENT_LOAD INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT * FROM {db_name}.{table_name}; ---- LOAD @@ -182,42 +162,38 @@ OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; tpcds ---- BASE_TABLE_NAME store ----- CREATE -create external table {db_name}{db_suffix}.{table_name} -( - s_store_sk int, - s_store_id string, - s_rec_start_date string, - s_rec_end_date string, - s_closed_date_sk int, - s_store_name string, - s_number_employees int, - s_floor_space int, - s_hours string, - s_manager string, - s_market_id int, - s_geography_class string, - s_market_desc string, - s_market_manager string, - s_division_id int, - s_division_name string, - s_company_id int, - s_company_name string, - s_street_number string, - s_street_name string, - s_street_type string, - s_suite_number string, - s_city string, - s_county string, - s_state string, - s_zip string, - s_country string, - s_gmt_offset float, - s_tax_precentage float -) -row format delimited fields terminated by '|' -STORED AS {file_format} -location '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +s_store_sk int +s_store_id string +s_rec_start_date string +s_rec_end_date string +s_closed_date_sk int +s_store_name string +s_number_employees int +s_floor_space int +s_hours string +s_manager string +s_market_id int +s_geography_class string +s_market_desc string +s_market_manager string +s_division_id int +s_division_name string +s_company_id int +s_company_name string +s_street_number string +s_street_name string +s_street_type string +s_suite_number string +s_city string +s_county string +s_state string +s_zip string +s_country string +s_gmt_offset float +s_tax_precentage float +---- ROW_FORMAT +delimited fields terminated by '|' ---- DEPENDENT_LOAD INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT * FROM {db_name}.{table_name}; ---- LOAD @@ -228,31 +204,27 @@ OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; tpcds ---- BASE_TABLE_NAME customer ----- CREATE -create external table {db_name}{db_suffix}.{table_name} -( - c_customer_sk int, - c_customer_id string, - c_current_cdemo_sk int, - c_current_hdemo_sk int, - c_current_addr_sk int, - c_first_shipto_date_sk int, - c_first_sales_date_sk int, - c_salutation string, - c_first_name string, - c_last_name string, - c_preferred_cust_flag string, - c_birth_day int, - c_birth_month int, - c_birth_year int, - c_birth_country string, - c_login string, - c_email_address string, - c_last_review_date string -) -row format delimited fields terminated by '|' -STORED AS {file_format} -location '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +c_customer_sk int +c_customer_id string +c_current_cdemo_sk int +c_current_hdemo_sk int +c_current_addr_sk int +c_first_shipto_date_sk int +c_first_sales_date_sk int +c_salutation string +c_first_name string +c_last_name string +c_preferred_cust_flag string +c_birth_day int +c_birth_month int +c_birth_year int +c_birth_country string +c_login string +c_email_address string +c_last_review_date string +---- ROW_FORMAT +delimited fields terminated by '|' ---- DEPENDENT_LOAD INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT * FROM {db_name}.{table_name}; ---- LOAD @@ -263,32 +235,28 @@ OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; tpcds ---- BASE_TABLE_NAME promotion ----- CREATE -create external table {db_name}{db_suffix}.{table_name} -( - p_promo_sk int, - p_promo_id string, - p_start_date_sk int, - p_end_date_sk int, - p_item_sk int, - p_cost float, - p_response_target int, - p_promo_name string, - p_channel_dmail string, - p_channel_email string, - p_channel_catalog string, - p_channel_tv string, - p_channel_radio string, - p_channel_press string, - p_channel_event string, - p_channel_demo string, - p_channel_details string, - p_purpose string, - p_discount_active string -) -row format delimited fields terminated by '|' -STORED AS {file_format} -location '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +p_promo_sk int +p_promo_id string +p_start_date_sk int +p_end_date_sk int +p_item_sk int +p_cost float +p_response_target int +p_promo_name string +p_channel_dmail string +p_channel_email string +p_channel_catalog string +p_channel_tv string +p_channel_radio string +p_channel_press string +p_channel_event string +p_channel_demo string +p_channel_details string +p_purpose string +p_discount_active string +---- ROW_FORMAT +delimited fields terminated by '|' ---- DEPENDENT_LOAD INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT * FROM {db_name}.{table_name}; ---- LOAD @@ -299,18 +267,14 @@ OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; tpcds ---- BASE_TABLE_NAME household_demographics ----- CREATE -create external table {db_name}{db_suffix}.{table_name} -( - hd_demo_sk int, - hd_income_band_sk int, - hd_buy_potential string, - hd_dep_count int, - hd_vehicle_count int -) -row format delimited fields terminated by '|' -STORED AS {file_format} -location '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +hd_demo_sk int +hd_income_band_sk int +hd_buy_potential string +hd_dep_count int +hd_vehicle_count int +---- ROW_FORMAT +delimited fields terminated by '|' ---- DEPENDENT_LOAD INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT * FROM {db_name}.{table_name}; ---- LOAD @@ -321,26 +285,22 @@ OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; tpcds ---- BASE_TABLE_NAME customer_address ----- CREATE -create external table {db_name}{db_suffix}.{table_name} -( - ca_address_sk int, - ca_address_id string, - ca_street_number string, - ca_street_name string, - ca_street_type string, - ca_suite_number string, - ca_city string, - ca_county string, - ca_state string, - ca_zip string, - ca_country string, - ca_gmt_offset float, - ca_location_type string -) -row format delimited fields terminated by '|' -STORED AS {file_format} -location '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +ca_address_sk int +ca_address_id string +ca_street_number string +ca_street_name string +ca_street_type string +ca_suite_number string +ca_city string +ca_county string +ca_state string +ca_zip string +ca_country string +ca_gmt_offset float +ca_location_type string +---- ROW_FORMAT +delimited fields terminated by '|' ---- DEPENDENT_LOAD INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT * FROM {db_name}.{table_name}; ---- LOAD diff --git a/testdata/datasets/tpch/tpch_schema_template.sql b/testdata/datasets/tpch/tpch_schema_template.sql index 81b6a486e..e8a703f54 100644 --- a/testdata/datasets/tpch/tpch_schema_template.sql +++ b/testdata/datasets/tpch/tpch_schema_template.sql @@ -5,27 +5,25 @@ tpch ---- BASE_TABLE_NAME lineitem ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( -L_ORDERKEY BIGINT, -L_PARTKEY BIGINT, -L_SUPPKEY BIGINT, -L_LINENUMBER INT, -L_QUANTITY DOUBLE, -L_EXTENDEDPRICE DOUBLE, -L_DISCOUNT DOUBLE, -L_TAX DOUBLE, -L_RETURNFLAG STRING, -L_LINESTATUS STRING, -L_SHIPDATE STRING, -L_COMMITDATE STRING, -L_RECEIPTDATE STRING, -L_SHIPINSTRUCT STRING, -L_SHIPMODE STRING, -L_COMMENT STRING) -ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' -STORED AS {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +L_ORDERKEY BIGINT +L_PARTKEY BIGINT +L_SUPPKEY BIGINT +L_LINENUMBER INT +L_QUANTITY DOUBLE +L_EXTENDEDPRICE DOUBLE +L_DISCOUNT DOUBLE +L_TAX DOUBLE +L_RETURNFLAG STRING +L_LINESTATUS STRING +L_SHIPDATE STRING +L_COMMITDATE STRING +L_RECEIPTDATE STRING +L_SHIPINSTRUCT STRING +L_SHIPMODE STRING +L_COMMENT STRING +---- ROW_FORMAT +DELIMITED FIELDS TERMINATED BY '|' ---- DEPENDENT_LOAD INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT * FROM {db_name}.{table_name}; ---- LOAD @@ -36,21 +34,18 @@ OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; tpch ---- BASE_TABLE_NAME part ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( -P_PARTKEY BIGINT, -P_NAME STRING, -P_MFGR STRING, -P_BRAND STRING, -P_TYPE -STRING, -P_SIZE INT, -P_CONTAINER STRING, -P_RETAILPRICE DOUBLE, -P_COMMENT STRING) -ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' -STORED AS {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +P_PARTKEY BIGINT +P_NAME STRING +P_MFGR STRING +P_BRAND STRING +P_TYPE STRING +P_SIZE INT +P_CONTAINER STRING +P_RETAILPRICE DOUBLE +P_COMMENT STRING +---- ROW_FORMAT +DELIMITED FIELDS TERMINATED BY '|' ---- DEPENDENT_LOAD INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT * FROM {db_name}.{table_name}; ---- LOAD @@ -61,16 +56,14 @@ OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; tpch ---- BASE_TABLE_NAME partsupp ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( -PS_PARTKEY BIGINT, -PS_SUPPKEY BIGINT, -PS_AVAILQTY INT, -PS_SUPPLYCOST DOUBLE, -PS_COMMENT STRING) -ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' -STORED AS {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +PS_PARTKEY BIGINT +PS_SUPPKEY BIGINT +PS_AVAILQTY INT +PS_SUPPLYCOST DOUBLE +PS_COMMENT STRING +---- ROW_FORMAT +DELIMITED FIELDS TERMINATED BY '|' ---- DEPENDENT_LOAD INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT * FROM {db_name}.{table_name}; ---- LOAD @@ -81,18 +74,16 @@ OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; tpch ---- BASE_TABLE_NAME supplier ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( -S_SUPPKEY BIGINT, -S_NAME STRING, -S_ADDRESS STRING, -S_NATIONKEY SMALLINT, -S_PHONE STRING, -S_ACCTBAL DOUBLE, -S_COMMENT STRING) -ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' -STORED AS {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +S_SUPPKEY BIGINT +S_NAME STRING +S_ADDRESS STRING +S_NATIONKEY SMALLINT +S_PHONE STRING +S_ACCTBAL DOUBLE +S_COMMENT STRING +---- ROW_FORMAT +DELIMITED FIELDS TERMINATED BY '|' ---- DEPENDENT_LOAD INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT * FROM {db_name}.{table_name}; ---- LOAD @@ -103,15 +94,13 @@ OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; tpch ---- BASE_TABLE_NAME nation ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( -N_NATIONKEY SMALLINT, -N_NAME STRING, -N_REGIONKEY SMALLINT, -N_COMMENT STRING) -ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' -STORED AS {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +N_NATIONKEY SMALLINT +N_NAME STRING +N_REGIONKEY SMALLINT +N_COMMENT STRING +---- ROW_FORMAT +DELIMITED FIELDS TERMINATED BY '|' ---- DEPENDENT_LOAD INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT * FROM {db_name}.{table_name}; ---- LOAD @@ -122,14 +111,12 @@ OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; tpch ---- BASE_TABLE_NAME region ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( -R_REGIONKEY SMALLINT, -R_NAME STRING, -R_COMMENT STRING) -ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' -STORED AS {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +R_REGIONKEY SMALLINT +R_NAME STRING +R_COMMENT STRING +---- ROW_FORMAT +DELIMITED FIELDS TERMINATED BY '|' ---- DEPENDENT_LOAD INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT * FROM {db_name}.{table_name}; ---- LOAD @@ -140,20 +127,18 @@ OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; tpch ---- BASE_TABLE_NAME orders ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( -O_ORDERKEY BIGINT, -O_CUSTKEY BIGINT, -O_ORDERSTATUS STRING, -O_TOTALPRICE DOUBLE, -O_ORDERDATE STRING, -O_ORDERPRIORITY STRING, -O_CLERK STRING, -O_SHIPPRIORITY INT, -O_COMMENT STRING) -ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' -STORED AS {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +O_ORDERKEY BIGINT +O_CUSTKEY BIGINT +O_ORDERSTATUS STRING +O_TOTALPRICE DOUBLE +O_ORDERDATE STRING +O_ORDERPRIORITY STRING +O_CLERK STRING +O_SHIPPRIORITY INT +O_COMMENT STRING +---- ROW_FORMAT +DELIMITED FIELDS TERMINATED BY '|' ---- DEPENDENT_LOAD INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT * FROM {db_name}.{table_name}; ---- LOAD @@ -164,19 +149,17 @@ OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; tpch ---- BASE_TABLE_NAME customer ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( -C_CUSTKEY BIGINT, -C_NAME STRING, -C_ADDRESS STRING, -C_NATIONKEY SMALLINT, -C_PHONE STRING, -C_ACCTBAL DOUBLE, -C_MKTSEGMENT STRING, -C_COMMENT STRING) -ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' -STORED AS {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +C_CUSTKEY BIGINT +C_NAME STRING +C_ADDRESS STRING +C_NATIONKEY SMALLINT +C_PHONE STRING +C_ACCTBAL DOUBLE +C_MKTSEGMENT STRING +C_COMMENT STRING +---- ROW_FORMAT +DELIMITED FIELDS TERMINATED BY '|' ---- DEPENDENT_LOAD INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT * FROM {db_name}.{table_name}; ---- LOAD @@ -187,176 +170,135 @@ OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; tpch ---- BASE_TABLE_NAME q2_minimum_cost_supplier_tmp1 ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( -s_acctbal double, -s_name string, -n_name string, -p_partkey bigint, -ps_supplycost double, -p_mfgr string, -s_address string, -s_phone string, -s_comment string) -STORED AS {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +s_acctbal double +s_name string +n_name string +p_partkey bigint +ps_supplycost double +p_mfgr string +s_address string +s_phone string +s_comment string ==== ---- DATASET tpch ---- BASE_TABLE_NAME q2_minimum_cost_supplier_tmp2 ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( -p_partkey bigint, -ps_min_supplycost double) -STORED AS {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +p_partkey bigint +ps_min_supplycost double ==== ---- DATASET tpch ---- BASE_TABLE_NAME q7_volume_shipping_tmp ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( -supp_nation string, -cust_nation string, -s_nationkey smallint, -c_nationkey smallint) -STORED AS {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +supp_nation string +cust_nation string +s_nationkey smallint +c_nationkey smallint ==== ---- DATASET tpch ---- BASE_TABLE_NAME q11_part_tmp ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( -ps_partkey bigint, -part_value double) -STORED AS {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +ps_partkey bigint +part_value double ==== ---- DATASET tpch ---- BASE_TABLE_NAME q11_sum_tmp ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} (total_value double) -STORED AS {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +total_value double ==== ---- DATASET tpch ---- BASE_TABLE_NAME revenue ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( -supplier_no bigint, -total_revenue double) -STORED AS {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +supplier_no bigint +total_revenue double ==== ---- DATASET tpch ---- BASE_TABLE_NAME max_revenue ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} (max_revenue double) -STORED AS {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +max_revenue double ==== ---- DATASET tpch ---- BASE_TABLE_NAME supplier_tmp ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} (s_suppkey bigint) -STORED AS {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +s_suppkey bigint ==== ---- DATASET tpch ---- BASE_TABLE_NAME q16_tmp ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( -p_brand string, -p_type string, -p_size int, -ps_suppkey bigint) -STORED AS {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +p_brand string +p_type string +p_size int +ps_suppkey bigint ==== ---- DATASET tpch ---- BASE_TABLE_NAME lineitem_tmp ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( -t_partkey bigint, -t_avg_quantity double) -STORED AS {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +t_partkey bigint +t_avg_quantity double ==== ---- DATASET tpch ---- BASE_TABLE_NAME q18_tmp ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( -l_orderkey bigint, -t_sum_quantity double) -STORED AS {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +l_orderkey bigint +t_sum_quantity double ==== ---- DATASET tpch ---- BASE_TABLE_NAME q20_tmp1 ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} (p_partkey bigint) -STORED AS {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +p_partkey bigint ==== ---- DATASET tpch ---- BASE_TABLE_NAME q20_tmp2 ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( -l_partkey bigint, -l_suppkey bigint, -sum_quantity double) -STORED AS {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +l_partkey bigint +l_suppkey bigint +sum_quantity double ==== ---- DATASET tpch ---- BASE_TABLE_NAME q20_tmp3 ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} ( -ps_suppkey bigint, -ps_availqty int, -sum_quantity double) -STORED AS {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +ps_suppkey bigint +ps_availqty int +sum_quantity double ==== ---- DATASET tpch ---- BASE_TABLE_NAME q20_tmp4 ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} (ps_suppkey bigint) -STORED AS {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +ps_suppkey bigint ==== ---- DATASET tpch ---- BASE_TABLE_NAME q22_customer_tmp1 ----- CREATE -CREATE EXTERNAL TABLE {db_name}{db_suffix}.{table_name} (avg_acctbal double, cust_name_char string) -STORED AS {file_format} -LOCATION '${{hiveconf:hive.metastore.warehouse.dir}}/{hdfs_location}'; +---- COLUMNS +avg_acctbal double +cust_name_char string ==== diff --git a/testdata/workloads/functional-query/functional-query_core.csv b/testdata/workloads/functional-query/functional-query_core.csv index d03a4e154..1e8ed780c 100644 --- a/testdata/workloads/functional-query/functional-query_core.csv +++ b/testdata/workloads/functional-query/functional-query_core.csv @@ -3,3 +3,5 @@ file_format:text, dataset:functional, compression_codec:none, compression_type:n file_format:seq, dataset:functional, compression_codec:none, compression_type:none file_format:seq, dataset:functional, compression_codec:snap, compression_type:block file_format:rc, dataset: functional, compression_codec: snap, compression_type: block +file_format:avro, dataset: functional, compression_codec: none, compression_type: none +file_format:avro, dataset: functional, compression_codec: snap, compression_type: block diff --git a/testdata/workloads/functional-query/functional-query_dimensions.csv b/testdata/workloads/functional-query/functional-query_dimensions.csv index 8ec8a8225..62fd498f8 100644 --- a/testdata/workloads/functional-query/functional-query_dimensions.csv +++ b/testdata/workloads/functional-query/functional-query_dimensions.csv @@ -1,4 +1,4 @@ -file_format: text,seq,rc +file_format: text,seq,rc,avro dataset: functional compression_codec: none,def,gzip,bzip,snap,lzo compression_type: none,block,record diff --git a/testdata/workloads/functional-query/functional-query_exhaustive.csv b/testdata/workloads/functional-query/functional-query_exhaustive.csv index b520ed279..ed6166013 100644 --- a/testdata/workloads/functional-query/functional-query_exhaustive.csv +++ b/testdata/workloads/functional-query/functional-query_exhaustive.csv @@ -15,3 +15,5 @@ file_format: rc, dataset: functional, compression_codec: def, compression_type: file_format: rc, dataset: functional, compression_codec: gzip, compression_type: block file_format: rc, dataset: functional, compression_codec: bzip, compression_type: block file_format: rc, dataset: functional, compression_codec: snap, compression_type: block +file_format: avro, dataset: functional, compression_codec: none, compression_type: none +file_format: avro, dataset: functional, compression_codec: snap, compression_type: block diff --git a/testdata/workloads/functional-query/functional-query_pairwise.csv b/testdata/workloads/functional-query/functional-query_pairwise.csv index 27efc876d..f0aef5a6b 100644 --- a/testdata/workloads/functional-query/functional-query_pairwise.csv +++ b/testdata/workloads/functional-query/functional-query_pairwise.csv @@ -2,8 +2,9 @@ file_format: text, dataset: functional, compression_codec: none, compression_type: none file_format: seq, dataset: functional, compression_codec: def, compression_type: block file_format: rc, dataset: functional, compression_codec: gzip, compression_type: block +file_format: avro, dataset: functional, compression_codec: snap, compression_type: block +file_format: avro, dataset: functional, compression_codec: none, compression_type: none file_format: rc, dataset: functional, compression_codec: bzip, compression_type: block -file_format: seq, dataset: functional, compression_codec: snap, compression_type: record +file_format: seq, dataset: functional, compression_codec: none, compression_type: none file_format: text, dataset: functional, compression_codec: lzo, compression_type: block -file_format: rc, dataset: functional, compression_codec: snap, compression_type: block -file_format: rc, dataset: functional, compression_codec: def, compression_type: block +file_format: rc, dataset: functional, compression_codec: none, compression_type: none diff --git a/testdata/workloads/hive-benchmark/hive-benchmark_dimensions.csv b/testdata/workloads/hive-benchmark/hive-benchmark_dimensions.csv index 368dc90cc..7a32ba2eb 100644 --- a/testdata/workloads/hive-benchmark/hive-benchmark_dimensions.csv +++ b/testdata/workloads/hive-benchmark/hive-benchmark_dimensions.csv @@ -1,4 +1,4 @@ -file_format: text,seq,rc +file_format: text,seq,rc,avro dataset: hive-benchmark compression_codec: none,def,gzip,bzip,snap,lzo compression_type: none,block,record diff --git a/testdata/workloads/hive-benchmark/hive-benchmark_exhaustive.csv b/testdata/workloads/hive-benchmark/hive-benchmark_exhaustive.csv index 4a6f02322..48836e6bb 100644 --- a/testdata/workloads/hive-benchmark/hive-benchmark_exhaustive.csv +++ b/testdata/workloads/hive-benchmark/hive-benchmark_exhaustive.csv @@ -15,3 +15,5 @@ file_format: rc, dataset: hive-benchmark, compression_codec: def, compression_ty file_format: rc, dataset: hive-benchmark, compression_codec: gzip, compression_type: block file_format: rc, dataset: hive-benchmark, compression_codec: bzip, compression_type: block file_format: rc, dataset: hive-benchmark, compression_codec: snap, compression_type: block +file_format: avro, dataset: hive-benchmark, compression_codec: none, compression_type: none +file_format: avro, dataset: hive-benchmark, compression_codec: snap, compression_type: block diff --git a/testdata/workloads/hive-benchmark/hive-benchmark_pairwise.csv b/testdata/workloads/hive-benchmark/hive-benchmark_pairwise.csv index e3946a4e7..edeaf8e98 100644 --- a/testdata/workloads/hive-benchmark/hive-benchmark_pairwise.csv +++ b/testdata/workloads/hive-benchmark/hive-benchmark_pairwise.csv @@ -2,8 +2,9 @@ file_format: text, dataset: hive-benchmark, compression_codec: none, compression_type: none file_format: seq, dataset: hive-benchmark, compression_codec: def, compression_type: block file_format: rc, dataset: hive-benchmark, compression_codec: gzip, compression_type: block +file_format: avro, dataset: hive-benchmark, compression_codec: snap, compression_type: block +file_format: avro, dataset: hive-benchmark, compression_codec: none, compression_type: none file_format: rc, dataset: hive-benchmark, compression_codec: bzip, compression_type: block -file_format: seq, dataset: hive-benchmark, compression_codec: snap, compression_type: record +file_format: seq, dataset: hive-benchmark, compression_codec: none, compression_type: none file_format: text, dataset: hive-benchmark, compression_codec: lzo, compression_type: block -file_format: rc, dataset: hive-benchmark, compression_codec: snap, compression_type: block -file_format: rc, dataset: hive-benchmark, compression_codec: def, compression_type: block +file_format: rc, dataset: hive-benchmark, compression_codec: none, compression_type: none diff --git a/testdata/workloads/targeted-perf/targeted-perf_core.csv b/testdata/workloads/targeted-perf/targeted-perf_core.csv index d2060e5f3..dcf4bc6fb 100644 --- a/testdata/workloads/targeted-perf/targeted-perf_core.csv +++ b/testdata/workloads/targeted-perf/targeted-perf_core.csv @@ -3,3 +3,5 @@ file_format:text, dataset:tpch, compression_codec:none, compression_type:none file_format:seq, dataset:tpch, compression_codec:gzip, compression_type:block file_format:seq, dataset:tpch, compression_codec:snap, compression_type:block file_format:rc, dataset:tpch, compression_codec:none, compression_type:none +file_format:avro, dataset:tpch, compression_codec: none, compression_type: none +file_format:avro, dataset:tpch, compression_codec: snap, compression_type: block diff --git a/testdata/workloads/targeted-stress/targeted-stress_core.csv b/testdata/workloads/targeted-stress/targeted-stress_core.csv index d2060e5f3..dcf4bc6fb 100644 --- a/testdata/workloads/targeted-stress/targeted-stress_core.csv +++ b/testdata/workloads/targeted-stress/targeted-stress_core.csv @@ -3,3 +3,5 @@ file_format:text, dataset:tpch, compression_codec:none, compression_type:none file_format:seq, dataset:tpch, compression_codec:gzip, compression_type:block file_format:seq, dataset:tpch, compression_codec:snap, compression_type:block file_format:rc, dataset:tpch, compression_codec:none, compression_type:none +file_format:avro, dataset:tpch, compression_codec: none, compression_type: none +file_format:avro, dataset:tpch, compression_codec: snap, compression_type: block diff --git a/testdata/workloads/tpcds/tpcds_dimensions.csv b/testdata/workloads/tpcds/tpcds_dimensions.csv index f90082679..a01357f60 100644 --- a/testdata/workloads/tpcds/tpcds_dimensions.csv +++ b/testdata/workloads/tpcds/tpcds_dimensions.csv @@ -1,4 +1,4 @@ -file_format: text,seq,rc +file_format: text,seq,rc,avro dataset: tpcds compression_codec: none,def,gzip,bzip,snap,lzo compression_type: none,block,record diff --git a/testdata/workloads/tpcds/tpcds_exhaustive.csv b/testdata/workloads/tpcds/tpcds_exhaustive.csv index e79552a62..e060811b0 100644 --- a/testdata/workloads/tpcds/tpcds_exhaustive.csv +++ b/testdata/workloads/tpcds/tpcds_exhaustive.csv @@ -15,3 +15,5 @@ file_format: rc, dataset: tpcds, compression_codec: def, compression_type: block file_format: rc, dataset: tpcds, compression_codec: gzip, compression_type: block file_format: rc, dataset: tpcds, compression_codec: bzip, compression_type: block file_format: rc, dataset: tpcds, compression_codec: snap, compression_type: block +file_format: avro, dataset: tpcds, compression_codec: none, compression_type: none +file_format: avro, dataset: tpcds, compression_codec: snap, compression_type: block diff --git a/testdata/workloads/tpcds/tpcds_pairwise.csv b/testdata/workloads/tpcds/tpcds_pairwise.csv index d3feca315..aea395180 100644 --- a/testdata/workloads/tpcds/tpcds_pairwise.csv +++ b/testdata/workloads/tpcds/tpcds_pairwise.csv @@ -2,8 +2,9 @@ file_format: text, dataset: tpcds, compression_codec: none, compression_type: none file_format: seq, dataset: tpcds, compression_codec: def, compression_type: block file_format: rc, dataset: tpcds, compression_codec: gzip, compression_type: block +file_format: avro, dataset: tpcds, compression_codec: snap, compression_type: block +file_format: avro, dataset: tpcds, compression_codec: none, compression_type: none file_format: rc, dataset: tpcds, compression_codec: bzip, compression_type: block -file_format: seq, dataset: tpcds, compression_codec: snap, compression_type: record +file_format: seq, dataset: tpcds, compression_codec: none, compression_type: none file_format: text, dataset: tpcds, compression_codec: lzo, compression_type: block -file_format: rc, dataset: tpcds, compression_codec: snap, compression_type: block -file_format: rc, dataset: tpcds, compression_codec: def, compression_type: block +file_format: rc, dataset: tpcds, compression_codec: none, compression_type: none diff --git a/testdata/workloads/tpch/tpch_core.csv b/testdata/workloads/tpch/tpch_core.csv index d2060e5f3..dcf4bc6fb 100644 --- a/testdata/workloads/tpch/tpch_core.csv +++ b/testdata/workloads/tpch/tpch_core.csv @@ -3,3 +3,5 @@ file_format:text, dataset:tpch, compression_codec:none, compression_type:none file_format:seq, dataset:tpch, compression_codec:gzip, compression_type:block file_format:seq, dataset:tpch, compression_codec:snap, compression_type:block file_format:rc, dataset:tpch, compression_codec:none, compression_type:none +file_format:avro, dataset:tpch, compression_codec: none, compression_type: none +file_format:avro, dataset:tpch, compression_codec: snap, compression_type: block diff --git a/tests/common/impala_test_suite.py b/tests/common/impala_test_suite.py index b8bcb02ff..49a2b9543 100644 --- a/tests/common/impala_test_suite.py +++ b/tests/common/impala_test_suite.py @@ -117,7 +117,8 @@ class ImpalaTestSuite(BaseTestSuite): updated_sections.append( self.__update_results(test_file_name, test_section, result)) else: - verify_raw_results(test_section, result) + verify_raw_results(test_section, result, + vector.get_value('table_format').file_format) if pytest.config.option.update_results: output_file = os.path.join('/tmp', test_file_name.replace('/','_') + ".test") diff --git a/tests/common/test_result_verifier.py b/tests/common/test_result_verifier.py index 9200b1680..82dba1c90 100644 --- a/tests/common/test_result_verifier.py +++ b/tests/common/test_result_verifier.py @@ -148,12 +148,7 @@ def verify_results(expected_results, actual_results, order_matters): assert expected_results == actual_results, failure_str -def verify_column_types(actual_col_types, exec_result_schema): - actual_col_types = [c.strip().upper() for c in actual_col_types.split(',')] - expected_col_types = parse_column_types(exec_result_schema) - verify_results(actual_col_types, expected_col_types, order_matters=True) - -def verify_raw_results(test_section, exec_result): +def verify_raw_results(test_section, exec_result, file_format): """ Accepts a raw exec_result object and verifies it matches the expected results @@ -169,9 +164,23 @@ def verify_raw_results(test_section, exec_result): return if 'TYPES' in test_section: - verify_column_types(test_section['TYPES'], exec_result.schema) expected_types = [c.strip().upper() for c in test_section['TYPES'].split(',')] - actual_types = parse_column_types(exec_result.schema) + + # Avro does not support as many types as Hive, so the Avro test tables may + # have different column types than we expect (e.g., INT instead of + # TINYINT). We represent TIMESTAMP columns as strings in Avro, so we bail in + # this case since the results will be wrong. Otherwise we bypass the type + # checking by ignoring the actual types of the Avro table. + if file_format == 'avro': + if 'TIMESTAMP' in expected_types: + LOG.info("TIMESTAMP columns unsupported in Avro, skipping verification.") + return + LOG.info("Skipping type verification of Avro-format table.") + actual_types = expected_types + else: + actual_types = parse_column_types(exec_result.schema) + + verify_results(expected_types, actual_types, order_matters=True) else: # This is an insert, so we are comparing the number of rows inserted expected_types = ['BIGINT'] diff --git a/tests/query_test/test_aggregation.py b/tests/query_test/test_aggregation.py index 0ccdac87a..d2cda0caf 100644 --- a/tests/query_test/test_aggregation.py +++ b/tests/query_test/test_aggregation.py @@ -40,6 +40,10 @@ class TestAggregation(ImpalaTestSuite): cls.TestMatrix.add_constraint(lambda v: v.get_value('exec_option')['batch_size'] == 0) cls.TestMatrix.add_constraint(lambda v: v.get_value('agg_func') in ['min', 'max'] if\ v.get_value('data_type') == 'bool' else True) + # Avro doesn't have timestamp type + cls.TestMatrix.add_constraint( + lambda v: not (v.get_value('table_format').file_format == 'avro' and + v.get_value('data_type') == 'timestamp')) def test_aggregation(self, vector): data_type, agg_func = (vector.get_value('data_type'), vector.get_value('agg_func')) diff --git a/tests/query_test/test_queries.py b/tests/query_test/test_queries.py index ea98b685c..4185e070f 100644 --- a/tests/query_test/test_queries.py +++ b/tests/query_test/test_queries.py @@ -19,6 +19,11 @@ class TestQueries(ImpalaTestSuite): self.run_test_case('QueryTest/aggregation', vector) def test_exprs(self, vector): + # Don't attempt to evaluate timestamp expressions with Avro tables (which + # don't support a timestamp type) + # TODO: Enable some of these tests for Avro if possible + if vector.get_value('table_format').file_format == 'avro': + pytest.skip() self.run_test_case('QueryTest/exprs', vector) def test_hdfs_scan_node(self, vector):