From 831ee529becb61ba35dc82d8ffa899f5bc8186a2 Mon Sep 17 00:00:00 2001 From: Lenni Kuff Date: Wed, 20 Mar 2013 13:45:12 -0700 Subject: [PATCH] Fixed data loading bugs, moved most tables out of load-dependent-tables --- bin/load-data.py | 4 ++ testdata/bin/create-load-data.sh | 12 ++-- testdata/bin/generate-schema-statements.py | 23 ++++---- testdata/bin/load-dependent-tables.sql | 37 ------------- .../functional/functional_schema_template.sql | 55 +++++++++++++++++++ .../functional/schema_constraints.csv | 15 +++-- .../DataErrorsTest/hdfs-scan-node-errors.test | 2 +- .../hdfs-sequence-scan-errors.test | 2 +- tests/util/test_file_parser.py | 11 +++- 9 files changed, 99 insertions(+), 62 deletions(-) diff --git a/bin/load-data.py b/bin/load-data.py index bab332b33..c31c4494a 100755 --- a/bin/load-data.py +++ b/bin/load-data.py @@ -137,6 +137,10 @@ def get_dataset_for_workload(workload): def copy_avro_schemas_to_hdfs(schemas_dir): """Recursively copies all of schemas_dir to the test warehouse.""" + if not os.path.exists(schemas_dir): + print 'Avro schema dir (%s) does not exist. Skipping copy to HDFS.' % schemas_dir + return + # Create warehouse directory if it doesn't already exist if exec_hadoop_fs_cmd("-test -d " + options.hive_warehouse_dir, expect_success=False): exec_hadoop_fs_cmd("-mkdir -p " + options.hive_warehouse_dir) diff --git a/testdata/bin/create-load-data.sh b/testdata/bin/create-load-data.sh index c44ab3f3a..2d9434b4f 100755 --- a/testdata/bin/create-load-data.sh +++ b/testdata/bin/create-load-data.sh @@ -81,17 +81,19 @@ if [ $? != 0 ]; then fi # Load the index files for corrupted lzo data. -hadoop fs -rm -f /test-warehouse/bad_text_lzo/bad_text.lzo.index +hadoop fs -rm -f /test-warehouse/bad_text_lzo_text_lzo/bad_text.lzo.index hadoop fs -put ${IMPALA_HOME}/testdata/bad_text_lzo/bad_text.lzo.index \ - /test-warehouse/bad_text_lzo/ + /test-warehouse/bad_text_lzo_text_lzo/ -hadoop fs -rm -r -f /bad_text_lzo/ -hadoop fs -mv /test-warehouse/bad_text_lzo/ / +hadoop fs -rm -r -f /bad_text_lzo_text_lzo/ +hadoop fs -mv /test-warehouse/bad_text_lzo_text_lzo/ / +# Cleanup the old bad_text_lzo files, if they exist. +hadoop fs -rm -r -f /test-warehouse/bad_text_lzo/ # Index all lzo files in HDFS under /test-warehouse ${IMPALA_HOME}/testdata/bin/lzo_indexer.sh /test-warehouse -hadoop fs -mv /bad_text_lzo/ /test-warehouse/ +hadoop fs -mv /bad_text_lzo_text_lzo/ /test-warehouse/ # Run compute stats over as many of the tables used in the Planner tests as possible. # Due to Hive bugs HIVE-4119 and HIVE-4122, these tables need to be chosen carefully or diff --git a/testdata/bin/generate-schema-statements.py b/testdata/bin/generate-schema-statements.py index 3f2f530b2..53733941d 100755 --- a/testdata/bin/generate-schema-statements.py +++ b/testdata/bin/generate-schema-statements.py @@ -291,12 +291,12 @@ class Statements(object): self.load_base = list() def write_to_file(self, filename): - # Only write to file if there's something to actually write - if self.create or self.load_base or self.load: - # Make sure we create the base tables first - output = self.create + self.load_base + self.load - with open(filename, 'w') as f: - f.write('\n\n'.join(output)) + # Make sure we create the base tables first. It is important that we always write + # to the output file, even if there are no statements to generate. This makes sure + # the output file is empty and the user doesn't unexpectedly load some stale data. + output = self.create + self.load_base + self.load + with open(filename, 'w') as f: + f.write('\n\n'.join(output)) def generate_statements(output_name, test_vectors, sections, schema_include_constraints, schema_exclude_constraints): @@ -346,21 +346,21 @@ def generate_statements(output_name, test_vectors, sections, # hive does not allow hyphenated table names. if data_set == 'hive-benchmark': db_name = '{0}{1}'.format('hivebenchmark', options.scale_factor) - + db = '{0}{1}'.format(db_name, db_suffix) data_path = os.path.join(options.hive_warehouse_dir, hdfs_location) if table_names and (table_name.lower() not in table_names): - print 'Skipping table: %s' % table_name + print 'Skipping table: %s.%s' % (db, table_name) continue if schema_include_constraints[table_name.lower()] and \ table_format not in schema_include_constraints[table_name.lower()]: - print 'Skipping \'%s\' due to include constraint match' % table_name + print 'Skipping \'%s.%s\' due to include constraint match' % (db, table_name) continue if schema_exclude_constraints[base_table_name.lower()] and\ table_format in schema_exclude_constraints[base_table_name.lower()]: - print 'Skipping \'%s\' due to exclude constraint match' % table_name + print 'Skipping \'%s.%s\' due to exclude constraint match' % (db, table_name) continue # If a CREATE section is provided, use that. Otherwise a COLUMNS section @@ -390,8 +390,7 @@ def generate_statements(output_name, test_vectors, sections, # The ALTER statement in hive does not accept fully qualified table names. # We need the use statement. if alter: - use_table = 'USE {db_name}{db_suffix};\n'.format(db_name=db_name, - db_suffix=db_suffix) + use_table = 'USE {db_name};\n'.format(db_name=db) output.create.append(use_table + alter.format(table_name=table_name)) # If the directory already exists in HDFS, assume that data files already exist diff --git a/testdata/bin/load-dependent-tables.sql b/testdata/bin/load-dependent-tables.sql index 750c44f0a..00d407779 100644 --- a/testdata/bin/load-dependent-tables.sql +++ b/testdata/bin/load-dependent-tables.sql @@ -49,26 +49,6 @@ ALTER TABLE alltypesmixedformat PARTITION (year=2009, month=2) ALTER TABLE alltypesmixedformat PARTITION (year=2009, month=3) SET FILEFORMAT RCFILE; --- Not really dependent: this table contains format errors and --- is accessed by the unit test: sequence-file-recover-test. -CREATE DATABASE IF NOT EXISTS functional_seq_snap; -USE functional_seq_snap; -DROP TABLE IF EXISTS bad_seq_snap; -CREATE EXTERNAL TABLE bad_seq_snap (field string) stored as SEQUENCEFILE -LOCATION '${hiveconf:hive.metastore.warehouse.dir}/bad_seq_snap'; -LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/bad_seq_snap/bad_file' OVERWRITE INTO TABLE bad_seq_snap; - ---- Error recovery test data for LZO compression. -CREATE DATABASE IF NOT EXISTS functional_text_lzo; -USE functional_text_lzo; -DROP TABLE IF EXISTS bad_text; -CREATE EXTERNAL TABLE bad_text (field string) stored as -INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat' -OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' -LOCATION '${hiveconf:hive.metastore.warehouse.dir}/bad_text_lzo'; - -LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/bad_text_lzo/bad_text.lzo' OVERWRITE INTO TABLE bad_text; - ---- -- Used by CatalogTest to confirm that non-external HBase tables are identified -- correctly (IMP-581) @@ -78,23 +58,6 @@ USE functional; CREATE TABLE internal_hbase_table(key int, value string) STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,cf1:val"); ----- --- For structured-type testing -DROP TABLE IF EXISTS map_table; -CREATE TABLE map_table(map_col map); -DROP TABLE IF EXISTS array_table; -CREATE TABLE array_table(array_col array); - ----- --- Create a table to test older rc files (pre hive9). The header for those files are --- different. -CREATE DATABASE IF NOT EXISTS functional_rc; -USE functional_rc; -DROP TABLE IF EXISTS old_rcfile_table; -CREATE EXTERNAL TABLE old_rcfile_table(key int, value string) -STORED AS RCFILE -LOCATION '${hiveconf:hive.metastore.warehouse.dir}/old_rcfile'; -LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/data/oldrcfile.rc' OVERWRITE into table old_rcfile_table; ---- Unsupported Impala table types USE functional; diff --git a/testdata/datasets/functional/functional_schema_template.sql b/testdata/datasets/functional/functional_schema_template.sql index c710525e9..6be08ce07 100644 --- a/testdata/datasets/functional/functional_schema_template.sql +++ b/testdata/datasets/functional/functional_schema_template.sql @@ -1020,3 +1020,58 @@ INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT * FROM {db_name} ---- LOAD LOAD DATA LOCAL INPATH '{impala_home}/testdata/UnsupportedTypes/data.csv' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; ==== +---- DATASET +functional +---- BASE_TABLE_NAME +old_rcfile_table +---- COLUMNS +key INT +value STRING +---- DEPENDENT_LOAD +LOAD DATA LOCAL INPATH '${{env:IMPALA_HOME}}/testdata/data/oldrcfile.rc' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; +==== +---- DATASET +functional +---- BASE_TABLE_NAME +bad_text_lzo +---- COLUMNS +field STRING +---- DEPENDENT_LOAD +-- Error recovery test data for LZO compression. +LOAD DATA LOCAL INPATH '${{env:IMPALA_HOME}}/testdata/bad_text_lzo/bad_text.lzo' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; +==== +---- DATASET +functional +---- BASE_TABLE_NAME +bad_seq_snap +---- COLUMNS +field STRING +---- DEPENDENT_LOAD +-- This data file contains format errors and is accessed by the unit test: sequence-file-recover-test. +LOAD DATA LOCAL INPATH '${{env:IMPALA_HOME}}/testdata/bad_seq_snap/bad_file' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; +==== +---- DATASET +functional +---- BASE_TABLE_NAME +map_table +---- CREATE +-- For structured-type testing +DROP TABLE IF EXISTS map_table; +CREATE TABLE {db_name}{db_suffix}.{table_name} (map_col map); +==== +---- DATASET +functional +---- BASE_TABLE_NAME +array_table +---- CREATE +-- For structured-type testing +CREATE TABLE {db_name}{db_suffix}.{table_name} (array_col array); +==== +---- DATASET +functional +---- BASE_TABLE_NAME +array_table +---- CREATE +-- For structured-type testing +CREATE TABLE {db_name}{db_suffix}.{table_name} (array_col array); +==== diff --git a/testdata/datasets/functional/schema_constraints.csv b/testdata/datasets/functional/schema_constraints.csv index 2196a4422..7771db03d 100644 --- a/testdata/datasets/functional/schema_constraints.csv +++ b/testdata/datasets/functional/schema_constraints.csv @@ -7,7 +7,14 @@ table_name:hbasealltypesagg, constraint:restrict_to, table_format:text/none/none table_name:hbasealltypeserror, constraint:restrict_to, table_format:text/none/none table_name:hbasealltypeserrornonulls, constraint:restrict_to, table_format:text/none/none table_name:hbasestringids, constraint:restrict_to, table_format:text/none/none -table_name:escapechartesttable, constraint:exclude, file_format:parquet -table_name:nulltable, constraint:exclude, file_format:parquet -table_name:nullescapedtable, constraint:exclude, file_format:parquet -table_name:TblWithRaggedColumns, constraint:exclude, file_format:parquet +table_name:old_rcfile_table, constraint:restrict_to, table_format:rc/none/none +table_name:bad_text_lzo, constraint:restrict_to, table_format:text/lzo/block +table_name:bad_seq_snap, constraint:restrict_to, table_format:seq/snap/block + +table_name:map_table, constraint:restrict_to, table_format:text/none/none +table_name:array_table, constraint:restrict_to, table_format:text/none/none + +table_name:escapechartesttable, constraint:exclude, table_format:parquet +table_name:nulltable, constraint:exclude, table_format:parquet +table_name:nullescapedtable, constraint:exclude, table_format:parquet +table_name:TblWithRaggedColumns, constraint:exclude, table_format:parquet diff --git a/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-scan-node-errors.test b/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-scan-node-errors.test index c8bb59a20..900c6c5dd 100644 --- a/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-scan-node-errors.test +++ b/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-scan-node-errors.test @@ -174,7 +174,7 @@ int, int, int, boolean, tinyint, smallint, int, bigint, float, double, string, s 2009,3,29,false,9,9,NULL,90,9,90.90000000000001,'03/01/09','9',2012-03-22 00:00:00 ==== ---- QUERY -select count(*) from functional_text_lzo.bad_text +select count(*) from functional_text_lzo.bad_text_lzo ---- ERRORS Blocksize: 536870911 is greater than MAX_BLOCK_SIZE: 67108864 ---- TYPES diff --git a/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-sequence-scan-errors.test b/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-sequence-scan-errors.test index 4b93a2629..2d2bf6101 100644 --- a/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-sequence-scan-errors.test +++ b/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-sequence-scan-errors.test @@ -10,7 +10,7 @@ Format error in record or block header at offset: 1784325 Format error in record or block header at offset: 1790563 Format error in record or block header at offset: 1791244 Format error in record or block header at end of file. -First error while processing: hdfs: test-warehouse/bad_seq_snap/bad_file at offset: 899514 +First error while processing: hdfs: test-warehouse/bad_seq_snap_seq_snap/bad_file at offset: 899514 ---- TYPES bigint ---- RESULTS diff --git a/tests/util/test_file_parser.py b/tests/util/test_file_parser.py index 7cb848230..6260a3b5c 100644 --- a/tests/util/test_file_parser.py +++ b/tests/util/test_file_parser.py @@ -83,13 +83,20 @@ def parse_table_constraints(constraints_file): table_name, constraint_type, table_formats =\ [value.split(':')[1].strip() for value in line.split(',', 2)] if constraint_type == 'restrict_to': - schema_include[table_name.lower()] += table_formats.split(',') + schema_include[table_name.lower()] +=\ + map(parse_table_format_constraint, table_formats.split(',')) elif constraint_type == 'exclude': - schema_exclude[table_name.lower()] += table_formats.split(',') + schema_exclude[table_name.lower()] +=\ + map(parse_table_format_constraint, table_formats.split(',')) else: raise ValueError, 'Unknown constraint type: %s' % constraint_type return schema_include, schema_exclude +def parse_table_format_constraint(table_format_constraint): + # TODO: Expand how we parse table format constraints to support syntax such as + # a table format string with a wildcard character. Right now we don't do anything. + return table_format_constraint + def parse_test_file(test_file_name, valid_section_names, skip_unknown_sections=True): """ Parses an Impala test file