From 831ee529becb61ba35dc82d8ffa899f5bc8186a2 Mon Sep 17 00:00:00 2001
From: Lenni Kuff <lskuff@cloudera.com>
Date: Wed, 20 Mar 2013 13:45:12 -0700
Subject: [PATCH] Fixed data loading bugs, moved most tables out of
 load-dependent-tables

---
 bin/load-data.py                              |  4 ++
 testdata/bin/create-load-data.sh              | 12 ++--
 testdata/bin/generate-schema-statements.py    | 23 ++++----
 testdata/bin/load-dependent-tables.sql        | 37 -------------
 .../functional/functional_schema_template.sql | 55 +++++++++++++++++++
 .../functional/schema_constraints.csv         | 15 +++--
 .../DataErrorsTest/hdfs-scan-node-errors.test |  2 +-
 .../hdfs-sequence-scan-errors.test            |  2 +-
 tests/util/test_file_parser.py                | 11 +++-
 9 files changed, 99 insertions(+), 62 deletions(-)

diff --git a/bin/load-data.py b/bin/load-data.py
index bab332b33..c31c4494a 100755
--- a/bin/load-data.py
+++ b/bin/load-data.py
@@ -137,6 +137,10 @@ def get_dataset_for_workload(workload):
 
 def copy_avro_schemas_to_hdfs(schemas_dir):
   """Recursively copies all of schemas_dir to the test warehouse."""
+  if not os.path.exists(schemas_dir):
+    print 'Avro schema dir (%s) does not exist. Skipping copy to HDFS.' % schemas_dir
+    return
+
   # Create warehouse directory if it doesn't already exist
   if exec_hadoop_fs_cmd("-test -d " + options.hive_warehouse_dir, expect_success=False):
     exec_hadoop_fs_cmd("-mkdir -p " + options.hive_warehouse_dir)
diff --git a/testdata/bin/create-load-data.sh b/testdata/bin/create-load-data.sh
index c44ab3f3a..2d9434b4f 100755
--- a/testdata/bin/create-load-data.sh
+++ b/testdata/bin/create-load-data.sh
@@ -81,17 +81,19 @@ if [ $? != 0 ]; then
 fi
 
 # Load the index files for corrupted lzo data.
-hadoop fs -rm -f /test-warehouse/bad_text_lzo/bad_text.lzo.index
+hadoop fs -rm -f /test-warehouse/bad_text_lzo_text_lzo/bad_text.lzo.index
 hadoop fs -put ${IMPALA_HOME}/testdata/bad_text_lzo/bad_text.lzo.index \
-      /test-warehouse/bad_text_lzo/
+    /test-warehouse/bad_text_lzo_text_lzo/
 
-hadoop fs -rm -r -f /bad_text_lzo/
-hadoop fs -mv /test-warehouse/bad_text_lzo/ /
+hadoop fs -rm -r -f /bad_text_lzo_text_lzo/
+hadoop fs -mv /test-warehouse/bad_text_lzo_text_lzo/ /
+# Cleanup the old bad_text_lzo files, if they exist.
+hadoop fs -rm -r -f /test-warehouse/bad_text_lzo/
 
 # Index all lzo files in HDFS under /test-warehouse
 ${IMPALA_HOME}/testdata/bin/lzo_indexer.sh /test-warehouse
 
-hadoop fs -mv /bad_text_lzo/ /test-warehouse/
+hadoop fs -mv /bad_text_lzo_text_lzo/ /test-warehouse/
 
 # Run compute stats over as many of the tables used in the Planner tests as possible.
 # Due to Hive bugs HIVE-4119 and HIVE-4122, these tables need to be chosen carefully or
diff --git a/testdata/bin/generate-schema-statements.py b/testdata/bin/generate-schema-statements.py
index 3f2f530b2..53733941d 100755
--- a/testdata/bin/generate-schema-statements.py
+++ b/testdata/bin/generate-schema-statements.py
@@ -291,12 +291,12 @@ class Statements(object):
     self.load_base = list()
 
   def write_to_file(self, filename):
-    # Only write to file if there's something to actually write
-    if self.create or self.load_base or self.load:
-      # Make sure we create the base tables first
-      output = self.create + self.load_base + self.load
-      with open(filename, 'w') as f:
-        f.write('\n\n'.join(output))
+    # Make sure we create the base tables first. It is important that we always write
+    # to the output file, even if there are no statements to generate. This makes sure
+    # the output file is empty and the user doesn't unexpectedly load some stale data.
+    output = self.create + self.load_base + self.load
+    with open(filename, 'w') as f:
+      f.write('\n\n'.join(output))
 
 def generate_statements(output_name, test_vectors, sections,
                         schema_include_constraints, schema_exclude_constraints):
@@ -346,21 +346,21 @@ def generate_statements(output_name, test_vectors, sections,
       # hive does not allow hyphenated table names.
       if data_set == 'hive-benchmark':
         db_name = '{0}{1}'.format('hivebenchmark', options.scale_factor)
-
+      db = '{0}{1}'.format(db_name, db_suffix)
       data_path = os.path.join(options.hive_warehouse_dir, hdfs_location)
 
       if table_names and (table_name.lower() not in table_names):
-        print 'Skipping table: %s' % table_name
+        print 'Skipping table: %s.%s' % (db, table_name)
         continue
 
       if schema_include_constraints[table_name.lower()] and \
          table_format not in schema_include_constraints[table_name.lower()]:
-        print 'Skipping \'%s\' due to include constraint match' % table_name
+        print 'Skipping \'%s.%s\' due to include constraint match' % (db, table_name)
         continue
 
       if schema_exclude_constraints[base_table_name.lower()] and\
          table_format in schema_exclude_constraints[base_table_name.lower()]:
-        print 'Skipping \'%s\' due to exclude constraint match' % table_name
+        print 'Skipping \'%s.%s\' due to exclude constraint match' % (db, table_name)
         continue
 
       # If a CREATE section is provided, use that. Otherwise a COLUMNS section
@@ -390,8 +390,7 @@ def generate_statements(output_name, test_vectors, sections,
       # The ALTER statement in hive does not accept fully qualified table names.
       # We need the use statement.
       if alter:
-        use_table = 'USE {db_name}{db_suffix};\n'.format(db_name=db_name,
-                                                         db_suffix=db_suffix)
+        use_table = 'USE {db_name};\n'.format(db_name=db)
         output.create.append(use_table + alter.format(table_name=table_name))
 
       # If the directory already exists in HDFS, assume that data files already exist
diff --git a/testdata/bin/load-dependent-tables.sql b/testdata/bin/load-dependent-tables.sql
index 750c44f0a..00d407779 100644
--- a/testdata/bin/load-dependent-tables.sql
+++ b/testdata/bin/load-dependent-tables.sql
@@ -49,26 +49,6 @@ ALTER TABLE alltypesmixedformat PARTITION (year=2009, month=2)
 ALTER TABLE alltypesmixedformat PARTITION (year=2009, month=3)
   SET FILEFORMAT RCFILE;
 
--- Not really dependent: this table contains format errors and
--- is accessed by the unit test: sequence-file-recover-test.
-CREATE DATABASE IF NOT EXISTS functional_seq_snap;
-USE functional_seq_snap;
-DROP TABLE IF EXISTS bad_seq_snap;
-CREATE EXTERNAL TABLE bad_seq_snap (field string) stored as SEQUENCEFILE
-LOCATION '${hiveconf:hive.metastore.warehouse.dir}/bad_seq_snap';
-LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/bad_seq_snap/bad_file' OVERWRITE INTO TABLE bad_seq_snap;
-
---- Error recovery test data for LZO compression.
-CREATE DATABASE IF NOT EXISTS functional_text_lzo;
-USE functional_text_lzo;
-DROP TABLE IF EXISTS bad_text;
-CREATE EXTERNAL TABLE bad_text (field string) stored as
-INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
-OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
-LOCATION '${hiveconf:hive.metastore.warehouse.dir}/bad_text_lzo';
-
-LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/bad_text_lzo/bad_text.lzo' OVERWRITE INTO TABLE bad_text;
-
 ----
 -- Used by CatalogTest to confirm that non-external HBase tables are identified
 -- correctly (IMP-581) 
@@ -78,23 +58,6 @@ USE functional;
 CREATE TABLE internal_hbase_table(key int, value string)
 STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
 WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,cf1:val");
-----
--- For structured-type testing
-DROP TABLE IF EXISTS map_table;
-CREATE TABLE map_table(map_col map<int, string>);
-DROP TABLE IF EXISTS array_table;
-CREATE TABLE array_table(array_col array<int>);
-
-----
--- Create a table to test older rc files (pre hive9).  The header for those files are
--- different.
-CREATE DATABASE IF NOT EXISTS functional_rc;
-USE functional_rc;
-DROP TABLE IF EXISTS old_rcfile_table;
-CREATE EXTERNAL TABLE old_rcfile_table(key int, value string)
-STORED AS RCFILE
-LOCATION '${hiveconf:hive.metastore.warehouse.dir}/old_rcfile';
-LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/data/oldrcfile.rc' OVERWRITE into table old_rcfile_table;
 
 ---- Unsupported Impala table types
 USE functional;
diff --git a/testdata/datasets/functional/functional_schema_template.sql b/testdata/datasets/functional/functional_schema_template.sql
index c710525e9..6be08ce07 100644
--- a/testdata/datasets/functional/functional_schema_template.sql
+++ b/testdata/datasets/functional/functional_schema_template.sql
@@ -1020,3 +1020,58 @@ INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT * FROM {db_name}
 ---- LOAD
 LOAD DATA LOCAL INPATH '{impala_home}/testdata/UnsupportedTypes/data.csv' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
 ====
+---- DATASET
+functional
+---- BASE_TABLE_NAME
+old_rcfile_table
+---- COLUMNS
+key INT
+value STRING
+---- DEPENDENT_LOAD
+LOAD DATA LOCAL INPATH '${{env:IMPALA_HOME}}/testdata/data/oldrcfile.rc' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
+====
+---- DATASET
+functional
+---- BASE_TABLE_NAME
+bad_text_lzo
+---- COLUMNS
+field STRING
+---- DEPENDENT_LOAD
+-- Error recovery test data for LZO compression.
+LOAD DATA LOCAL INPATH '${{env:IMPALA_HOME}}/testdata/bad_text_lzo/bad_text.lzo' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
+====
+---- DATASET
+functional
+---- BASE_TABLE_NAME
+bad_seq_snap
+---- COLUMNS
+field STRING
+---- DEPENDENT_LOAD
+-- This data file contains format errors and is accessed by the unit test: sequence-file-recover-test.
+LOAD DATA LOCAL INPATH '${{env:IMPALA_HOME}}/testdata/bad_seq_snap/bad_file' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
+====
+---- DATASET
+functional
+---- BASE_TABLE_NAME
+map_table
+---- CREATE
+-- For structured-type testing
+DROP TABLE IF EXISTS map_table;
+CREATE TABLE {db_name}{db_suffix}.{table_name} (map_col map<int, string>);
+====
+---- DATASET
+functional
+---- BASE_TABLE_NAME
+array_table
+---- CREATE
+-- For structured-type testing
+CREATE TABLE {db_name}{db_suffix}.{table_name} (array_col array<int>);
+====
+---- DATASET
+functional
+---- BASE_TABLE_NAME
+array_table
+---- CREATE
+-- For structured-type testing
+CREATE TABLE {db_name}{db_suffix}.{table_name} (array_col array<int>);
+====
diff --git a/testdata/datasets/functional/schema_constraints.csv b/testdata/datasets/functional/schema_constraints.csv
index 2196a4422..7771db03d 100644
--- a/testdata/datasets/functional/schema_constraints.csv
+++ b/testdata/datasets/functional/schema_constraints.csv
@@ -7,7 +7,14 @@ table_name:hbasealltypesagg, constraint:restrict_to, table_format:text/none/none
 table_name:hbasealltypeserror, constraint:restrict_to, table_format:text/none/none
 table_name:hbasealltypeserrornonulls, constraint:restrict_to, table_format:text/none/none
 table_name:hbasestringids, constraint:restrict_to, table_format:text/none/none
-table_name:escapechartesttable, constraint:exclude, file_format:parquet
-table_name:nulltable, constraint:exclude, file_format:parquet
-table_name:nullescapedtable, constraint:exclude, file_format:parquet
-table_name:TblWithRaggedColumns, constraint:exclude, file_format:parquet
+table_name:old_rcfile_table, constraint:restrict_to, table_format:rc/none/none
+table_name:bad_text_lzo, constraint:restrict_to, table_format:text/lzo/block
+table_name:bad_seq_snap, constraint:restrict_to, table_format:seq/snap/block
+
+table_name:map_table, constraint:restrict_to, table_format:text/none/none
+table_name:array_table, constraint:restrict_to, table_format:text/none/none
+
+table_name:escapechartesttable, constraint:exclude, table_format:parquet
+table_name:nulltable, constraint:exclude, table_format:parquet
+table_name:nullescapedtable, constraint:exclude, table_format:parquet
+table_name:TblWithRaggedColumns, constraint:exclude, table_format:parquet
diff --git a/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-scan-node-errors.test b/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-scan-node-errors.test
index c8bb59a20..900c6c5dd 100644
--- a/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-scan-node-errors.test
+++ b/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-scan-node-errors.test
@@ -174,7 +174,7 @@ int, int, int, boolean, tinyint, smallint, int, bigint, float, double, string, s
 2009,3,29,false,9,9,NULL,90,9,90.90000000000001,'03/01/09','9',2012-03-22 00:00:00
 ====
 ---- QUERY
-select count(*) from functional_text_lzo.bad_text
+select count(*) from functional_text_lzo.bad_text_lzo
 ---- ERRORS
 Blocksize: 536870911 is greater than MAX_BLOCK_SIZE: 67108864
 ---- TYPES
diff --git a/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-sequence-scan-errors.test b/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-sequence-scan-errors.test
index 4b93a2629..2d2bf6101 100644
--- a/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-sequence-scan-errors.test
+++ b/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-sequence-scan-errors.test
@@ -10,7 +10,7 @@ Format error in record or block header at offset: 1784325
 Format error in record or block header at offset: 1790563
 Format error in record or block header at offset: 1791244
 Format error in record or block header at end of file.
-First error while processing: hdfs: test-warehouse/bad_seq_snap/bad_file at offset: 899514
+First error while processing: hdfs: test-warehouse/bad_seq_snap_seq_snap/bad_file at offset: 899514
 ---- TYPES
 bigint
 ---- RESULTS
diff --git a/tests/util/test_file_parser.py b/tests/util/test_file_parser.py
index 7cb848230..6260a3b5c 100644
--- a/tests/util/test_file_parser.py
+++ b/tests/util/test_file_parser.py
@@ -83,13 +83,20 @@ def parse_table_constraints(constraints_file):
         table_name, constraint_type, table_formats =\
             [value.split(':')[1].strip() for value in line.split(',', 2)]
         if constraint_type == 'restrict_to':
-          schema_include[table_name.lower()] += table_formats.split(',')
+          schema_include[table_name.lower()] +=\
+              map(parse_table_format_constraint, table_formats.split(','))
         elif constraint_type == 'exclude':
-          schema_exclude[table_name.lower()] += table_formats.split(',')
+          schema_exclude[table_name.lower()] +=\
+              map(parse_table_format_constraint, table_formats.split(','))
         else:
           raise ValueError, 'Unknown constraint type: %s' % constraint_type
   return schema_include, schema_exclude
 
+def parse_table_format_constraint(table_format_constraint):
+  # TODO: Expand how we parse table format constraints to support syntax such as
+  # a table format string with a wildcard character. Right now we don't do anything.
+  return table_format_constraint
+
 def parse_test_file(test_file_name, valid_section_names, skip_unknown_sections=True):
   """
   Parses an Impala test file