diff --git a/fe/src/main/java/com/cloudera/impala/catalog/HBaseTable.java b/fe/src/main/java/com/cloudera/impala/catalog/HBaseTable.java index 118d3910f..3a583089e 100644 --- a/fe/src/main/java/com/cloudera/impala/catalog/HBaseTable.java +++ b/fe/src/main/java/com/cloudera/impala/catalog/HBaseTable.java @@ -88,6 +88,9 @@ public class HBaseTable extends Table { private static final String HBASE_STORAGE_HANDLER = "org.apache.hadoop.hive.hbase.HBaseStorageHandler"; + // Column family of HBase row key + private static final String ROW_KEY_COLUMN_FAMILY = ":key"; + // Keep the conf around private final static Configuration hbaseConf_ = HBaseConfiguration.create(); @@ -279,6 +282,9 @@ public class HBaseTable extends Table { // Populate tmp cols in the order they appear in the Hive metastore. // We will reorder the cols below. List tmpCols = new ArrayList(); + // Store the key column separately. + // TODO: Change this to an ArrayList once we support composite row keys. + HBaseColumn keyCol = null; for (int i = 0; i < fieldSchemas.size(); ++i) { FieldSchema s = fieldSchemas.get(i); ColumnType t = ColumnType.INVALID; @@ -291,20 +297,32 @@ public class HBaseTable extends Table { HBaseColumn col = new HBaseColumn(s.getName(), hbaseColumnFamilies.get(i), hbaseColumnQualifiers.get(i), hbaseColumnBinaryEncodings.get(i), t, s.getComment(), -1); - tmpCols.add(col); // Load column stats from the Hive metastore into col. loadColumnStats(col, client); + if (col.getColumnFamily().equals(ROW_KEY_COLUMN_FAMILY)) { + // Store the row key column separately from the rest + keyCol = col; + } else { + tmpCols.add(col); + } } + Preconditions.checkState(keyCol != null); - // HBase columns are ordered by columnFamily,columnQualifier, + // The backend assumes that the row key column is always first and + // that the remaining HBase columns are ordered by columnFamily,columnQualifier, // so the final position depends on the other mapped HBase columns. // Sort columns and update positions. Collections.sort(tmpCols); colsByPos_.clear(); colsByName_.clear(); + + keyCol.setPosition(0); + colsByPos_.add(keyCol); + colsByName_.put(keyCol.getName(), keyCol); + // Update the positions of the remaining columns for (int i = 0; i < tmpCols.size(); ++i) { HBaseColumn col = tmpCols.get(i); - col.setPosition(i); + col.setPosition(i + 1); colsByPos_.add(col); colsByName_.put(col.getName(), col); } diff --git a/testdata/bin/generate-schema-statements.py b/testdata/bin/generate-schema-statements.py index c2cb38780..04ed9e24d 100755 --- a/testdata/bin/generate-schema-statements.py +++ b/testdata/bin/generate-schema-statements.py @@ -335,13 +335,14 @@ def build_load_statement(load_template, db_name, db_suffix, table_name): impala_home = os.environ['IMPALA_HOME']) return load_template -def build_hbase_create_stmt(db_name, table_name): +def build_hbase_create_stmt(db_name, table_name, column_families): hbase_table_name = "{db_name}_hbase.{table_name}".format(db_name=db_name, table_name=table_name) create_stmt = list() create_stmt.append("disable '%s'" % hbase_table_name) create_stmt.append("drop '%s'" % hbase_table_name) - create_stmt.append("create '%s', 'd'" % hbase_table_name) + column_families = ','.join(["'{0}'".format(cf) for cf in column_families.splitlines()]) + create_stmt.append("create '%s', %s" % (hbase_table_name, column_families)) return create_stmt def build_db_suffix(file_format, codec, compression_type): @@ -524,7 +525,10 @@ def generate_statements(output_name, test_vectors, sections, db_suffix, create_file_format, create_codec, data_path)) # HBASE create table if file_format == 'hbase': - hbase_output.create.extend(build_hbase_create_stmt(db_name, table_name)) + # If the HBASE_COLUMN_FAMILIES section does not exist, default to 'd' + column_families = section.get('HBASE_COLUMN_FAMILIES', 'd') + hbase_output.create.extend(build_hbase_create_stmt(db_name, table_name, + column_families)) # The ALTER statement in hive does not accept fully qualified table names so # insert a use statement. The ALTER statement is skipped for HBASE as it's @@ -576,7 +580,7 @@ def generate_statements(output_name, test_vectors, sections, def parse_schema_template_file(file_name): VALID_SECTION_NAMES = ['DATASET', 'BASE_TABLE_NAME', 'COLUMNS', 'PARTITION_COLUMNS', 'ROW_FORMAT', 'CREATE', 'CREATE_HIVE', 'DEPENDENT_LOAD', 'LOAD', - 'LOAD_LOCAL', 'ALTER'] + 'LOAD_LOCAL', 'ALTER', 'HBASE_COLUMN_FAMILIES'] return parse_test_file(file_name, VALID_SECTION_NAMES, skip_unknown_sections=False) if __name__ == "__main__": diff --git a/testdata/datasets/functional/functional_schema_template.sql b/testdata/datasets/functional/functional_schema_template.sql index 1168d58c5..9628ad666 100644 --- a/testdata/datasets/functional/functional_schema_template.sql +++ b/testdata/datasets/functional/functional_schema_template.sql @@ -301,6 +301,41 @@ TBLPROPERTIES("hbase.table.name" = "functional_hbase.hbasealltypeserror"); ---- DATASET functional ---- BASE_TABLE_NAME +hbasecolumnfamilies +---- HBASE_COLUMN_FAMILIES +0 +1 +2 +3 +d +---- CREATE_HIVE +-- Create an HBase table with multiple column families +CREATE EXTERNAL TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name} ( + id int, + bool_col boolean, + tinyint_col tinyint, + smallint_col smallint, + int_col int, + bigint_col bigint, + float_col float, + double_col double, + date_string_col string, + string_col string, + timestamp_col timestamp) +STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' +WITH SERDEPROPERTIES ( + "hbase.columns.mapping" = + ":key,0:bool_col,1:tinyint_col,2:smallint_col,3:int_col,d:bigint_col,d:float_col,d:double_col,d:date_string_col,d:string_col,d:timestamp_col" +) +TBLPROPERTIES("hbase.table.name" = "functional_hbase.hbasecolumnfamilies"); +---- DEPENDENT_LOAD +INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} +SELECT id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, +date_string_col, string_col, timestamp_col FROM functional.alltypestiny; +==== +---- DATASET +functional +---- BASE_TABLE_NAME alltypeserrornonulls ---- CREATE CREATE EXTERNAL TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name} ( diff --git a/testdata/datasets/functional/schema_constraints.csv b/testdata/datasets/functional/schema_constraints.csv index 660c78768..568c25d32 100644 --- a/testdata/datasets/functional/schema_constraints.csv +++ b/testdata/datasets/functional/schema_constraints.csv @@ -3,6 +3,7 @@ # table. The table name should match the base table name defined in the schema template # file. table_name:stringids, constraint:restrict_to, table_format:hbase/none/none +table_name:hbasecolumnfamilies, constraint:restrict_to, table_format:hbase/none/none table_name:insertalltypesagg, constraint:restrict_to, table_format:hbase/none/none table_name:alltypessmallbinary, constraint:restrict_to, table_format:hbase/none/none table_name:insertalltypesaggbinary, constraint:restrict_to, table_format:hbase/none/none diff --git a/testdata/workloads/functional-query/queries/QueryTest/hbase-scan-node.test b/testdata/workloads/functional-query/queries/QueryTest/hbase-scan-node.test index 2fe9fef54..5f2505be4 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/hbase-scan-node.test +++ b/testdata/workloads/functional-query/queries/QueryTest/hbase-scan-node.test @@ -636,3 +636,18 @@ select * from alltypessmallbinary ---- TYPES INT, BIGINT, BOOLEAN, STRING, DOUBLE, FLOAT, INT, INT, SMALLINT, STRING, TIMESTAMP, TINYINT, INT ==== +---- QUERY +# Scan an HBase table with multiple column families (CDH-18969) +select * from hbasecolumnfamilies +---- RESULTS +0,true,0,0,0,0,'01/01/09',0,0,'0',2009-01-01 00:00:00 +1,false,1,1,1,10,'01/01/09',10.1,1.100000023841858,'1',2009-01-01 00:01:00 +2,true,0,0,0,0,'02/01/09',0,0,'0',2009-02-01 00:00:00 +3,false,1,1,1,10,'02/01/09',10.1,1.100000023841858,'1',2009-02-01 00:01:00 +4,true,0,0,0,0,'03/01/09',0,0,'0',2009-03-01 00:00:00 +5,false,1,1,1,10,'03/01/09',10.1,1.100000023841858,'1',2009-03-01 00:01:00 +6,true,0,0,0,0,'04/01/09',0,0,'0',2009-04-01 00:00:00 +7,false,1,1,1,10,'04/01/09',10.1,1.100000023841858,'1',2009-04-01 00:01:00 +---- TYPES +INT, BOOLEAN, TINYINT, SMALLINT, INT, BIGINT, STRING, DOUBLE, FLOAT, STRING, TIMESTAMP +====