From cb496104d98e8cbd87acf25277f2648cffaac42a Mon Sep 17 00:00:00 2001 From: Riza Suminto Date: Sat, 3 May 2025 16:45:26 -0700 Subject: [PATCH] IMPALA-14027: Implement HS2 NULL_TYPE using TStringValue HS2 NULL_TYPE should be implemented using TStringValue. However, due to incompatibility with Hive JDBC driver implementation then, Impala choose to implement NULL type using TBoolValue (see IMPALA-914, IMPALA-1370). HIVE-4172 might be the root cause for such decision. Today, the Hive JDBC (org.apache.hive.jdbc.HiveDriver) does not have that issue anymore, as shown in this reproduction after applying this patch: ./bin/run-jdbc-client.sh -q "select null" -t NOSASL Using JDBC Driver Name: org.apache.hive.jdbc.HiveDriver Connecting to: jdbc:hive2://localhost:21050/;auth=noSasl Executing: select null ----[START]---- NULL ----[END]---- Returned 1 row(s) in 0.343s Thus, we can reimplement NULL_TYPE using TStringValue to match HiveServer2 behavior. Testing: - Pass core tests. Change-Id: I354110164b360013d9893f1eb4398c3418f80472 Reviewed-on: http://gerrit.cloudera.org:8080/22852 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- be/src/service/hs2-util.cc | 34 +++++++++++++++---- be/src/service/query-result-set.cc | 4 +-- .../org/apache/impala/service/JdbcTest.java | 6 ++-- .../queries/QueryTest/except.test | 4 +-- .../queries/QueryTest/intersect.test | 2 +- .../queries/QueryTest/joins.test | 2 +- .../queries/QueryTest/top-n.test | 2 +- .../queries/QueryTest/union.test | 8 ++--- tests/hs2/test_fetch.py | 10 +++--- 9 files changed, 46 insertions(+), 26 deletions(-) diff --git a/be/src/service/hs2-util.cc b/be/src/service/hs2-util.cc index ce6654db3..8424e6996 100644 --- a/be/src/service/hs2-util.cc +++ b/be/src/service/hs2-util.cc @@ -85,7 +85,6 @@ void impala::TColumnValueToHS2TColumn(const TColumnValue& col_val, string* nulls; bool is_null; switch (type.types[0].scalar_type.type) { - case TPrimitiveType::NULL_TYPE: case TPrimitiveType::BOOLEAN: is_null = !col_val.__isset.bool_val; column->boolVal.values.push_back(col_val.bool_val); @@ -117,6 +116,7 @@ void impala::TColumnValueToHS2TColumn(const TColumnValue& col_val, column->doubleVal.values.push_back(col_val.double_val); nulls = &column->doubleVal.nulls; break; + case TPrimitiveType::NULL_TYPE: case TPrimitiveType::TIMESTAMP: case TPrimitiveType::DATE: case TPrimitiveType::STRING: @@ -152,6 +152,25 @@ void ReserveSpace(int reserve_count, T* hs2Vals) { hs2Vals->nulls.reserve(BitUtil::RoundUpToPowerOfTwo(num_null_bytes)); } +// Implementation for NULL. +// Internally, Impala implement NULL expession using nullable-BooleanVal (IMPALA-914). +// To match with HiveServer2 behavior, IMPALA-14027 change the result mapping to use +// TColumn.stringVal rather than TColumn.boolVal. +static void NullExprValuesToHS2TColumn(ScalarExprEvaluator* expr_eval, RowBatch* batch, + int start_idx, int num_rows, uint32_t output_row_idx, + apache::hive::service::cli::thrift::TColumn* column) { + FOREACH_ROW_LIMIT(batch, start_idx, num_rows, it) { + // It is actually not necessary to evaluate expr_eval here. But we choose to do it + // and DCHECK the result to be consistent with other functions. + BooleanVal val = expr_eval->GetBooleanVal(it.Get()); + DCHECK(val.is_null); + // emplace empty string and set null bit. + column->stringVal.values.emplace_back(); + SetNullBit(output_row_idx, val.is_null, &column->stringVal.nulls); + ++output_row_idx; + } +} + // Implementation for BOOL. static void BoolExprValuesToHS2TColumn(ScalarExprEvaluator* expr_eval, RowBatch* batch, int start_idx, int num_rows, uint32_t output_row_idx, @@ -454,6 +473,9 @@ void impala::ExprValuesToHS2TColumn(ScalarExprEvaluator* expr_eval, switch (type.types[0].scalar_type.type) { case TPrimitiveType::NULL_TYPE: + ReserveSpace(expected_result_count, &column->stringVal); + NullExprValuesToHS2TColumn( + expr_eval, batch, start_idx, num_rows, output_row_idx, column); case TPrimitiveType::BOOLEAN: ReserveSpace(expected_result_count, &column->boolVal); BoolExprValuesToHS2TColumn( @@ -598,9 +620,9 @@ void impala::ExprValueToHS2TColumnValue(const void* value, const TColumnType& ty DCHECK_EQ(1, type.types[0].__isset.scalar_type); switch (type.types[0].scalar_type.type) { case TPrimitiveType::NULL_TYPE: - // Set NULLs in the bool_val. - hs2_col_val->__isset.boolVal = true; - hs2_col_val->boolVal.__isset.value = false; + // Set NULLs in the stringVal, but don't set the value itself. + hs2_col_val->__isset.stringVal = true; + hs2_col_val->stringVal.__isset.value = false; break; case TPrimitiveType::BOOLEAN: hs2_col_val->__isset.boolVal = true; @@ -868,10 +890,8 @@ thrift::TTypeEntry impala::ColumnToHs2Type( const ColumnType& type = ColumnType::FromThrift(columnType); thrift::TPrimitiveTypeEntry type_entry; switch (type.type) { - // Map NULL_TYPE to BOOLEAN, otherwise Hive's JDBC driver won't - // work for queries like "SELECT NULL" (IMPALA-914). case TYPE_NULL: - type_entry.__set_type(thrift::TTypeId::BOOLEAN_TYPE); + type_entry.__set_type(thrift::TTypeId::NULL_TYPE); break; case TYPE_BOOLEAN: type_entry.__set_type(thrift::TTypeId::BOOLEAN_TYPE); diff --git a/be/src/service/query-result-set.cc b/be/src/service/query-result-set.cc index 97d97d97e..4fe411350 100644 --- a/be/src/service/query-result-set.cc +++ b/be/src/service/query-result-set.cc @@ -410,7 +410,6 @@ int HS2ColumnarResultSet::AddRows( primitiveType = TPrimitiveType::STRING; } switch (primitiveType) { - case TPrimitiveType::NULL_TYPE: case TPrimitiveType::BOOLEAN: StitchNulls( num_rows_, rows_added, start_idx, from->boolVal.nulls, &(to->boolVal.nulls)); @@ -454,6 +453,7 @@ int HS2ColumnarResultSet::AddRows( from->doubleVal.values.begin() + start_idx, from->doubleVal.values.begin() + start_idx + rows_added); break; + case TPrimitiveType::NULL_TYPE: case TPrimitiveType::TIMESTAMP: case TPrimitiveType::DATE: case TPrimitiveType::DECIMAL: @@ -509,7 +509,6 @@ void HS2ColumnarResultSet::InitColumns() { DCHECK(type_nodes[0].__isset.scalar_type); TPrimitiveType::type input_type = type_nodes[0].scalar_type.type; switch (input_type) { - case TPrimitiveType::NULL_TYPE: case TPrimitiveType::BOOLEAN: col_output.__isset.boolVal = true; break; @@ -529,6 +528,7 @@ void HS2ColumnarResultSet::InitColumns() { case TPrimitiveType::DOUBLE: col_output.__isset.doubleVal = true; break; + case TPrimitiveType::NULL_TYPE: case TPrimitiveType::TIMESTAMP: case TPrimitiveType::DATE: case TPrimitiveType::DECIMAL: diff --git a/fe/src/test/java/org/apache/impala/service/JdbcTest.java b/fe/src/test/java/org/apache/impala/service/JdbcTest.java index 43086caec..5a48a4435 100644 --- a/fe/src/test/java/org/apache/impala/service/JdbcTest.java +++ b/fe/src/test/java/org/apache/impala/service/JdbcTest.java @@ -599,10 +599,10 @@ public class JdbcTest extends JdbcTestBase { @Test public void testSelectNull() throws SQLException { - // Regression test for IMPALA-914. + // Regression test for IMPALA-914 / IMPALA-1370 / IMPALA-14027. ResultSet rs = con_.createStatement().executeQuery("select NULL"); - // Expect the column to be of type BOOLEAN to be compatible with Hive. - assertEquals(rs.getMetaData().getColumnType(1), Types.BOOLEAN); + // Expect the column to be of type NULL to be compatible with HiveServer2. + assertEquals(rs.getMetaData().getColumnType(1), Types.NULL); try { // We expect exactly one result row with a NULL inside the first column. assertTrue(rs.next()); diff --git a/testdata/workloads/functional-query/queries/QueryTest/except.test b/testdata/workloads/functional-query/queries/QueryTest/except.test index 51dcd0f2a..7e22b8df6 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/except.test +++ b/testdata/workloads/functional-query/queries/QueryTest/except.test @@ -519,7 +519,7 @@ select 3, 'c', NULL, 30.0 ---- TYPES tinyint, string, null, decimal ---- HS2_TYPES -tinyint, string, boolean, decimal +tinyint, string, null, decimal ---- RESULTS: VERIFY_IS_EQUAL_SORTED 1,'a',NULL,10.0 ==== @@ -533,7 +533,7 @@ values(3, 'c', NULL, 30.0) ---- TYPES tinyint, string, null, decimal ---- HS2_TYPES -tinyint, string, boolean, decimal +tinyint, string, null, decimal ---- RESULTS: VERIFY_IS_EQUAL_SORTED 1,'a',NULL,10.0 ==== diff --git a/testdata/workloads/functional-query/queries/QueryTest/intersect.test b/testdata/workloads/functional-query/queries/QueryTest/intersect.test index 9751ae41a..61eef15c6 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/intersect.test +++ b/testdata/workloads/functional-query/queries/QueryTest/intersect.test @@ -463,7 +463,7 @@ values(1, 'a', NULL, 10.0) ---- TYPES tinyint, string, null, decimal ---- HS2_TYPES -tinyint, string, boolean, decimal +tinyint, string, null, decimal ---- RESULTS: VERIFY_IS_EQUAL_SORTED 1,'a',NULL,10.0 ==== diff --git a/testdata/workloads/functional-query/queries/QueryTest/joins.test b/testdata/workloads/functional-query/queries/QueryTest/joins.test index 0027b9e69..19b1b483b 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/joins.test +++ b/testdata/workloads/functional-query/queries/QueryTest/joins.test @@ -195,7 +195,7 @@ limit 6 ---- TYPES int, tinyint, null ---- HS2_TYPES -int, tinyint, boolean +int, tinyint, null ==== ---- QUERY # check cross joins within a subquery diff --git a/testdata/workloads/functional-query/queries/QueryTest/top-n.test b/testdata/workloads/functional-query/queries/QueryTest/top-n.test index 0992559c4..f5df9a0ea 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/top-n.test +++ b/testdata/workloads/functional-query/queries/QueryTest/top-n.test @@ -903,7 +903,7 @@ limit 6 int, tinyint, null ---- HS2_TYPES # HS2 maps NULL to BOOLEAN -int, tinyint, boolean +int, tinyint, null ==== ---- QUERY # check cross joins within a subquery diff --git a/testdata/workloads/functional-query/queries/QueryTest/union.test b/testdata/workloads/functional-query/queries/QueryTest/union.test index fc5927ebb..6ac61474a 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/union.test +++ b/testdata/workloads/functional-query/queries/QueryTest/union.test @@ -741,7 +741,7 @@ select 3, 'c', NULL, 30.0 ---- TYPES tinyint, string, null, decimal ---- HS2_TYPES -tinyint, string, boolean, decimal +tinyint, string, null, decimal ---- RESULTS: VERIFY_IS_EQUAL_SORTED 1,'a',NULL,10.0 2,'b',NULL,20.0 @@ -757,7 +757,7 @@ select 1, 'a', NULL, 10.0 ---- TYPES tinyint, string, null, decimal ---- HS2_TYPES -tinyint, string, boolean, decimal +tinyint, string, null, decimal ---- RESULTS: VERIFY_IS_EQUAL_SORTED 1,'a',NULL,10.0 2,'b',NULL,20.0 @@ -772,7 +772,7 @@ values(3, 'c', NULL, 30.0) ---- TYPES tinyint, string, null, decimal ---- HS2_TYPES -tinyint, string, boolean, decimal +tinyint, string, null, decimal ---- RESULTS: VERIFY_IS_EQUAL_SORTED 1,'a',NULL,10.0 2,'b',NULL,20.0 @@ -788,7 +788,7 @@ values(1, 'a', NULL, 10.0) ---- TYPES tinyint, string, null, decimal ---- HS2_TYPES -tinyint, string, boolean, decimal +tinyint, string, null, decimal ---- RESULTS: VERIFY_IS_EQUAL_SORTED 1,'a',NULL,10.0 2,'b',NULL,20.0 diff --git a/tests/hs2/test_fetch.py b/tests/hs2/test_fetch.py index 7b40ef502..f52f57010 100644 --- a/tests/hs2/test_fetch.py +++ b/tests/hs2/test_fetch.py @@ -242,21 +242,21 @@ class TestFetch(HS2TestSuite): execute_statement_resp = self.hs2_client.ExecuteStatement(execute_statement_req) HS2TestSuite.check_response(execute_statement_resp) - # Check that the expected type is boolean (for compatibility with Hive, see also - # IMPALA-914) + # Check that the expected type is NULL_TYPE (for compatibility with HiveServer2, + # see also IMPALA-914, IMPALA-1370, and IMPALA-14027 for history). get_result_metadata_req = TCLIService.TGetResultSetMetadataReq() get_result_metadata_req.operationHandle = execute_statement_resp.operationHandle get_result_metadata_resp = \ self.hs2_client.GetResultSetMetadata(get_result_metadata_req) col = get_result_metadata_resp.schema.columns[0] - assert col.typeDesc.types[0].primitiveEntry.type == TTypeId.BOOLEAN_TYPE + assert col.typeDesc.types[0].primitiveEntry.type == TTypeId.NULL_TYPE - # Check that the actual type is boolean + # Check that the actual type is string fetch_results_req = TCLIService.TFetchResultsReq() fetch_results_req.operationHandle = execute_statement_resp.operationHandle fetch_results_req.maxRows = 1 fetch_results_resp = self.fetch(fetch_results_req) - assert fetch_results_resp.results.columns[0].boolVal is not None + assert fetch_results_resp.results.columns[0].stringVal is not None assert self.column_results_to_string( fetch_results_resp.results.columns) == (1, "NULL\n")