From d1c263402ee893ef4bca72ff4474d35a59d0aa11 Mon Sep 17 00:00:00 2001 From: Juan Yu Date: Fri, 1 May 2015 12:10:19 -0700 Subject: [PATCH] IMPALA-1973: Fixing crash when uninitialized, empty row is added in HdfsTextScanner This patch fixes an issue when an uninitialized, empty row is falsely added to the rowbatch. The uninitialized data inside this row leads later on to a crash when the null byte is checked together with the offsets (that contains garbage). The fix is to not only check for the number of materialized columns, but as well for the number of materialized partition key columns. Only if both are empty and the parser has an unfinished tuple, add the empty row. To accommodate for the last row, check in FinishScanRange() if there is an unfinished tuple with materialized slots or materialized partition key. Write the fields if necessary. Change-Id: I2808cc228e62d048d917d3a6352d869d117597ab (cherry picked from commit c1795a8b40d10fbb32d9051a0e7de5ebffc8a6bd) Reviewed-on: http://gerrit.cloudera.org:8080/364 Reviewed-by: Juan Yu Tested-by: Internal Jenkins --- be/src/exec/hdfs-text-scanner.cc | 10 +++- testdata/data/table_missing_columns.csv | 6 ++ .../functional/functional_schema_template.sql | 19 ++++++ .../functional/schema_constraints.csv | 1 + .../queries/QueryTest/hdfs-text-scan.test | 59 +++++++++++++++++++ tests/query_test/test_scanners.py | 4 ++ 6 files changed, 96 insertions(+), 3 deletions(-) create mode 100644 testdata/data/table_missing_columns.csv diff --git a/be/src/exec/hdfs-text-scanner.cc b/be/src/exec/hdfs-text-scanner.cc index 9977256de..349d56352 100644 --- a/be/src/exec/hdfs-text-scanner.cc +++ b/be/src/exec/hdfs-text-scanner.cc @@ -278,7 +278,10 @@ Status HdfsTextScanner::FinishScanRange() { } if (state_->abort_on_error()) return Status(ss.str()); } else if (!partial_tuple_empty_ || !boundary_column_.Empty() || - !boundary_row_.Empty()) { + !boundary_row_.Empty() || + (delimited_text_parser_->HasUnfinishedTuple() && + (!scan_node_->materialized_slots().empty() || + scan_node_->num_materialized_partition_keys() > 0))) { // Missing columns or row delimiter at end of the file is ok, fill the row in. char* col = boundary_column_.str().ptr; int num_fields = 0; @@ -297,8 +300,9 @@ Status HdfsTextScanner::FinishScanRange() { DCHECK_GE(num_tuples, 0); COUNTER_ADD(scan_node_->rows_read_counter(), num_tuples); RETURN_IF_ERROR(CommitRows(num_tuples)); - } else if (delimited_text_parser_->HasUnfinishedTuple() && - scan_node_->materialized_slots().empty()) { + } else if (delimited_text_parser_->HasUnfinishedTuple()) { + DCHECK(scan_node_->materialized_slots().empty()); + DCHECK_EQ(scan_node_->num_materialized_partition_keys(), 0); // If no fields are materialized we do not update partial_tuple_empty_, // boundary_column_, or boundary_row_. However, we still need to handle the case // of partial tuple due to missing tuple delimiter at the end of file. diff --git a/testdata/data/table_missing_columns.csv b/testdata/data/table_missing_columns.csv new file mode 100644 index 000000000..4676e3379 --- /dev/null +++ b/testdata/data/table_missing_columns.csv @@ -0,0 +1,6 @@ +1,true,123.123,2012-10-24 08:55:00 +2,false +3,false,24453.325,2008-08-22 09:33:21.123 + +4,false,243423.325,2007-05-12 22:32:21.33454 +5,true,243.325 \ No newline at end of file diff --git a/testdata/datasets/functional/functional_schema_template.sql b/testdata/datasets/functional/functional_schema_template.sql index 7865da6d2..f2329b52f 100644 --- a/testdata/datasets/functional/functional_schema_template.sql +++ b/testdata/datasets/functional/functional_schema_template.sql @@ -1464,6 +1464,25 @@ hadoop fs -put -f ${IMPALA_HOME}/testdata/data/table_no_newline.csv /test-wareho ---- DATASET functional ---- BASE_TABLE_NAME +table_no_newline_part +---- CREATE +CREATE EXTERNAL TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name} ( +id INT, col_1 BOOLEAN, col_2 DOUBLE, col_3 TIMESTAMP) +partitioned by (year INT, month INT) +row format delimited fields terminated by ',' +LOCATION '/test-warehouse/{table_name}'; +ALTER TABLE {db_name}{db_suffix}.{table_name} ADD PARTITION (year=2015, month=3); +ALTER TABLE {db_name}{db_suffix}.{table_name} ADD PARTITION (year=2010, month=3); +---- LOAD +`hadoop fs -mkdir -p /test-warehouse/table_no_newline_part && \ +hadoop fs -mkdir -p /test-warehouse/table_no_newline_part/year=2010/month=3 && \ +hadoop fs -mkdir -p /test-warehouse/table_no_newline_part/year=2015/month=3 && \ +hadoop fs -put -f ${IMPALA_HOME}/testdata/data/table_no_newline.csv /test-warehouse/table_no_newline_part/year=2010/month=3 && \ +hadoop fs -put -f ${IMPALA_HOME}/testdata/data/table_missing_columns.csv /test-warehouse/table_no_newline_part/year=2015/month=3 +==== +---- DATASET +functional +---- BASE_TABLE_NAME testescape_16_lf ---- CREATE CREATE EXTERNAL TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name} ( diff --git a/testdata/datasets/functional/schema_constraints.csv b/testdata/datasets/functional/schema_constraints.csv index 0321bc404..b3981919f 100644 --- a/testdata/datasets/functional/schema_constraints.csv +++ b/testdata/datasets/functional/schema_constraints.csv @@ -122,6 +122,7 @@ table_name:avro_decimal_tbl, constraint:restrict_to, table_format:avro/snap/bloc # testescape tables are used for testing text scanner delimiter handling table_name:table_no_newline, constraint:restrict_to, table_format:text/none/none +table_name:table_no_newline_part, constraint:restrict_to, table_format:text/none/none table_name:testescape_16_lf, constraint:restrict_to, table_format:text/none/none table_name:testescape_16_crlf, constraint:restrict_to, table_format:text/none/none table_name:testescape_17_lf, constraint:restrict_to, table_format:text/none/none diff --git a/testdata/workloads/functional-query/queries/QueryTest/hdfs-text-scan.test b/testdata/workloads/functional-query/queries/QueryTest/hdfs-text-scan.test index 1b1eaddec..a99049622 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/hdfs-text-scan.test +++ b/testdata/workloads/functional-query/queries/QueryTest/hdfs-text-scan.test @@ -12,3 +12,62 @@ select count(col_3) from functional.table_no_newline ---- TYPES BIGINT ==== +---- QUERY +select count(*) from functional.table_no_newline_part +---- RESULTS +11 +---- TYPES +BIGINT +==== +---- QUERY +select count(year) from functional.table_no_newline_part +---- RESULTS +11 +---- TYPES +BIGINT +==== +---- QUERY +select count(year) from functional.table_no_newline_part where year=2015 +---- RESULTS +6 +---- TYPES +BIGINT +==== +---- QUERY +select count(col_3) from functional.table_no_newline_part where year=2015 +---- RESULTS +3 +---- TYPES +BIGINT +==== +---- QUERY +select distinct year from functional.table_no_newline_part order by year +---- RESULTS +2010 +2015 +---- TYPES +INT +==== +---- QUERY +select count(id), count(col_1), count(col_2), count(col_3), count(year), count(month) from functional.table_no_newline_part +---- RESULTS +10,10,9,8,11,11 +---- TYPES +BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT +==== +---- QUERY +select year, count(*) from functional.table_no_newline_part group by year order by year +---- RESULTS +2010,5 +2015,6 +---- TYPES +INT,BIGINT +==== +---- QUERY +select year, count(col_3) from functional.table_no_newline_part group by year order by year +---- RESULTS +2010,5 +2015,3 +---- TYPES +INT,BIGINT +==== diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py index 53ba1ce10..62f5d3b0d 100644 --- a/tests/query_test/test_scanners.py +++ b/tests/query_test/test_scanners.py @@ -364,6 +364,10 @@ class TestTextScanRangeLengths(ImpalaTestSuite): def test_text_scanner(self, vector): vector.get_value('exec_option')['max_scan_range_length'] =\ vector.get_value('max_scan_range_length') + self.execute_query_expect_success(self.client, "drop stats " + "functional.table_no_newline_part") + self.execute_query_expect_success(self.client, "compute stats " + "functional.table_no_newline_part") self.run_test_case('QueryTest/hdfs-text-scan', vector) # Test various escape char cases. We have to check the count(*) result against