diff --git a/be/src/exec/hdfs-text-scanner.cc b/be/src/exec/hdfs-text-scanner.cc index 9977256de..349d56352 100644 --- a/be/src/exec/hdfs-text-scanner.cc +++ b/be/src/exec/hdfs-text-scanner.cc @@ -278,7 +278,10 @@ Status HdfsTextScanner::FinishScanRange() { } if (state_->abort_on_error()) return Status(ss.str()); } else if (!partial_tuple_empty_ || !boundary_column_.Empty() || - !boundary_row_.Empty()) { + !boundary_row_.Empty() || + (delimited_text_parser_->HasUnfinishedTuple() && + (!scan_node_->materialized_slots().empty() || + scan_node_->num_materialized_partition_keys() > 0))) { // Missing columns or row delimiter at end of the file is ok, fill the row in. char* col = boundary_column_.str().ptr; int num_fields = 0; @@ -297,8 +300,9 @@ Status HdfsTextScanner::FinishScanRange() { DCHECK_GE(num_tuples, 0); COUNTER_ADD(scan_node_->rows_read_counter(), num_tuples); RETURN_IF_ERROR(CommitRows(num_tuples)); - } else if (delimited_text_parser_->HasUnfinishedTuple() && - scan_node_->materialized_slots().empty()) { + } else if (delimited_text_parser_->HasUnfinishedTuple()) { + DCHECK(scan_node_->materialized_slots().empty()); + DCHECK_EQ(scan_node_->num_materialized_partition_keys(), 0); // If no fields are materialized we do not update partial_tuple_empty_, // boundary_column_, or boundary_row_. However, we still need to handle the case // of partial tuple due to missing tuple delimiter at the end of file. diff --git a/testdata/data/table_missing_columns.csv b/testdata/data/table_missing_columns.csv new file mode 100644 index 000000000..4676e3379 --- /dev/null +++ b/testdata/data/table_missing_columns.csv @@ -0,0 +1,6 @@ +1,true,123.123,2012-10-24 08:55:00 +2,false +3,false,24453.325,2008-08-22 09:33:21.123 + +4,false,243423.325,2007-05-12 22:32:21.33454 +5,true,243.325 \ No newline at end of file diff --git a/testdata/datasets/functional/functional_schema_template.sql b/testdata/datasets/functional/functional_schema_template.sql index 7865da6d2..f2329b52f 100644 --- a/testdata/datasets/functional/functional_schema_template.sql +++ b/testdata/datasets/functional/functional_schema_template.sql @@ -1464,6 +1464,25 @@ hadoop fs -put -f ${IMPALA_HOME}/testdata/data/table_no_newline.csv /test-wareho ---- DATASET functional ---- BASE_TABLE_NAME +table_no_newline_part +---- CREATE +CREATE EXTERNAL TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name} ( +id INT, col_1 BOOLEAN, col_2 DOUBLE, col_3 TIMESTAMP) +partitioned by (year INT, month INT) +row format delimited fields terminated by ',' +LOCATION '/test-warehouse/{table_name}'; +ALTER TABLE {db_name}{db_suffix}.{table_name} ADD PARTITION (year=2015, month=3); +ALTER TABLE {db_name}{db_suffix}.{table_name} ADD PARTITION (year=2010, month=3); +---- LOAD +`hadoop fs -mkdir -p /test-warehouse/table_no_newline_part && \ +hadoop fs -mkdir -p /test-warehouse/table_no_newline_part/year=2010/month=3 && \ +hadoop fs -mkdir -p /test-warehouse/table_no_newline_part/year=2015/month=3 && \ +hadoop fs -put -f ${IMPALA_HOME}/testdata/data/table_no_newline.csv /test-warehouse/table_no_newline_part/year=2010/month=3 && \ +hadoop fs -put -f ${IMPALA_HOME}/testdata/data/table_missing_columns.csv /test-warehouse/table_no_newline_part/year=2015/month=3 +==== +---- DATASET +functional +---- BASE_TABLE_NAME testescape_16_lf ---- CREATE CREATE EXTERNAL TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name} ( diff --git a/testdata/datasets/functional/schema_constraints.csv b/testdata/datasets/functional/schema_constraints.csv index 0321bc404..b3981919f 100644 --- a/testdata/datasets/functional/schema_constraints.csv +++ b/testdata/datasets/functional/schema_constraints.csv @@ -122,6 +122,7 @@ table_name:avro_decimal_tbl, constraint:restrict_to, table_format:avro/snap/bloc # testescape tables are used for testing text scanner delimiter handling table_name:table_no_newline, constraint:restrict_to, table_format:text/none/none +table_name:table_no_newline_part, constraint:restrict_to, table_format:text/none/none table_name:testescape_16_lf, constraint:restrict_to, table_format:text/none/none table_name:testescape_16_crlf, constraint:restrict_to, table_format:text/none/none table_name:testescape_17_lf, constraint:restrict_to, table_format:text/none/none diff --git a/testdata/workloads/functional-query/queries/QueryTest/hdfs-text-scan.test b/testdata/workloads/functional-query/queries/QueryTest/hdfs-text-scan.test index 1b1eaddec..a99049622 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/hdfs-text-scan.test +++ b/testdata/workloads/functional-query/queries/QueryTest/hdfs-text-scan.test @@ -12,3 +12,62 @@ select count(col_3) from functional.table_no_newline ---- TYPES BIGINT ==== +---- QUERY +select count(*) from functional.table_no_newline_part +---- RESULTS +11 +---- TYPES +BIGINT +==== +---- QUERY +select count(year) from functional.table_no_newline_part +---- RESULTS +11 +---- TYPES +BIGINT +==== +---- QUERY +select count(year) from functional.table_no_newline_part where year=2015 +---- RESULTS +6 +---- TYPES +BIGINT +==== +---- QUERY +select count(col_3) from functional.table_no_newline_part where year=2015 +---- RESULTS +3 +---- TYPES +BIGINT +==== +---- QUERY +select distinct year from functional.table_no_newline_part order by year +---- RESULTS +2010 +2015 +---- TYPES +INT +==== +---- QUERY +select count(id), count(col_1), count(col_2), count(col_3), count(year), count(month) from functional.table_no_newline_part +---- RESULTS +10,10,9,8,11,11 +---- TYPES +BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT +==== +---- QUERY +select year, count(*) from functional.table_no_newline_part group by year order by year +---- RESULTS +2010,5 +2015,6 +---- TYPES +INT,BIGINT +==== +---- QUERY +select year, count(col_3) from functional.table_no_newline_part group by year order by year +---- RESULTS +2010,5 +2015,3 +---- TYPES +INT,BIGINT +==== diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py index 53ba1ce10..62f5d3b0d 100644 --- a/tests/query_test/test_scanners.py +++ b/tests/query_test/test_scanners.py @@ -364,6 +364,10 @@ class TestTextScanRangeLengths(ImpalaTestSuite): def test_text_scanner(self, vector): vector.get_value('exec_option')['max_scan_range_length'] =\ vector.get_value('max_scan_range_length') + self.execute_query_expect_success(self.client, "drop stats " + "functional.table_no_newline_part") + self.execute_query_expect_success(self.client, "compute stats " + "functional.table_no_newline_part") self.run_test_case('QueryTest/hdfs-text-scan', vector) # Test various escape char cases. We have to check the count(*) result against