From d1c263402ee893ef4bca72ff4474d35a59d0aa11 Mon Sep 17 00:00:00 2001
From: Juan Yu <jyu@cloudera.com>
Date: Fri, 1 May 2015 12:10:19 -0700
Subject: [PATCH] IMPALA-1973: Fixing crash when uninitialized, empty row is
 added in HdfsTextScanner

This patch fixes an issue when an uninitialized, empty row is falsely
added to the rowbatch. The uninitialized data inside this row leads
later on to a crash when the null byte is checked together with the
offsets (that contains garbage).

The fix is to not only check for the number of materialized columns, but
as well for the number of materialized partition key columns. Only if both are
empty and the parser has an unfinished tuple, add the empty row.

To accommodate for the last row, check in FinishScanRange() if there is an
unfinished tuple with materialized slots or materialized partition key. Write
the fields if necessary.

Change-Id: I2808cc228e62d048d917d3a6352d869d117597ab
(cherry picked from commit c1795a8b40d10fbb32d9051a0e7de5ebffc8a6bd)
Reviewed-on: http://gerrit.cloudera.org:8080/364
Reviewed-by: Juan Yu <jyu@cloudera.com>
Tested-by: Internal Jenkins
---
 be/src/exec/hdfs-text-scanner.cc              | 10 +++-
 testdata/data/table_missing_columns.csv       |  6 ++
 .../functional/functional_schema_template.sql | 19 ++++++
 .../functional/schema_constraints.csv         |  1 +
 .../queries/QueryTest/hdfs-text-scan.test     | 59 +++++++++++++++++++
 tests/query_test/test_scanners.py             |  4 ++
 6 files changed, 96 insertions(+), 3 deletions(-)
 create mode 100644 testdata/data/table_missing_columns.csv

diff --git a/be/src/exec/hdfs-text-scanner.cc b/be/src/exec/hdfs-text-scanner.cc
index 9977256de..349d56352 100644
--- a/be/src/exec/hdfs-text-scanner.cc
+++ b/be/src/exec/hdfs-text-scanner.cc
@@ -278,7 +278,10 @@ Status HdfsTextScanner::FinishScanRange() {
         }
         if (state_->abort_on_error()) return Status(ss.str());
       } else if (!partial_tuple_empty_ || !boundary_column_.Empty() ||
-          !boundary_row_.Empty()) {
+          !boundary_row_.Empty() ||
+          (delimited_text_parser_->HasUnfinishedTuple() &&
+              (!scan_node_->materialized_slots().empty() ||
+                  scan_node_->num_materialized_partition_keys() > 0))) {
         // Missing columns or row delimiter at end of the file is ok, fill the row in.
         char* col = boundary_column_.str().ptr;
         int num_fields = 0;
@@ -297,8 +300,9 @@ Status HdfsTextScanner::FinishScanRange() {
         DCHECK_GE(num_tuples, 0);
         COUNTER_ADD(scan_node_->rows_read_counter(), num_tuples);
         RETURN_IF_ERROR(CommitRows(num_tuples));
-      } else if (delimited_text_parser_->HasUnfinishedTuple() &&
-          scan_node_->materialized_slots().empty()) {
+      } else if (delimited_text_parser_->HasUnfinishedTuple()) {
+        DCHECK(scan_node_->materialized_slots().empty());
+        DCHECK_EQ(scan_node_->num_materialized_partition_keys(), 0);
         // If no fields are materialized we do not update partial_tuple_empty_,
         // boundary_column_, or boundary_row_. However, we still need to handle the case
         // of partial tuple due to missing tuple delimiter at the end of file.
diff --git a/testdata/data/table_missing_columns.csv b/testdata/data/table_missing_columns.csv
new file mode 100644
index 000000000..4676e3379
--- /dev/null
+++ b/testdata/data/table_missing_columns.csv
@@ -0,0 +1,6 @@
+1,true,123.123,2012-10-24 08:55:00
+2,false
+3,false,24453.325,2008-08-22 09:33:21.123
+
+4,false,243423.325,2007-05-12 22:32:21.33454
+5,true,243.325
\ No newline at end of file
diff --git a/testdata/datasets/functional/functional_schema_template.sql b/testdata/datasets/functional/functional_schema_template.sql
index 7865da6d2..f2329b52f 100644
--- a/testdata/datasets/functional/functional_schema_template.sql
+++ b/testdata/datasets/functional/functional_schema_template.sql
@@ -1464,6 +1464,25 @@ hadoop fs -put -f ${IMPALA_HOME}/testdata/data/table_no_newline.csv /test-wareho
 ---- DATASET
 functional
 ---- BASE_TABLE_NAME
+table_no_newline_part
+---- CREATE
+CREATE EXTERNAL TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name} (
+id INT, col_1 BOOLEAN, col_2 DOUBLE, col_3 TIMESTAMP)
+partitioned by (year INT, month INT)
+row format delimited fields terminated by ','
+LOCATION '/test-warehouse/{table_name}';
+ALTER TABLE {db_name}{db_suffix}.{table_name} ADD PARTITION (year=2015, month=3);
+ALTER TABLE {db_name}{db_suffix}.{table_name} ADD PARTITION (year=2010, month=3);
+---- LOAD
+`hadoop fs -mkdir -p /test-warehouse/table_no_newline_part && \
+hadoop fs -mkdir -p /test-warehouse/table_no_newline_part/year=2010/month=3 && \
+hadoop fs -mkdir -p /test-warehouse/table_no_newline_part/year=2015/month=3 && \
+hadoop fs -put -f ${IMPALA_HOME}/testdata/data/table_no_newline.csv /test-warehouse/table_no_newline_part/year=2010/month=3 && \
+hadoop fs -put -f ${IMPALA_HOME}/testdata/data/table_missing_columns.csv /test-warehouse/table_no_newline_part/year=2015/month=3
+====
+---- DATASET
+functional
+---- BASE_TABLE_NAME
 testescape_16_lf
 ---- CREATE
 CREATE EXTERNAL TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name} (
diff --git a/testdata/datasets/functional/schema_constraints.csv b/testdata/datasets/functional/schema_constraints.csv
index 0321bc404..b3981919f 100644
--- a/testdata/datasets/functional/schema_constraints.csv
+++ b/testdata/datasets/functional/schema_constraints.csv
@@ -122,6 +122,7 @@ table_name:avro_decimal_tbl, constraint:restrict_to, table_format:avro/snap/bloc
 
 # testescape tables are used for testing text scanner delimiter handling
 table_name:table_no_newline, constraint:restrict_to, table_format:text/none/none
+table_name:table_no_newline_part, constraint:restrict_to, table_format:text/none/none
 table_name:testescape_16_lf, constraint:restrict_to, table_format:text/none/none
 table_name:testescape_16_crlf, constraint:restrict_to, table_format:text/none/none
 table_name:testescape_17_lf, constraint:restrict_to, table_format:text/none/none
diff --git a/testdata/workloads/functional-query/queries/QueryTest/hdfs-text-scan.test b/testdata/workloads/functional-query/queries/QueryTest/hdfs-text-scan.test
index 1b1eaddec..a99049622 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/hdfs-text-scan.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/hdfs-text-scan.test
@@ -12,3 +12,62 @@ select count(col_3) from functional.table_no_newline
 ---- TYPES
 BIGINT
 ====
+---- QUERY
+select count(*) from functional.table_no_newline_part
+---- RESULTS
+11
+---- TYPES
+BIGINT
+====
+---- QUERY
+select count(year) from functional.table_no_newline_part
+---- RESULTS
+11
+---- TYPES
+BIGINT
+====
+---- QUERY
+select count(year) from functional.table_no_newline_part where year=2015
+---- RESULTS
+6
+---- TYPES
+BIGINT
+====
+---- QUERY
+select count(col_3) from functional.table_no_newline_part where year=2015
+---- RESULTS
+3
+---- TYPES
+BIGINT
+====
+---- QUERY
+select distinct year from functional.table_no_newline_part order by year
+---- RESULTS
+2010
+2015
+---- TYPES
+INT
+====
+---- QUERY
+select count(id), count(col_1), count(col_2), count(col_3), count(year), count(month) from functional.table_no_newline_part
+---- RESULTS
+10,10,9,8,11,11
+---- TYPES
+BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT
+====
+---- QUERY
+select year, count(*) from functional.table_no_newline_part group by year order by year
+---- RESULTS
+2010,5
+2015,6
+---- TYPES
+INT,BIGINT
+====
+---- QUERY
+select year, count(col_3) from functional.table_no_newline_part group by year order by year
+---- RESULTS
+2010,5
+2015,3
+---- TYPES
+INT,BIGINT
+====
diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py
index 53ba1ce10..62f5d3b0d 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py
@@ -364,6 +364,10 @@ class TestTextScanRangeLengths(ImpalaTestSuite):
   def test_text_scanner(self, vector):
     vector.get_value('exec_option')['max_scan_range_length'] =\
         vector.get_value('max_scan_range_length')
+    self.execute_query_expect_success(self.client, "drop stats "
+        "functional.table_no_newline_part")
+    self.execute_query_expect_success(self.client, "compute stats "
+        "functional.table_no_newline_part")
     self.run_test_case('QueryTest/hdfs-text-scan', vector)
 
     # Test various escape char cases. We have to check the count(*) result against