diff --git a/be/src/exec/hdfs-parquet-scanner.cc b/be/src/exec/hdfs-parquet-scanner.cc index d86263fce..11c7d47fc 100644 --- a/be/src/exec/hdfs-parquet-scanner.cc +++ b/be/src/exec/hdfs-parquet-scanner.cc @@ -730,38 +730,65 @@ Status HdfsParquetScanner::ProcessSplit() { // specific to type and encoding and then inlined into AssembleRows(). Status HdfsParquetScanner::AssembleRows() { assemble_rows_timer_.Start(); - while (!scan_node_->ReachedLimit() && !context_->cancelled()) { + int64_t rows_read = 0; + int64_t rows_in_file = file_metadata_.num_rows; + while (!scan_node_->ReachedLimit() && !context_->cancelled() && + rows_read < rows_in_file) { MemPool* pool; Tuple* tuple; TupleRow* row; - int num_rows = GetMemory(&pool, &tuple, &row); - int num_to_commit = 0; + int64_t num_rows = std::min(rows_in_file, + static_cast(GetMemory(&pool, &tuple, &row))); + if (num_rows == 0) return parse_status_; - for (int i = 0; i < num_rows; ++i) { - bool conjuncts_failed = false; - InitTuple(template_tuple_, tuple); - for (int c = 0; c < column_readers_.size(); ++c) { - if (!column_readers_[c]->ReadValue(pool, tuple, &conjuncts_failed)) { - assemble_rows_timer_.Stop(); - // This column is complete and has no more data. This indicates - // we are done with this row group. - // For correctly formed files, this should be the first column we - // are reading. - DCHECK(c == 0 || !parse_status_.ok()) + int num_to_commit = 0; + int num_column_readers = column_readers_.size(); + if (num_column_readers > 0) { + for (int i = 0; i < num_rows; ++i) { + bool conjuncts_failed = false; + InitTuple(template_tuple_, tuple); + for (int c = 0; c < num_column_readers; ++c) { + if (!column_readers_[c]->ReadValue(pool, tuple, &conjuncts_failed)) { + assemble_rows_timer_.Stop(); + // This column is complete and has no more data. This indicates + // we are done with this row group. + // For correctly formed files, this should be the first column we + // are reading. + DCHECK(c == 0 || !parse_status_.ok()) << "c=" << c << " " << parse_status_.GetErrorMsg();; - COUNTER_UPDATE(scan_node_->rows_read_counter(), i); - RETURN_IF_ERROR(CommitRows(num_to_commit)); - return parse_status_; + COUNTER_UPDATE(scan_node_->rows_read_counter(), i); + RETURN_IF_ERROR(CommitRows(num_to_commit)); + return parse_status_; + } + } + if (conjuncts_failed) continue; + row->SetTuple(scan_node_->tuple_idx(), tuple); + if (ExecNode::EvalConjuncts(&(*conjuncts_)[0], num_conjuncts_, row)) { + row = next_row(row); + tuple = next_tuple(tuple); + ++num_to_commit; } } - if (conjuncts_failed) continue; + } else { + // Special case when there is no data for the accessed column(s) in the file. + // This can happen, for example, due to schema evolution (alter table add column). + // Since all the tuples are same, evaluating conjuncts only for the first tuple. + InitTuple(template_tuple_, tuple); row->SetTuple(scan_node_->tuple_idx(), tuple); if (ExecNode::EvalConjuncts(&(*conjuncts_)[0], num_conjuncts_, row)) { row = next_row(row); tuple = next_tuple(tuple); - ++num_to_commit; - } + + for (int i = 1; i < num_rows; ++i) { + InitTuple(template_tuple_, tuple); + row->SetTuple(scan_node_->tuple_idx(), tuple); + row = next_row(row); + tuple = next_tuple(tuple); + } + num_to_commit += num_rows; + } } + rows_read += num_rows; COUNTER_UPDATE(scan_node_->rows_read_counter(), num_rows); RETURN_IF_ERROR(CommitRows(num_to_commit)); } diff --git a/be/src/exec/hdfs-scanner.h b/be/src/exec/hdfs-scanner.h index 80a366e6b..3cba67677 100644 --- a/be/src/exec/hdfs-scanner.h +++ b/be/src/exec/hdfs-scanner.h @@ -320,6 +320,7 @@ class HdfsScanner { // Initialize a tuple. // TODO: only copy over non-null slots. + // TODO: InitTuple is called frequently, avoid the if, perhaps via templatization. void InitTuple(Tuple* template_tuple, Tuple* tuple) { if (template_tuple != NULL) { memcpy(tuple, template_tuple, tuple_byte_size_); diff --git a/testdata/workloads/functional-query/queries/QueryTest/alter-table.test b/testdata/workloads/functional-query/queries/QueryTest/alter-table.test index 12f76fa49..b2ef2e3f9 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/alter-table.test +++ b/testdata/workloads/functional-query/queries/QueryTest/alter-table.test @@ -567,3 +567,111 @@ Total,,200,2,regex:.+KB,'0B','' ---- TYPES INT, INT, BIGINT, BIGINT, STRING, STRING, STRING ==== +---- QUERY +# IMPALA-1016: Testing scanning newly added columns +DROP TABLE IF EXISTS imp1016 +==== +---- QUERY +CREATE TABLE imp1016 (string1 string) +---- RESULTS +==== +---- QUERY +INSERT INTO imp1016 VALUES ('test') +---- RESULTS +: 1 +==== +---- QUERY +ALTER TABLE imp1016 ADD COLUMNS (string2 string) +---- RESULTS +==== +---- QUERY +DESCRIBE imp1016 +---- RESULTS +'string1','string','' +'string2','string','' +---- TYPES +string,string,string +==== +---- QUERY +SELECT * FROM imp1016 +---- RESULTS +'test','NULL' +---- TYPES +string,string +==== +---- QUERY +SELECT string1 FROM imp1016 +---- RESULTS +'test' +---- TYPES +string +==== +---- QUERY +SELECT string2 FROM imp1016 +---- RESULTS +'NULL' +---- TYPES +string +==== +---- QUERY +SELECT COUNT(DISTINCT string1) FROM imp1016 +---- RESULTS +1 +---- TYPES +bigint +==== +---- QUERY +SELECT COUNT(DISTINCT string2) FROM imp1016 +---- RESULTS +0 +---- TYPES +bigint +==== +---- QUERY +# Create a larger table to test scanning newly added columns +DROP TABLE IF EXISTS imp1016Large +==== +---- QUERY +CREATE TABLE imp1016Large (string1 string) +---- RESULTS +==== +---- QUERY +# There is a codepath that operates on chunks of 1024 tuples, inserting +# more than 1024 tuples +INSERT INTO imp1016Large SELECT 'test' FROM functional.alltypes LIMIT 2000 +---- RESULTS +: 2000 +==== +---- QUERY +ALTER TABLE imp1016Large ADD COLUMNS (string2 string) +---- RESULTS +==== +---- QUERY +DESCRIBE imp1016Large +---- RESULTS +'string1','string','' +'string2','string','' +---- TYPES +string,string,string +==== +---- QUERY +SELECT COUNT(string2) FROM imp1016Large +---- RESULTS +0 +---- TYPES +bigint +==== +---- QUERY +SELECT COUNT(*), COUNT(DISTINCT string1) FROM imp1016Large +---- RESULTS +2000,1 +---- TYPES +bigint,bigint +==== +---- QUERY +SELECT COUNT(*), COUNT(DISTINCT string2) FROM imp1016Large +---- RESULTS +2000,0 +---- TYPES +bigint,bigint +====