From 0c8b2d3dbeff6440cdb0223680f77f8daa10a4a7 Mon Sep 17 00:00:00 2001 From: Lars Volker Date: Sat, 13 May 2017 20:27:50 +0200 Subject: [PATCH] IMPALA-5144: Remove sortby() hint The sortby() hint is superseded by the SORT BY SQL clause, which has been introduced in IMPALA-4166. This changes removes the hint. Change-Id: I83e1cd6fa7039035973676322deefbce00d3f594 Reviewed-on: http://gerrit.cloudera.org:8080/6885 Reviewed-by: Lars Volker Tested-by: Impala Public Jenkins --- be/src/exec/hdfs-table-sink.h | 4 +- common/thrift/DataSinks.thrift | 4 +- .../apache/impala/analysis/InsertStmt.java | 49 +------------ .../apache/impala/planner/HdfsTableSink.java | 4 +- .../org/apache/impala/planner/Planner.java | 17 +++-- .../org/apache/impala/planner/TableSink.java | 6 +- .../impala/analysis/AnalyzeDDLTest.java | 4 +- .../impala/analysis/AnalyzeStmtsTest.java | 33 --------- .../apache/impala/analysis/ParserTest.java | 37 +++++----- .../org/apache/impala/analysis/ToSqlTest.java | 12 ++- .../queries/PlannerTest/insert-sort-by.test | 51 ------------- .../queries/PlannerTest/insert.test | 73 ------------------- .../queries/QueryTest/insert.test | 26 ------- tests/query_test/test_insert_parquet.py | 21 +++--- 14 files changed, 62 insertions(+), 279 deletions(-) diff --git a/be/src/exec/hdfs-table-sink.h b/be/src/exec/hdfs-table-sink.h index f6db86855..b928a2083 100644 --- a/be/src/exec/hdfs-table-sink.h +++ b/be/src/exec/hdfs-table-sink.h @@ -276,8 +276,8 @@ class HdfsTableSink : public DataSink { bool input_is_clustered_; // Stores the indices into the list of non-clustering columns of the target table that - // are mentioned in the 'sortby()' hint. This is used in the backend to populate the - // RowGroup::sorting_columns list in parquet files. + // are stored in the 'sort.columns' table property. This is used in the backend to + // populate the RowGroup::sorting_columns list in parquet files. const std::vector& sort_columns_; /// Stores the current partition during clustered inserts across subsequent row batches. diff --git a/common/thrift/DataSinks.thrift b/common/thrift/DataSinks.thrift index cce797102..1b26c92b8 100644 --- a/common/thrift/DataSinks.thrift +++ b/common/thrift/DataSinks.thrift @@ -72,8 +72,8 @@ struct THdfsTableSink { 4: required bool input_is_clustered // Stores the indices into the list of non-clustering columns of the target table that - // are mentioned in the 'sortby()' hint. This is used in the backend to populate the - // RowGroup::sorting_columns list in parquet files. + // are stored in the 'sort.columns' table property. This is used in the backend to + // populate the RowGroup::sorting_columns list in parquet files. 5: optional list sort_columns } diff --git a/fe/src/main/java/org/apache/impala/analysis/InsertStmt.java b/fe/src/main/java/org/apache/impala/analysis/InsertStmt.java index 412586cb7..c25f484d8 100644 --- a/fe/src/main/java/org/apache/impala/analysis/InsertStmt.java +++ b/fe/src/main/java/org/apache/impala/analysis/InsertStmt.java @@ -136,7 +136,7 @@ public class InsertStmt extends StatementBase { private boolean hasNoClusteredHint_ = false; // For every column of the target table that is referenced in the optional - // 'sort.columns' table property or in the optional 'sortby()' hint, this list will + // 'sort.columns' table property, this list will // contain the corresponding result expr from 'resultExprs_'. Before insertion, all rows // will be sorted by these exprs. If the list is empty, no additional sorting by // non-partitioning columns will be performed. The column list must not contain @@ -144,11 +144,8 @@ public class InsertStmt extends StatementBase { private List sortExprs_ = Lists.newArrayList(); // Stores the indices into the list of non-clustering columns of the target table that - // are mentioned in the 'sort.columns' table property or the 'sortby()' hint. This is + // are mentioned in the 'sort.columns' table property. This is // sent to the backend to populate the RowGroup::sorting_columns list in parquet files. - // Sort columns supersede the sortby() hint, which we will remove in a subsequent change - // (IMPALA-5144). Until then, it is possible to specify sort columns using both ways at - // the same time and the column lists will be concatenated. private List sortColumns_ = Lists.newArrayList(); // Output expressions that produce the final results to write to the target table. May @@ -820,9 +817,6 @@ public class InsertStmt extends StatementBase { } else if (hint.is("NOCLUSTERED")) { hasNoClusteredHint_ = true; analyzer.setHasPlanHints(); - } else if (hint.is("SORTBY")) { - analyzeSortByHint(hint); - analyzer.setHasPlanHints(); } else { analyzer.addWarning("INSERT hint not recognized: " + hint); } @@ -836,45 +830,6 @@ public class InsertStmt extends StatementBase { } } - private void analyzeSortByHint(PlanHint hint) throws AnalysisException { - // HBase and Kudu tables don't support insert hints at all (must be enforced by the caller). - Preconditions.checkState(!(table_ instanceof HBaseTable || table_ instanceof KuduTable)); - - List columnNames = hint.getArgs(); - Preconditions.checkState(!columnNames.isEmpty()); - for (String columnName: columnNames) { - // Make sure it's not an Hdfs partition column. - for (Column tableColumn: table_.getClusteringColumns()) { - if (tableColumn.getName().equals(columnName)) { - throw new AnalysisException(String.format("SORTBY hint column list must " + - "not contain Hdfs partition column: '%s'", columnName)); - } - } - - // Find the matching column in the target table's column list (by name) and store - // the corresponding result expr in sortExprs_. - boolean foundColumn = false; - List columns = table_.getNonClusteringColumns(); - if (!columns.isEmpty()) { - // We need to make a copy to make the sortColumns_ list mutable. - // TODO: Remove this when removing the sortby() hint (IMPALA-5157). - sortColumns_ = Lists.newArrayList(sortColumns_); - } - for (int i = 0; i < columns.size(); ++i) { - if (columns.get(i).getName().equals(columnName)) { - sortExprs_.add(resultExprs_.get(i)); - sortColumns_.add(i); - foundColumn = true; - break; - } - } - if (!foundColumn) { - throw new AnalysisException(String.format("Could not find SORTBY hint column " + - "'%s' in table.", columnName)); - } - } - } - @Override public ArrayList getResultExprs() { return resultExprs_; } diff --git a/fe/src/main/java/org/apache/impala/planner/HdfsTableSink.java b/fe/src/main/java/org/apache/impala/planner/HdfsTableSink.java index 996388f45..5ff8a1b94 100644 --- a/fe/src/main/java/org/apache/impala/planner/HdfsTableSink.java +++ b/fe/src/main/java/org/apache/impala/planner/HdfsTableSink.java @@ -53,8 +53,8 @@ public class HdfsTableSink extends TableSink { protected final boolean inputIsClustered_; // Stores the indices into the list of non-clustering columns of the target table that - // are mentioned in the 'sortby()' hint. This is sent to the backend to populate the - // RowGroup::sorting_columns list in parquet files. + // are stored in the 'sort.columns' table property. This is sent to the backend to + // populate the RowGroup::sorting_columns list in parquet files. private List sortColumns_ = Lists.newArrayList(); public HdfsTableSink(Table targetTable, List partitionKeyExprs, diff --git a/fe/src/main/java/org/apache/impala/planner/Planner.java b/fe/src/main/java/org/apache/impala/planner/Planner.java index d6b0814ad..b6fa54aa9 100644 --- a/fe/src/main/java/org/apache/impala/planner/Planner.java +++ b/fe/src/main/java/org/apache/impala/planner/Planner.java @@ -484,14 +484,15 @@ public class Planner { } /** - * Insert a sort node on top of the plan, depending on the clustered/noclustered/sortby - * plan hint. If clustering is enabled in insertStmt or additional columns are specified - * in the 'sort.columns' table property, then the ordering columns will start with the - * clustering columns (key columns for Kudu tables), so that partitions can be written - * sequentially in the table sink. Any additional non-clustering columns specified by - * the 'sort.columns' property will be added to the ordering columns and after any - * clustering columns. If no clustering is requested and the table does not contain - * columns in the 'sort.columns' property, then no sort node will be added to the plan. + * Insert a sort node on top of the plan, depending on the clustered/noclustered + * plan hint and on the 'sort.columns' table property. If clustering is enabled in + * insertStmt or additional columns are specified in the 'sort.columns' table property, + * then the ordering columns will start with the clustering columns (key columns for + * Kudu tables), so that partitions can be written sequentially in the table sink. Any + * additional non-clustering columns specified by the 'sort.columns' property will be + * added to the ordering columns and after any clustering columns. If no clustering is + * requested and the table does not contain columns in the 'sort.columns' property, then + * no sort node will be added to the plan. */ public void createPreInsertSort(InsertStmt insertStmt, PlanFragment inputFragment, Analyzer analyzer) throws ImpalaException { diff --git a/fe/src/main/java/org/apache/impala/planner/TableSink.java b/fe/src/main/java/org/apache/impala/planner/TableSink.java index 1c27360ec..4c4b35ef6 100644 --- a/fe/src/main/java/org/apache/impala/planner/TableSink.java +++ b/fe/src/main/java/org/apache/impala/planner/TableSink.java @@ -88,7 +88,7 @@ public abstract class TableSink extends DataSink { * All parameters must be non-null, the lists in particular need to be empty if they * don't make sense for a certain table type. * For HDFS tables 'sortColumns' specifies the indices into the list of non-clustering - * columns of the target table that are mentioned in the 'sortby()' hint. + * columns of the target table that are stored in the 'sort.columns' table property. */ public static TableSink create(Table table, Op sinkAction, List partitionKeyExprs, List referencedColumns, @@ -112,14 +112,14 @@ public abstract class TableSink extends DataSink { Preconditions.checkState(overwrite == false); // Referenced columns don't make sense for an HBase table. Preconditions.checkState(referencedColumns.isEmpty()); - // sortby() hint is not supported for HBase tables. + // Sort columns are not supported for HBase tables. Preconditions.checkState(sortColumns.isEmpty()); // Create the HBaseTableSink and return it. return new HBaseTableSink(table); } else if (table instanceof KuduTable) { // Kudu doesn't have a way to perform INSERT OVERWRITE. Preconditions.checkState(overwrite == false); - // sortby() hint is not supported for Kudu tables. + // Sort columns are not supported for Kudu tables. Preconditions.checkState(sortColumns.isEmpty()); return new KuduTableSink(table, sinkAction, referencedColumns); } else { diff --git a/fe/src/test/java/org/apache/impala/analysis/AnalyzeDDLTest.java b/fe/src/test/java/org/apache/impala/analysis/AnalyzeDDLTest.java index 085877404..0fe60a385 100644 --- a/fe/src/test/java/org/apache/impala/analysis/AnalyzeDDLTest.java +++ b/fe/src/test/java/org/apache/impala/analysis/AnalyzeDDLTest.java @@ -1952,14 +1952,14 @@ public class AnalyzeDDLTest extends FrontendTestBase { "tblproperties ('sort.columns'='i')", "Table definition must not contain the " + "sort.columns table property. Use SORT BY (...) instead."); - // Column in sortby hint must exist. + // Column in sort by list must exist. AnalysisError("create table functional.new_table (i int) sort by (j)", "Could not " + "find SORT BY column 'j' in table."); // Partitioned HDFS table AnalyzesOk("create table functional.new_table (i int) PARTITIONED BY (d decimal)" + "SORT BY (i)"); - // Column in sortby hint must not be a Hdfs partition column. + // Column in sort by list must not be a Hdfs partition column. AnalysisError("create table functional.new_table (i int) PARTITIONED BY (d decimal)" + "SORT BY (d)", "SORT BY column list must not contain partition column: 'd'"); } diff --git a/fe/src/test/java/org/apache/impala/analysis/AnalyzeStmtsTest.java b/fe/src/test/java/org/apache/impala/analysis/AnalyzeStmtsTest.java index e4a688cdf..35acda967 100644 --- a/fe/src/test/java/org/apache/impala/analysis/AnalyzeStmtsTest.java +++ b/fe/src/test/java/org/apache/impala/analysis/AnalyzeStmtsTest.java @@ -1779,39 +1779,6 @@ public class AnalyzeStmtsTest extends AnalyzerTest { "functional.alltypes", prefix, suffix), "Insert statement has 'noclustered' hint, but table has 'sort.columns' " + "property. The 'noclustered' hint will be ignored."); - - - // Below are tests for hints that are not supported by the legacy syntax. - if (prefix == "[") continue; - - // Tests for sortby hint - AnalyzesOk(String.format("insert into functional.alltypessmall " + - "partition (year, month) %ssortby(int_col)%s select * from functional.alltypes", - prefix, suffix)); - AnalyzesOk(String.format("insert into functional.alltypessmall " + - "partition (year, month) %sshuffle,clustered,sortby(int_col)%s select * from " + - "functional.alltypes", prefix, suffix)); - AnalyzesOk(String.format("insert into functional.alltypessmall " + - "partition (year, month) %ssortby(int_col, bool_col)%s select * from " + - "functional.alltypes", prefix, suffix)); - AnalyzesOk(String.format("insert into functional.alltypessmall " + - "partition (year, month) %sshuffle,clustered,sortby(int_col,bool_col)%s " + - "select * from functional.alltypes", prefix, suffix)); - AnalyzesOk(String.format("insert into functional.alltypessmall " + - "partition (year, month) %sshuffle,sortby(int_col,bool_col),clustered%s " + - "select * from functional.alltypes", prefix, suffix)); - AnalyzesOk(String.format("insert into functional.alltypessmall " + - "partition (year, month) %ssortby(int_col,bool_col),shuffle,clustered%s " + - "select * from functional.alltypes", prefix, suffix)); - // Column in sortby hint must exist. - AnalysisError(String.format("insert into functional.alltypessmall " + - "partition (year, month) %ssortby(foo)%s select * from functional.alltypes", - prefix, suffix), "Could not find SORTBY hint column 'foo' in table."); - // Column in sortby hint must not be a Hdfs partition column. - AnalysisError(String.format("insert into functional.alltypessmall " + - "partition (year, month) %ssortby(year)%s select * from " + - "functional.alltypes", prefix, suffix), - "SORTBY hint column list must not contain Hdfs partition column: 'year'"); } // Multiple non-conflicting hints and case insensitivity of hints. diff --git a/fe/src/test/java/org/apache/impala/analysis/ParserTest.java b/fe/src/test/java/org/apache/impala/analysis/ParserTest.java index d35b9b312..9fb1e2a04 100644 --- a/fe/src/test/java/org/apache/impala/analysis/ParserTest.java +++ b/fe/src/test/java/org/apache/impala/analysis/ParserTest.java @@ -209,13 +209,14 @@ public class ParserTest extends FrontendTestBase { ParsesOk("/* select 1; */ select 1"); ParsesOk("/** select 1; */ select 1"); ParsesOk("/* select */ select 1 /* 1 */"); - ParsesOk("select 1 /* sortby(() */"); - // Empty columns list in sortby hint - ParserError("select 1 /*+ sortby() */"); + // Test hint with arguments + ParsesOk("select 1 /* hint_with_args(() */"); + // Empty argument list in hint_with_args hint is not allowed + ParserError("select 1 /*+ hint_with_args() */"); // Mismatching parentheses - ParserError("select 1 /*+ sortby(() */"); - ParserError("select 1 /*+ sortby(a) \n"); - ParserError("select 1 --+ sortby(a) */\n from t"); + ParserError("select 1 /*+ hint_with_args(() */"); + ParserError("select 1 /*+ hint_with_args(a) \n"); + ParserError("select 1 --+ hint_with_args(a) */\n from t"); } /** @@ -237,9 +238,10 @@ public class ParserTest extends FrontendTestBase { ParserError("-- baz /*\nselect 1*/"); ParsesOk("select -- blah\n 1"); ParsesOk("select -- select 1\n 1"); - ParsesOk("select 1 -- sortby(()"); + // Test hint with arguments + ParsesOk("select 1 -- hint_with_args(()"); // Mismatching parentheses - ParserError("select 1 -- +sortby(()\n"); + ParserError("select 1 -- +hint_with_args(()\n"); } /** @@ -460,19 +462,20 @@ public class ParserTest extends FrontendTestBase { // Tests for hints with arguments. TestInsertHints(String.format( - "insert into t %ssortby(a)%s select * from t", prefix, suffix), - "sortby(a)"); + "insert into t %shint_with_args(a)%s select * from t", prefix, suffix), + "hint_with_args(a)"); TestInsertHints(String.format( - "insert into t %sclustered,shuffle,sortby(a)%s select * from t", prefix, - suffix), "clustered", "shuffle", "sortby(a)"); + "insert into t %sclustered,shuffle,hint_with_args(a)%s select * from t", prefix, + suffix), "clustered", "shuffle", "hint_with_args(a)"); TestInsertHints(String.format( - "insert into t %ssortby(a,b)%s select * from t", prefix, suffix), - "sortby(a,b)"); + "insert into t %shint_with_args(a,b)%s select * from t", prefix, suffix), + "hint_with_args(a,b)"); TestInsertHints(String.format( - "insert into t %ssortby(a , b)%s select * from t", prefix, suffix), - "sortby(a,b)"); + "insert into t %shint_with_args(a , b)%s select * from t", prefix, suffix), + "hint_with_args(a,b)"); ParserError(String.format( - "insert into t %ssortby( a , , ,,, b )%s select * from t", prefix, suffix)); + "insert into t %shint_with_args( a , , ,,, b )%s select * from t", + prefix, suffix)); } // No "+" at the beginning so the comment is not recognized as a hint. TestJoinHints("select * from functional.alltypes a join /* comment */" + diff --git a/fe/src/test/java/org/apache/impala/analysis/ToSqlTest.java b/fe/src/test/java/org/apache/impala/analysis/ToSqlTest.java index 26def5b45..cd14d252b 100644 --- a/fe/src/test/java/org/apache/impala/analysis/ToSqlTest.java +++ b/fe/src/test/java/org/apache/impala/analysis/ToSqlTest.java @@ -505,11 +505,19 @@ public class ToSqlTest extends FrontendTestBase { // Insert hint. testToSql(String.format( "insert into functional.alltypes(int_col, bool_col) " + - "partition(year, month) %snoshuffle,sortby(int_col)%s " + + "partition(year, month) %snoshuffle%s " + "select int_col, bool_col, year, month from functional.alltypes", prefix, suffix), "INSERT INTO TABLE functional.alltypes(int_col, bool_col) " + - "PARTITION (year, month) \n-- +noshuffle,sortby(int_col)\n " + + "PARTITION (year, month) \n-- +noshuffle\n " + + "SELECT int_col, bool_col, year, month FROM functional.alltypes"); + testToSql(String.format( + "insert into functional.alltypes(int_col, bool_col) " + + "partition(year, month) %sshuffle,clustered%s " + + "select int_col, bool_col, year, month from functional.alltypes", + prefix, suffix), + "INSERT INTO TABLE functional.alltypes(int_col, bool_col) " + + "PARTITION (year, month) \n-- +shuffle,clustered\n " + "SELECT int_col, bool_col, year, month FROM functional.alltypes"); // Table hint diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/insert-sort-by.test b/testdata/workloads/functional-planner/queries/PlannerTest/insert-sort-by.test index 2a9d272c0..f2ecb2c57 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/insert-sort-by.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/insert-sort-by.test @@ -403,54 +403,3 @@ WRITE TO HDFS [test_sort_by.t_nopart, OVERWRITE=false] partitions=1/0 files=0 size=0B runtime filters: RF000 -> id ==== -# Test that using both the sortby hint and sort columns work. -# TODO: Remove this in IMPALA-5157. -insert into table test_sort_by.t partition(year, month) /*+ shuffle,sortby(id) */ -select id, int_col, bool_col, year, month from functional.alltypes ----- PLAN -WRITE TO HDFS [test_sort_by.t, OVERWRITE=false, PARTITION-KEYS=(year,month)] -| partitions=24 -| -01:SORT -| order by: year ASC NULLS LAST, month ASC NULLS LAST, int_col ASC NULLS LAST, bool_col ASC NULLS LAST, id ASC NULLS LAST -| -00:SCAN HDFS [functional.alltypes] - partitions=24/24 files=24 size=478.45KB ----- DISTRIBUTEDPLAN -WRITE TO HDFS [test_sort_by.t, OVERWRITE=false, PARTITION-KEYS=(year,month)] -| partitions=24 -| -02:SORT -| order by: year ASC NULLS LAST, month ASC NULLS LAST, int_col ASC NULLS LAST, bool_col ASC NULLS LAST, id ASC NULLS LAST -| -01:EXCHANGE [HASH(year,month)] -| -00:SCAN HDFS [functional.alltypes] - partitions=24/24 files=24 size=478.45KB -==== -# Test that using both the sortby hint and sort columns work when the column lists -# overlap. -# TODO: Remove this in IMPALA-5157. -insert into table test_sort_by.t partition(year, month) /*+ shuffle,sortby(id, bool_col) */ -select id, int_col, bool_col, year, month from functional.alltypes ----- PLAN -WRITE TO HDFS [test_sort_by.t, OVERWRITE=false, PARTITION-KEYS=(year,month)] -| partitions=24 -| -01:SORT -| order by: year ASC NULLS LAST, month ASC NULLS LAST, int_col ASC NULLS LAST, bool_col ASC NULLS LAST, id ASC NULLS LAST, bool_col ASC NULLS LAST -| -00:SCAN HDFS [functional.alltypes] - partitions=24/24 files=24 size=478.45KB ----- DISTRIBUTEDPLAN -WRITE TO HDFS [test_sort_by.t, OVERWRITE=false, PARTITION-KEYS=(year,month)] -| partitions=24 -| -02:SORT -| order by: year ASC NULLS LAST, month ASC NULLS LAST, int_col ASC NULLS LAST, bool_col ASC NULLS LAST, id ASC NULLS LAST, bool_col ASC NULLS LAST -| -01:EXCHANGE [HASH(year,month)] -| -00:SCAN HDFS [functional.alltypes] - partitions=24/24 files=24 size=478.45KB -==== diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/insert.test b/testdata/workloads/functional-planner/queries/PlannerTest/insert.test index e3edae030..bdbc92fba 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/insert.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/insert.test @@ -705,76 +705,3 @@ WRITE TO HDFS [functional.alltypesnopart, OVERWRITE=false] 00:SCAN HDFS [functional.alltypesnopart] partitions=1/1 files=0 size=0B ==== -# IMPALA-4163: sortby hint in insert statement adds sort node. -insert into table functional.alltypes partition(year, month) - /*+ sortby(int_col, bool_col),shuffle */ - select * from functional.alltypes ----- PLAN -WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(year,month)] -| partitions=24 -| -01:SORT -| order by: year ASC NULLS LAST, month ASC NULLS LAST, int_col ASC NULLS LAST, bool_col ASC NULLS LAST -| -00:SCAN HDFS [functional.alltypes] - partitions=24/24 files=24 size=478.45KB ----- DISTRIBUTEDPLAN -WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(year,month)] -| partitions=24 -| -02:SORT -| order by: year ASC NULLS LAST, month ASC NULLS LAST, int_col ASC NULLS LAST, bool_col ASC NULLS LAST -| -01:EXCHANGE [HASH(functional.alltypes.year,functional.alltypes.month)] -| -00:SCAN HDFS [functional.alltypes] - partitions=24/24 files=24 size=478.45KB -==== -# IMPALA-4163: sortby hint in insert statement with noshuffle adds sort node. -insert into table functional.alltypes partition(year, month) - /*+ sortby(int_col, bool_col),noshuffle */ - select * from functional.alltypes ----- PLAN -WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(year,month)] -| partitions=24 -| -01:SORT -| order by: year ASC NULLS LAST, month ASC NULLS LAST, int_col ASC NULLS LAST, bool_col ASC NULLS LAST -| -00:SCAN HDFS [functional.alltypes] - partitions=24/24 files=24 size=478.45KB ----- DISTRIBUTEDPLAN -WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(year,month)] -| partitions=24 -| -01:SORT -| order by: year ASC NULLS LAST, month ASC NULLS LAST, int_col ASC NULLS LAST, bool_col ASC NULLS LAST -| -00:SCAN HDFS [functional.alltypes] - partitions=24/24 files=24 size=478.45KB -==== -# IMPALA-4163: sortby hint in insert statement adds ordering columns to clustering sort. -insert into table functional.alltypes partition(year, month) - /*+ sortby(int_col, bool_col),clustered */ - select * from functional.alltypes ----- PLAN -WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(year,month)] -| partitions=24 -| -01:SORT -| order by: year ASC NULLS LAST, month ASC NULLS LAST, int_col ASC NULLS LAST, bool_col ASC NULLS LAST -| -00:SCAN HDFS [functional.alltypes] - partitions=24/24 files=24 size=478.45KB ----- DISTRIBUTEDPLAN -WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(year,month)] -| partitions=24 -| -02:SORT -| order by: year ASC NULLS LAST, month ASC NULLS LAST, int_col ASC NULLS LAST, bool_col ASC NULLS LAST -| -01:EXCHANGE [HASH(functional.alltypes.year,functional.alltypes.month)] -| -00:SCAN HDFS [functional.alltypes] - partitions=24/24 files=24 size=478.45KB -==== diff --git a/testdata/workloads/functional-query/queries/QueryTest/insert.test b/testdata/workloads/functional-query/queries/QueryTest/insert.test index 9dfaf3469..5601b3513 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/insert.test +++ b/testdata/workloads/functional-query/queries/QueryTest/insert.test @@ -945,29 +945,3 @@ RESET alltypesnopart_insert ---- RESULTS : 100 ==== ----- QUERY -# IMPALA-4163: insert into table sortby() plan hint -insert into table alltypesinsert -partition (year, month) /*+ clustered,shuffle,sortby(int_col, bool_col) */ -select * from alltypestiny; ----- SETUP -DROP PARTITIONS alltypesinsert -RESET alltypesinsert ----- RESULTS -year=2009/month=1/: 2 -year=2009/month=2/: 2 -year=2009/month=3/: 2 -year=2009/month=4/: 2 -==== ----- QUERY -# IMPALA-4163: insert into table sortby() plan hint -insert into table alltypesnopart_insert -/*+ clustered,shuffle,sortby(int_col, bool_col) */ -select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, -double_col, date_string_col, string_col, timestamp_col from alltypestiny; ----- SETUP -DROP PARTITIONS alltypesinsert -RESET alltypesinsert ----- RESULTS -: 8 -==== diff --git a/tests/query_test/test_insert_parquet.py b/tests/query_test/test_insert_parquet.py index 447778608..9d9eb7b2b 100644 --- a/tests/query_test/test_insert_parquet.py +++ b/tests/query_test/test_insert_parquet.py @@ -232,8 +232,8 @@ class TestHdfsParquetTableWriter(ImpalaTestSuite): rmtree(tmp_dir) def test_sorting_columns(self, vector, unique_database, tmpdir): - """Tests that RowGroup::sorting_columns gets populated when specifying a sortby() - insert hint.""" + """Tests that RowGroup::sorting_columns gets populated when the table has SORT BY + columns.""" source_table = "functional_parquet.alltypessmall" target_table = "test_write_sorting_columns" qualified_target_table = "{0}.{1}".format(unique_database, target_table) @@ -241,14 +241,13 @@ class TestHdfsParquetTableWriter(ImpalaTestSuite): target_table)) # Create table - # TODO: Simplify once IMPALA-4167 (insert hints in CTAS) has been fixed. - query = "create table {0} like {1} stored as parquet".format(qualified_target_table, - source_table) + query = "create table {0} sort by (int_col, id) like {1} stored as parquet".format( + qualified_target_table, source_table) self.execute_query(query) # Insert data - query = ("insert into {0} partition(year, month) /* +sortby(int_col, id) */ " - "select * from {1}").format(qualified_target_table, source_table) + query = ("insert into {0} partition(year, month) select * from {1}").format( + qualified_target_table, source_table) self.execute_query(query) # Download hdfs files and extract rowgroup metadata @@ -540,7 +539,7 @@ class TestHdfsParquetTableStatsWriter(ImpalaTestSuite): def test_write_statistics_multiple_row_groups(self, vector, unique_database): """Test that writing multiple row groups works as expected. This is done by inserting - into a table using the sortby() hint and then making sure that the min and max values + into a table using the SORT BY clause and then making sure that the min and max values of row groups don't overlap.""" source_table = "tpch_parquet.orders" target_table = "test_hdfs_parquet_table_writer" @@ -551,10 +550,10 @@ class TestHdfsParquetTableStatsWriter(ImpalaTestSuite): # Insert a large amount of data on a single backend with a limited parquet file size. # This will result in several files being written, exercising code that tracks # statistics for row groups. - query = "create table {0} like {1} stored as parquet".format(qualified_target_table, - source_table) + query = "create table {0} sort by (o_orderkey) like {1} stored as parquet".format( + qualified_target_table, source_table) self.execute_query(query, vector.get_value('exec_option')) - query = ("insert into {0} /* +sortby(o_orderkey) */ select * from {1}").format( + query = ("insert into {0} select * from {1}").format( qualified_target_table, source_table) vector.get_value('exec_option')['num_nodes'] = 1 vector.get_value('exec_option')['parquet_file_size'] = 8 * 1024 * 1024