diff --git a/be/src/exec/aggregation-node.cc b/be/src/exec/aggregation-node.cc index d19acd0bf..5871ac24a 100644 --- a/be/src/exec/aggregation-node.cc +++ b/be/src/exec/aggregation-node.cc @@ -144,8 +144,8 @@ Status AggregationNode::Prepare(RuntimeState* state) { } // TODO: how many buckets? - hash_tbl_.reset(new OldHashTable(state, build_expr_ctxs_, probe_expr_ctxs_, 1, - true, true, id(), mem_tracker(), true)); + hash_tbl_.reset(new OldHashTable(state, build_expr_ctxs_, probe_expr_ctxs_, 1, true, + std::vector(build_expr_ctxs_.size(), true), id(), mem_tracker(), true)); if (probe_expr_ctxs_.empty()) { // create single intermediate tuple now; we need to output something diff --git a/be/src/exec/hash-join-node.cc b/be/src/exec/hash-join-node.cc index 65d933097..7a1570470 100644 --- a/be/src/exec/hash-join-node.cc +++ b/be/src/exec/hash-join-node.cc @@ -14,6 +14,8 @@ #include "exec/hash-join-node.h" +#include +#include #include #include "codegen/llvm-codegen.h" @@ -40,6 +42,7 @@ const char* HashJoinNode::LLVM_CLASS_NAME = "class.impala::HashJoinNode"; HashJoinNode::HashJoinNode( ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs) : BlockingJoinNode("HashJoinNode", tnode.hash_join_node.join_op, pool, tnode, descs), + is_not_distinct_from_(), codegen_process_build_batch_fn_(NULL), process_build_batch_fn_(NULL), process_probe_batch_fn_(NULL) { @@ -70,6 +73,7 @@ Status HashJoinNode::Init(const TPlanNode& tnode) { probe_expr_ctxs_.push_back(ctx); RETURN_IF_ERROR(Expr::CreateExprTree(pool_, eq_join_conjuncts[i].right, &ctx)); build_expr_ctxs_.push_back(ctx); + is_not_distinct_from_.push_back(eq_join_conjuncts[i].is_not_distinct_from); } RETURN_IF_ERROR( Expr::CreateExprTrees(pool_, tnode.hash_join_node.other_join_conjuncts, @@ -104,11 +108,13 @@ Status HashJoinNode::Prepare(RuntimeState* state) { AddExprCtxsToFree(other_join_conjunct_ctxs_); // TODO: default buckets - bool stores_nulls = - join_op_ == TJoinOp::RIGHT_OUTER_JOIN || join_op_ == TJoinOp::FULL_OUTER_JOIN; + const bool stores_nulls = join_op_ == TJoinOp::RIGHT_OUTER_JOIN || + join_op_ == TJoinOp::FULL_OUTER_JOIN || + std::accumulate(is_not_distinct_from_.begin(), is_not_distinct_from_.end(), false, + std::logical_or()); hash_tbl_.reset(new OldHashTable(state, build_expr_ctxs_, probe_expr_ctxs_, child(1)->row_desc().tuple_descriptors().size(), stores_nulls, - false, state->fragment_hash_seed(), mem_tracker())); + is_not_distinct_from_, state->fragment_hash_seed(), mem_tracker())); bool build_codegen_enabled = false; bool probe_codegen_enabled = false; diff --git a/be/src/exec/hash-join-node.h b/be/src/exec/hash-join-node.h index ec947ed9f..b2cd77983 100644 --- a/be/src/exec/hash-join-node.h +++ b/be/src/exec/hash-join-node.h @@ -72,6 +72,10 @@ class HashJoinNode : public BlockingJoinNode { std::vector probe_expr_ctxs_; std::vector build_expr_ctxs_; + /// is_not_distinct_from_[i] is true if and only if the ith equi-join predicate is IS + /// NOT DISTINCT FROM, rather than equality. + std::vector is_not_distinct_from_; + /// non-equi-join conjuncts from the JOIN clause std::vector other_join_conjunct_ctxs_; diff --git a/be/src/exec/hash-table-test.cc b/be/src/exec/hash-table-test.cc index 012711199..7e332ae0d 100644 --- a/be/src/exec/hash-table-test.cc +++ b/be/src/exec/hash-table-test.cc @@ -237,7 +237,8 @@ class HashTableTest : public testing::Test { // Create the hash table and insert the build rows scoped_ptr hash_table; ASSERT_TRUE(CreateHashTable(quadratic, initial_num_buckets, &hash_table)); - HashTableCtx ht_ctx(build_expr_ctxs_, probe_expr_ctxs_, false, false, 1, 0, 1); + HashTableCtx ht_ctx(build_expr_ctxs_, probe_expr_ctxs_, false, + std::vector(build_expr_ctxs_.size(), false), 1, 0, 1); uint32_t hash = 0; bool success = hash_table->CheckAndResize(5, &ht_ctx); @@ -287,7 +288,8 @@ class HashTableTest : public testing::Test { ASSERT_TRUE(CreateHashTable(quadratic, initial_size, &hash_table)); int total_rows = rows_to_insert + additional_rows; - HashTableCtx ht_ctx(build_expr_ctxs_, probe_expr_ctxs_, false, false, 1, 0, 1); + HashTableCtx ht_ctx(build_expr_ctxs_, probe_expr_ctxs_, false, + std::vector(build_expr_ctxs_.size(), false), 1, 0, 1); // Add 1 row with val 1, 2 with val 2, etc. vector build_rows; @@ -340,7 +342,8 @@ class HashTableTest : public testing::Test { MemTracker tracker(100 * 1024 * 1024); scoped_ptr hash_table; ASSERT_TRUE(CreateHashTable(quadratic, num_to_add, &hash_table)); - HashTableCtx ht_ctx(build_expr_ctxs_, probe_expr_ctxs_, false, false, 1, 0, 1); + HashTableCtx ht_ctx(build_expr_ctxs_, probe_expr_ctxs_, false, + std::vector(build_expr_ctxs_.size(), false), 1, 0, 1); // Inserts num_to_add + (num_to_add^2) + (num_to_add^4) + ... + (num_to_add^20) // entries. When num_to_add == 4, then the total number of inserts is 4194300. @@ -389,7 +392,8 @@ class HashTableTest : public testing::Test { void InsertFullTest(bool quadratic, int table_size) { scoped_ptr hash_table; ASSERT_TRUE(CreateHashTable(quadratic, table_size, &hash_table)); - HashTableCtx ht_ctx(build_expr_ctxs_, probe_expr_ctxs_, false, false, 1, 0, 1); + HashTableCtx ht_ctx(build_expr_ctxs_, probe_expr_ctxs_, false, + std::vector(build_expr_ctxs_.size(), false), 1, 0, 1); EXPECT_EQ(hash_table->EmptyBuckets(), table_size); // Insert and probe table_size different tuples. All of them are expected to be @@ -454,7 +458,8 @@ class HashTableTest : public testing::Test { scoped_ptr hash_table; ASSERT_FALSE(CreateHashTable(quadratic, table_size, &hash_table, block_size, max_num_blocks, reserved_blocks)); - HashTableCtx ht_ctx(build_expr_ctxs_, probe_expr_ctxs_, false, false, 1, 0, 1); + HashTableCtx ht_ctx(build_expr_ctxs_, probe_expr_ctxs_, false, + std::vector(build_expr_ctxs_.size(), false), 1, 0, 1); HashTable::Iterator iter = hash_table->Begin(&ht_ctx); EXPECT_TRUE(iter.AtEnd()); @@ -533,7 +538,8 @@ TEST_F(HashTableTest, QuadraticInsertFullTest) { // Test that hashing empty string updates hash value. TEST_F(HashTableTest, HashEmpty) { - HashTableCtx ht_ctx(build_expr_ctxs_, probe_expr_ctxs_, false, false, 1, 2, 1); + HashTableCtx ht_ctx(build_expr_ctxs_, probe_expr_ctxs_, false, + std::vector(build_expr_ctxs_.size(), false), 1, 2, 1); uint32_t seed = 9999; ht_ctx.set_level(0); EXPECT_NE(seed, ht_ctx.Hash(NULL, 0, seed)); diff --git a/be/src/exec/hash-table.cc b/be/src/exec/hash-table.cc index b56099c92..c1493b7a0 100644 --- a/be/src/exec/hash-table.cc +++ b/be/src/exec/hash-table.cc @@ -14,6 +14,9 @@ #include "exec/hash-table.inline.h" +#include +#include + #include "codegen/codegen-anyval.h" #include "codegen/llvm-codegen.h" #include "exprs/expr.h" @@ -77,17 +80,21 @@ static int64_t NULL_VALUE[] = { HashUtil::FNV_SEED, HashUtil::FNV_SEED, static const int64_t INITIAL_DATA_PAGE_SIZES[] = { 64 * 1024, 512 * 1024 }; static const int NUM_SMALL_DATA_PAGES = sizeof(INITIAL_DATA_PAGE_SIZES) / sizeof(int64_t); -HashTableCtx::HashTableCtx(const vector& build_expr_ctxs, - const vector& probe_expr_ctxs, bool stores_nulls, bool finds_nulls, - int32_t initial_seed, int max_levels, int num_build_tuples) +HashTableCtx::HashTableCtx(const std::vector& build_expr_ctxs, + const std::vector& probe_expr_ctxs, bool stores_nulls, + const std::vector& finds_nulls, int32_t initial_seed, + int max_levels, int num_build_tuples) : build_expr_ctxs_(build_expr_ctxs), probe_expr_ctxs_(probe_expr_ctxs), stores_nulls_(stores_nulls), finds_nulls_(finds_nulls), + finds_some_nulls_(std::accumulate( + finds_nulls_.begin(), finds_nulls_.end(), false, std::logical_or())), level_(0), row_(reinterpret_cast(malloc(sizeof(Tuple*) * num_build_tuples))) { // Compute the layout and buffer size to store the evaluated expr results DCHECK_EQ(build_expr_ctxs_.size(), probe_expr_ctxs_.size()); + DCHECK_EQ(build_expr_ctxs_.size(), finds_nulls_.size()); DCHECK(!build_expr_ctxs_.empty()); results_buffer_size_ = Expr::ComputeResultsLayout(build_expr_ctxs_, &expr_values_buffer_offsets_, &var_result_begin_); @@ -170,7 +177,7 @@ bool HashTableCtx::Equals(TupleRow* build_row) { for (int i = 0; i < build_expr_ctxs_.size(); ++i) { void* val = build_expr_ctxs_[i]->GetValue(build_row); if (val == NULL) { - if (!stores_nulls_) return false; + if (!(stores_nulls_ && finds_nulls_[i])) return false; if (!expr_value_null_bits_[i]) return false; continue; } else { @@ -830,7 +837,7 @@ Function* HashTableCtx::CodegenEquals(RuntimeState* state) { // the case where the hash table does not store nulls, this is always false. Value* probe_is_null = codegen->false_value(); uint8_t* null_byte_loc = &expr_value_null_bits_[i]; - if (stores_nulls_) { + if (stores_nulls_ && finds_nulls_[i]) { Value* llvm_null_byte_loc = codegen->CastPtrToLlvmPtr(codegen->ptr_type(), null_byte_loc); Value* null_byte = builder.CreateLoad(llvm_null_byte_loc); diff --git a/be/src/exec/hash-table.h b/be/src/exec/hash-table.h index 595f30068..f27b709d8 100644 --- a/be/src/exec/hash-table.h +++ b/be/src/exec/hash-table.h @@ -108,14 +108,18 @@ class HashTableCtx { /// - build_exprs are the exprs that should be used to evaluate rows during Insert(). /// - probe_exprs are used during Find() /// - stores_nulls: if false, TupleRows with nulls are ignored during Insert - /// - finds_nulls: if false, Find() returns End() for TupleRows with nulls - /// even if stores_nulls is true + /// - finds_nulls: if finds_nulls[i] is false, Find() returns End() for TupleRows with + /// nulls in position i even if stores_nulls is true. /// - initial_seed: Initial seed value to use when computing hashes for rows with /// level 0. Other levels have their seeds derived from this seed. /// - The max levels we will hash with. + /// TODO: stores_nulls is too coarse: for a hash table in which some columns are joined + /// with '<=>' and others with '=', stores_nulls could distinguish between columns + /// in which nulls are stored and columns in which they are not, which could save + /// space by not storing some rows we know will never match. HashTableCtx(const std::vector& build_expr_ctxs, const std::vector& probe_expr_ctxs, bool stores_nulls, - bool finds_nulls, int32_t initial_seed, int max_levels, + const std::vector& finds_nulls, int32_t initial_seed, int max_levels, int num_build_tuples); /// Call to cleanup any resources. @@ -235,7 +239,10 @@ class HashTableCtx { /// TODO: these constants are an ideal candidate to be removed with codegen. /// TODO: ..or with template-ization const bool stores_nulls_; - const bool finds_nulls_; + const std::vector finds_nulls_; + + /// finds_some_nulls_ is just the logical OR of finds_nulls_. + const bool finds_some_nulls_; /// The current level this context is working on. Each level needs to use a /// different seed. diff --git a/be/src/exec/hash-table.inline.h b/be/src/exec/hash-table.inline.h index dd6ead902..27e72f2d2 100644 --- a/be/src/exec/hash-table.inline.h +++ b/be/src/exec/hash-table.inline.h @@ -29,7 +29,7 @@ inline bool HashTableCtx::EvalAndHashBuild(TupleRow* row, uint32_t* hash) { inline bool HashTableCtx::EvalAndHashProbe(TupleRow* row, uint32_t* hash) { bool has_null = EvalProbeRow(row); - if ((!stores_nulls_ || !finds_nulls_) && has_null) return false; + if (has_null && !(stores_nulls_ && finds_some_nulls_)) return false; *hash = HashCurrentRow(); return true; } diff --git a/be/src/exec/old-hash-table-test.cc b/be/src/exec/old-hash-table-test.cc index 499ccabc6..a46f7b015 100644 --- a/be/src/exec/old-hash-table-test.cc +++ b/be/src/exec/old-hash-table-test.cc @@ -198,8 +198,8 @@ TEST_F(OldHashTableTest, BasicTest) { // Create the hash table and insert the build rows MemTracker tracker; - OldHashTable hash_table(NULL, build_expr_ctxs_, probe_expr_ctxs_, - 1, false, false, 0, &tracker); + OldHashTable hash_table(NULL, build_expr_ctxs_, probe_expr_ctxs_, 1, false, + std::vector(build_expr_ctxs_.size(), false), 0, &tracker); for (int i = 0; i < 5; ++i) { hash_table.Insert(build_rows[i]); } @@ -240,8 +240,8 @@ TEST_F(OldHashTableTest, BasicTest) { // This tests makes sure we can scan ranges of buckets TEST_F(OldHashTableTest, ScanTest) { MemTracker tracker; - OldHashTable hash_table(NULL, build_expr_ctxs_, probe_expr_ctxs_, - 1, false, false, 0, &tracker); + OldHashTable hash_table(NULL, build_expr_ctxs_, probe_expr_ctxs_, 1, false, + std::vector(build_expr_ctxs_.size(), false), 0, &tracker); // Add 1 row with val 1, 2 with val 2, etc vector build_rows; ProbeTestData probe_rows[15]; @@ -286,8 +286,8 @@ TEST_F(OldHashTableTest, GrowTableTest) { int num_to_add = 4; int expected_size = 0; MemTracker tracker(100 * 1024 * 1024); - OldHashTable hash_table(NULL, build_expr_ctxs_, probe_expr_ctxs_, - 1, false, false, 0, &tracker, false, num_to_add); + OldHashTable hash_table(NULL, build_expr_ctxs_, probe_expr_ctxs_, 1, false, + std::vector(build_expr_ctxs_.size(), false), 0, &tracker, false, num_to_add); EXPECT_FALSE(hash_table.mem_limit_exceeded()); EXPECT_TRUE(!tracker.LimitExceeded()); diff --git a/be/src/exec/old-hash-table.cc b/be/src/exec/old-hash-table.cc index 5bab1e429..ebec5e736 100644 --- a/be/src/exec/old-hash-table.cc +++ b/be/src/exec/old-hash-table.cc @@ -14,6 +14,9 @@ #include "exec/old-hash-table.inline.h" +#include +#include + #include "codegen/codegen-anyval.h" #include "codegen/llvm-codegen.h" #include "exprs/expr.h" @@ -53,14 +56,16 @@ static int64_t NULL_VALUE[] = { HashUtil::FNV_SEED, HashUtil::FNV_SEED, OldHashTable::OldHashTable(RuntimeState* state, const vector& build_expr_ctxs, const vector& probe_expr_ctxs, int num_build_tuples, bool stores_nulls, - bool finds_nulls, int32_t initial_seed, MemTracker* mem_tracker, bool stores_tuples, - int64_t num_buckets) + const std::vector& finds_nulls, int32_t initial_seed, MemTracker* mem_tracker, + bool stores_tuples, int64_t num_buckets) : state_(state), build_expr_ctxs_(build_expr_ctxs), probe_expr_ctxs_(probe_expr_ctxs), num_build_tuples_(num_build_tuples), stores_nulls_(stores_nulls), finds_nulls_(finds_nulls), + finds_some_nulls_(std::accumulate( + finds_nulls_.begin(), finds_nulls_.end(), false, std::logical_or())), stores_tuples_(stores_tuples), initial_seed_(initial_seed), num_filled_buckets_(0), @@ -73,7 +78,7 @@ OldHashTable::OldHashTable(RuntimeState* state, const vector& buil mem_limit_exceeded_(false) { DCHECK(mem_tracker != NULL); DCHECK_EQ(build_expr_ctxs_.size(), probe_expr_ctxs_.size()); - + DCHECK_EQ(build_expr_ctxs_.size(), finds_nulls_.size()); DCHECK_EQ((num_buckets & (num_buckets-1)), 0) << "num_buckets must be a power of 2"; buckets_.resize(num_buckets); num_buckets_ = num_buckets; @@ -508,7 +513,7 @@ bool OldHashTable::Equals(TupleRow* build_row) { for (int i = 0; i < build_expr_ctxs_.size(); ++i) { void* val = build_expr_ctxs_[i]->GetValue(build_row); if (val == NULL) { - if (!stores_nulls_) return false; + if (!(stores_nulls_ && finds_nulls_[i])) return false; if (!expr_value_null_bits_[i]) return false; continue; } else { @@ -630,7 +635,7 @@ Function* OldHashTable::CodegenEquals(RuntimeState* state) { // the case where the hash table does not store nulls, this is always false. Value* probe_is_null = codegen->false_value(); uint8_t* null_byte_loc = &expr_value_null_bits_[i]; - if (stores_nulls_) { + if (stores_nulls_ && finds_nulls_[i]) { Value* llvm_null_byte_loc = codegen->CastPtrToLlvmPtr(codegen->ptr_type(), null_byte_loc); Value* null_byte = builder.CreateLoad(llvm_null_byte_loc); diff --git a/be/src/exec/old-hash-table.h b/be/src/exec/old-hash-table.h index d2612c58c..0bf4da99a 100644 --- a/be/src/exec/old-hash-table.h +++ b/be/src/exec/old-hash-table.h @@ -93,17 +93,21 @@ class OldHashTable { /// - probe_exprs are used during Find() /// - num_build_tuples: number of Tuples in the build tuple row /// - stores_nulls: if false, TupleRows with nulls are ignored during Insert - /// - finds_nulls: if false, Find() returns End() for TupleRows with nulls - /// even if stores_nulls is true + /// - finds_nulls: if finds_nulls[i] is false, Find() returns End() for TupleRows with + /// nulls in position i even if stores_nulls is true. /// - num_buckets: number of buckets that the hash table should be initialized to /// - mem_tracker: if non-empty, all memory allocations for nodes and for buckets are /// tracked; the tracker must be valid until the d'tor is called /// - initial_seed: Initial seed value to use when computing hashes for rows /// - stores_tuples: If true, the hash table stores tuples, otherwise it stores tuple /// rows. + /// TODO: stores_nulls is too coarse: for a hash table in which some columns are joined + /// with '<=>' and others with '=', stores_nulls could distinguish between columns + /// in which nulls are stored and columns in which they are not, which could save + /// space by not storing some rows we know will never match. OldHashTable(RuntimeState* state, const std::vector& build_expr_ctxs, const std::vector& probe_expr_ctxs, int num_build_tuples, - bool stores_nulls, bool finds_nulls, int32_t initial_seed, + bool stores_nulls, const std::vector& finds_nulls, int32_t initial_seed, MemTracker* mem_tracker, bool stores_tuples = false, int64_t num_buckets = 1024); /// Call to cleanup any resources. Must be called once. @@ -435,7 +439,11 @@ class OldHashTable { /// different behavior. /// TODO: these constants are an ideal candidate to be removed with codegen. const bool stores_nulls_; - const bool finds_nulls_; + const std::vector finds_nulls_; + + /// finds_some_nulls_ is just the logical OR of finds_nulls_. + const bool finds_some_nulls_; + const bool stores_tuples_; const int32_t initial_seed_; diff --git a/be/src/exec/old-hash-table.inline.h b/be/src/exec/old-hash-table.inline.h index c4c361004..811dae2de 100644 --- a/be/src/exec/old-hash-table.inline.h +++ b/be/src/exec/old-hash-table.inline.h @@ -102,7 +102,7 @@ inline bool OldHashTable::EvalAndHashBuild(TupleRow* row, uint32_t* hash) { inline bool OldHashTable::EvalAndHashProbe(TupleRow* row, uint32_t* hash) { bool has_null = EvalProbeRow(row); - if ((!stores_nulls_ || !finds_nulls_) && has_null) return false; + if (has_null && !(stores_nulls_ && finds_some_nulls_)) return false; *hash = HashCurrentRow(); return true; } diff --git a/be/src/exec/partitioned-aggregation-node.cc b/be/src/exec/partitioned-aggregation-node.cc index ca749dcb4..b66753175 100644 --- a/be/src/exec/partitioned-aggregation-node.cc +++ b/be/src/exec/partitioned-aggregation-node.cc @@ -189,8 +189,9 @@ Status PartitionedAggregationNode::Prepare(RuntimeState* state) { RETURN_IF_ERROR(state_->GetQueryStatus()); singleton_output_tuple_returned_ = false; } else { - ht_ctx_.reset(new HashTableCtx(build_expr_ctxs_, probe_expr_ctxs_, true, true, - state->fragment_hash_seed(), MAX_PARTITION_DEPTH, 1)); + ht_ctx_.reset(new HashTableCtx(build_expr_ctxs_, probe_expr_ctxs_, true, + std::vector(build_expr_ctxs_.size(), true), state->fragment_hash_seed(), + MAX_PARTITION_DEPTH, 1)); RETURN_IF_ERROR(state_->block_mgr()->RegisterClient( MinRequiredBuffers(), true, mem_tracker(), state, &block_mgr_client_)); RETURN_IF_ERROR(CreateHashPartitions(0)); diff --git a/be/src/exec/partitioned-hash-join-node.cc b/be/src/exec/partitioned-hash-join-node.cc index 4e06ab373..e7a633235 100644 --- a/be/src/exec/partitioned-hash-join-node.cc +++ b/be/src/exec/partitioned-hash-join-node.cc @@ -14,6 +14,8 @@ #include "exec/partitioned-hash-join-node.inline.h" +#include +#include #include #include @@ -44,6 +46,7 @@ PartitionedHashJoinNode::PartitionedHashJoinNode( ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs) : BlockingJoinNode("PartitionedHashJoinNode", tnode.hash_join_node.join_op, pool, tnode, descs), + is_not_distinct_from_(), block_mgr_client_(NULL), partition_build_timer_(NULL), null_aware_eval_timer_(NULL), @@ -79,6 +82,7 @@ Status PartitionedHashJoinNode::Init(const TPlanNode& tnode) { probe_expr_ctxs_.push_back(ctx); RETURN_IF_ERROR(Expr::CreateExprTree(pool_, eq_join_conjuncts[i].right, &ctx)); build_expr_ctxs_.push_back(ctx); + is_not_distinct_from_.push_back(eq_join_conjuncts[i].is_not_distinct_from); } RETURN_IF_ERROR( Expr::CreateExprTrees(pool_, tnode.hash_join_node.other_join_conjuncts, @@ -130,10 +134,12 @@ Status PartitionedHashJoinNode::Prepare(RuntimeState* state) { RETURN_IF_ERROR(state->block_mgr()->RegisterClient( MinRequiredBuffers(), true, mem_tracker(), state, &block_mgr_client_)); - bool should_store_nulls = join_op_ == TJoinOp::RIGHT_OUTER_JOIN || - join_op_ == TJoinOp::RIGHT_ANTI_JOIN || join_op_ == TJoinOp::FULL_OUTER_JOIN; - ht_ctx_.reset(new HashTableCtx(build_expr_ctxs_, probe_expr_ctxs_, - should_store_nulls, false, state->fragment_hash_seed(), MAX_PARTITION_DEPTH, + const bool should_store_nulls = join_op_ == TJoinOp::RIGHT_OUTER_JOIN || + join_op_ == TJoinOp::RIGHT_ANTI_JOIN || join_op_ == TJoinOp::FULL_OUTER_JOIN || + std::accumulate(is_not_distinct_from_.begin(), is_not_distinct_from_.end(), false, + std::logical_or()); + ht_ctx_.reset(new HashTableCtx(build_expr_ctxs_, probe_expr_ctxs_, should_store_nulls, + is_not_distinct_from_, state->fragment_hash_seed(), MAX_PARTITION_DEPTH, child(1)->row_desc().tuple_descriptors().size())); if (join_op_ == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) { diff --git a/be/src/exec/partitioned-hash-join-node.h b/be/src/exec/partitioned-hash-join-node.h index 78ff440a9..ac6884c0c 100644 --- a/be/src/exec/partitioned-hash-join-node.h +++ b/be/src/exec/partitioned-hash-join-node.h @@ -296,6 +296,10 @@ class PartitionedHashJoinNode : public BlockingJoinNode { std::vector probe_expr_ctxs_; std::vector build_expr_ctxs_; + /// is_not_distinct_from_[i] is true if and only if the ith equi-join predicate is IS + /// NOT DISTINCT FROM, rather than equality. + std::vector is_not_distinct_from_; + /// Non-equi-join conjuncts from the ON clause. std::vector other_join_conjunct_ctxs_; diff --git a/be/src/exprs/decimal-operators.cc b/be/src/exprs/decimal-operators.cc index ef2d169d0..16911c388 100644 --- a/be/src/exprs/decimal-operators.cc +++ b/be/src/exprs/decimal-operators.cc @@ -647,41 +647,53 @@ BooleanVal DecimalOperators::CastToBooleanVal( return DecimalVal::null(); \ } +#define DECIMAL_BINARY_OP_NONNULL(OP_FN, X, Y) \ + bool dummy = false; \ + const FunctionContext::TypeDesc& x_type = *context->GetArgType(0); \ + const FunctionContext::TypeDesc& y_type = *context->GetArgType(1); \ + int byte_size = ::max(ColumnType::GetDecimalByteSize(x_type.precision), \ + ColumnType::GetDecimalByteSize(y_type.precision)); \ + switch (byte_size) { \ + case 4: { \ + Decimal4Value x_val = GetDecimal4Value(X, x_type, &dummy); \ + Decimal4Value y_val = GetDecimal4Value(Y, y_type, &dummy); \ + bool result = x_val.OP_FN(x_type.scale, y_val, y_type.scale); \ + return BooleanVal(result); \ + } \ + case 8: { \ + Decimal8Value x_val = GetDecimal8Value(X, x_type, &dummy); \ + Decimal8Value y_val = GetDecimal8Value(Y, y_type, &dummy); \ + bool result = x_val.OP_FN(x_type.scale, y_val, y_type.scale); \ + return BooleanVal(result); \ + } \ + case 16: { \ + Decimal16Value x_val = GetDecimal16Value(X, x_type, &dummy); \ + Decimal16Value y_val = GetDecimal16Value(Y, y_type, &dummy); \ + bool result = x_val.OP_FN(x_type.scale, y_val, y_type.scale); \ + return BooleanVal(result); \ + } \ + default: \ + DCHECK(false); \ + break; \ + } \ + return BooleanVal::null(); + #define DECIMAL_BINARY_OP(FN_NAME, OP_FN) \ BooleanVal DecimalOperators::FN_NAME( \ FunctionContext* context, const DecimalVal& x, const DecimalVal& y) { \ if (x.is_null || y.is_null) return BooleanVal::null(); \ - bool dummy = false; \ - const FunctionContext::TypeDesc& x_type = *context->GetArgType(0); \ - const FunctionContext::TypeDesc& y_type = *context->GetArgType(1); \ - int byte_size = ::max(ColumnType::GetDecimalByteSize(x_type.precision), \ - ColumnType::GetDecimalByteSize(y_type.precision)); \ - switch (byte_size) { \ - case 4: { \ - Decimal4Value x_val = GetDecimal4Value(x, x_type, &dummy); \ - Decimal4Value y_val = GetDecimal4Value(y, y_type, &dummy); \ - bool result = x_val.OP_FN(x_type.scale, y_val, y_type.scale); \ - return BooleanVal(result); \ - } \ - case 8: { \ - Decimal8Value x_val = GetDecimal8Value(x, x_type, &dummy); \ - Decimal8Value y_val = GetDecimal8Value(y, y_type, &dummy); \ - bool result = x_val.OP_FN(x_type.scale, y_val, y_type.scale); \ - return BooleanVal(result); \ - } \ - case 16: { \ - Decimal16Value x_val = GetDecimal16Value(x, x_type, &dummy); \ - Decimal16Value y_val = GetDecimal16Value(y, y_type, &dummy); \ - bool result = x_val.OP_FN(x_type.scale, y_val, y_type.scale); \ - return BooleanVal(result); \ - } \ - default: \ - DCHECK(false); \ - break; \ - } \ - return BooleanVal::null(); \ + DECIMAL_BINARY_OP_NONNULL(OP_FN, x, y) \ } +#define NULLSAFE_DECIMAL_BINARY_OP(FN_NAME, OP_FN, IS_EQUAL) \ + BooleanVal DecimalOperators::FN_NAME( \ + FunctionContext* context, const DecimalVal& x, const DecimalVal& y) { \ + if (x.is_null) return BooleanVal(IS_EQUAL ? y.is_null : !y.is_null); \ + if (y.is_null) return BooleanVal(!IS_EQUAL); \ + DECIMAL_BINARY_OP_NONNULL(OP_FN, x, y) \ + } + + DECIMAL_ARITHMETIC_OP(Add_DecimalVal_DecimalVal, Add) DECIMAL_ARITHMETIC_OP(Subtract_DecimalVal_DecimalVal, Subtract) DECIMAL_ARITHMETIC_OP(Multiply_DecimalVal_DecimalVal, Multiply) @@ -694,5 +706,6 @@ DECIMAL_BINARY_OP(Ge_DecimalVal_DecimalVal, Ge) DECIMAL_BINARY_OP(Gt_DecimalVal_DecimalVal, Gt) DECIMAL_BINARY_OP(Le_DecimalVal_DecimalVal, Le) DECIMAL_BINARY_OP(Lt_DecimalVal_DecimalVal, Lt) - +NULLSAFE_DECIMAL_BINARY_OP(DistinctFrom_DecimalVal_DecimalVal, Ne, false) +NULLSAFE_DECIMAL_BINARY_OP(NotDistinct_DecimalVal_DecimalVal, Eq, true) } diff --git a/be/src/exprs/decimal-operators.h b/be/src/exprs/decimal-operators.h index f32590e1e..0d5568da3 100644 --- a/be/src/exprs/decimal-operators.h +++ b/be/src/exprs/decimal-operators.h @@ -75,6 +75,10 @@ class DecimalOperators { FunctionContext*, const DecimalVal&, const DecimalVal&); static BooleanVal Lt_DecimalVal_DecimalVal( FunctionContext*, const DecimalVal&, const DecimalVal&); + static BooleanVal DistinctFrom_DecimalVal_DecimalVal( + FunctionContext*, const DecimalVal&, const DecimalVal&); + static BooleanVal NotDistinct_DecimalVal_DecimalVal( + FunctionContext*, const DecimalVal&, const DecimalVal&); /// The rounding rule when converting decimals. These only apply going from a higher /// scale to a lower one. diff --git a/be/src/exprs/expr-test.cc b/be/src/exprs/expr-test.cc index 038be44be..f10b28aec 100644 --- a/be/src/exprs/expr-test.cc +++ b/be/src/exprs/expr-test.cc @@ -537,7 +537,31 @@ class ExprTest : public testing::Test { TestValue("1.23 < 1.234", TYPE_BOOLEAN, true); TestValue("1.23 > 1.234", TYPE_BOOLEAN, false); TestValue("1.23 = 1.230000000000000000000", TYPE_BOOLEAN, true); - TestValue("1.2300 != 1.230000000000000000001", TYPE_BOOLEAN, true); + + // Some values are too precise to be stored to full precision as doubles, but not too + // precise to be stored as decimals. + static const string not_too_precise = "1.25"; + // The closest double to 'too_precise' is 1.25 - the string as written cannot be + // preresented exactly as a double. + static const string too_precise = "1.250000000000000000001"; + TestValue( + "cast(" + not_too_precise + " as double) != cast(" + too_precise + " as double)", + TYPE_BOOLEAN, false); + TestValue(not_too_precise + " != " + too_precise, TYPE_BOOLEAN, true); + TestValue("cast(" + not_too_precise + " as double) IS DISTINCT FROM cast(" + + too_precise + " as double)", + TYPE_BOOLEAN, false); + TestValue(not_too_precise + " IS DISTINCT FROM " + too_precise, TYPE_BOOLEAN, true); + TestValue("cast(" + not_too_precise + " as double) IS NOT DISTINCT FROM cast(" + + too_precise + " as double)", + TYPE_BOOLEAN, true); + TestValue( + not_too_precise + " IS NOT DISTINCT FROM " + too_precise, TYPE_BOOLEAN, false); + TestValue( + "cast(" + not_too_precise + " as double) <=> cast(" + too_precise + " as double)", + TYPE_BOOLEAN, true); + TestValue(not_too_precise + " <=> " + too_precise, TYPE_BOOLEAN, false); + TestValue("cast(1 as decimal(38,0)) = cast(1 as decimal(38,37))", TYPE_BOOLEAN, true); TestValue("cast(1 as decimal(38,0)) = cast(0.1 as decimal(38,38))", TYPE_BOOLEAN, false); @@ -568,6 +592,47 @@ class ExprTest : public testing::Test { TestIsNull("NULL >= " + op, TYPE_BOOLEAN); } + // Test IS DISTINCT FROM operator and its variants + void TestDistinctFrom() { + static const string operators[] = {"<=>", "IS DISTINCT FROM", "IS NOT DISTINCT FROM"}; + static const string types[] = {"Boolean", "TinyInt", "SmallInt", "Int", "BigInt", + "Float", "Double", "String", "Timestamp", "Decimal"}; + static const string operands1[] = { + "true", "cast(1 as TinyInt)", "cast(1 as SmallInt)", "cast(1 as Int)", + "cast(1 as BigInt)", "cast(1 as Float)", "cast(1 as Double)", + "'this is a string'", "cast(1 as TimeStamp)", "cast(1 as Decimal)" + }; + static const string operands2[] = { + "false", "cast(2 as TinyInt)", "cast(2 as SmallInt)", "cast(2 as Int)", + "cast(2 as BigInt)", "cast(2 as Float)", "cast(2 as Double)", + "'this is ALSO a string'", "cast(2 as TimeStamp)", "cast(2 as Decimal)" + }; + for (int i = 0; i < sizeof(operators) / sizeof(string); ++i) { + // "IS DISTINCT FROM" and "<=>" are generalized equality, and + // this fact is recorded in is_equal. + const bool is_equal = operators[i] != "IS DISTINCT FROM"; + // Everything IS NOT DISTINCT FROM itself. + for (int j = 0; j < sizeof(types) / sizeof(string); ++j) { + const string operand = "cast(NULL as " + types[j] + ")"; + TestValue(operand + ' ' + operators[i] + ' ' + operand, TYPE_BOOLEAN, is_equal); + } + for (int j = 0; j < sizeof(operands1) / sizeof(string); ++j) { + TestValue(operands1[j] + ' ' + operators[i] + ' ' + operands1[j], TYPE_BOOLEAN, + is_equal); + } + // NULL IS DISTINCT FROM all non-null things. + for (int j = 0; j < sizeof(operands1) / sizeof(string); ++j) { + TestValue("NULL " + operators[i] + ' ' + operands1[j], TYPE_BOOLEAN, !is_equal); + TestValue(operands1[j] + ' ' + operators[i] + " NULL", TYPE_BOOLEAN, !is_equal); + } + // Non-null values can be DISTINCT. + for (int j = 0; j < sizeof(operands1) / sizeof(string); ++j) { + TestValue(operands1[j] + ' ' + operators[i] + ' ' + operands2[j], TYPE_BOOLEAN, + !is_equal); + } + } + } + // Test comparison operators with a left or right NULL operand on all types. void TestNullComparisons() { unordered_map::iterator def_iter; @@ -1181,6 +1246,7 @@ TEST_F(ExprTest, BinaryPredicates) { TestStringComparisons(); TestDecimalComparisons(); TestNullComparisons(); + TestDistinctFrom(); } // Test casting from all types to all other types diff --git a/be/src/exprs/operators.cc b/be/src/exprs/operators.cc index 1ceb8ebc4..42b3c7648 100644 --- a/be/src/exprs/operators.cc +++ b/be/src/exprs/operators.cc @@ -54,31 +54,64 @@ using strings::Substitute; return BigIntVal(fact); \ } +#define BINARY_PREDICATE_NUMERIC_NONNULL(OP, V1, V2) \ + return BooleanVal(V1.val OP V2.val) + +#define BINARY_PREDICATE_NONNUMERIC_NONNULL(TYPE, IMPALA_TYPE, OP, V1, V2) \ + IMPALA_TYPE iv1 = IMPALA_TYPE::From##TYPE(V1);\ + IMPALA_TYPE iv2 = IMPALA_TYPE::From##TYPE(V2);\ + return BooleanVal(iv1 OP iv2) + +#define BINARY_PREDICATE_CHAR_NONNULL(OP, V1, V2) \ + StringValue iv1 = StringValue::FromStringVal(V1);\ + StringValue iv2 = StringValue::FromStringVal(V2);\ + iv1.len = StringValue::UnpaddedCharLength(iv1.ptr, c->GetArgType(0)->len); \ + iv2.len = StringValue::UnpaddedCharLength(iv2.ptr, c->GetArgType(1)->len); \ + return BooleanVal(iv1 OP iv2) + #define BINARY_PREDICATE_NUMERIC_FN(NAME, TYPE, OP) \ BooleanVal Operators::NAME##_##TYPE##_##TYPE(\ FunctionContext* c, const TYPE& v1, const TYPE& v2) {\ if (v1.is_null || v2.is_null) return BooleanVal::null();\ - return BooleanVal(v1.val OP v2.val);\ + BINARY_PREDICATE_NUMERIC_NONNULL(OP, v1, v2);\ } #define BINARY_PREDICATE_NONNUMERIC_FN(NAME, TYPE, IMPALA_TYPE, OP) \ BooleanVal Operators::NAME##_##TYPE##_##TYPE(\ FunctionContext* c, const TYPE& v1, const TYPE& v2) {\ if (v1.is_null || v2.is_null) return BooleanVal::null();\ - IMPALA_TYPE iv1 = IMPALA_TYPE::From##TYPE(v1);\ - IMPALA_TYPE iv2 = IMPALA_TYPE::From##TYPE(v2);\ - return BooleanVal(iv1 OP iv2);\ + BINARY_PREDICATE_NONNUMERIC_NONNULL(TYPE, IMPALA_TYPE, OP, v1, v2);\ } #define BINARY_PREDICATE_CHAR(NAME, OP) \ BooleanVal Operators::NAME##_Char_Char(\ FunctionContext* c, const StringVal& v1, const StringVal& v2) {\ if (v1.is_null || v2.is_null) return BooleanVal::null();\ - StringValue iv1 = StringValue::FromStringVal(v1);\ - StringValue iv2 = StringValue::FromStringVal(v2);\ - iv1.len = StringValue::UnpaddedCharLength(iv1.ptr, c->GetArgType(0)->len); \ - iv2.len = StringValue::UnpaddedCharLength(iv2.ptr, c->GetArgType(1)->len); \ - return BooleanVal(iv1 OP iv2);\ + BINARY_PREDICATE_CHAR_NONNULL(OP, v1, v2);\ + } + +#define NULLSAFE_NUMERIC_DISTINCTION(NAME, TYPE, OP, IS_EQUAL) \ + BooleanVal Operators::NAME##_##TYPE##_##TYPE(\ + FunctionContext* c, const TYPE& v1, const TYPE& v2) {\ + if (v1.is_null) return BooleanVal(IS_EQUAL ? v2.is_null : !v2.is_null); \ + if (v2.is_null) return BooleanVal(!IS_EQUAL);\ + BINARY_PREDICATE_NUMERIC_NONNULL(OP, v1, v2);\ + } + +#define NULLSAFE_NONNUMERIC_DISTINCTION(NAME, TYPE, IMPALA_TYPE, OP, IS_EQUAL) \ + BooleanVal Operators::NAME##_##TYPE##_##TYPE(\ + FunctionContext* c, const TYPE& v1, const TYPE& v2) {\ + if (v1.is_null) return BooleanVal(IS_EQUAL ? v2.is_null : !v2.is_null); \ + if (v2.is_null) return BooleanVal(!IS_EQUAL);\ + BINARY_PREDICATE_NONNUMERIC_NONNULL(TYPE, IMPALA_TYPE, OP, v1, v2);\ + } + +#define NULLSAFE_CHAR_DISTINCTION(NAME, OP, IS_EQUAL) \ + BooleanVal Operators::NAME##_Char_Char(\ + FunctionContext* c, const StringVal& v1, const StringVal& v2) {\ + if (v1.is_null) return BooleanVal(IS_EQUAL ? v2.is_null : !v2.is_null); \ + if (v2.is_null) return BooleanVal(!IS_EQUAL);\ + BINARY_PREDICATE_CHAR_NONNULL(OP, v1, v2);\ } #define BINARY_OP_NUMERIC_TYPES(NAME, OP) \ @@ -113,6 +146,18 @@ using strings::Substitute; BINARY_PREDICATE_NONNUMERIC_FN(NAME, TimestampVal, TimestampValue, OP);\ BINARY_PREDICATE_CHAR(NAME, OP); +#define NULLSAFE_DISTINCTION(NAME, OP, IS_EQUAL) \ + NULLSAFE_NUMERIC_DISTINCTION(NAME, BooleanVal, OP, IS_EQUAL); \ + NULLSAFE_NUMERIC_DISTINCTION(NAME, TinyIntVal, OP, IS_EQUAL); \ + NULLSAFE_NUMERIC_DISTINCTION(NAME, SmallIntVal, OP, IS_EQUAL); \ + NULLSAFE_NUMERIC_DISTINCTION(NAME, IntVal, OP, IS_EQUAL); \ + NULLSAFE_NUMERIC_DISTINCTION(NAME, BigIntVal, OP, IS_EQUAL); \ + NULLSAFE_NUMERIC_DISTINCTION(NAME, FloatVal, OP, IS_EQUAL); \ + NULLSAFE_NUMERIC_DISTINCTION(NAME, DoubleVal, OP, IS_EQUAL); \ + NULLSAFE_NONNUMERIC_DISTINCTION(NAME, StringVal, StringValue, OP, IS_EQUAL);\ + NULLSAFE_NONNUMERIC_DISTINCTION(NAME, TimestampVal, TimestampValue, OP, IS_EQUAL);\ + NULLSAFE_CHAR_DISTINCTION(NAME, OP, IS_EQUAL); + namespace impala { BINARY_OP_NUMERIC_TYPES(Add, +); @@ -182,4 +227,7 @@ BINARY_PREDICATE_ALL_TYPES(Lt, <); BINARY_PREDICATE_ALL_TYPES(Ge, >=); BINARY_PREDICATE_ALL_TYPES(Le, <=); +NULLSAFE_DISTINCTION(DistinctFrom, !=, false); +NULLSAFE_DISTINCTION(NotDistinct, ==, true); + } // namespace impala diff --git a/be/src/exprs/operators.h b/be/src/exprs/operators.h index ee1fc152e..1d79cf138 100644 --- a/be/src/exprs/operators.h +++ b/be/src/exprs/operators.h @@ -151,6 +151,48 @@ class Operators { static BooleanVal Ne_TimestampVal_TimestampVal( FunctionContext*, const TimestampVal&, const TimestampVal&); + static BooleanVal DistinctFrom_BooleanVal_BooleanVal( + FunctionContext*, const BooleanVal&, const BooleanVal&); + static BooleanVal DistinctFrom_TinyIntVal_TinyIntVal( + FunctionContext*, const TinyIntVal&, const TinyIntVal&); + static BooleanVal DistinctFrom_SmallIntVal_SmallIntVal( + FunctionContext*, const SmallIntVal&, const SmallIntVal&); + static BooleanVal DistinctFrom_IntVal_IntVal( + FunctionContext*, const IntVal&, const IntVal&); + static BooleanVal DistinctFrom_BigIntVal_BigIntVal( + FunctionContext*, const BigIntVal&, const BigIntVal&); + static BooleanVal DistinctFrom_FloatVal_FloatVal( + FunctionContext*, const FloatVal&, const FloatVal&); + static BooleanVal DistinctFrom_DoubleVal_DoubleVal( + FunctionContext*, const DoubleVal&, const DoubleVal&); + static BooleanVal DistinctFrom_StringVal_StringVal( + FunctionContext*, const StringVal&, const StringVal&); + static BooleanVal DistinctFrom_Char_Char( + FunctionContext*, const StringVal&, const StringVal&); + static BooleanVal DistinctFrom_TimestampVal_TimestampVal( + FunctionContext*, const TimestampVal&, const TimestampVal&); + + static BooleanVal NotDistinct_BooleanVal_BooleanVal( + FunctionContext*, const BooleanVal&, const BooleanVal&); + static BooleanVal NotDistinct_TinyIntVal_TinyIntVal( + FunctionContext*, const TinyIntVal&, const TinyIntVal&); + static BooleanVal NotDistinct_SmallIntVal_SmallIntVal( + FunctionContext*, const SmallIntVal&, const SmallIntVal&); + static BooleanVal NotDistinct_IntVal_IntVal( + FunctionContext*, const IntVal&, const IntVal&); + static BooleanVal NotDistinct_BigIntVal_BigIntVal( + FunctionContext*, const BigIntVal&, const BigIntVal&); + static BooleanVal NotDistinct_FloatVal_FloatVal( + FunctionContext*, const FloatVal&, const FloatVal&); + static BooleanVal NotDistinct_DoubleVal_DoubleVal( + FunctionContext*, const DoubleVal&, const DoubleVal&); + static BooleanVal NotDistinct_StringVal_StringVal( + FunctionContext*, const StringVal&, const StringVal&); + static BooleanVal NotDistinct_Char_Char( + FunctionContext*, const StringVal&, const StringVal&); + static BooleanVal NotDistinct_TimestampVal_TimestampVal( + FunctionContext*, const TimestampVal&, const TimestampVal&); + static BooleanVal Gt_BooleanVal_BooleanVal( FunctionContext*, const BooleanVal&, const BooleanVal&); static BooleanVal Gt_TinyIntVal_TinyIntVal( diff --git a/common/function-registry/impala_functions.py b/common/function-registry/impala_functions.py index 0c5bebe4e..0d0988c4e 100644 --- a/common/function-registry/impala_functions.py +++ b/common/function-registry/impala_functions.py @@ -658,4 +658,28 @@ invisible_functions = [ '_ZN6impala18TimestampFunctions6AddSubILb0EN10impala_udf6IntValEN5boost9date_time15months_durationINS4_9gregorian21greg_durations_configEEELb0EEENS2_12TimestampValEPNS2_15FunctionContextERKSA_RKT0_'], [['months_sub_interval'], 'TIMESTAMP', ['TIMESTAMP', 'BIGINT'], '_ZN6impala18TimestampFunctions6AddSubILb0EN10impala_udf9BigIntValEN5boost9date_time15months_durationINS4_9gregorian21greg_durations_configEEELb0EEENS2_12TimestampValEPNS2_15FunctionContextERKSA_RKT0_'], + + [['distinctfrom'], 'BOOLEAN', ['BOOLEAN', 'BOOLEAN'], 'impala::Operators::DistinctFrom_BooleanVal_BooleanVal'], + [['distinctfrom'], 'BOOLEAN', ['TINYINT', 'TINYINT'], 'impala::Operators::DistinctFrom_TinyIntVal_TinyIntVal'], + [['distinctfrom'], 'BOOLEAN', ['SMALLINT', 'SMALLINT'], 'impala::Operators::DistinctFrom_SmallIntVal_SmallIntVal'], + [['distinctfrom'], 'BOOLEAN', ['INT', 'INT'], 'impala::Operators::DistinctFrom_IntVal_IntVal'], + [['distinctfrom'], 'BOOLEAN', ['BIGINT', 'BIGINT'], 'impala::Operators::DistinctFrom_BigIntVal_BigIntVal'], + [['distinctfrom'], 'BOOLEAN', ['FLOAT', 'FLOAT'], 'impala::Operators::DistinctFrom_FloatVal_FloatVal'], + [['distinctfrom'], 'BOOLEAN', ['DOUBLE', 'DOUBLE'], 'impala::Operators::DistinctFrom_DoubleVal_DoubleVal'], + [['distinctfrom'], 'BOOLEAN', ['STRING', 'STRING'], 'impala::Operators::DistinctFrom_StringVal_StringVal'], + [['distinctfrom'], 'BOOLEAN', ['TIMESTAMP', 'TIMESTAMP'], 'impala::Operators::DistinctFrom_TimestampVal_TimestampVal'], + [['distinctfrom'], 'BOOLEAN', ['CHAR', 'CHAR'], 'impala::Operators::DistinctFrom_Char_Char'], + [['distinctfrom'], 'BOOLEAN', ['DECIMAL', 'DECIMAL'], 'impala::DecimalOperators::DistinctFrom_DecimalVal_DecimalVal'], + + [['notdistinct'], 'BOOLEAN', ['BOOLEAN', 'BOOLEAN'], 'impala::Operators::NotDistinct_BooleanVal_BooleanVal'], + [['notdistinct'], 'BOOLEAN', ['TINYINT', 'TINYINT'], 'impala::Operators::NotDistinct_TinyIntVal_TinyIntVal'], + [['notdistinct'], 'BOOLEAN', ['SMALLINT', 'SMALLINT'], 'impala::Operators::NotDistinct_SmallIntVal_SmallIntVal'], + [['notdistinct'], 'BOOLEAN', ['INT', 'INT'], 'impala::Operators::NotDistinct_IntVal_IntVal'], + [['notdistinct'], 'BOOLEAN', ['BIGINT', 'BIGINT'], 'impala::Operators::NotDistinct_BigIntVal_BigIntVal'], + [['notdistinct'], 'BOOLEAN', ['FLOAT', 'FLOAT'], 'impala::Operators::NotDistinct_FloatVal_FloatVal'], + [['notdistinct'], 'BOOLEAN', ['DOUBLE', 'DOUBLE'], 'impala::Operators::NotDistinct_DoubleVal_DoubleVal'], + [['notdistinct'], 'BOOLEAN', ['STRING', 'STRING'], 'impala::Operators::NotDistinct_StringVal_StringVal'], + [['notdistinct'], 'BOOLEAN', ['TIMESTAMP', 'TIMESTAMP'], 'impala::Operators::NotDistinct_TimestampVal_TimestampVal'], + [['notdistinct'], 'BOOLEAN', ['CHAR', 'CHAR'], 'impala::Operators::NotDistinct_Char_Char'], + [['notdistinct'], 'BOOLEAN', ['DECIMAL', 'DECIMAL'], 'impala::DecimalOperators::NotDistinct_DecimalVal_DecimalVal'], ] diff --git a/common/thrift/ExternalDataSource.thrift b/common/thrift/ExternalDataSource.thrift index ee550e762..824184d61 100644 --- a/common/thrift/ExternalDataSource.thrift +++ b/common/thrift/ExternalDataSource.thrift @@ -50,7 +50,7 @@ struct TRowBatch { // Comparison operators used in predicates. enum TComparisonOp { - LT, LE, EQ, NE, GE, GT, + LT, LE, EQ, NE, GE, GT, DISTINCT_FROM, NOT_DISTINCT } // Binary predicates that can be pushed to the external data source and diff --git a/common/thrift/PlanNodes.thrift b/common/thrift/PlanNodes.thrift index 8ec6bdd07..f194a0278 100644 --- a/common/thrift/PlanNodes.thrift +++ b/common/thrift/PlanNodes.thrift @@ -162,6 +162,8 @@ struct TEqJoinCondition { 1: required Exprs.TExpr left; // right-hand side of " = " 2: required Exprs.TExpr right; + // true if and only if operator is "<=>", also known as "IS NOT DISTINCT FROM" + 3: required bool is_not_distinct_from; } enum TJoinOp { diff --git a/fe/src/main/cup/sql-parser.y b/fe/src/main/cup/sql-parser.y index 5aa59d8d2..fa51595fd 100644 --- a/fe/src/main/cup/sql-parser.y +++ b/fe/src/main/cup/sql-parser.y @@ -458,7 +458,7 @@ precedence left KW_AND; precedence right KW_NOT, NOT; precedence left KW_BETWEEN, KW_IN, KW_IS, KW_EXISTS; precedence left KW_LIKE, KW_RLIKE, KW_REGEXP; -precedence left EQUAL, NOTEQUAL, LESSTHAN, GREATERTHAN; +precedence left EQUAL, NOTEQUAL, LESSTHAN, GREATERTHAN, KW_FROM, KW_DISTINCT; precedence left ADD, SUBTRACT; precedence left STAR, DIVIDE, MOD, KW_DIV; precedence left BITAND, BITOR, BITXOR, BITNOT; @@ -2406,6 +2406,12 @@ comparison_predicate ::= {: RESULT = new BinaryPredicate(BinaryPredicate.Operator.LT, e1, e2); :} | expr:e1 GREATERTHAN expr:e2 {: RESULT = new BinaryPredicate(BinaryPredicate.Operator.GT, e1, e2); :} + | expr:e1 LESSTHAN EQUAL GREATERTHAN expr:e2 + {: RESULT = new BinaryPredicate(BinaryPredicate.Operator.NOT_DISTINCT, e1, e2); :} + | expr:e1 KW_IS KW_DISTINCT KW_FROM expr:e2 + {: RESULT = new BinaryPredicate(BinaryPredicate.Operator.DISTINCT_FROM, e1, e2); :} + | expr:e1 KW_IS KW_NOT KW_DISTINCT KW_FROM expr:e2 + {: RESULT = new BinaryPredicate(BinaryPredicate.Operator.NOT_DISTINCT, e1, e2); :} ; like_predicate ::= diff --git a/fe/src/main/java/com/cloudera/impala/analysis/Analyzer.java b/fe/src/main/java/com/cloudera/impala/analysis/Analyzer.java index 933c9b7fe..9ea4dfd2e 100644 --- a/fe/src/main/java/com/cloudera/impala/analysis/Analyzer.java +++ b/fe/src/main/java/com/cloudera/impala/analysis/Analyzer.java @@ -1009,7 +1009,8 @@ public class Analyzer { // form = where at least one of the exprs is bound by // exactly one tuple id if (binaryPred.getOp() != BinaryPredicate.Operator.EQ && - binaryPred.getOp() != BinaryPredicate.Operator.NULL_MATCHING_EQ) { + binaryPred.getOp() != BinaryPredicate.Operator.NULL_MATCHING_EQ && + binaryPred.getOp() != BinaryPredicate.Operator.NOT_DISTINCT) { return; } // the binary predicate must refer to at least two tuples to be an eqJoinConjunct diff --git a/fe/src/main/java/com/cloudera/impala/analysis/BinaryPredicate.java b/fe/src/main/java/com/cloudera/impala/analysis/BinaryPredicate.java index b2bbc07a0..293faf410 100644 --- a/fe/src/main/java/com/cloudera/impala/analysis/BinaryPredicate.java +++ b/fe/src/main/java/com/cloudera/impala/analysis/BinaryPredicate.java @@ -49,6 +49,8 @@ public class BinaryPredicate extends Predicate { GE(">=", "ge", TComparisonOp.GE), LT("<", "lt", TComparisonOp.LT), GT(">", "gt", TComparisonOp.GT), + DISTINCT_FROM("IS DISTINCT FROM", "distinctfrom", TComparisonOp.DISTINCT_FROM), + NOT_DISTINCT("IS NOT DISTINCT FROM", "notdistinct", TComparisonOp.NOT_DISTINCT), // Same as EQ, except it returns True if the rhs is NULL. There is no backend // function for this. The functionality is embedded in the hash-join // implementation. @@ -68,6 +70,7 @@ public class BinaryPredicate extends Predicate { public String toString() { return description_; } public String getName() { return name_; } public TComparisonOp getThriftOp() { return thriftOp_; } + public boolean isEquivalence() { return this == EQ || this == NOT_DISTINCT; } public Operator converse() { switch (this) { @@ -77,6 +80,8 @@ public class BinaryPredicate extends Predicate { case GE: return LE; case LT: return GT; case GT: return LT; + case DISTINCT_FROM: return DISTINCT_FROM; + case NOT_DISTINCT: return NOT_DISTINCT; case NULL_MATCHING_EQ: throw new IllegalStateException("Not implemented"); default: throw new IllegalStateException("Invalid operator"); @@ -207,7 +212,7 @@ public class BinaryPredicate extends Predicate { // determine selectivity // TODO: Compute selectivity for nested predicates Reference slotRefRef = new Reference(); - if (op_ == Operator.EQ + if ((op_ == Operator.EQ || op_ == Operator.NOT_DISTINCT) && isSingleColumnPredicate(slotRefRef, null) && slotRefRef.getRef().getNumDistinctValues() > 0) { Preconditions.checkState(slotRefRef.getRef() != null); @@ -293,6 +298,12 @@ public class BinaryPredicate extends Predicate { case GT: newOp = Operator.LE; break; + case DISTINCT_FROM: + newOp = Operator.NOT_DISTINCT; + break; + case NOT_DISTINCT: + newOp = Operator.DISTINCT_FROM; + break; case NULL_MATCHING_EQ: throw new IllegalStateException("Not implemented"); } diff --git a/fe/src/main/java/com/cloudera/impala/analysis/StmtRewriter.java b/fe/src/main/java/com/cloudera/impala/analysis/StmtRewriter.java index 67ac6908c..518442d78 100644 --- a/fe/src/main/java/com/cloudera/impala/analysis/StmtRewriter.java +++ b/fe/src/main/java/com/cloudera/impala/analysis/StmtRewriter.java @@ -465,10 +465,9 @@ public class StmtRewriter { // Check if we have a valid ON clause for an equi-join. boolean hasEqJoinPred = false; for (Expr conjunct: onClausePredicate.getConjuncts()) { - if (!(conjunct instanceof BinaryPredicate) || - ((BinaryPredicate)conjunct).getOp() != BinaryPredicate.Operator.EQ) { - continue; - } + if (!(conjunct instanceof BinaryPredicate)) continue; + BinaryPredicate.Operator operator = ((BinaryPredicate) conjunct).getOp(); + if (!operator.isEquivalence()) continue; List lhsTupleIds = Lists.newArrayList(); conjunct.getChild(0).getIds(lhsTupleIds, null); if (lhsTupleIds.isEmpty()) continue; @@ -524,9 +523,9 @@ public class StmtRewriter { for (Expr conjunct: onClausePredicate.getConjuncts()) { if (conjunct.equals(joinConjunct)) { Preconditions.checkState(conjunct instanceof BinaryPredicate); - Preconditions.checkState(((BinaryPredicate)conjunct).getOp() == - BinaryPredicate.Operator.EQ); - ((BinaryPredicate)conjunct).setOp(BinaryPredicate.Operator.NULL_MATCHING_EQ); + BinaryPredicate binaryPredicate = (BinaryPredicate)conjunct; + Preconditions.checkState(binaryPredicate.getOp().isEquivalence()); + binaryPredicate.setOp(BinaryPredicate.Operator.NULL_MATCHING_EQ); break; } } diff --git a/fe/src/main/java/com/cloudera/impala/planner/HashJoinNode.java b/fe/src/main/java/com/cloudera/impala/planner/HashJoinNode.java index 8305d60ba..878c5dae0 100644 --- a/fe/src/main/java/com/cloudera/impala/planner/HashJoinNode.java +++ b/fe/src/main/java/com/cloudera/impala/planner/HashJoinNode.java @@ -127,9 +127,11 @@ public class HashJoinNode extends JoinNode { msg.hash_join_node = new THashJoinNode(); msg.hash_join_node.join_op = joinOp_.toThrift(); for (Expr entry: eqJoinConjuncts_) { + BinaryPredicate bp = (BinaryPredicate)entry; TEqJoinCondition eqJoinCondition = - new TEqJoinCondition(entry.getChild(0).treeToThrift(), - entry.getChild(1).treeToThrift()); + new TEqJoinCondition(bp.getChild(0).treeToThrift(), + bp.getChild(1).treeToThrift(), + bp.getOp() == BinaryPredicate.Operator.NOT_DISTINCT); msg.hash_join_node.addToEq_join_conjuncts(eqJoinCondition); } for (Expr e: otherJoinConjuncts_) { diff --git a/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java b/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java index 2070531c6..29ac7c6bf 100644 --- a/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java +++ b/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java @@ -399,7 +399,11 @@ public class HdfsScanNode extends ScanNode { Preconditions.checkNotNull(bindingExpr); Preconditions.checkState(bindingExpr.isLiteral()); LiteralExpr literal = (LiteralExpr)bindingExpr; - if (literal instanceof NullLiteral) return Sets.newHashSet(); + Operator op = bp.getOp(); + if ((literal instanceof NullLiteral) && (op != Operator.NOT_DISTINCT) + && (op != Operator.DISTINCT_FROM)) { + return Sets.newHashSet(); + } // Get the partition column position and retrieve the associated partition // value metadata. @@ -410,13 +414,36 @@ public class HdfsScanNode extends ScanNode { HashSet matchingIds = Sets.newHashSet(); // Compute the matching partition ids - Operator op = bp.getOp(); + if (op == Operator.NOT_DISTINCT) { + // Case: SlotRef <=> Literal + if (literal instanceof NullLiteral) { + HashSet ids = tbl_.getNullPartitionIds(partitionPos); + if (ids != null) matchingIds.addAll(ids); + return matchingIds; + } + // Punt to equality case: + op = Operator.EQ; + } if (op == Operator.EQ) { // Case: SlotRef = Literal HashSet ids = partitionValueMap.get(literal); if (ids != null) matchingIds.addAll(ids); return matchingIds; } + if (op == Operator.DISTINCT_FROM) { + // Case: SlotRef IS DISTINCT FROM Literal + if (literal instanceof NullLiteral) { + matchingIds.addAll(tbl_.getPartitionIds()); + HashSet nullIds = tbl_.getNullPartitionIds(partitionPos); + matchingIds.removeAll(nullIds); + return matchingIds; + } else { + matchingIds.addAll(tbl_.getPartitionIds()); + HashSet ids = partitionValueMap.get(literal); + if (ids != null) matchingIds.removeAll(ids); + return matchingIds; + } + } if (op == Operator.NE) { // Case: SlotRef != Literal matchingIds.addAll(tbl_.getPartitionIds()); diff --git a/fe/src/main/java/com/cloudera/impala/planner/SingleNodePlanner.java b/fe/src/main/java/com/cloudera/impala/planner/SingleNodePlanner.java index d9710aeca..43f449afa 100644 --- a/fe/src/main/java/com/cloudera/impala/planner/SingleNodePlanner.java +++ b/fe/src/main/java/com/cloudera/impala/planner/SingleNodePlanner.java @@ -994,7 +994,11 @@ public class SingleNodePlanner { Expr e = i.next(); if (!(e instanceof BinaryPredicate)) continue; BinaryPredicate comp = (BinaryPredicate) e; - if (comp.getOp() == BinaryPredicate.Operator.NE) continue; + if ((comp.getOp() == BinaryPredicate.Operator.NE) + || (comp.getOp() == BinaryPredicate.Operator.DISTINCT_FROM) + || (comp.getOp() == BinaryPredicate.Operator.NOT_DISTINCT)) { + continue; + } Expr slotBinding = comp.getSlotBinding(d.getId()); if (slotBinding == null || !slotBinding.isConstant() || !slotBinding.getType().equals(Type.STRING)) { @@ -1168,18 +1172,13 @@ public class SingleNodePlanner { Predicate isIdentityPredicate = new Predicate() { @Override public boolean apply(Expr expr) { - if (!(expr instanceof BinaryPredicate) - || ((BinaryPredicate) expr).getOp() != BinaryPredicate.Operator.EQ) { - return false; - } - if (!expr.isRegisteredPredicate() + return (expr instanceof BinaryPredicate) + && (((BinaryPredicate) expr).getOp().isEquivalence()) + && !expr.isRegisteredPredicate() && expr.getChild(0) instanceof SlotRef && expr.getChild(1) instanceof SlotRef - && (((SlotRef) expr.getChild(0)).getSlotId() == - ((SlotRef) expr.getChild(1)).getSlotId())) { - return true; - } - return false; + && (((SlotRef) expr.getChild(0)).getSlotId() + == ((SlotRef) expr.getChild(1)).getSlotId()); } }; Iterables.removeIf(viewPredicates, isIdentityPredicate); diff --git a/fe/src/test/java/com/cloudera/impala/analysis/AnalyzeExprsTest.java b/fe/src/test/java/com/cloudera/impala/analysis/AnalyzeExprsTest.java index 9b222f4c7..35229dfec 100644 --- a/fe/src/test/java/com/cloudera/impala/analysis/AnalyzeExprsTest.java +++ b/fe/src/test/java/com/cloudera/impala/analysis/AnalyzeExprsTest.java @@ -20,6 +20,7 @@ import static org.junit.Assert.assertTrue; import java.math.BigInteger; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import org.junit.Assert; @@ -160,33 +161,83 @@ public class AnalyzeExprsTest extends AnalyzerTest { @Test public void TestBinaryPredicates() throws AnalysisException { - AnalyzesOk("select * from functional.alltypes where bool_col != true"); - AnalyzesOk("select * from functional.alltypes where tinyint_col <> 1"); - AnalyzesOk("select * from functional.alltypes where smallint_col <= 23"); - AnalyzesOk("select * from functional.alltypes where int_col > 15"); - AnalyzesOk("select * from functional.alltypes where bigint_col >= 17"); - AnalyzesOk("select * from functional.alltypes where float_col < 15.0"); - AnalyzesOk("select * from functional.alltypes where double_col > 7.7"); - // automatic type cast if compatible - AnalyzesOk("select * from functional.alltypes where 1 = 0"); - AnalyzesOk("select * from functional.alltypes where int_col = smallint_col"); - AnalyzesOk("select * from functional.alltypes where bigint_col = float_col"); - AnalyzesOk("select * from functional.alltypes where bool_col = 0"); - AnalyzesOk("select * from functional.alltypes where int_col = cast('0' as int)"); - AnalyzesOk("select * from functional.alltypes where cast(string_col as int) = 15"); - // tests with NULL - AnalyzesOk("select * from functional.alltypes where bool_col != NULL"); - AnalyzesOk("select * from functional.alltypes where tinyint_col <> NULL"); - AnalyzesOk("select * from functional.alltypes where smallint_col <= NULL"); - AnalyzesOk("select * from functional.alltypes where int_col > NULL"); - AnalyzesOk("select * from functional.alltypes where bigint_col >= NULL"); - AnalyzesOk("select * from functional.alltypes where float_col < NULL"); - AnalyzesOk("select * from functional.alltypes where double_col > NULL"); - AnalyzesOk("select * from functional.alltypes where string_col = NULL"); - AnalyzesOk("select * from functional.alltypes where timestamp_col = NULL"); + for (String operator: new String[]{"<=>", "IS DISTINCT FROM", + "IS NOT DISTINCT FROM", "<", ">", ">=", "<=", "!=", "=", "<>"}) { + // Operator can compare numeric values (literals, casts, and columns), even ones of + // different types. + ArrayList numericValues = + new ArrayList(Arrays.asList("0", "1", "1.1", "-7", "-7.7", "1.2e99", + "false", "1234567890123456789012345678901234567890", "tinyint_col", + "smallint_col", "int_col", "bigint_col", "float_col", "double_col")); + String numericTypes[] = new String[] { + "TINYINT", "SMALLINT", "INT", "BIGINT", "FLOAT", "DOUBLE", "DECIMAL"}; + for (String numericType : numericTypes) { + numericValues.add("cast(NULL as " + numericType + ")"); + } + for (String lhs : numericValues) { + for (String rhs : numericValues) { + AnalyzesOk("select * from functional.alltypes where " + lhs + " " + + operator + " " + rhs); + } + } - AnalyzesOk("select cast('hi' as CHAR(2)) = cast('hi' as CHAR(3))"); - AnalyzesOk("select cast('hi' as CHAR(2)) = 'hi'"); + // Operator can compare identical non-numeric types + for (String operand : + new String[] {"bool_col", "string_col", "timestamp_col", "NULL"}) { + AnalyzesOk("select * from functional.alltypes where " + operand + " " + + operator + " " + operand); + AnalyzesOk("select * from functional.alltypes where " + operand + " " + + operator + " NULL"); + AnalyzesOk( + "select * from functional.alltypes where NULL " + operator + " " + operand); + } + + // Operator can compare string column and literals + AnalyzesOk( + "select * from functional.alltypes where string_col " + operator + " 'hi'"); + // Operator can compare timestamp column and literals + AnalyzesOk("select * from functional.alltypes where timestamp_col " + + operator + " '1993-01-21 02:00:00'"); + // Operator can compare bool column and literals + AnalyzesOk("select * from functional.alltypes where bool_col " + operator + + " true"); + + // Decimal types of different precisions and scales are comparable + String decimalColumns[] = new String[]{"d1", "d2", "d3", "d4", "d5", "NULL"}; + for (String operand1 : decimalColumns) { + for (String operand2 : decimalColumns) { + AnalyzesOk("select * from functional.decimal_tbl where " + operand1 + " " + + operator + " " + operand2); + } + } + + // Chars of different length are comparable + for (int i = 1; i < 16; ++i) { + AnalyzesOk("select cast('hi' as char(" + i + ")) " + operator + + " 'hi'"); + AnalyzesOk("select cast('hi' as char(" + i + ")) " + operator + + " NULL"); + for (int j = 1; j < 16; ++j) { + AnalyzesOk("select cast('hi' as char(" + i + ")) " + operator + + " cast('hi' as char(" + j + "))"); + } + } + + // Binary operators do not operate on expression with incompatible types + for (String numeric_type: new String[]{"BOOLEAN", "TINYINT", "SMALLINT", "INT", + "BIGINT", "FLOAT", "DOUBLE", "DECIMAL(9,0)"}) { + for (String string_type: new String[]{"STRING", "TIMESTAMP"}) { + AnalysisError("select cast(NULL as " + numeric_type + ") " + + operator + " cast(NULL as " + string_type + ")", + "operands of type " + numeric_type + " and " + string_type + + " are not comparable:"); + AnalysisError("select cast(NULL as " + string_type + ") " + + operator + " cast(NULL as " + numeric_type + ")", + "operands of type " + string_type + " and " + numeric_type + + " are not comparable:"); + } + } + } // invalid casts AnalysisError("select * from functional.alltypes where bool_col = '15'", @@ -2028,13 +2079,6 @@ public class AnalyzeExprsTest extends AnalyzerTest { "'~' operation only allowed on integer types: ~d1"); AnalysisError("select d1! from functional.decimal_tbl", "'!' operation only allowed on integer types: d1!"); - - AnalyzesOk("select d3 = d4 from functional.decimal_tbl"); - AnalyzesOk("select d5 != d1 from functional.decimal_tbl"); - AnalyzesOk("select d2 > d2 from functional.decimal_tbl"); - AnalyzesOk("select d4 >= d1 from functional.decimal_tbl"); - AnalyzesOk("select d2 < d5 from functional.decimal_tbl"); - AnalyzesOk("select d2 <= d5 from functional.decimal_tbl"); } @Test diff --git a/fe/src/test/java/com/cloudera/impala/analysis/ParserTest.java b/fe/src/test/java/com/cloudera/impala/analysis/ParserTest.java index f86d8e3fc..7b9e0fc9e 100644 --- a/fe/src/test/java/com/cloudera/impala/analysis/ParserTest.java +++ b/fe/src/test/java/com/cloudera/impala/analysis/ParserTest.java @@ -2759,7 +2759,8 @@ public class ParserTest { public void TestSubqueries() { // Binary nested predicates String subquery = "(SELECT count(*) FROM bar)"; - String[] operators = {"=", "!=", "<>", ">", ">=", "<", "<="}; + String[] operators = {"=", "!=", "<>", ">", ">=", "<", "<=", "<=>", + "IS DISTINCT FROM", "IS NOT DISTINCT FROM"}; for (String op: operators) { ParsesOk(String.format("SELECT * FROM foo WHERE a %s %s", op, subquery)); ParsesOk(String.format("SELECT * FROM foo WHERE %s %s a", subquery, op)); diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/data-source-tables.test b/testdata/workloads/functional-planner/queries/PlannerTest/data-source-tables.test index e4f10b77c..c23a24ba4 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/data-source-tables.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/data-source-tables.test @@ -59,3 +59,17 @@ where a.tinyint_col = a.smallint_col and a.int_col = a.bigint_col 00:SCAN DATA SOURCE [functional.alltypes_datasource a] predicates: a.tinyint_col = a.smallint_col, a.int_col = a.bigint_col, a.id = a.tinyint_col, a.id = a.int_col ==== +# Tests that <=>, IS DISTINCT FROM, and IS NOT DISTINCT FROM all can be offered to the +# data source. +select * from functional.alltypes_datasource +where id <=> 1 +and bool_col <=> true +and tinyint_col IS DISTINCT FROM 2 +and smallint_col IS DISTINCT FROM 3 +and int_col is not distinct from 4 +and bigint_col is not distinct from 5 +---- PLAN +00:SCAN DATA SOURCE [functional.alltypes_datasource] +data source predicates: id IS NOT DISTINCT FROM 1, tinyint_col IS DISTINCT FROM 2, int_col IS NOT DISTINCT FROM 4 +predicates: bool_col IS NOT DISTINCT FROM TRUE, smallint_col IS DISTINCT FROM 3, bigint_col IS NOT DISTINCT FROM 5 +==== diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/hdfs.test b/testdata/workloads/functional-planner/queries/PlannerTest/hdfs.test index 222707c96..fc841566f 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/hdfs.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/hdfs.test @@ -113,6 +113,11 @@ select * from functional.alltypes where 2009 = year 00:SCAN HDFS [functional.alltypes] partitions=12/24 files=12 size=238.68KB ==== +select * from functional.alltypes where 2009 <=> year +---- PLAN +00:SCAN HDFS [functional.alltypes] + partitions=12/24 files=12 size=238.68KB +==== # compound predicate on the second partition key select * from functional.alltypes where !(month > 2) ---- PLAN @@ -125,6 +130,11 @@ select * from functional.alltypes where !(!(month=1)) 00:SCAN HDFS [functional.alltypes] partitions=2/24 files=2 size=40.32KB ==== +select * from functional.alltypes where !(!(month<=>1)) +---- PLAN +00:SCAN HDFS [functional.alltypes] + partitions=2/24 files=2 size=40.32KB +==== # predicates on both partition keys one of which is a compound predicate with NOT select * from functional.alltypes where year=2009 and !(month < 6) ---- PLAN @@ -143,30 +153,63 @@ select * from functional.alltypes where !(year = 2009 and month > 6) 00:SCAN HDFS [functional.alltypes] partitions=18/24 files=18 size=357.58KB ==== +select * from functional.alltypes where !(year <=> 2009 and month > 6) +---- PLAN +00:SCAN HDFS [functional.alltypes] + partitions=18/24 files=18 size=357.58KB +==== +select * from functional.alltypes where !(year <=> 2009) or !(month > 6) +---- PLAN +00:SCAN HDFS [functional.alltypes] + partitions=18/24 files=18 size=357.58KB +==== # compound predicate on a disjunct select * from functional.alltypes where !(month = 6 or month = 8) ---- PLAN 00:SCAN HDFS [functional.alltypes] partitions=20/24 files=20 size=398.31KB ==== +select * from functional.alltypes where !(month <=> 6 or month <=> 8) +---- PLAN +00:SCAN HDFS [functional.alltypes] + partitions=20/24 files=20 size=398.31KB +==== # not predicate with is null select * from functional.alltypes where not (year = 2009 or month is null) ---- PLAN 00:SCAN HDFS [functional.alltypes] partitions=12/24 files=12 size=239.77KB ==== +# not predicate with "<=> null" as a synonym of "is null" +select * from functional.alltypes where not (year = 2009 or month <=> null) +---- PLAN +00:SCAN HDFS [functional.alltypes] + partitions=12/24 files=12 size=239.77KB +==== # nested not predicates with is null select * from functional.alltypes where not (not (month is null)) ---- PLAN 00:SCAN HDFS [functional.alltypes] partitions=0/24 files=0 size=0B ==== +# nested not predicates with "<=> null" as a synonym of "is null" +select * from functional.alltypes where not (not (month <=> null)) +---- PLAN +00:SCAN HDFS [functional.alltypes] + partitions=0/24 files=0 size=0B +==== # nested not predicates with disjunct select * from functional.alltypes where not (not (month is null or year = 2009)) ---- PLAN 00:SCAN HDFS [functional.alltypes] partitions=12/24 files=12 size=238.68KB ==== +# nested not predicates with disjunct and "<=> null" as a synonym of "is null" +select * from functional.alltypes where not (not (month <=> null or year = 2009)) +---- PLAN +00:SCAN HDFS [functional.alltypes] + partitions=12/24 files=12 size=238.68KB +==== # predicate on second partition key select * from functional.alltypes where month=1 ---- PLAN @@ -195,6 +238,11 @@ select * from functional.alltypes where year=2009 and month in (1, 3, 5, 7) 00:SCAN HDFS [functional.alltypes] partitions=4/24 files=4 size=80.74KB ==== +select * from functional.alltypes where year<=>2009 and month in (1, 3, 5, 7) +---- PLAN +00:SCAN HDFS [functional.alltypes] + partitions=4/24 files=4 size=80.74KB +==== # adding a predicate that always evaluates to true should not change anything select * from functional.alltypes where year=2009 and month in (1, 3, 5, 7) and month is not null @@ -239,12 +287,25 @@ where year < 2010 and (month > 6 or month = 1 or month in (3, 4)) 00:SCAN HDFS [functional.alltypes] partitions=9/24 files=9 size=180.49KB ==== +# multiple predicates on second key +select * from functional.alltypes +where year < 2010 and (month > 6 or month <=> 1 or month in (3, 4)) +---- PLAN +00:SCAN HDFS [functional.alltypes] + partitions=9/24 files=9 size=180.49KB +==== # between predicate on second key select * from functional.alltypes where year = 2009 and month between 6 and 8 ---- PLAN 00:SCAN HDFS [functional.alltypes] partitions=3/24 files=3 size=60.43KB ==== +# between predicate on second key +select * from functional.alltypes where year <=> 2009 and month between 6 and 8 +---- PLAN +00:SCAN HDFS [functional.alltypes] + partitions=3/24 files=3 size=60.43KB +==== # between predicates on first and second keys select * from functional.alltypes where year between 2009 and 2009 and month between 6 and 8 @@ -270,6 +331,11 @@ select * from functional.alltypes where year - 1 = 2009 00:SCAN HDFS [functional.alltypes] partitions=12/24 files=12 size=239.77KB ==== +select * from functional.alltypes where year - 1 <=> 2009 +---- PLAN +00:SCAN HDFS [functional.alltypes] + partitions=12/24 files=12 size=239.77KB +==== # Predicates on a partition key with nulls (see IMPALA-887) # IS NULL predicate on a partition key with nulls select * from functional.alltypesagg where day is null @@ -277,36 +343,82 @@ select * from functional.alltypesagg where day is null 00:SCAN HDFS [functional.alltypesagg] partitions=1/11 files=1 size=71.05KB ==== +# <=> null predicate on a partition key with nulls +select * from functional.alltypesagg where day <=> null +---- PLAN +00:SCAN HDFS [functional.alltypesagg] + partitions=1/11 files=1 size=71.05KB +==== # IS NOT NULL predicate on a partition key with nulls select * from functional.alltypesagg where day is not null ---- PLAN 00:SCAN HDFS [functional.alltypesagg] partitions=10/11 files=10 size=743.67KB ==== +# IS DISTINCT FROM NULL predicate on a partition key with nulls +select * from functional.alltypesagg where day is distinct from null +---- PLAN +00:SCAN HDFS [functional.alltypesagg] + partitions=10/11 files=10 size=743.67KB +==== +select * from functional.alltypesagg where day = day +---- PLAN +00:SCAN HDFS [functional.alltypesagg] + partitions=10/11 files=10 size=743.67KB +==== +select * from functional.alltypesagg where day <=> day +---- PLAN +00:SCAN HDFS [functional.alltypesagg] + partitions=11/11 files=11 size=814.73KB +==== # partition key predicates which are in conjunctive normal form (case 1) select * from functional.alltypesagg where day is null and day = 10 ---- PLAN 00:SCAN HDFS [functional.alltypesagg] partitions=0/11 files=0 size=0B ==== +# partition key predicates which are in conjunctive normal form (case 1) +select * from functional.alltypesagg where day <=> null and day = 10 +---- PLAN +00:SCAN HDFS [functional.alltypesagg] + partitions=0/11 files=0 size=0B +==== # partition key predicates which are in conjunctive normal form (case 2) select * from functional.alltypesagg where day is null and month = 1 ---- PLAN 00:SCAN HDFS [functional.alltypesagg] partitions=1/11 files=1 size=71.05KB ==== +# partition key predicates which are in conjunctive normal form (case 2) +select * from functional.alltypesagg where day <=> null and month = 1 +---- PLAN +00:SCAN HDFS [functional.alltypesagg] + partitions=1/11 files=1 size=71.05KB +==== # partition key predicates which are in conjunctive normal form (case 3) select * from functional.alltypesagg where month = 1 and (day is null or day = 10) ---- PLAN 00:SCAN HDFS [functional.alltypesagg] partitions=2/11 files=2 size=145.53KB ==== +# partition key predicates which are in conjunctive normal form (case 3) +select * from functional.alltypesagg where month = 1 and (day <=> null or day = 10) +---- PLAN +00:SCAN HDFS [functional.alltypesagg] + partitions=2/11 files=2 size=145.53KB +==== # partition key predicates which are in conjunctive normal form (case 4) select * from functional.alltypesagg where month = 1 and (day is null or year = 2010) ---- PLAN 00:SCAN HDFS [functional.alltypesagg] partitions=11/11 files=11 size=814.73KB ==== +# partition key predicates which are in conjunctive normal form (case 4) +select * from functional.alltypesagg where month = 1 and (day <=> null or year = 2010) +---- PLAN +00:SCAN HDFS [functional.alltypesagg] + partitions=11/11 files=11 size=814.73KB +==== # partition key predicates which are in conjunctive normal form (case 5) select * from functional.alltypesagg where (year = 2010 or month = 1) and (day is not null or day = 10) @@ -314,24 +426,49 @@ where (year = 2010 or month = 1) and (day is not null or day = 10) 00:SCAN HDFS [functional.alltypesagg] partitions=10/11 files=10 size=743.67KB ==== +# partition key predicates which are in conjunctive normal form (case 5) +select * from functional.alltypesagg +where (year = 2010 or month = 1) and (day is distinct from null or day = 10) +---- PLAN +00:SCAN HDFS [functional.alltypesagg] + partitions=10/11 files=10 size=743.67KB +==== # partition key predicates which are in disjunctive normal form (case 1) select * from functional.alltypesagg where day is null or month = 1 ---- PLAN 00:SCAN HDFS [functional.alltypesagg] partitions=11/11 files=11 size=814.73KB ==== +# partition key predicates which are in disjunctive normal form (case 1) +select * from functional.alltypesagg where day <=> null or month = 1 +---- PLAN +00:SCAN HDFS [functional.alltypesagg] + partitions=11/11 files=11 size=814.73KB +==== # partition key predicates which are in disjunctive normal form (case 2) select * from functional.alltypesagg where day is null or day = 10 ---- PLAN 00:SCAN HDFS [functional.alltypesagg] partitions=2/11 files=2 size=145.53KB ==== +# partition key predicates which are in disjunctive normal form (case 2) +select * from functional.alltypesagg where day <=> null or day = 10 +---- PLAN +00:SCAN HDFS [functional.alltypesagg] + partitions=2/11 files=2 size=145.53KB +==== # partition key predicates which are in disjunctive normal form (case 3) select * from functional.alltypesagg where day = 10 or (day is null and year = 2010) ---- PLAN 00:SCAN HDFS [functional.alltypesagg] partitions=2/11 files=2 size=145.53KB ==== +# partition key predicates which are in disjunctive normal form (case 3) +select * from functional.alltypesagg where day = 10 or (day <=> null and year = 2010) +---- PLAN +00:SCAN HDFS [functional.alltypesagg] + partitions=2/11 files=2 size=145.53KB +==== # partition key predicates which are in disjunctive normal form (case 4) select * from functional.alltypesagg where (month = 1 and day = 1) or (day is null and year = 2010) @@ -339,18 +476,37 @@ where (month = 1 and day = 1) or (day is null and year = 2010) 00:SCAN HDFS [functional.alltypesagg] partitions=2/11 files=2 size=144.45KB ==== +# partition key predicates which are in disjunctive normal form (case 4) +select * from functional.alltypesagg +where (month = 1 and day = 1) or (day <=> null and year = 2010) +---- PLAN +00:SCAN HDFS [functional.alltypesagg] + partitions=2/11 files=2 size=144.45KB +==== # partition key predicates with negation (case 1) select * from functional.alltypesagg where not (day is not null) ---- PLAN 00:SCAN HDFS [functional.alltypesagg] partitions=1/11 files=1 size=71.05KB ==== +# partition key predicates with negation (case 1) +select * from functional.alltypesagg where not (day is distinct from null) +---- PLAN +00:SCAN HDFS [functional.alltypesagg] + partitions=1/11 files=1 size=71.05KB +==== # partition key predicates with negation (case 2) select * from functional.alltypesagg where not (not (day is null)) ---- PLAN 00:SCAN HDFS [functional.alltypesagg] partitions=1/11 files=1 size=71.05KB ==== +# partition key predicates with negation (case 2) +select * from functional.alltypesagg where not (not (day <=> null)) +---- PLAN +00:SCAN HDFS [functional.alltypesagg] + partitions=1/11 files=1 size=71.05KB +==== # partition key predicates with negation (case 3) select * from functional.alltypesagg where not (day is not null and month = 1) ---- PLAN @@ -358,11 +514,23 @@ select * from functional.alltypesagg where not (day is not null and month = 1) partitions=1/11 files=1 size=71.05KB ==== # partition key predicates with negation (case 3) +select * from functional.alltypesagg where not (day is distinct from null and month = 1) +---- PLAN +00:SCAN HDFS [functional.alltypesagg] + partitions=1/11 files=1 size=71.05KB +==== +# partition key predicates with negation (case 3) select * from functional.alltypesagg where not (day is not null or day < 9) ---- PLAN 00:SCAN HDFS [functional.alltypesagg] partitions=0/11 files=0 size=0B ==== +# partition key predicates with negation (case 3) +select * from functional.alltypesagg where not (day is distinct from null or day < 9) +---- PLAN +00:SCAN HDFS [functional.alltypesagg] + partitions=0/11 files=0 size=0B +==== # partition key predicates with negation (case 4) select * from functional.alltypesagg where not (day is not null and (not (day < 9 and month = 1))) @@ -370,6 +538,13 @@ where not (day is not null and (not (day < 9 and month = 1))) 00:SCAN HDFS [functional.alltypesagg] partitions=9/11 files=9 size=665.77KB ==== +# partition key predicates with negation (case 4) +select * from functional.alltypesagg +where not (day is distinct from null and (not (day < 9 and month = 1))) +---- PLAN +00:SCAN HDFS [functional.alltypesagg] + partitions=9/11 files=9 size=665.77KB +==== # partition key predicates with negation (case 5) select * from functional.alltypesagg where not (day is not null or (day = 1 and (not (month = 1 or year = 2010)))) @@ -377,6 +552,13 @@ where not (day is not null or (day = 1 and (not (month = 1 or year = 2010)))) 00:SCAN HDFS [functional.alltypesagg] partitions=1/11 files=1 size=71.05KB ==== +# partition key predicates with negation (case 5) +select * from functional.alltypesagg +where not (day is distinct from null or (day = 1 and (not (month = 1 or year = 2010)))) +---- PLAN +00:SCAN HDFS [functional.alltypesagg] + partitions=1/11 files=1 size=71.05KB +==== # partition key predicates where some are evaluated by the index and others are evaluated in the BE select * from functional.alltypesagg where year + 1 = 2011 and month + 1 <= 3 and day is null @@ -384,6 +566,13 @@ where year + 1 = 2011 and month + 1 <= 3 and day is null 00:SCAN HDFS [functional.alltypesagg] partitions=1/11 files=1 size=71.05KB ==== +# partition key predicates where some are evaluated by the index and others are evaluated in the BE +select * from functional.alltypesagg +where year + 1 = 2011 and month + 1 <= 3 and day <=> null +---- PLAN +00:SCAN HDFS [functional.alltypesagg] + partitions=1/11 files=1 size=71.05KB +==== # all supported predicates that can be evaluated using partition key index select * from functional.alltypesagg where day = 5 or (day >= 1 and day <= 2) or (day > 6 and day < 8) @@ -403,6 +592,25 @@ NODE 0: HDFS SPLIT hdfs://localhost:20500/test-warehouse/alltypesagg/year=2010/month=1/day=8/100108.txt 0:76263 HDFS SPLIT hdfs://localhost:20500/test-warehouse/alltypesagg/year=2010/month=1/day=__HIVE_DEFAULT_PARTITION__/000000_0 0:72759 ==== +# all supported predicates that can be evaluated using partition key index +select * from functional.alltypesagg +where day = 5 or (day >= 1 and day <= 2) or (day > 6 and day < 8) +or day <=> null or day in (4) or not(day is distinct from null) +or not (day not in (10)) or not (day != 8) +---- PLAN +00:SCAN HDFS [functional.alltypesagg] + partitions=8/11 files=8 size=591.30KB +---- SCANRANGELOCATIONS +NODE 0: + HDFS SPLIT hdfs://localhost:20500/test-warehouse/alltypesagg/year=2010/month=1/day=1/100101.txt 0:75153 + HDFS SPLIT hdfs://localhost:20500/test-warehouse/alltypesagg/year=2010/month=1/day=10/100110.txt 0:76263 + HDFS SPLIT hdfs://localhost:20500/test-warehouse/alltypesagg/year=2010/month=1/day=2/100102.txt 0:76263 + HDFS SPLIT hdfs://localhost:20500/test-warehouse/alltypesagg/year=2010/month=1/day=4/100104.txt 0:76263 + HDFS SPLIT hdfs://localhost:20500/test-warehouse/alltypesagg/year=2010/month=1/day=5/100105.txt 0:76263 + HDFS SPLIT hdfs://localhost:20500/test-warehouse/alltypesagg/year=2010/month=1/day=7/100107.txt 0:76263 + HDFS SPLIT hdfs://localhost:20500/test-warehouse/alltypesagg/year=2010/month=1/day=8/100108.txt 0:76263 + HDFS SPLIT hdfs://localhost:20500/test-warehouse/alltypesagg/year=2010/month=1/day=__HIVE_DEFAULT_PARTITION__/000000_0 0:72759 +==== # Predicates on a partition key with no values (see CDH-20089) select * from functional.emptytable where f2 = 10 ---- PLAN @@ -471,24 +679,46 @@ select * from scale_db.num_partitions_1234_blocks_per_partition_1 where j = 1 00:SCAN HDFS [scale_db.num_partitions_1234_blocks_per_partition_1] partitions=1/1234 files=1 size=2B ==== +select * from scale_db.num_partitions_1234_blocks_per_partition_1 where j <=> 1 +---- PLAN +00:SCAN HDFS [scale_db.num_partitions_1234_blocks_per_partition_1] + partitions=1/1234 files=1 size=2B +==== # Test disjunctive predicate on a partition column select * from scale_db.num_partitions_1234_blocks_per_partition_1 where j = 1 or j = 2 ---- PLAN 00:SCAN HDFS [scale_db.num_partitions_1234_blocks_per_partition_1] partitions=2/1234 files=2 size=4B ==== +select * from scale_db.num_partitions_1234_blocks_per_partition_1 where j <=> 1 or j <=> 2 +---- PLAN +00:SCAN HDFS [scale_db.num_partitions_1234_blocks_per_partition_1] + partitions=2/1234 files=2 size=4B +==== # Test conjunctive predicate on a partition column select * from scale_db.num_partitions_1234_blocks_per_partition_1 where j = 1 and j = 2 ---- PLAN 00:SCAN HDFS [scale_db.num_partitions_1234_blocks_per_partition_1] partitions=0/1234 files=0 size=0B ==== +select * from scale_db.num_partitions_1234_blocks_per_partition_1 where j <=> 1 and j <=> 2 +---- PLAN +00:SCAN HDFS [scale_db.num_partitions_1234_blocks_per_partition_1] + partitions=0/1234 files=0 size=0B +==== # Partition pruning when a binary predicate contains a NullLiteral (IMPALA-1535) select * from functional.alltypestiny t1 where t1.year != null or t1.year = null ---- PLAN 00:SCAN HDFS [functional.alltypestiny t1] partitions=0/4 files=0 size=0B ==== +# Partition pruning when a binary predicate contains a NullLiteral and IS DISTINCT FROM +select * from functional.alltypestiny t1 +where t1.year IS DISTINCT FROM null or t1.year = null +---- PLAN +00:SCAN HDFS [functional.alltypestiny t1] + partitions=4/4 files=4 size=460B +==== # Partition pruning when a binary predicate contains a NullLiteral in an arithmetic # expression select * from functional.alltypesagg t1 where t1.year + null != t1.day @@ -496,6 +726,13 @@ select * from functional.alltypesagg t1 where t1.year + null != t1.day 00:SCAN HDFS [functional.alltypesagg t1] partitions=0/11 files=0 size=0B ==== +# Partition pruning when a binary predicate contains a NullLiteral in an arithmetic +# expression and IS DISTINCT FROM +select * from functional.alltypesagg t1 where t1.year + null IS DISTINCT FROM t1.day +---- PLAN +00:SCAN HDFS [functional.alltypesagg t1] + partitions=10/11 files=10 size=743.67KB +==== # Partition pruning when an IN predicate contains a NullLiteral # (a single partition is scanned) select * from functional.alltypesagg t1 where day in (10, null) diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/join-order.test b/testdata/workloads/functional-planner/queries/PlannerTest/join-order.test index 5f04c7512..c6d2a7403 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/join-order.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/join-order.test @@ -1264,3 +1264,82 @@ on (t3.string_col = t1.string_col_1 and t3.date_string_col = t1.string_col_2) 01:SCAN HDFS [functional.alltypes t2] partitions=24/24 files=24 size=478.45KB ==== +# Test that filtering with "<=>" sets selectivity, just as "=" does. First, the +# base case: functional.alltypes.timestamp_col has more distinct vals than +# functional.alltypes.date_string_col. As a result, in a left semi join between +# functional.alltypes and itself, if one side of the join is filtered on timestamp_col and +# the other on date_string_col, the one filtered on timestamp_col is expected by the +# planner to be smaller and becomes the build side of the hash join. +select * from functional.alltypes a +left semi join +(select * from functional.alltypes +where timestamp_col = now()) b +on (a.id = b.id) +and a.date_string_col = '' +---- PLAN +02:HASH JOIN [LEFT SEMI JOIN] +| hash predicates: a.id = functional.alltypes.id +| +|--01:SCAN HDFS [functional.alltypes] +| partitions=24/24 files=24 size=478.45KB +| predicates: timestamp_col = now() +| +00:SCAN HDFS [functional.alltypes a] + partitions=24/24 files=24 size=478.45KB + predicates: a.date_string_col = '' +==== +select * from functional.alltypes a +left semi join +(select * from functional.alltypes +where date_string_col = '') b +on (a.id = b.id) +and a.timestamp_col = now() +---- PLAN +02:HASH JOIN [RIGHT SEMI JOIN] +| hash predicates: functional.alltypes.id = a.id +| +|--00:SCAN HDFS [functional.alltypes a] +| partitions=24/24 files=24 size=478.45KB +| predicates: a.timestamp_col = now() +| +01:SCAN HDFS [functional.alltypes] + partitions=24/24 files=24 size=478.45KB + predicates: date_string_col = '' +==== +# The same should hold true when the filtering is done with "<=>" rather than "=". +select * from functional.alltypes a +left semi join +(select * from functional.alltypes +where timestamp_col <=> now()) b +on (a.id = b.id) +and a.date_string_col <=> '' +---- PLAN +02:HASH JOIN [LEFT SEMI JOIN] +| hash predicates: a.id = functional.alltypes.id +| +|--01:SCAN HDFS [functional.alltypes] +| partitions=24/24 files=24 size=478.45KB +| predicates: timestamp_col IS NOT DISTINCT FROM now() +| +00:SCAN HDFS [functional.alltypes a] + partitions=24/24 files=24 size=478.45KB + predicates: a.date_string_col IS NOT DISTINCT FROM '' +==== +select * from functional.alltypes a +left semi join +(select * from functional.alltypes +where date_string_col <=> '') b +on (a.id = b.id) +and a.timestamp_col <=> now() +---- PLAN +02:HASH JOIN [RIGHT SEMI JOIN] +| hash predicates: functional.alltypes.id = a.id +| +|--00:SCAN HDFS [functional.alltypes a] +| partitions=24/24 files=24 size=478.45KB +| predicates: a.timestamp_col IS NOT DISTINCT FROM now() +| +01:SCAN HDFS [functional.alltypes] + partitions=24/24 files=24 size=478.45KB + predicates: date_string_col IS NOT DISTINCT FROM '' +==== diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/joins.test b/testdata/workloads/functional-planner/queries/PlannerTest/joins.test index 68dfa6df5..a90636247 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/joins.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/joins.test @@ -1906,3 +1906,131 @@ where t2.int_col in (t2.int_col, 10); 00:SCAN HDFS [functional.alltypestiny t1] partitions=4/4 files=4 size=460B ==== +# Test queries that appear earlier in this file, but substitute "<=>" or "IS DISTINCT +# FROM" for "=" in the join predicates. +select * +from functional.testtbl t1 join functional.testtbl t2 +where t1.id <=> t2.id and t1.zip = 94611 +---- PLAN +02:HASH JOIN [INNER JOIN] +| hash predicates: t1.id IS NOT DISTINCT FROM t2.id +| +|--01:SCAN HDFS [functional.testtbl t2] +| partitions=1/1 files=0 size=0B +| +00:SCAN HDFS [functional.testtbl t1] + partitions=1/1 files=0 size=0B + predicates: t1.zip = 94611 +==== +select * +from functional.testtbl t1 join functional.testtbl t2 +where t1.id is not distinct from t2.id and t1.zip = 94611 +---- PLAN +02:HASH JOIN [INNER JOIN] +| hash predicates: t1.id IS NOT DISTINCT FROM t2.id +| +|--01:SCAN HDFS [functional.testtbl t2] +| partitions=1/1 files=0 size=0B +| +00:SCAN HDFS [functional.testtbl t1] + partitions=1/1 files=0 size=0B + predicates: t1.zip = 94611 +==== +select * +from functional.testtbl t1 join functional.testtbl t2 +where (t1.id IS DISTINCT FROM t2.id) and t1.zip = 94611 +---- PLAN +02:NESTED LOOP JOIN [INNER JOIN] +| predicates: (t1.id IS DISTINCT FROM t2.id) +| +|--01:SCAN HDFS [functional.testtbl t2] +| partitions=1/1 files=0 size=0B +| +00:SCAN HDFS [functional.testtbl t1] + partitions=1/1 files=0 size=0B + predicates: t1.zip = 94611 +==== +# Test that "is not distinct from" plans the same as "=" in the same query above. +select t1.* +from (select * from functional.alltypestiny) t1 + join (select * from functional.alltypestiny) t2 on (t1.id is not distinct from t2.id) + join functional.alltypestiny t3 on (coalesce(t1.id, t2.id) is not distinct from t3.id) +---- PLAN +04:HASH JOIN [INNER JOIN] +| hash predicates: coalesce(functional.alltypestiny.id, functional.alltypestiny.id) IS NOT DISTINCT FROM t3.id +| +|--02:SCAN HDFS [functional.alltypestiny t3] +| partitions=4/4 files=4 size=460B +| +03:HASH JOIN [INNER JOIN] +| hash predicates: functional.alltypestiny.id IS NOT DISTINCT FROM functional.alltypestiny.id +| +|--01:SCAN HDFS [functional.alltypestiny] +| partitions=4/4 files=4 size=460B +| +00:SCAN HDFS [functional.alltypestiny] + partitions=4/4 files=4 size=460B +==== +select * +from functional.alltypesagg a +full outer join functional.alltypessmall b on (a.id <=> b.id and a.int_col = b.int_col) +right join functional.alltypesaggnonulls c on (a.id = c.id and b.string_col <=> c.string_col) +where a.day >= 6 +and b.month > 2 +and c.day < 3 +and a.tinyint_col = 15 +and b.string_col = '15' +and a.tinyint_col + b.tinyint_col < 15 +and a.float_col - c.double_col < 0 +and (b.double_col * c.tinyint_col > 1000 or c.tinyint_col < 1000) +---- PLAN +04:HASH JOIN [LEFT OUTER JOIN] +| hash predicates: c.id = a.id, c.string_col IS NOT DISTINCT FROM b.string_col +| other predicates: a.day >= 6, b.month > 2, a.tinyint_col = 15, b.string_col = '15', a.tinyint_col + b.tinyint_col < 15, a.float_col - c.double_col < 0, (b.double_col * c.tinyint_col > 1000 OR c.tinyint_col < 1000) +| +|--03:HASH JOIN [FULL OUTER JOIN] +| | hash predicates: a.id IS NOT DISTINCT FROM b.id, a.int_col = b.int_col +| | +| |--01:SCAN HDFS [functional.alltypessmall b] +| | partitions=2/4 files=2 size=3.17KB +| | predicates: b.string_col = '15' +| | +| 00:SCAN HDFS [functional.alltypesagg a] +| partitions=5/11 files=5 size=372.38KB +| predicates: a.tinyint_col = 15 +| +02:SCAN HDFS [functional.alltypesaggnonulls c] + partitions=2/10 files=2 size=148.10KB +==== +select t1.d, t2.d from functional.nulltable t1, functional.nulltable t2 +where not(t1.d IS DISTINCT FROM t2.d) +---- PLAN +02:NESTED LOOP JOIN [INNER JOIN] +| predicates: NOT (t1.d IS DISTINCT FROM t2.d) +| +|--01:SCAN HDFS [functional.nulltable t2] +| partitions=1/1 files=1 size=18B +| +00:SCAN HDFS [functional.nulltable t1] + partitions=1/1 files=1 size=18B +==== +select t1.d, t2.d +from functional.nulltable t1, functional.nulltable t2, functional.nulltable t3 +where t1.d IS DISTINCT FROM t2.d +and t3.a != t2.g +---- PLAN +04:NESTED LOOP JOIN [INNER JOIN] +| predicates: t3.a != t2.g +| +|--02:SCAN HDFS [functional.nulltable t3] +| partitions=1/1 files=1 size=18B +| +03:NESTED LOOP JOIN [INNER JOIN] +| predicates: t1.d IS DISTINCT FROM t2.d +| +|--01:SCAN HDFS [functional.nulltable t2] +| partitions=1/1 files=1 size=18B +| +00:SCAN HDFS [functional.nulltable t1] + partitions=1/1 files=1 size=18B +==== \ No newline at end of file diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/subquery-rewrite.test b/testdata/workloads/functional-planner/queries/PlannerTest/subquery-rewrite.test index 4e25511a8..167c9a259 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/subquery-rewrite.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/subquery-rewrite.test @@ -1497,3 +1497,152 @@ and not exists partitions=11/11 files=11 size=814.73KB predicates: t3.id > 100 ==== +# Tests for <=> (aka IS NOT DISTINCT FROM) and IS DISTINCT FROM +select * from functional.alltypesagg t1 +where t1.id is not distinct from +(select min(id) from functional.alltypes t2 +where t1.int_col is not distinct from t2.int_col); +---- PLAN +03:HASH JOIN [LEFT SEMI JOIN] +| hash predicates: t1.id IS NOT DISTINCT FROM min(id), t1.int_col IS NOT DISTINCT FROM t2.int_col +| +|--02:AGGREGATE [FINALIZE] +| | output: min(id) +| | group by: t2.int_col +| | +| 01:SCAN HDFS [functional.alltypes t2] +| partitions=24/24 files=24 size=478.45KB +| +00:SCAN HDFS [functional.alltypesagg t1] + partitions=11/11 files=11 size=814.73KB +==== +select * from functional.alltypesagg t1 +where t1.id is distinct from +(select min(id) from functional.alltypes t2 +where t1.int_col is not distinct from t2.int_col); +---- PLAN +03:HASH JOIN [LEFT SEMI JOIN] +| hash predicates: t1.int_col IS NOT DISTINCT FROM t2.int_col +| other join predicates: t1.id IS DISTINCT FROM min(id) +| +|--02:AGGREGATE [FINALIZE] +| | output: min(id) +| | group by: t2.int_col +| | +| 01:SCAN HDFS [functional.alltypes t2] +| partitions=24/24 files=24 size=478.45KB +| +00:SCAN HDFS [functional.alltypesagg t1] + partitions=11/11 files=11 size=814.73KB +==== +select * from functional.alltypesagg t1 +where t1.id = +(select min(id) from functional.alltypes t2 +where t1.int_col is not distinct from t2.int_col); +---- PLAN +03:HASH JOIN [LEFT SEMI JOIN] +| hash predicates: t1.id = min(id), t1.int_col IS NOT DISTINCT FROM t2.int_col +| +|--02:AGGREGATE [FINALIZE] +| | output: min(id) +| | group by: t2.int_col +| | +| 01:SCAN HDFS [functional.alltypes t2] +| partitions=24/24 files=24 size=478.45KB +| +00:SCAN HDFS [functional.alltypesagg t1] + partitions=11/11 files=11 size=814.73KB +==== +select * from functional.alltypesagg t1 +where t1.id != +(select min(id) from functional.alltypes t2 +where t1.int_col is not distinct from t2.int_col); +---- PLAN +03:HASH JOIN [LEFT SEMI JOIN] +| hash predicates: t1.int_col IS NOT DISTINCT FROM t2.int_col +| other join predicates: t1.id != min(id) +| +|--02:AGGREGATE [FINALIZE] +| | output: min(id) +| | group by: t2.int_col +| | +| 01:SCAN HDFS [functional.alltypes t2] +| partitions=24/24 files=24 size=478.45KB +| +00:SCAN HDFS [functional.alltypesagg t1] + partitions=11/11 files=11 size=814.73KB +==== +select * from functional.alltypesagg t1 +where t1.id is not distinct from +(select min(id) from functional.alltypes t2 +where t1.int_col = t2.int_col); +---- PLAN +03:HASH JOIN [LEFT SEMI JOIN] +| hash predicates: t1.id IS NOT DISTINCT FROM min(id), t1.int_col = t2.int_col +| +|--02:AGGREGATE [FINALIZE] +| | output: min(id) +| | group by: t2.int_col +| | +| 01:SCAN HDFS [functional.alltypes t2] +| partitions=24/24 files=24 size=478.45KB +| +00:SCAN HDFS [functional.alltypesagg t1] + partitions=11/11 files=11 size=814.73KB +==== +select * from functional.alltypesagg t1 +where t1.id is distinct from +(select min(id) from functional.alltypes t2 +where t1.int_col = t2.int_col); +---- PLAN +03:HASH JOIN [LEFT SEMI JOIN] +| hash predicates: t1.int_col = t2.int_col +| other join predicates: t1.id IS DISTINCT FROM min(id) +| +|--02:AGGREGATE [FINALIZE] +| | output: min(id) +| | group by: t2.int_col +| | +| 01:SCAN HDFS [functional.alltypes t2] +| partitions=24/24 files=24 size=478.45KB +| +00:SCAN HDFS [functional.alltypesagg t1] + partitions=11/11 files=11 size=814.73KB +==== +select * from functional.alltypesagg t1 +where t1.id = +(select min(id) from functional.alltypes t2 +where t1.int_col = t2.int_col); +---- PLAN +03:HASH JOIN [LEFT SEMI JOIN] +| hash predicates: t1.id = min(id), t1.int_col = t2.int_col +| +|--02:AGGREGATE [FINALIZE] +| | output: min(id) +| | group by: t2.int_col +| | +| 01:SCAN HDFS [functional.alltypes t2] +| partitions=24/24 files=24 size=478.45KB +| +00:SCAN HDFS [functional.alltypesagg t1] + partitions=11/11 files=11 size=814.73KB +==== +select * from functional.alltypesagg t1 +where t1.id != +(select min(id) from functional.alltypes t2 +where t1.int_col = t2.int_col); +---- PLAN +03:HASH JOIN [LEFT SEMI JOIN] +| hash predicates: t1.int_col = t2.int_col +| other join predicates: t1.id != min(id) +| +|--02:AGGREGATE [FINALIZE] +| | output: min(id) +| | group by: t2.int_col +| | +| 01:SCAN HDFS [functional.alltypes t2] +| partitions=24/24 files=24 size=478.45KB +| +00:SCAN HDFS [functional.alltypesagg t1] + partitions=11/11 files=11 size=814.73KB +==== \ No newline at end of file diff --git a/testdata/workloads/functional-query/queries/QueryTest/data-source-tables.test b/testdata/workloads/functional-query/queries/QueryTest/data-source-tables.test index ff97ceb2c..c12968d6e 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/data-source-tables.test +++ b/testdata/workloads/functional-query/queries/QueryTest/data-source-tables.test @@ -63,4 +63,68 @@ where string_col = 'VALIDATE_PREDICATES##id LT 1 && id GT 1 && id LE 1 && id GE 'SUCCESS' ---- TYPES STRING -==== \ No newline at end of file +==== +---- QUERY +# Test that <=>, IS DISTINCT FROM, and IS NOT DISTINCT FROM all can be validated +select string_col from alltypes_datasource +where string_col = 'VALIDATE_PREDICATES##id NOT_DISTINCT 1 && id DISTINCT_FROM 1 && id NOT_DISTINCT 1' + and 1 <=> id and 1 IS DISTINCT FROM id and 1 IS NOT DISTINCT FROM id +---- RESULTS +'SUCCESS' +---- TYPES +STRING +==== +---- QUERY +# Test that <=>, IS DISTINCT FROM, and IS NOT DISTINCT FROM are evaluated just like their +# equality counterparts +select * from +(select count(*) from alltypes_datasource +where tinyint_col = 1 and smallint_col = 11) a +union all +(select count(*) from alltypes_datasource +where tinyint_col <=> 1 and smallint_col <=> 11) +---- RESULTS +50 +50 +---- TYPES +BIGINT +==== +---- QUERY +select * from +(select count(*) from alltypes_datasource +where smallint_col = 11 and tinyint_col = 1) a +union all +(select count(*) from alltypes_datasource +where smallint_col <=> 11 and tinyint_col <=> 1) +---- RESULTS +500 +500 +---- TYPES +BIGINT +==== +---- QUERY +select * from +(select count(*) from alltypes_datasource +where tinyint_col != 1 and smallint_col != 11) a +union all +(select count(*) from alltypes_datasource +where tinyint_col IS DISTINCT FROM 1 and smallint_col IS DISTINCT FROM 11) +---- RESULTS +4950 +4950 +---- TYPES +BIGINT +==== +---- QUERY +select * from +(select count(*) from alltypes_datasource +where smallint_col != 11 and tinyint_col != 1) a +union all +(select count(*) from alltypes_datasource +where smallint_col IS DISTINCT FROM 11 and tinyint_col IS DISTINCT FROM 1) +---- RESULTS +4096 +4096 +---- TYPES +BIGINT +==== diff --git a/testdata/workloads/functional-query/queries/QueryTest/exprs.test b/testdata/workloads/functional-query/queries/QueryTest/exprs.test index b5a5de5af..710eb650a 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/exprs.test +++ b/testdata/workloads/functional-query/queries/QueryTest/exprs.test @@ -2338,3 +2338,61 @@ select regexp_match_count(tmp.str, tmp.pattern, tmp.start_pos, tmp.params) from ('iPhone\niPad\niPod', '^I.*$', 1, 'imn')) as tmp ---- CATCH Illegal match parameter x +==== +---- QUERY +# IMPALA-2147: IS [NOT] DISTINCT FROM and "<=>" +select NULL <=> NULL +---- RESULTS +true +---- TYPES +BOOLEAN +==== +---- QUERY +select NULL <=> 1 +---- RESULTS +false +---- TYPES +BOOLEAN +==== +---- QUERY +select NULL <=> "foo" +---- RESULTS +false +---- TYPES +BOOLEAN +==== +---- QUERY +select NULL IS DISTINCT FROM NULL +---- RESULTS +false +---- TYPES +BOOLEAN +==== +---- QUERY +select NULL IS DISTINCT FROM 3.14 +---- RESULTS +true +---- TYPES +BOOLEAN +==== +---- QUERY +select cast(0 as bigint) IS DISTINCT FROM NULL +---- RESULTS +true +---- TYPES +BOOLEAN +==== +---- QUERY +select 2.78 IS DISTINCT FROM 3.14 +---- RESULTS +true +---- TYPES +BOOLEAN +==== +---- QUERY +select 2.78 IS NOT DISTINCT FROM 3.14 +---- RESULTS +false +---- TYPES +BOOLEAN +==== diff --git a/testdata/workloads/functional-query/queries/QueryTest/joins.test b/testdata/workloads/functional-query/queries/QueryTest/joins.test index c880996a5..e570d3587 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/joins.test +++ b/testdata/workloads/functional-query/queries/QueryTest/joins.test @@ -562,3 +562,151 @@ cross join functional.alltypes t2 where t1.id = 0 limit 1; ---- TYPES INT,BOOLEAN ==== +---- QUERY +# IMPALA-2147: IS [NOT] DISTINCT FROM and "<=>" +select count(*) > 0 +from alltypesagg as a, alltypesagg as b +where (a.tinyint_col IS DISTINCT FROM b.tinyint_col) +and a.tinyint_col is null +---- RESULTS +true +---- TYPES +BOOLEAN +==== +---- QUERY +select count(*) > 0 +from alltypesagg as a, alltypesagg as b +where (a.tinyint_col IS NOT DISTINCT FROM b.tinyint_col) +and a.tinyint_col is null +---- RESULTS +true +---- TYPES +BOOLEAN +==== +---- QUERY +select count(*) > 0 +from alltypesagg as a, alltypesagg as b +where (a.tinyint_col <=> b.tinyint_col) +and a.tinyint_col is null +---- RESULTS +true +---- TYPES +BOOLEAN +==== +---- QUERY +# left joins with <=> are different from left joins with = +select P.d, Q.d, Q.b, Q.b is null +from nulltable P left join nulltable Q +on P.d = Q.d +---- RESULTS +NULL,NULL,'NULL',true +---- TYPES +INT,INT,STRING,BOOLEAN +==== +---- QUERY +select P.d, Q.d, Q.b +from nulltable P left join nulltable Q +on P.d <=> Q.d +---- RESULTS +NULL,NULL,'' +---- TYPES +INT,INT,STRING +==== +---- QUERY +select count(*) +from nulltable P left anti join nulltable Q +on P.d = Q.d +---- RESULTS +1 +---- TYPES +BIGINT +==== +---- QUERY +select count(*) +from nulltable P left anti join nulltable Q +on P.d <=> Q.d +---- RESULTS +0 +---- TYPES +BIGINT +==== +---- QUERY +# Test that 'IS DISTINCT FROM' works in nested loop joins +select count(*) from nulltable t1, nulltable t2 where not(t1.d != t2.d) +---- RESULTS +0 +---- TYPES +BIGINT +==== +---- QUERY +select t1.d, t2.d from nulltable t1, nulltable t2 where not(t1.d IS DISTINCT FROM t2.d) +---- RESULTS +NULL,NULL +---- TYPES +INT,INT +==== +---- QUERY +select count(*) from nulltable t1, nulltable t2 +where t1.d != length(t2.a) +---- RESULTS +0 +---- TYPES +BIGINT +==== +---- QUERY +select t1.d, t2.a from nulltable t1, nulltable t2 +where t1.d IS DISTINCT FROM length(t2.a) +---- RESULTS +NULL,'a' +---- TYPES +INT,STRING +==== +---- QUERY +select t1.a, t1.b, t2.a, t2.b from +(values +(NULL a, NULL b), (NULL, 0), (NULL, 1), +(0, NULL), (0, 0), (0, 1), +(1, NULL), (1, 0), (1, 1)) as t1, +(values +(NULL a, NULL b), (NULL, 0), (NULL, 1), +(0, NULL), (0, 0), (0, 1), +(1, NULL), (1, 0), (1, 1)) as t2 +where t1.a <=> t2.a +and t1.b <=> t2.b +order by t1.a, t1.b, t2.a, t2.b +---- RESULTS +0,0,0,0 +0,1,0,1 +0,NULL,0,NULL +1,0,1,0 +1,1,1,1 +1,NULL,1,NULL +NULL,0,NULL,0 +NULL,1,NULL,1 +NULL,NULL,NULL,NULL +---- TYPES +TINYINT,TINYINT,TINYINT,TINYINT +==== +---- QUERY +select t1.a, t1.b, t2.a, t2.b from +(values +(NULL a, NULL b), (NULL, 0), (NULL, 1), +(0, NULL), (0, 0), (0, 1), +(1, NULL), (1, 0), (1, 1)) as t1, +(values +(NULL a, NULL b), (NULL, 0), (NULL, 1), +(0, NULL), (0, 0), (0, 1), +(1, NULL), (1, 0), (1, 1)) as t2 +where t1.a <=> t2.a +and t1.b = t2.b +order by t1.a, t1.b, t2.a, t2.b +---- RESULTS +0,0,0,0 +0,1,0,1 +1,0,1,0 +1,1,1,1 +NULL,0,NULL,0 +NULL,1,NULL,1 +---- TYPES +TINYINT,TINYINT,TINYINT,TINYINT +==== diff --git a/tests/query_test/test_join_queries.py b/tests/query_test/test_join_queries.py index 582343de0..f617ffdeb 100644 --- a/tests/query_test/test_join_queries.py +++ b/tests/query_test/test_join_queries.py @@ -30,7 +30,7 @@ class TestJoinQueries(ImpalaTestSuite): # Cut down on execution time when not running in exhaustive mode. cls.TestMatrix.add_constraint(lambda v: v.get_value('batch_size') != 1) - def test_joins(self, vector): + def test_basic_joins(self, vector): new_vector = copy(vector) new_vector.get_value('exec_option')['batch_size'] = vector.get_value('batch_size') self.run_test_case('QueryTest/joins', new_vector)