diff --git a/be/src/exprs/expr-test.cc b/be/src/exprs/expr-test.cc index 50bbf7f13..3808b4f8b 100644 --- a/be/src/exprs/expr-test.cc +++ b/be/src/exprs/expr-test.cc @@ -1010,6 +1010,10 @@ TEST_F(ExprTest, LikePredicate) { TestValue("'abxcy1234a' RLIKE 'a.x.y.*a'", TYPE_BOOLEAN, true); TestValue("'axcy1234a' REGEXP 'a.x.y.*a'", TYPE_BOOLEAN, false); TestValue("'axcy1234a' RLIKE 'a.x.y.*a'", TYPE_BOOLEAN, false); + TestValue("'english' REGEXP 'en'", TYPE_BOOLEAN, true); + TestValue("'english' REGEXP 'lis'", TYPE_BOOLEAN, true); + TestValue("'english' REGEXP 'english'", TYPE_BOOLEAN, true); + TestValue("'english' REGEXP 'engilsh'", TYPE_BOOLEAN, false); // regex escape chars; insert special character in the middle to prevent // it from being matched as a substring TestValue("'.[]{}()x\\\\*+?|^$' LIKE '.[]{}()_\\\\\\\\*+?|^$'", TYPE_BOOLEAN, true); diff --git a/be/src/exprs/like-predicate.cc b/be/src/exprs/like-predicate.cc index 03c5b961a..5048b6273 100644 --- a/be/src/exprs/like-predicate.cc +++ b/be/src/exprs/like-predicate.cc @@ -85,6 +85,17 @@ void* LikePredicate::ConstantEqualsFn(Expr* e, TupleRow* row) { return &p->result_.bool_val; } +void* LikePredicate::ConstantRegexFnPartial(Expr* e, TupleRow* row) { + LikePredicate* p = static_cast(e); + DCHECK_EQ(p->GetNumChildren(), 2); + StringValue* operand_val = static_cast(e->GetChild(0)->GetValue(row)); + if (operand_val == NULL) return NULL; + + re2::StringPiece operand_sp(operand_val->ptr, operand_val->len); + p->result_.bool_val = RE2::PartialMatch(operand_sp, *p->regex_); + return &p->result_.bool_val; +} + void* LikePredicate::ConstantRegexFn(Expr* e, TupleRow* row) { LikePredicate* p = static_cast(e); DCHECK_EQ(p->GetNumChildren(), 2); @@ -110,8 +121,13 @@ void* LikePredicate::RegexMatch(Expr* e, TupleRow* row, bool is_like_pattern) { } re2::RE2 re(re_pattern); if (re.ok()) { - p->result_.bool_val = - RE2::FullMatch(re2::StringPiece(operand_value->ptr, operand_value->len), re); + if (is_like_pattern) { + p->result_.bool_val = + RE2::FullMatch(re2::StringPiece(operand_value->ptr, operand_value->len), re); + } else { + p->result_.bool_val = + RE2::PartialMatch(re2::StringPiece(operand_value->ptr, operand_value->len), re); + } return &p->result_.bool_val; } else { // TODO: log error in runtime state @@ -128,6 +144,9 @@ void* LikePredicate::RegexFn(Expr* e, TupleRow* row) { return RegexMatch(e, row, false); } +// There is a difference in the semantics of LIKE and REGEXP +// LIKE only requires explicit use of '%' to preform partial matches +// REGEXP does partial matching by default Status LikePredicate::Prepare(RuntimeState* state, const RowDescriptor& row_desc) { RETURN_IF_ERROR(Expr::PrepareChildren(state, row_desc)); DCHECK_EQ(children_.size(), 2); @@ -192,7 +211,12 @@ Status LikePredicate::Prepare(RuntimeState* state, const RowDescriptor& row_desc } regex_.reset(new RE2(re_pattern)); if (!regex_->ok()) return Status("Invalid regular expression: " + pattern_str); - compute_fn_ = ConstantRegexFn; + if (fn_.name.function_name == "regexp" || fn_.name.function_name == "rlike") { + compute_fn_ = ConstantRegexFnPartial; + } else { + compute_fn_ = ConstantRegexFn; + } + } return Status::OK; } diff --git a/be/src/exprs/like-predicate.h b/be/src/exprs/like-predicate.h index 343c162ff..c11d37e7b 100644 --- a/be/src/exprs/like-predicate.h +++ b/be/src/exprs/like-predicate.h @@ -17,7 +17,7 @@ #define IMPALA_EXPRS_LIKE_PREDICATE_H_ #include -#include +#include #include "exprs/predicate.h" #include "gen-cpp/Exprs_types.h" @@ -56,7 +56,7 @@ class LikePredicate: public Predicate { // Handling of like predicates that can be implemented using strncmp static void* ConstantStartsWithFn(Expr* e, TupleRow* row); - + // Handling of like predicates that can be implemented using strncmp static void* ConstantEndsWithFn(Expr* e, TupleRow* row); @@ -64,6 +64,7 @@ class LikePredicate: public Predicate { static void* ConstantEqualsFn(Expr* e, TupleRow* row); static void* ConstantRegexFn(Expr* e, TupleRow* row); + static void* ConstantRegexFnPartial(Expr* e, TupleRow* row); static void* LikeFn(Expr* e, TupleRow* row); static void* RegexFn(Expr* e, TupleRow* row); static void* RegexMatch(Expr* e, TupleRow* row, bool is_like_pattern); diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/tpch-all.test b/testdata/workloads/functional-planner/queries/PlannerTest/tpch-all.test index 6f1e8bbfe..7c1746801 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/tpch-all.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/tpch-all.test @@ -1746,28 +1746,40 @@ join tpch.part p where ( p_brand = 'Brand#12' - and p_container REGEXP 'SM CASE||SM BOX||SM PACK||SM PKG' + and (p_container LIKE 'SM CASE' or + p_container LIKE 'SM BOX' or + p_container LIKE 'SM PACK' or + p_container LIKE 'SM PKG') and l_quantity >= 1 and l_quantity <= 11 and p_size >= 1 and p_size <= 5 - and l_shipmode REGEXP 'AIR||AIR REG' + and (l_shipmode LIKE 'AIR' or + l_shipmode LIKE 'AIR REG') and l_shipinstruct = 'DELIVER IN PERSON' ) or ( p_brand = 'Brand#23' - and p_container REGEXP 'MED BAG||MED BOX||MED PKG||MED PACK' + and (p_container LIKE 'MED BAG' or + p_container LIKE 'MED BOX' or + p_container LIKE 'MED PKG' or + p_container LIKE 'MED PACK') and l_quantity >= 10 and l_quantity <= 20 and p_size >= 1 and p_size <= 10 - and l_shipmode REGEXP 'AIR||AIR REG' + and (l_shipmode LIKE 'AIR' or + l_shipmode LIKE 'AIR REG') and l_shipinstruct = 'DELIVER IN PERSON' ) or ( p_brand = 'Brand#34' - and p_container REGEXP 'LG CASE||LG BOX||LG PACK||LG PKG' + and (p_container LIKE 'LG BAG' or + p_container LIKE 'LG BOX' or + p_container LIKE 'LG PKG' or + p_container LIKE 'LG PACK') and l_quantity >= 20 and l_quantity <= 30 and p_size >= 1 and p_size <= 15 - and l_shipmode REGEXP 'AIR||AIR REG' + and (l_shipmode LIKE 'AIR' or + l_shipmode LIKE 'AIR REG') and l_shipinstruct = 'DELIVER IN PERSON' ) ---- PLAN @@ -1776,7 +1788,7 @@ or | 02:HASH JOIN [INNER JOIN] | hash predicates: l.l_partkey = p.p_partkey -| other predicates: (p_brand = 'Brand#12' AND p_container REGEXP 'SM CASE||SM BOX||SM PACK||SM PKG' AND l_quantity >= 1.0 AND l_quantity <= 11.0 AND p_size >= 1 AND p_size <= 5 AND l_shipmode REGEXP 'AIR||AIR REG' AND l_shipinstruct = 'DELIVER IN PERSON') OR (p_brand = 'Brand#23' AND p_container REGEXP 'MED BAG||MED BOX||MED PKG||MED PACK' AND l_quantity >= 10.0 AND l_quantity <= 20.0 AND p_size >= 1 AND p_size <= 10 AND l_shipmode REGEXP 'AIR||AIR REG' AND l_shipinstruct = 'DELIVER IN PERSON') OR (p_brand = 'Brand#34' AND p_container REGEXP 'LG CASE||LG BOX||LG PACK||LG PKG' AND l_quantity >= 20.0 AND l_quantity <= 30.0 AND p_size >= 1 AND p_size <= 15 AND l_shipmode REGEXP 'AIR||AIR REG' AND l_shipinstruct = 'DELIVER IN PERSON') +| other predicates: (p_brand = 'Brand#12' AND (p_container LIKE 'SM CASE' OR p_container LIKE 'SM BOX' OR p_container LIKE 'SM PACK' OR p_container LIKE 'SM PKG') AND l_quantity >= 1.0 AND l_quantity <= 11.0 AND p_size >= 1 AND p_size <= 5 AND (l_shipmode LIKE 'AIR' OR l_shipmode LIKE 'AIR REG') AND l_shipinstruct = 'DELIVER IN PERSON') OR (p_brand = 'Brand#23' AND (p_container LIKE 'MED BAG' OR p_container LIKE 'MED BOX' OR p_container LIKE 'MED PKG' OR p_container LIKE 'MED PACK') AND l_quantity >= 10.0 AND l_quantity <= 20.0 AND p_size >= 1 AND p_size <= 10 AND (l_shipmode LIKE 'AIR' OR l_shipmode LIKE 'AIR REG') AND l_shipinstruct = 'DELIVER IN PERSON') OR (p_brand = 'Brand#34' AND (p_container LIKE 'LG BAG' OR p_container LIKE 'LG BOX' OR p_container LIKE 'LG PKG' OR p_container LIKE 'LG PACK') AND l_quantity >= 20.0 AND l_quantity <= 30.0 AND p_size >= 1 AND p_size <= 15 AND (l_shipmode LIKE 'AIR' OR l_shipmode LIKE 'AIR REG') AND l_shipinstruct = 'DELIVER IN PERSON') | |--01:SCAN HDFS [tpch.part p] | partitions=1/1 size=22.83MB compact @@ -1794,7 +1806,7 @@ or | 02:HASH JOIN [INNER JOIN, BROADCAST] | hash predicates: l.l_partkey = p.p_partkey -| other predicates: (p_brand = 'Brand#12' AND p_container REGEXP 'SM CASE||SM BOX||SM PACK||SM PKG' AND l_quantity >= 1.0 AND l_quantity <= 11.0 AND p_size >= 1 AND p_size <= 5 AND l_shipmode REGEXP 'AIR||AIR REG' AND l_shipinstruct = 'DELIVER IN PERSON') OR (p_brand = 'Brand#23' AND p_container REGEXP 'MED BAG||MED BOX||MED PKG||MED PACK' AND l_quantity >= 10.0 AND l_quantity <= 20.0 AND p_size >= 1 AND p_size <= 10 AND l_shipmode REGEXP 'AIR||AIR REG' AND l_shipinstruct = 'DELIVER IN PERSON') OR (p_brand = 'Brand#34' AND p_container REGEXP 'LG CASE||LG BOX||LG PACK||LG PKG' AND l_quantity >= 20.0 AND l_quantity <= 30.0 AND p_size >= 1 AND p_size <= 15 AND l_shipmode REGEXP 'AIR||AIR REG' AND l_shipinstruct = 'DELIVER IN PERSON') +| other predicates: (p_brand = 'Brand#12' AND (p_container LIKE 'SM CASE' OR p_container LIKE 'SM BOX' OR p_container LIKE 'SM PACK' OR p_container LIKE 'SM PKG') AND l_quantity >= 1.0 AND l_quantity <= 11.0 AND p_size >= 1 AND p_size <= 5 AND (l_shipmode LIKE 'AIR' OR l_shipmode LIKE 'AIR REG') AND l_shipinstruct = 'DELIVER IN PERSON') OR (p_brand = 'Brand#23' AND (p_container LIKE 'MED BAG' OR p_container LIKE 'MED BOX' OR p_container LIKE 'MED PKG' OR p_container LIKE 'MED PACK') AND l_quantity >= 10.0 AND l_quantity <= 20.0 AND p_size >= 1 AND p_size <= 10 AND (l_shipmode LIKE 'AIR' OR l_shipmode LIKE 'AIR REG') AND l_shipinstruct = 'DELIVER IN PERSON') OR (p_brand = 'Brand#34' AND (p_container LIKE 'LG BAG' OR p_container LIKE 'LG BOX' OR p_container LIKE 'LG PKG' OR p_container LIKE 'LG PACK') AND l_quantity >= 20.0 AND l_quantity <= 30.0 AND p_size >= 1 AND p_size <= 15 AND (l_shipmode LIKE 'AIR' OR l_shipmode LIKE 'AIR REG') AND l_shipinstruct = 'DELIVER IN PERSON') | |--04:EXCHANGE [BROADCAST] | | diff --git a/testdata/workloads/tpch/queries/tpch-q19.test b/testdata/workloads/tpch/queries/tpch-q19.test index d519ff084..851ba0ae2 100644 --- a/testdata/workloads/tpch/queries/tpch-q19.test +++ b/testdata/workloads/tpch/queries/tpch-q19.test @@ -9,28 +9,40 @@ join part p where ( p_brand = 'Brand#12' - and p_container REGEXP 'SM CASE||SM BOX||SM PACK||SM PKG' + and (p_container = 'SM CASE' or + p_container = 'SM BOX' or + p_container = 'SM PACK' or + p_container = 'SM PKG') and l_quantity >= 1 and l_quantity <= 11 and p_size >= 1 and p_size <= 5 - and l_shipmode REGEXP 'AIR||AIR REG' + and (l_shipmode = 'AIR' or + l_shipmode = 'AIR REG') and l_shipinstruct = 'DELIVER IN PERSON' ) or ( p_brand = 'Brand#23' - and p_container REGEXP 'MED BAG||MED BOX||MED PKG||MED PACK' + and (p_container = 'MED BAG' or + p_container = 'MED BOX' or + p_container = 'MED PKG' or + p_container = 'MED PACK') and l_quantity >= 10 and l_quantity <= 20 and p_size >= 1 and p_size <= 10 - and l_shipmode REGEXP 'AIR||AIR REG' + and (l_shipmode = 'AIR' or + l_shipmode = 'AIR REG') and l_shipinstruct = 'DELIVER IN PERSON' ) or ( p_brand = 'Brand#34' - and p_container REGEXP 'LG CASE||LG BOX||LG PACK||LG PKG' + and (p_container = 'LG CASE' or + p_container = 'LG BOX' or + p_container = 'LG PKG' or + p_container = 'LG PACK') and l_quantity >= 20 and l_quantity <= 30 and p_size >= 1 and p_size <= 15 - and l_shipmode REGEXP 'AIR||AIR REG' + and (l_shipmode = 'AIR' or + l_shipmode = 'AIR REG') and l_shipinstruct = 'DELIVER IN PERSON' ) ---- TYPES