IMPALA-939: Regex should match anywhere in string.

Change-Id: I8dcd337c3b06b632017270670a4f199ec7ada648
Reviewed-on: http://gerrit.ent.cloudera.com:8080/2296
Reviewed-by: Victor Bittorf <victor.bittorf@cloudera.com>
Tested-by: jenkins
(cherry picked from commit c97f82eaaf0efe9bd4c3da3d005464f425696a62)
Reviewed-on: http://gerrit.ent.cloudera.com:8080/2371
This commit is contained in:
Victor Bittorf
2014-04-14 16:03:12 -07:00
committed by jenkins
parent 46151dc7dd
commit 808f9a661a
5 changed files with 72 additions and 19 deletions

View File

@@ -1010,6 +1010,10 @@ TEST_F(ExprTest, LikePredicate) {
TestValue("'abxcy1234a' RLIKE 'a.x.y.*a'", TYPE_BOOLEAN, true);
TestValue("'axcy1234a' REGEXP 'a.x.y.*a'", TYPE_BOOLEAN, false);
TestValue("'axcy1234a' RLIKE 'a.x.y.*a'", TYPE_BOOLEAN, false);
TestValue("'english' REGEXP 'en'", TYPE_BOOLEAN, true);
TestValue("'english' REGEXP 'lis'", TYPE_BOOLEAN, true);
TestValue("'english' REGEXP 'english'", TYPE_BOOLEAN, true);
TestValue("'english' REGEXP 'engilsh'", TYPE_BOOLEAN, false);
// regex escape chars; insert special character in the middle to prevent
// it from being matched as a substring
TestValue("'.[]{}()x\\\\*+?|^$' LIKE '.[]{}()_\\\\\\\\*+?|^$'", TYPE_BOOLEAN, true);

View File

@@ -85,6 +85,17 @@ void* LikePredicate::ConstantEqualsFn(Expr* e, TupleRow* row) {
return &p->result_.bool_val;
}
void* LikePredicate::ConstantRegexFnPartial(Expr* e, TupleRow* row) {
LikePredicate* p = static_cast<LikePredicate*>(e);
DCHECK_EQ(p->GetNumChildren(), 2);
StringValue* operand_val = static_cast<StringValue*>(e->GetChild(0)->GetValue(row));
if (operand_val == NULL) return NULL;
re2::StringPiece operand_sp(operand_val->ptr, operand_val->len);
p->result_.bool_val = RE2::PartialMatch(operand_sp, *p->regex_);
return &p->result_.bool_val;
}
void* LikePredicate::ConstantRegexFn(Expr* e, TupleRow* row) {
LikePredicate* p = static_cast<LikePredicate*>(e);
DCHECK_EQ(p->GetNumChildren(), 2);
@@ -110,8 +121,13 @@ void* LikePredicate::RegexMatch(Expr* e, TupleRow* row, bool is_like_pattern) {
}
re2::RE2 re(re_pattern);
if (re.ok()) {
p->result_.bool_val =
RE2::FullMatch(re2::StringPiece(operand_value->ptr, operand_value->len), re);
if (is_like_pattern) {
p->result_.bool_val =
RE2::FullMatch(re2::StringPiece(operand_value->ptr, operand_value->len), re);
} else {
p->result_.bool_val =
RE2::PartialMatch(re2::StringPiece(operand_value->ptr, operand_value->len), re);
}
return &p->result_.bool_val;
} else {
// TODO: log error in runtime state
@@ -128,6 +144,9 @@ void* LikePredicate::RegexFn(Expr* e, TupleRow* row) {
return RegexMatch(e, row, false);
}
// There is a difference in the semantics of LIKE and REGEXP
// LIKE only requires explicit use of '%' to preform partial matches
// REGEXP does partial matching by default
Status LikePredicate::Prepare(RuntimeState* state, const RowDescriptor& row_desc) {
RETURN_IF_ERROR(Expr::PrepareChildren(state, row_desc));
DCHECK_EQ(children_.size(), 2);
@@ -192,7 +211,12 @@ Status LikePredicate::Prepare(RuntimeState* state, const RowDescriptor& row_desc
}
regex_.reset(new RE2(re_pattern));
if (!regex_->ok()) return Status("Invalid regular expression: " + pattern_str);
compute_fn_ = ConstantRegexFn;
if (fn_.name.function_name == "regexp" || fn_.name.function_name == "rlike") {
compute_fn_ = ConstantRegexFnPartial;
} else {
compute_fn_ = ConstantRegexFn;
}
}
return Status::OK;
}

View File

@@ -17,7 +17,7 @@
#define IMPALA_EXPRS_LIKE_PREDICATE_H_
#include <string>
#include <boost/scoped_ptr.hpp>
#include <boost/scoped_ptr.hpp>
#include "exprs/predicate.h"
#include "gen-cpp/Exprs_types.h"
@@ -56,7 +56,7 @@ class LikePredicate: public Predicate {
// Handling of like predicates that can be implemented using strncmp
static void* ConstantStartsWithFn(Expr* e, TupleRow* row);
// Handling of like predicates that can be implemented using strncmp
static void* ConstantEndsWithFn(Expr* e, TupleRow* row);
@@ -64,6 +64,7 @@ class LikePredicate: public Predicate {
static void* ConstantEqualsFn(Expr* e, TupleRow* row);
static void* ConstantRegexFn(Expr* e, TupleRow* row);
static void* ConstantRegexFnPartial(Expr* e, TupleRow* row);
static void* LikeFn(Expr* e, TupleRow* row);
static void* RegexFn(Expr* e, TupleRow* row);
static void* RegexMatch(Expr* e, TupleRow* row, bool is_like_pattern);

View File

@@ -1746,28 +1746,40 @@ join tpch.part p
where
(
p_brand = 'Brand#12'
and p_container REGEXP 'SM CASE||SM BOX||SM PACK||SM PKG'
and (p_container LIKE 'SM CASE' or
p_container LIKE 'SM BOX' or
p_container LIKE 'SM PACK' or
p_container LIKE 'SM PKG')
and l_quantity >= 1 and l_quantity <= 11
and p_size >= 1 and p_size <= 5
and l_shipmode REGEXP 'AIR||AIR REG'
and (l_shipmode LIKE 'AIR' or
l_shipmode LIKE 'AIR REG')
and l_shipinstruct = 'DELIVER IN PERSON'
)
or
(
p_brand = 'Brand#23'
and p_container REGEXP 'MED BAG||MED BOX||MED PKG||MED PACK'
and (p_container LIKE 'MED BAG' or
p_container LIKE 'MED BOX' or
p_container LIKE 'MED PKG' or
p_container LIKE 'MED PACK')
and l_quantity >= 10 and l_quantity <= 20
and p_size >= 1 and p_size <= 10
and l_shipmode REGEXP 'AIR||AIR REG'
and (l_shipmode LIKE 'AIR' or
l_shipmode LIKE 'AIR REG')
and l_shipinstruct = 'DELIVER IN PERSON'
)
or
(
p_brand = 'Brand#34'
and p_container REGEXP 'LG CASE||LG BOX||LG PACK||LG PKG'
and (p_container LIKE 'LG BAG' or
p_container LIKE 'LG BOX' or
p_container LIKE 'LG PKG' or
p_container LIKE 'LG PACK')
and l_quantity >= 20 and l_quantity <= 30
and p_size >= 1 and p_size <= 15
and l_shipmode REGEXP 'AIR||AIR REG'
and (l_shipmode LIKE 'AIR' or
l_shipmode LIKE 'AIR REG')
and l_shipinstruct = 'DELIVER IN PERSON'
)
---- PLAN
@@ -1776,7 +1788,7 @@ or
|
02:HASH JOIN [INNER JOIN]
| hash predicates: l.l_partkey = p.p_partkey
| other predicates: (p_brand = 'Brand#12' AND p_container REGEXP 'SM CASE||SM BOX||SM PACK||SM PKG' AND l_quantity >= 1.0 AND l_quantity <= 11.0 AND p_size >= 1 AND p_size <= 5 AND l_shipmode REGEXP 'AIR||AIR REG' AND l_shipinstruct = 'DELIVER IN PERSON') OR (p_brand = 'Brand#23' AND p_container REGEXP 'MED BAG||MED BOX||MED PKG||MED PACK' AND l_quantity >= 10.0 AND l_quantity <= 20.0 AND p_size >= 1 AND p_size <= 10 AND l_shipmode REGEXP 'AIR||AIR REG' AND l_shipinstruct = 'DELIVER IN PERSON') OR (p_brand = 'Brand#34' AND p_container REGEXP 'LG CASE||LG BOX||LG PACK||LG PKG' AND l_quantity >= 20.0 AND l_quantity <= 30.0 AND p_size >= 1 AND p_size <= 15 AND l_shipmode REGEXP 'AIR||AIR REG' AND l_shipinstruct = 'DELIVER IN PERSON')
| other predicates: (p_brand = 'Brand#12' AND (p_container LIKE 'SM CASE' OR p_container LIKE 'SM BOX' OR p_container LIKE 'SM PACK' OR p_container LIKE 'SM PKG') AND l_quantity >= 1.0 AND l_quantity <= 11.0 AND p_size >= 1 AND p_size <= 5 AND (l_shipmode LIKE 'AIR' OR l_shipmode LIKE 'AIR REG') AND l_shipinstruct = 'DELIVER IN PERSON') OR (p_brand = 'Brand#23' AND (p_container LIKE 'MED BAG' OR p_container LIKE 'MED BOX' OR p_container LIKE 'MED PKG' OR p_container LIKE 'MED PACK') AND l_quantity >= 10.0 AND l_quantity <= 20.0 AND p_size >= 1 AND p_size <= 10 AND (l_shipmode LIKE 'AIR' OR l_shipmode LIKE 'AIR REG') AND l_shipinstruct = 'DELIVER IN PERSON') OR (p_brand = 'Brand#34' AND (p_container LIKE 'LG BAG' OR p_container LIKE 'LG BOX' OR p_container LIKE 'LG PKG' OR p_container LIKE 'LG PACK') AND l_quantity >= 20.0 AND l_quantity <= 30.0 AND p_size >= 1 AND p_size <= 15 AND (l_shipmode LIKE 'AIR' OR l_shipmode LIKE 'AIR REG') AND l_shipinstruct = 'DELIVER IN PERSON')
|
|--01:SCAN HDFS [tpch.part p]
| partitions=1/1 size=22.83MB compact
@@ -1794,7 +1806,7 @@ or
|
02:HASH JOIN [INNER JOIN, BROADCAST]
| hash predicates: l.l_partkey = p.p_partkey
| other predicates: (p_brand = 'Brand#12' AND p_container REGEXP 'SM CASE||SM BOX||SM PACK||SM PKG' AND l_quantity >= 1.0 AND l_quantity <= 11.0 AND p_size >= 1 AND p_size <= 5 AND l_shipmode REGEXP 'AIR||AIR REG' AND l_shipinstruct = 'DELIVER IN PERSON') OR (p_brand = 'Brand#23' AND p_container REGEXP 'MED BAG||MED BOX||MED PKG||MED PACK' AND l_quantity >= 10.0 AND l_quantity <= 20.0 AND p_size >= 1 AND p_size <= 10 AND l_shipmode REGEXP 'AIR||AIR REG' AND l_shipinstruct = 'DELIVER IN PERSON') OR (p_brand = 'Brand#34' AND p_container REGEXP 'LG CASE||LG BOX||LG PACK||LG PKG' AND l_quantity >= 20.0 AND l_quantity <= 30.0 AND p_size >= 1 AND p_size <= 15 AND l_shipmode REGEXP 'AIR||AIR REG' AND l_shipinstruct = 'DELIVER IN PERSON')
| other predicates: (p_brand = 'Brand#12' AND (p_container LIKE 'SM CASE' OR p_container LIKE 'SM BOX' OR p_container LIKE 'SM PACK' OR p_container LIKE 'SM PKG') AND l_quantity >= 1.0 AND l_quantity <= 11.0 AND p_size >= 1 AND p_size <= 5 AND (l_shipmode LIKE 'AIR' OR l_shipmode LIKE 'AIR REG') AND l_shipinstruct = 'DELIVER IN PERSON') OR (p_brand = 'Brand#23' AND (p_container LIKE 'MED BAG' OR p_container LIKE 'MED BOX' OR p_container LIKE 'MED PKG' OR p_container LIKE 'MED PACK') AND l_quantity >= 10.0 AND l_quantity <= 20.0 AND p_size >= 1 AND p_size <= 10 AND (l_shipmode LIKE 'AIR' OR l_shipmode LIKE 'AIR REG') AND l_shipinstruct = 'DELIVER IN PERSON') OR (p_brand = 'Brand#34' AND (p_container LIKE 'LG BAG' OR p_container LIKE 'LG BOX' OR p_container LIKE 'LG PKG' OR p_container LIKE 'LG PACK') AND l_quantity >= 20.0 AND l_quantity <= 30.0 AND p_size >= 1 AND p_size <= 15 AND (l_shipmode LIKE 'AIR' OR l_shipmode LIKE 'AIR REG') AND l_shipinstruct = 'DELIVER IN PERSON')
|
|--04:EXCHANGE [BROADCAST]
| |

View File

@@ -9,28 +9,40 @@ join part p
where
(
p_brand = 'Brand#12'
and p_container REGEXP 'SM CASE||SM BOX||SM PACK||SM PKG'
and (p_container = 'SM CASE' or
p_container = 'SM BOX' or
p_container = 'SM PACK' or
p_container = 'SM PKG')
and l_quantity >= 1 and l_quantity <= 11
and p_size >= 1 and p_size <= 5
and l_shipmode REGEXP 'AIR||AIR REG'
and (l_shipmode = 'AIR' or
l_shipmode = 'AIR REG')
and l_shipinstruct = 'DELIVER IN PERSON'
)
or
(
p_brand = 'Brand#23'
and p_container REGEXP 'MED BAG||MED BOX||MED PKG||MED PACK'
and (p_container = 'MED BAG' or
p_container = 'MED BOX' or
p_container = 'MED PKG' or
p_container = 'MED PACK')
and l_quantity >= 10 and l_quantity <= 20
and p_size >= 1 and p_size <= 10
and l_shipmode REGEXP 'AIR||AIR REG'
and (l_shipmode = 'AIR' or
l_shipmode = 'AIR REG')
and l_shipinstruct = 'DELIVER IN PERSON'
)
or
(
p_brand = 'Brand#34'
and p_container REGEXP 'LG CASE||LG BOX||LG PACK||LG PKG'
and (p_container = 'LG CASE' or
p_container = 'LG BOX' or
p_container = 'LG PKG' or
p_container = 'LG PACK')
and l_quantity >= 20 and l_quantity <= 30
and p_size >= 1 and p_size <= 15
and l_shipmode REGEXP 'AIR||AIR REG'
and (l_shipmode = 'AIR' or
l_shipmode = 'AIR REG')
and l_shipinstruct = 'DELIVER IN PERSON'
)
---- TYPES