mirror of
https://github.com/apache/impala.git
synced 2025-12-22 03:18:15 -05:00
IMPALA-3973: add position and occurrence to instr()
Change-Id: Ie9648de458d243306fa14adc5e7f7002bf6f67fd Reviewed-on: http://gerrit.cloudera.org:8080/4094 Tested-by: Internal Jenkins Reviewed-by: Matthew Jacobs <mj@cloudera.com>
This commit is contained in:
committed by
Matthew Jacobs
parent
94fc6a3d07
commit
64ffbc4b51
@@ -38,4 +38,4 @@ target_link_libraries(tuple-splitter-test Experiments ${IMPALA_LINK_LIBS})
|
|||||||
target_link_libraries(hash-partition-test ${IMPALA_LINK_LIBS})
|
target_link_libraries(hash-partition-test ${IMPALA_LINK_LIBS})
|
||||||
target_link_libraries(compression-test ${IMPALA_LINK_LIBS})
|
target_link_libraries(compression-test ${IMPALA_LINK_LIBS})
|
||||||
|
|
||||||
ADD_BE_TEST(string-search-test)
|
ADD_BE_TEST(string-search-sse-test)
|
||||||
|
|||||||
@@ -15,8 +15,8 @@
|
|||||||
// specific language governing permissions and limitations
|
// specific language governing permissions and limitations
|
||||||
// under the License.
|
// under the License.
|
||||||
|
|
||||||
#include <string>
|
|
||||||
#include <gtest/gtest.h>
|
#include <gtest/gtest.h>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
#include "experiments/string-search-sse.h"
|
#include "experiments/string-search-sse.h"
|
||||||
|
|
||||||
@@ -27,9 +27,8 @@ namespace impala {
|
|||||||
// Test string search for haystack/needle, up to len for each.
|
// Test string search for haystack/needle, up to len for each.
|
||||||
// If the length is -1, use the full string length
|
// If the length is -1, use the full string length
|
||||||
// haystack/needle are null terminated
|
// haystack/needle are null terminated
|
||||||
void TestSearch(const char* haystack_orig, const char* needle_orig,
|
void TestSearch(const char* haystack_orig, const char* needle_orig, int haystack_len = -1,
|
||||||
int haystack_len = -1, int needle_len = -1) {
|
int needle_len = -1) {
|
||||||
|
|
||||||
string haystack_copy(haystack_orig);
|
string haystack_copy(haystack_orig);
|
||||||
string needle_copy(needle_orig);
|
string needle_copy(needle_orig);
|
||||||
const char* haystack = haystack_copy.c_str();
|
const char* haystack = haystack_copy.c_str();
|
||||||
@@ -79,7 +78,6 @@ void TestSearch(const char* haystack_orig, const char* needle_orig,
|
|||||||
EXPECT_EQ(strcmp(needle, needle_orig), 0);
|
EXPECT_EQ(strcmp(needle, needle_orig), 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
TEST(StringSearchTest, Basic) {
|
TEST(StringSearchTest, Basic) {
|
||||||
// Basic Tests
|
// Basic Tests
|
||||||
TestSearch("abcd", "a");
|
TestSearch("abcd", "a");
|
||||||
@@ -113,11 +111,9 @@ TEST(StringSearchTest, Basic) {
|
|||||||
TestSearch("abcdede", "cde");
|
TestSearch("abcdede", "cde");
|
||||||
TestSearch("abcdacde", "cde");
|
TestSearch("abcdacde", "cde");
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char** argv) {
|
||||||
::testing::InitGoogleTest(&argc, argv);
|
::testing::InitGoogleTest(&argc, argv);
|
||||||
return RUN_ALL_TESTS();
|
return RUN_ALL_TESTS();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2064,9 +2064,74 @@ TEST_F(ExprTest, StringFunctions) {
|
|||||||
TestValue("instr('', '')", TYPE_INT, 0);
|
TestValue("instr('', '')", TYPE_INT, 0);
|
||||||
TestValue("instr('', 'abc')", TYPE_INT, 0);
|
TestValue("instr('', 'abc')", TYPE_INT, 0);
|
||||||
TestValue("instr('abc', '')", TYPE_INT, 0);
|
TestValue("instr('abc', '')", TYPE_INT, 0);
|
||||||
|
TestValue("instr('abcdef', 'xyz')", TYPE_INT, 0);
|
||||||
|
TestValue("instr('xyz', 'abcdef')", TYPE_INT, 0);
|
||||||
TestValue("instr('abc', 'abc')", TYPE_INT, 1);
|
TestValue("instr('abc', 'abc')", TYPE_INT, 1);
|
||||||
TestValue("instr('xyzabc', 'abc')", TYPE_INT, 4);
|
TestValue("instr('xyzabc', 'abc')", TYPE_INT, 4);
|
||||||
TestValue("instr('xyzabcxyz', 'bcx')", TYPE_INT, 5);
|
TestValue("instr('xyzabcxyz', 'bcx')", TYPE_INT, 5);
|
||||||
|
|
||||||
|
TestValue("instr('', '', -1)", TYPE_INT, 0);
|
||||||
|
TestValue("instr('', 'abc', -1)", TYPE_INT, 0);
|
||||||
|
TestValue("instr('abc', '', -1)", TYPE_INT, 0);
|
||||||
|
TestValue("instr('abc', 'abc', -1)", TYPE_INT, 1);
|
||||||
|
TestValue("instr('xyzabc', 'abc', -1)", TYPE_INT, 4);
|
||||||
|
TestValue("instr('xyzabcxyz', 'bcx', -1)", TYPE_INT, 5);
|
||||||
|
|
||||||
|
TestValue("instr('corporate floor', 'or', 0)", TYPE_INT, 0);
|
||||||
|
TestValue("instr('corporate floor', 'or', 14)", TYPE_INT, 14);
|
||||||
|
TestValue("instr('corporate floor', 'or', 15)", TYPE_INT, 0);
|
||||||
|
TestValue("instr('corporate floor', 'or', -14)", TYPE_INT, 2);
|
||||||
|
TestValue("instr('corporate floor', 'or', -15)", TYPE_INT, 0);
|
||||||
|
|
||||||
|
TestValue("instr('corporate floor', 'or', 0, 1)", TYPE_INT, 0);
|
||||||
|
TestValue("instr('corporate floor', 'or', 14, 1)", TYPE_INT, 14);
|
||||||
|
TestValue("instr('corporate floor', 'or', 15, 1)", TYPE_INT, 0);
|
||||||
|
TestValue("instr('corporate floor', 'or', -14, 1)", TYPE_INT, 2);
|
||||||
|
TestValue("instr('corporate floor', 'or', -15, 1)", TYPE_INT, 0);
|
||||||
|
|
||||||
|
TestValue("instr('corporate floor', 'or', 2, 2)", TYPE_INT, 5);
|
||||||
|
TestValue("instr('corporate floor', 'or', 3, 2)", TYPE_INT, 14);
|
||||||
|
TestValue("instr('corporate floor', 'or', -3, 2)", TYPE_INT, 2);
|
||||||
|
TestValue("instr('corporate floor', 'or', -2, 2)", TYPE_INT, 5);
|
||||||
|
|
||||||
|
TestValue("instr('corporate floor', 'or', 3, 3)", TYPE_INT, 0);
|
||||||
|
TestValue("instr('corporate floor', 'or', -3, 3)", TYPE_INT, 0);
|
||||||
|
TestValue("instr('corporate floor', '', 3, 3)", TYPE_INT, 0);
|
||||||
|
TestValue("instr('corporate floor', '', -3, 3)", TYPE_INT, 0);
|
||||||
|
|
||||||
|
TestValue("instr('abababa', 'aba', 1, 1)", TYPE_INT, 1);
|
||||||
|
TestValue("instr('abababa', 'aba', 1, 2)", TYPE_INT, 3);
|
||||||
|
TestValue("instr('abababa', 'aba', 1, 3)", TYPE_INT, 5);
|
||||||
|
TestValue("instr('abababa', 'aba', cast(1 as bigint), cast(3 as bigint))", TYPE_INT, 5);
|
||||||
|
|
||||||
|
TestError("instr('corporate floor', 'or', 0, 0)");
|
||||||
|
TestError("instr('corporate floor', 'or', 0, -1)");
|
||||||
|
TestError("instr('corporate floor', 'or', 1, 0)");
|
||||||
|
TestError("instr('corporate floor', 'or', 1, -1)");
|
||||||
|
|
||||||
|
TestIsNull("instr(NULL, 'or', 2)", TYPE_INT);
|
||||||
|
TestIsNull("instr('corporate floor', NULL, 2)", TYPE_INT);
|
||||||
|
TestIsNull("instr('corporate floor', 'or', NULL)", TYPE_INT);
|
||||||
|
|
||||||
|
TestIsNull("instr(NULL, 'or', 2, 2)", TYPE_INT);
|
||||||
|
TestIsNull("instr('corporate floor', NULL, 2, 2)", TYPE_INT);
|
||||||
|
TestIsNull("instr('corporate floor', 'or', NULL, 2)", TYPE_INT);
|
||||||
|
TestIsNull("instr('corporate floor', 'or', 2, NULL)", TYPE_INT);
|
||||||
|
|
||||||
|
TestValue("instr('a', 'a', 1, 1)", TYPE_INT, 1);
|
||||||
|
TestValue("instr('axyz', 'a', 1, 1)", TYPE_INT, 1);
|
||||||
|
TestValue("instr('xyza', 'a', 1, 1)", TYPE_INT, 4);
|
||||||
|
TestValue("instr('a', 'a', 1, 2)", TYPE_INT, 0);
|
||||||
|
TestValue("instr('axyz', 'a', 1, 2)", TYPE_INT, 0);
|
||||||
|
TestValue("instr('xyza', 'a', 1, 2)", TYPE_INT, 0);
|
||||||
|
|
||||||
|
TestValue("instr('a', 'a', -1, 1)", TYPE_INT, 1);
|
||||||
|
TestValue("instr('axyz', 'a', -1, 1)", TYPE_INT, 1);
|
||||||
|
TestValue("instr('xyza', 'a', -1, 1)", TYPE_INT, 4);
|
||||||
|
TestValue("instr('a', 'a', -1, 2)", TYPE_INT, 0);
|
||||||
|
TestValue("instr('axyz', 'a', -1, 2)", TYPE_INT, 0);
|
||||||
|
TestValue("instr('xyza', 'a', -1, 2)", TYPE_INT, 0);
|
||||||
|
|
||||||
TestIsNull("instr(NULL, 'bcx')", TYPE_INT);
|
TestIsNull("instr(NULL, 'bcx')", TYPE_INT);
|
||||||
TestIsNull("instr('xyzabcxyz', NULL)", TYPE_INT);
|
TestIsNull("instr('xyzabcxyz', NULL)", TYPE_INT);
|
||||||
TestIsNull("instr(NULL, NULL)", TYPE_INT);
|
TestIsNull("instr(NULL, NULL)", TYPE_INT);
|
||||||
|
|||||||
@@ -284,13 +284,68 @@ IntVal StringFunctions::Ascii(FunctionContext* context, const StringVal& str) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
IntVal StringFunctions::Instr(FunctionContext* context, const StringVal& str,
|
IntVal StringFunctions::Instr(FunctionContext* context, const StringVal& str,
|
||||||
const StringVal& substr) {
|
const StringVal& substr, const BigIntVal& start_position,
|
||||||
if (str.is_null || substr.is_null) return IntVal::null();
|
const BigIntVal& occurrence) {
|
||||||
StringValue str_sv = StringValue::FromStringVal(str);
|
if (str.is_null || substr.is_null || start_position.is_null || occurrence.is_null) {
|
||||||
StringValue substr_sv = StringValue::FromStringVal(substr);
|
return IntVal::null();
|
||||||
StringSearch search(&substr_sv);
|
}
|
||||||
// Hive returns positions starting from 1.
|
if (occurrence.val <= 0) {
|
||||||
return IntVal(search.Search(&str_sv) + 1);
|
stringstream ss;
|
||||||
|
ss << "Invalid occurrence parameter to instr function: " << occurrence.val;
|
||||||
|
context->SetError(ss.str().c_str());
|
||||||
|
return IntVal(0);
|
||||||
|
}
|
||||||
|
if (start_position.val == 0) return IntVal(0);
|
||||||
|
|
||||||
|
StringValue haystack = StringValue::FromStringVal(str);
|
||||||
|
StringValue needle = StringValue::FromStringVal(substr);
|
||||||
|
StringSearch search(&needle);
|
||||||
|
if (start_position.val > 0) {
|
||||||
|
// A positive starting position indicates regular searching from the left.
|
||||||
|
int search_start_pos = start_position.val - 1;
|
||||||
|
if (search_start_pos >= haystack.len) return IntVal(0);
|
||||||
|
int match_pos = -1;
|
||||||
|
for (int match_num = 0; match_num < occurrence.val; ++match_num) {
|
||||||
|
DCHECK_LE(search_start_pos, haystack.len);
|
||||||
|
StringValue haystack_substring = haystack.Substring(search_start_pos);
|
||||||
|
int match_pos_in_substring = search.Search(&haystack_substring);
|
||||||
|
if (match_pos_in_substring < 0) return IntVal(0);
|
||||||
|
match_pos = search_start_pos + match_pos_in_substring;
|
||||||
|
search_start_pos = match_pos + 1;
|
||||||
|
}
|
||||||
|
// Return positions starting from 1 at the leftmost position.
|
||||||
|
return IntVal(match_pos + 1);
|
||||||
|
} else {
|
||||||
|
// A negative starting position indicates searching from the right.
|
||||||
|
int search_start_pos = haystack.len + start_position.val;
|
||||||
|
// The needle must fit between search_start_pos and the end of the string
|
||||||
|
if (search_start_pos + needle.len > haystack.len) {
|
||||||
|
search_start_pos = haystack.len - needle.len;
|
||||||
|
}
|
||||||
|
if (search_start_pos < 0) return IntVal(0);
|
||||||
|
int match_pos = -1;
|
||||||
|
for (int match_num = 0; match_num < occurrence.val; ++match_num) {
|
||||||
|
DCHECK_GE(search_start_pos + needle.len, 0);
|
||||||
|
DCHECK_LE(search_start_pos + needle.len, haystack.len);
|
||||||
|
StringValue haystack_substring =
|
||||||
|
haystack.Substring(0, search_start_pos + needle.len);
|
||||||
|
match_pos = search.RSearch(&haystack_substring);
|
||||||
|
if (match_pos < 0) return IntVal(0);
|
||||||
|
search_start_pos = match_pos - 1;
|
||||||
|
}
|
||||||
|
// Return positions starting from 1 at the leftmost position.
|
||||||
|
return IntVal(match_pos + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
IntVal StringFunctions::Instr(FunctionContext* context, const StringVal& str,
|
||||||
|
const StringVal& substr, const BigIntVal& start_position) {
|
||||||
|
return Instr(context, str, substr, start_position, BigIntVal(1));
|
||||||
|
}
|
||||||
|
|
||||||
|
IntVal StringFunctions::Instr(
|
||||||
|
FunctionContext* context, const StringVal& str, const StringVal& substr) {
|
||||||
|
return Instr(context, str, substr, BigIntVal(1), BigIntVal(1));
|
||||||
}
|
}
|
||||||
|
|
||||||
IntVal StringFunctions::Locate(FunctionContext* context, const StringVal& substr,
|
IntVal StringFunctions::Locate(FunctionContext* context, const StringVal& substr,
|
||||||
|
|||||||
@@ -60,7 +60,12 @@ class StringFunctions {
|
|||||||
static StringVal Ltrim(FunctionContext*, const StringVal& str);
|
static StringVal Ltrim(FunctionContext*, const StringVal& str);
|
||||||
static StringVal Rtrim(FunctionContext*, const StringVal& str);
|
static StringVal Rtrim(FunctionContext*, const StringVal& str);
|
||||||
static IntVal Ascii(FunctionContext*, const StringVal& str);
|
static IntVal Ascii(FunctionContext*, const StringVal& str);
|
||||||
|
static IntVal Instr(FunctionContext*, const StringVal& str, const StringVal& substr,
|
||||||
|
const BigIntVal& start_position, const BigIntVal& occurrence);
|
||||||
|
static IntVal Instr(FunctionContext*, const StringVal& str, const StringVal& substr,
|
||||||
|
const BigIntVal& start_position);
|
||||||
static IntVal Instr(FunctionContext*, const StringVal& str, const StringVal& substr);
|
static IntVal Instr(FunctionContext*, const StringVal& str, const StringVal& substr);
|
||||||
|
|
||||||
static IntVal Locate(FunctionContext*, const StringVal& substr, const StringVal& str);
|
static IntVal Locate(FunctionContext*, const StringVal& substr, const StringVal& str);
|
||||||
static IntVal LocatePos(FunctionContext*, const StringVal& substr, const StringVal& str,
|
static IntVal LocatePos(FunctionContext*, const StringVal& substr, const StringVal& str,
|
||||||
const BigIntVal& start_pos);
|
const BigIntVal& start_pos);
|
||||||
|
|||||||
@@ -78,6 +78,7 @@ ADD_BE_TEST(disk-io-mgr-test)
|
|||||||
ADD_BE_TEST(buffered-block-mgr-test)
|
ADD_BE_TEST(buffered-block-mgr-test)
|
||||||
ADD_BE_TEST(parallel-executor-test)
|
ADD_BE_TEST(parallel-executor-test)
|
||||||
ADD_BE_TEST(raw-value-test)
|
ADD_BE_TEST(raw-value-test)
|
||||||
|
ADD_BE_TEST(string-search-test)
|
||||||
ADD_BE_TEST(string-value-test)
|
ADD_BE_TEST(string-value-test)
|
||||||
ADD_BE_TEST(thread-resource-mgr-test)
|
ADD_BE_TEST(thread-resource-mgr-test)
|
||||||
ADD_BE_TEST(mem-tracker-test)
|
ADD_BE_TEST(mem-tracker-test)
|
||||||
|
|||||||
135
be/src/runtime/string-search-test.cc
Normal file
135
be/src/runtime/string-search-test.cc
Normal file
@@ -0,0 +1,135 @@
|
|||||||
|
// Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
// or more contributor license agreements. See the NOTICE file
|
||||||
|
// distributed with this work for additional information
|
||||||
|
// regarding copyright ownership. The ASF licenses this file
|
||||||
|
// to you under the Apache License, Version 2.0 (the
|
||||||
|
// "License"); you may not use this file except in compliance
|
||||||
|
// with the License. You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing,
|
||||||
|
// software distributed under the License is distributed on an
|
||||||
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
// KIND, either express or implied. See the License for the
|
||||||
|
// specific language governing permissions and limitations
|
||||||
|
// under the License.
|
||||||
|
|
||||||
|
#include <gtest/gtest.h>
|
||||||
|
|
||||||
|
#include "runtime/string-search.h"
|
||||||
|
|
||||||
|
namespace impala {
|
||||||
|
|
||||||
|
StringValue StrValFromCString(const char* str, int len) {
|
||||||
|
if (len == -1) len = strlen(str);
|
||||||
|
return StringValue(const_cast<char*>(str), len);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test string search for haystack/needle, up to len for each.
|
||||||
|
// If the length is -1, use the full string length.
|
||||||
|
// Searching in a substring allows checking whether the search respects the
|
||||||
|
// length stored in the StringValue or also reads parts of the string which
|
||||||
|
// should be excluded.
|
||||||
|
int TestSearch(const char* haystack, const char* needle, int haystack_len = -1,
|
||||||
|
int needle_len = -1) {
|
||||||
|
StringValue needle_str_val = StrValFromCString(needle, needle_len);
|
||||||
|
StringValue haystack_str_val = StrValFromCString(haystack, haystack_len);
|
||||||
|
StringSearch search(&needle_str_val);
|
||||||
|
return search.Search(&haystack_str_val);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test reverse string search for haystack/needle, up to len for each.
|
||||||
|
// If the length is -1, use the full string length.
|
||||||
|
// Searching in a substring allows checking whether the search respects the
|
||||||
|
// length stored in the StringValue or also reads parts of the string which
|
||||||
|
// should be excluded.
|
||||||
|
int TestRSearch(const char* haystack, const char* needle, int haystack_len = -1,
|
||||||
|
int needle_len = -1) {
|
||||||
|
StringValue needle_str_val = StrValFromCString(needle, needle_len);
|
||||||
|
StringValue haystack_str_val = StrValFromCString(haystack, haystack_len);
|
||||||
|
StringSearch search(&needle_str_val);
|
||||||
|
return search.RSearch(&haystack_str_val);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(StringSearchTest, RegularSearch) {
|
||||||
|
// Basic Tests
|
||||||
|
EXPECT_EQ(0, TestSearch("abcdabcd", "a"));
|
||||||
|
EXPECT_EQ(0, TestSearch("abcdabcd", "ab"));
|
||||||
|
EXPECT_EQ(2, TestSearch("abcdabcd", "c"));
|
||||||
|
EXPECT_EQ(2, TestSearch("abcdabcd", "cd"));
|
||||||
|
EXPECT_EQ(-1, TestSearch("", ""));
|
||||||
|
EXPECT_EQ(-1, TestSearch("abc", ""));
|
||||||
|
EXPECT_EQ(-1, TestSearch("", "a"));
|
||||||
|
|
||||||
|
// Test single chars
|
||||||
|
EXPECT_EQ(0, TestSearch("a", "a"));
|
||||||
|
EXPECT_EQ(-1, TestSearch("a", "b"));
|
||||||
|
EXPECT_EQ(1, TestSearch("abc", "b"));
|
||||||
|
|
||||||
|
// Test searching in a substring of the haystack
|
||||||
|
EXPECT_EQ(0, TestSearch("abcabcd", "abc", 5));
|
||||||
|
EXPECT_EQ(-1, TestSearch("abcabcd", "abcd", 5));
|
||||||
|
EXPECT_EQ(-1, TestSearch("abcabcd", "abcd", 0));
|
||||||
|
|
||||||
|
// Test searching a substring of the needle in a substring of the haystack.
|
||||||
|
EXPECT_EQ(0, TestSearch("abcde", "abaaaaaa", 4, 2));
|
||||||
|
EXPECT_EQ(-1, TestSearch("abcde", "abaaaaaa", 4, 3));
|
||||||
|
EXPECT_EQ(-1, TestSearch("abcdabaaaaa", "abaaaaaa", 4, 3));
|
||||||
|
EXPECT_EQ(-1, TestSearch("abcdabaaaaa", "abaaaaaa", 4, 0));
|
||||||
|
EXPECT_EQ(-1, TestSearch("abcdabaaaaa", "abaaaaaa"));
|
||||||
|
|
||||||
|
// Test with needle matching the end of the substring.
|
||||||
|
EXPECT_EQ(4, TestSearch("abcde", "e"));
|
||||||
|
EXPECT_EQ(3, TestSearch("abcde", "de"));
|
||||||
|
EXPECT_EQ(2, TestSearch("abcdede", "cde"));
|
||||||
|
EXPECT_EQ(5, TestSearch("abcdacde", "cde"));
|
||||||
|
|
||||||
|
// Test correct skip_ handling, which depends on which char of the needle is
|
||||||
|
// the same as the last one.
|
||||||
|
EXPECT_EQ(2, TestSearch("ababcacac", "abcacac"));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(StringSearchTest, ReverseSearch) {
|
||||||
|
// Basic Tests
|
||||||
|
EXPECT_EQ(4, TestRSearch("abcdabcd", "a"));
|
||||||
|
EXPECT_EQ(4, TestRSearch("abcdabcd", "ab"));
|
||||||
|
EXPECT_EQ(6, TestRSearch("abcdabcd", "c"));
|
||||||
|
EXPECT_EQ(6, TestRSearch("abcdabcd", "cd"));
|
||||||
|
EXPECT_EQ(-1, TestRSearch("", ""));
|
||||||
|
EXPECT_EQ(-1, TestRSearch("abc", ""));
|
||||||
|
EXPECT_EQ(-1, TestRSearch("", "a"));
|
||||||
|
|
||||||
|
// Test single chars
|
||||||
|
EXPECT_EQ(0, TestRSearch("a", "a"));
|
||||||
|
EXPECT_EQ(-1, TestRSearch("a", "b"));
|
||||||
|
EXPECT_EQ(1, TestRSearch("abc", "b"));
|
||||||
|
|
||||||
|
// Test searching in a substring of the haystack
|
||||||
|
EXPECT_EQ(0, TestRSearch("abcabcd", "abc", 5));
|
||||||
|
EXPECT_EQ(-1, TestRSearch("abcabcd", "abcd", 5));
|
||||||
|
EXPECT_EQ(-1, TestRSearch("abcabcd", "abcd", 0));
|
||||||
|
|
||||||
|
// Test searching a substring of the needle in a substring of the haystack.
|
||||||
|
EXPECT_EQ(0, TestRSearch("abcde", "abaaaaaa", 4, 2));
|
||||||
|
EXPECT_EQ(-1, TestRSearch("abcde", "abaaaaaa", 4, 3));
|
||||||
|
EXPECT_EQ(-1, TestRSearch("abcdabaaaaa", "abaaaaaa", 4, 3));
|
||||||
|
EXPECT_EQ(-1, TestRSearch("abcdabaaaaa", "abaaaaaa", 4, 0));
|
||||||
|
EXPECT_EQ(-1, TestRSearch("abcdabaaaaa", "abaaaaaa"));
|
||||||
|
|
||||||
|
// Test with needle matching the end of the substring.
|
||||||
|
EXPECT_EQ(4, TestRSearch("abcde", "e"));
|
||||||
|
EXPECT_EQ(3, TestRSearch("abcde", "de"));
|
||||||
|
EXPECT_EQ(2, TestRSearch("abcdede", "cde"));
|
||||||
|
EXPECT_EQ(5, TestRSearch("abcdacde", "cde"));
|
||||||
|
|
||||||
|
// Test correct rskip_ handling, which depends on which char of the needle is
|
||||||
|
// the same as the first one.
|
||||||
|
EXPECT_EQ(0, TestRSearch("cacacbaba", "cacacba"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char** argv) {
|
||||||
|
::testing::InitGoogleTest(&argc, argv);
|
||||||
|
return RUN_ALL_TESTS();
|
||||||
|
}
|
||||||
@@ -15,12 +15,13 @@
|
|||||||
// specific language governing permissions and limitations
|
// specific language governing permissions and limitations
|
||||||
// under the License.
|
// under the License.
|
||||||
|
|
||||||
|
|
||||||
#ifndef IMPALA_RUNTIME_STRING_SEARCH_H
|
#ifndef IMPALA_RUNTIME_STRING_SEARCH_H
|
||||||
#define IMPALA_RUNTIME_STRING_SEARCH_H
|
#define IMPALA_RUNTIME_STRING_SEARCH_H
|
||||||
|
|
||||||
#include <vector>
|
#include <algorithm>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
#include <boost/cstdint.hpp>
|
#include <boost/cstdint.hpp>
|
||||||
|
|
||||||
#include "common/logging.h"
|
#include "common/logging.h"
|
||||||
@@ -44,7 +45,8 @@ class StringSearch {
|
|||||||
StringSearch() : pattern_(NULL), mask_(0) {}
|
StringSearch() : pattern_(NULL), mask_(0) {}
|
||||||
|
|
||||||
/// Initialize/Precompute a StringSearch object from the pattern
|
/// Initialize/Precompute a StringSearch object from the pattern
|
||||||
StringSearch(const StringValue* pattern) : pattern_(pattern), mask_(0), skip_(0) {
|
StringSearch(const StringValue* pattern)
|
||||||
|
: pattern_(pattern), mask_(0), skip_(0), rskip_(0) {
|
||||||
// Special cases
|
// Special cases
|
||||||
if (pattern_->len <= 1) {
|
if (pattern_->len <= 1) {
|
||||||
return;
|
return;
|
||||||
@@ -53,13 +55,25 @@ class StringSearch {
|
|||||||
// Build compressed lookup table
|
// Build compressed lookup table
|
||||||
int mlast = pattern_->len - 1;
|
int mlast = pattern_->len - 1;
|
||||||
skip_ = mlast - 1;
|
skip_ = mlast - 1;
|
||||||
|
rskip_ = mlast - 1;
|
||||||
|
|
||||||
|
// In the Python implementation, building the bloom filter happens at the
|
||||||
|
// beginning of the Search operation. We build it during construction
|
||||||
|
// instead, so that the same StringSearch instance can be reused multiple
|
||||||
|
// times without rebuilding the bloom filter every time.
|
||||||
for (int i = 0; i < mlast; ++i) {
|
for (int i = 0; i < mlast; ++i) {
|
||||||
BloomAdd(pattern_->ptr[i]);
|
BloomAdd(pattern_->ptr[i]);
|
||||||
if (pattern_->ptr[i] == pattern_->ptr[mlast])
|
if (pattern_->ptr[i] == pattern_->ptr[mlast])
|
||||||
skip_ = mlast - i - 1;
|
skip_ = mlast - i - 1;
|
||||||
}
|
}
|
||||||
BloomAdd(pattern_->ptr[mlast]);
|
BloomAdd(pattern_->ptr[mlast]);
|
||||||
|
|
||||||
|
// The order of iteration doesn't have any effect on the bloom filter, but
|
||||||
|
// it does on skip_. For this reason we need to calculate a separate rskip_
|
||||||
|
// for reverse search.
|
||||||
|
for (int i = mlast; i > 0; i--) {
|
||||||
|
if (pattern_->ptr[i] == pattern_->ptr[0]) rskip_ = i - 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Search for this pattern in str.
|
/// Search for this pattern in str.
|
||||||
@@ -114,6 +128,54 @@ class StringSearch {
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Search for this pattern in str backwards.
|
||||||
|
/// Returns the offset into str if the pattern exists
|
||||||
|
/// Returns -1 if the pattern is not found
|
||||||
|
int RSearch(const StringValue* str) const {
|
||||||
|
// Special cases
|
||||||
|
if (str == NULL || pattern_ == NULL || pattern_->len == 0) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int mlast = pattern_->len - 1;
|
||||||
|
int w = str->len - pattern_->len;
|
||||||
|
int n = str->len;
|
||||||
|
int m = pattern_->len;
|
||||||
|
const char* s = str->ptr;
|
||||||
|
const char* p = pattern_->ptr;
|
||||||
|
|
||||||
|
// Special case if pattern->len == 1
|
||||||
|
if (m == 1) {
|
||||||
|
const char* result = reinterpret_cast<const char*>(memrchr(s, p[0], n));
|
||||||
|
if (result != NULL) return result - s;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// General case.
|
||||||
|
int j;
|
||||||
|
for (int i = w; i >= 0; i--) {
|
||||||
|
if (s[i] == p[0]) {
|
||||||
|
// candidate match
|
||||||
|
for (j = mlast; j > 0; j--)
|
||||||
|
if (s[i + j] != p[j]) break;
|
||||||
|
if (j == 0) {
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
// miss: check if previous character is part of pattern
|
||||||
|
if (i > 0 && !BloomQuery(s[i - 1]))
|
||||||
|
i = i - m;
|
||||||
|
else
|
||||||
|
i = i - rskip_;
|
||||||
|
} else {
|
||||||
|
// skip: check if previous character is part of pattern
|
||||||
|
if (i > 0 && !BloomQuery(s[i - 1])) {
|
||||||
|
i = i - m;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
static const int BLOOM_WIDTH = 64;
|
static const int BLOOM_WIDTH = 64;
|
||||||
|
|
||||||
@@ -127,7 +189,8 @@ class StringSearch {
|
|||||||
|
|
||||||
const StringValue* pattern_;
|
const StringValue* pattern_;
|
||||||
int64_t mask_;
|
int64_t mask_;
|
||||||
int64_t skip_;
|
int skip_;
|
||||||
|
int rskip_;
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -431,6 +431,9 @@ visible_functions = [
|
|||||||
[['rtrim'], 'STRING', ['STRING'], 'impala::StringFunctions::Rtrim'],
|
[['rtrim'], 'STRING', ['STRING'], 'impala::StringFunctions::Rtrim'],
|
||||||
[['ascii'], 'INT', ['STRING'], 'impala::StringFunctions::Ascii'],
|
[['ascii'], 'INT', ['STRING'], 'impala::StringFunctions::Ascii'],
|
||||||
[['instr'], 'INT', ['STRING', 'STRING'], 'impala::StringFunctions::Instr'],
|
[['instr'], 'INT', ['STRING', 'STRING'], 'impala::StringFunctions::Instr'],
|
||||||
|
[['instr'], 'INT', ['STRING', 'STRING', 'BIGINT'], 'impala::StringFunctions::Instr'],
|
||||||
|
[['instr'], 'INT', ['STRING', 'STRING', 'BIGINT', 'BIGINT'],
|
||||||
|
'impala::StringFunctions::Instr'],
|
||||||
[['locate'], 'INT', ['STRING', 'STRING'], 'impala::StringFunctions::Locate'],
|
[['locate'], 'INT', ['STRING', 'STRING'], 'impala::StringFunctions::Locate'],
|
||||||
[['locate'], 'INT', ['STRING', 'STRING', 'BIGINT'],
|
[['locate'], 'INT', ['STRING', 'STRING', 'BIGINT'],
|
||||||
'impala::StringFunctions::LocatePos'],
|
'impala::StringFunctions::LocatePos'],
|
||||||
|
|||||||
Reference in New Issue
Block a user