IMPALA-10349: Support constant folding for non ascii strings

Before this patch constant folding only converted the result of an
expression to StringLiteral if all characters were ASCII. The
change allows both UTF8 strings with non ascii characters and
byte arrays that are not valid UTF8 strings - the latter can
occur when constant folding is applied to BINARY columns,
for example in geospatial functions like st_polygon().

The main goal is being able to push down more predicates, e.g.
before that patch a filter like col="á" couldn't be pushed down
to Iceberg/Kudu/Parquet stat filtering, as all these expect literals.

Main changes:
- TStringLiteral uses a binary instead of a string member.
  This doesn't affect BE as in c++ both types are compiled
  to std::string. In Jave a java.nio.ByteBuffer is used instead of
  String.
- StringLiteral uses a byte[] member to store the value of
  the literal in case it is not valid UTF8 and cannot be
  represented as Java String. In other cases still a String
  is used to keep the change minimal, though it may be more
  optimal to use UTF8 byte[] due to the smaller size. Always
  converting from byte[] to String may be costy in the catalog
  as partition values are stored as *Literals and rest of the
  catalog operates on String.
- StringLiteral#compareTo() is switched to byte wise compare on byte[]
  to be consistent with BE. This was not needed for ASCII strings
  as Java String behaves the same way in that case, but non-ASCII
  can have different order (note that Impala does not support
  collations).
- When an invalid UTF8 StringLiteral is printed, for example in
  case of EXPLAIN output, then it is printed as
  unhex("<byte array in hexadecimal>"). This is a non-lossy way to
  represent it, but it may be too verbose in some cases, e.g. for
  large polygons. A follow up commit may refine this, e.g. by
  limiting the max size printed.

An issue found while implementing this is that INSERT does not
handle invalid UTF8 partition values correctly, see IMPALA-14096.
This behavior is not changed in the patch.

Testing:
- Added a few tests that push down non-ascii const expressions in
  predicates (both with utf8_mode=true and false).

Change-Id: I70663457a0b0a3443e586350f0a5996bb75ba64a
Reviewed-on: http://gerrit.cloudera.org:8080/22603
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
This commit is contained in:
Csaba Ringhofer
2025-03-09 23:47:22 +01:00
committed by Impala Public Jenkins
parent 3475e6b506
commit 85a2211bfb
18 changed files with 345 additions and 63 deletions

View File

@@ -117,7 +117,8 @@ struct TSlotRef {
}
struct TStringLiteral {
1: required string value;
// Use binary instead of string as the value may not be valid utf8.
1: required binary value;
}
// Additional information for aggregate functions.

View File

@@ -319,6 +319,15 @@ public class ColumnDef {
Preconditions.checkNotNull(defaultValLiteral);
outputDefaultValue_ = defaultValLiteral;
// Non-utf8 values could be allowed for BINARY literals, but those don't work for
// other reason: IMPALA-14173
if (outputDefaultValue_ instanceof StringLiteral) {
if (!((StringLiteral) outputDefaultValue_).isValidUtf8()) {
throw new AnalysisException(
"Invalid String default value: " + defaultValue_.toSql());
}
}
// TODO: Remove when Impala supports a 64-bit TIMESTAMP type.
if (type_.isTimestamp()) {
try {

View File

@@ -1840,7 +1840,7 @@ abstract public class Expr extends TreeNode<Expr> implements ParseNode, Cloneabl
return DEFAULT_AVG_STRING_LENGTH;
}
} else if (e instanceof StringLiteral) {
return ((StringLiteral) e).getUnescapedValue().length();
return ((StringLiteral) e).utf8ArrayLength();
} else {
// TODO(tmarshall): Extend this to support other string Exprs, such as
// function calls that return string.

View File

@@ -146,8 +146,7 @@ public abstract class LiteralExpr extends Expr implements Comparable<LiteralExpr
Long.toString(exprNode.int_literal.value), colType);
break;
case STRING_LITERAL:
result = LiteralExpr.createFromUnescapedStr(
exprNode.string_literal.value, colType);
result = new StringLiteral(exprNode.string_literal.getValue(), colType);
break;
case BOOL_LITERAL:
result = LiteralExpr.createFromUnescapedStr(
@@ -194,7 +193,7 @@ public abstract class LiteralExpr extends Expr implements Comparable<LiteralExpr
/**
* Evaluates the given constant expr and returns its result as a LiteralExpr.
* Assumes expr has been analyzed. Returns constExpr if is it already a LiteralExpr.
* Returns null for types that do not have a LiteralExpr subclass, e.g. TIMESTAMP, or
* Returns null for types that do not have a LiteralExpr subclass and
* in cases where the corresponding LiteralExpr is not able to represent the evaluation
* result, e.g., NaN or infinity. Returns null if the expr evaluation encountered errors
* or warnings in the BE.
@@ -259,18 +258,7 @@ public abstract class LiteralExpr extends Expr implements Comparable<LiteralExpr
if (val.isSetBinary_val()) {
byte[] bytes = new byte[val.binary_val.remaining()];
val.binary_val.get(bytes);
// Converting strings between the BE/FE does not work properly for the
// extended ASCII characters above 127. Bail in such cases to avoid
// producing incorrect results.
for (byte b: bytes) if (b < 0) return null;
try {
// US-ASCII is 7-bit ASCII.
String strVal = new String(bytes, "US-ASCII");
// The evaluation result is a raw string that must not be unescaped.
result = new StringLiteral(strVal, constExpr.getType(), false);
} catch (UnsupportedEncodingException e) {
return null;
}
result = new StringLiteral(bytes, constExpr.getType());
}
break;
case TIMESTAMP:
@@ -312,7 +300,8 @@ public abstract class LiteralExpr extends Expr implements Comparable<LiteralExpr
if (Expr.IS_NULL_LITERAL.apply(this) && Expr.IS_NULL_LITERAL.apply(other)) return 0;
if (Expr.IS_NULL_LITERAL.apply(this)) return -1;
if (Expr.IS_NULL_LITERAL.apply(other)) return 1;
if (getClass() != other.getClass()) return -1;
// Sublcass implementations do not handle comparison with other Literal types.
Preconditions.checkState(getClass() == other.getClass());
return 0;
}

View File

@@ -201,6 +201,13 @@ public class RangePartition extends StmtNode {
"NULL. Range partition: '%s'", toSql()));
}
if (literal instanceof StringLiteral) {
if (!((StringLiteral) literal).isValidUtf8()) {
throw new AnalysisException(
"Invalid String range partition value: " + literal.toSql());
}
}
// Special case string literals in timestamp columns for convenience.
if (literal.getType().isStringType() && colType.isTimestamp()) {
// Add an explicit cast to TIMESTAMP

View File

@@ -20,7 +20,10 @@ package org.apache.impala.analysis;
import java.io.IOException;
import java.io.StringReader;
import java.math.BigDecimal;
import java.nio.ByteBuffer;
import java.util.Arrays;
import org.apache.commons.codec.binary.Hex;
import org.apache.impala.catalog.ScalarType;
import org.apache.impala.catalog.Type;
import org.apache.impala.catalog.TypeCompatibility;
@@ -29,14 +32,22 @@ import org.apache.impala.compat.MetastoreShim;
import org.apache.impala.thrift.TExprNode;
import org.apache.impala.thrift.TExprNodeType;
import org.apache.impala.thrift.TStringLiteral;
import org.apache.impala.util.StringUtils;
import com.google.common.base.MoreObjects;
import com.google.common.base.Preconditions;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java_cup.runtime.Symbol;
public class StringLiteral extends LiteralExpr {
private final String value_;
private final static Logger LOG = LoggerFactory.getLogger(StringLiteral.class);
// From the strValue_ and binValue_ always only one is null while the other is non-null.
// strValue_ == null means that the value is not decodable as utf8.
private final String strValue_;
private final byte[] binValue_;
public static int MAX_STRING_LEN = Integer.MAX_VALUE;
// Indicates whether this value needs to be unescaped in toThrift() or comparison.
@@ -50,17 +61,33 @@ public class StringLiteral extends LiteralExpr {
}
public StringLiteral(String value, Type type, boolean needsUnescaping) {
value_ = value;
strValue_ = value;
binValue_ = null;
type_ = type;
needsUnescaping_ = needsUnescaping;
}
public StringLiteral(byte[] value, Type type) {
// It is ok to fail as StringLiteral can contain binary data. strValue_ is null in
// this case.
strValue_ = StringUtils.fromUtf8Buffer(ByteBuffer.wrap(value), true);
if (strValue_ == null) {
// Save binValue_ only if it cannot be represented as String.
binValue_ = value;
} else {
binValue_ = null;
}
type_ = type;
needsUnescaping_ = false;
}
/**
* Copy c'tor used in clone().
*/
protected StringLiteral(StringLiteral other) {
super(other);
value_ = other.value_;
strValue_ = other.strValue_;
binValue_ = other.binValue_;
needsUnescaping_ = other.needsUnescaping_;
}
@@ -68,12 +95,22 @@ public class StringLiteral extends LiteralExpr {
protected boolean localEquals(Expr that) {
if (!super.localEquals(that)) return false;
StringLiteral other = (StringLiteral) that;
return needsUnescaping_ == other.needsUnescaping_ && type_.equals(other.type_)
&& value_.equals(other.value_);
if (!type_.equals(other.type_)) return false;
if (needsUnescaping_ != other.needsUnescaping_) return false;
if (binValue_ != null) {
Preconditions.checkState(strValue_ == null);
if (other.binValue_ == null) return false;
return Arrays.equals(binValue_, other.binValue_);
}
Preconditions.checkState(strValue_ != null);
return strValue_.equals(other.strValue_);
}
// TODO: shouldn't this also check needsUnescaping?
@Override
public int hashCode() { return value_.hashCode(); }
public int hashCode() {
return binValue_ != null ? Arrays.hashCode(binValue_) : strValue_.hashCode();
}
@Override
public String toSqlImpl(ToSqlOptions options) {
@@ -83,21 +120,35 @@ public class StringLiteral extends LiteralExpr {
@Override
protected void toThrift(TExprNode msg) {
msg.node_type = TExprNodeType.STRING_LITERAL;
String val = (needsUnescaping_) ? getUnescapedValue() : value_;
msg.string_literal = new TStringLiteral(val);
byte[] val;
if (needsUnescaping_) {
val = StringUtils.toUtf8Array(getUnescapedValue());
} else {
val = getBinValue();
}
msg.string_literal = new TStringLiteral(ByteBuffer.wrap(val));
}
/**
* Returns the original value that the string literal was constructed with,
* without escaping or unescaping it.
* Can be only used when the StringLiteral can be represented as java String.
*/
public String getValueWithOriginalEscapes() { return value_; }
public String getValueWithOriginalEscapes() {
checkHasValidString();
return strValue_;
}
public String getUnescapedValue() {
checkHasValidString();
// Unescape string exactly like Hive does. Hive's method assumes
// quotes so we add them here to reuse Hive's code.
return MetastoreShim.unescapeSQLString("'" + getNormalizedValue()
+ "'");
return MetastoreShim.unescapeSQLString("'" + getNormalizedValue() + "'");
}
private String unescapeIfNeeded() {
checkHasValidString();
return needsUnescaping_ ? getUnescapedValue() : strValue_;
}
/**
@@ -109,12 +160,17 @@ public class StringLiteral extends LiteralExpr {
* SQL as a single-quoted string literal.
*/
private String getNormalizedValue() {
final int len = value_.length();
if (strValue_ == null) {
// express binary using unhex()
String hex = Hex.encodeHexString(binValue_);
return "unhex(\"" + hex.toUpperCase() +"\")";
}
final int len = strValue_.length();
final StringBuilder sb = new StringBuilder(len);
for (int i = 0; i < len; ++i) {
final char currentChar = value_.charAt(i);
final char currentChar = strValue_.charAt(i);
if (currentChar == '\\' && (i + 1) < len) {
final char nextChar = value_.charAt(i + 1);
final char nextChar = strValue_.charAt(i + 1);
// unescape an escaped double quote: remove back-slash in front of the quote.
if (nextChar == '"' || nextChar == '\'' || nextChar == '\\') {
if (nextChar != '"') {
@@ -143,8 +199,10 @@ public class StringLiteral extends LiteralExpr {
@Override
public String debugString() {
// TODO: hex encode the binary case if someone needs this
String str = strValue_ != null ? strValue_ : "<can't encode as String>";
return MoreObjects.toStringHelper(this)
.add("value", value_)
.add("strValue", str)
.toString();
}
@@ -179,7 +237,8 @@ public class StringLiteral extends LiteralExpr {
* or if floating point value is NaN or infinite
*/
public LiteralExpr convertToNumber(Type targetType) throws AnalysisException {
StringReader reader = new StringReader(value_);
checkHasValidString();
StringReader reader = new StringReader(strValue_);
SqlScanner scanner = new SqlScanner(reader);
// For distinguishing positive and negative numbers.
boolean negative = false;
@@ -198,7 +257,7 @@ public class StringLiteral extends LiteralExpr {
throw new AnalysisException("Failed to convert string literal to number.", e);
}
if (sym.sym == SqlParserSymbols.NUMERIC_OVERFLOW) {
throw new AnalysisException("Number too large: " + value_);
throw new AnalysisException("Number too large: " + strValue_);
}
if (sym.sym == SqlParserSymbols.INTEGER_LITERAL) {
BigDecimal val = (BigDecimal) sym.value;
@@ -212,20 +271,43 @@ public class StringLiteral extends LiteralExpr {
}
// Symbol is not an integer or floating point literal.
throw new AnalysisException("Failed to convert string literal '"
+ value_ + "' to number.");
+ strValue_ + "' to number.");
}
@Override
public int compareTo(LiteralExpr o) {
int ret = super.compareTo(o);
if (ret != 0) return ret;
if (ret != 0) return ret; // TODO: this doesn't seem to handle the NULL case well
StringLiteral other = (StringLiteral) o;
String thisValue = needsUnescaping_? getUnescapedValue() : value_;
String otherValue = other.needsUnescaping_?
other.getUnescapedValue() : other.getStringValue();
return thisValue.compareTo(otherValue);
// Do byte wise comparisin on UTF8 format to be consistant with BE.
// TODO: it would be clearer to use backand's comparision functions in literals.
byte[] arr1 = getBinValue();
byte[] arr2 = other.getBinValue();
// TODO: rewrite once Java 8 support is dropped
//return Arrays.compare(binValue_, other.binValue_); // added in java 9
for (int i = 0; i < arr1.length && i < arr2.length; i++) {
if (arr1[i] < arr2[i]) return -1;
if (arr1[i] > arr2[i]) return 1;
}
if (arr1.length < arr2.length) return -1;
if (arr1.length > arr2.length) return 1;
return 0;
}
@Override
public Expr clone() { return new StringLiteral(this); }
public int utf8ArrayLength() { return getBinValue().length; }
public boolean isValidUtf8() { return binValue_ == null; }
public byte[] getBinValue() {
if (binValue_ != null) return binValue_;
return StringUtils.toUtf8Array(unescapeIfNeeded());
}
private void checkHasValidString() {
Preconditions.checkState(strValue_ != null,
"non-utf8 string: %s", getNormalizedValue());
}
}

View File

@@ -576,13 +576,16 @@ public class KuduScanNode extends ScanNode {
case STRING:
case VARCHAR:
case CHAR: {
StringLiteral strLit = (StringLiteral)literal;
if (!strLit.isValidUtf8()) return false;
kuduPredicate = KuduPredicate.newComparisonPredicate(column, op,
((StringLiteral)literal).getUnescapedValue());
strLit.getUnescapedValue());
break;
}
case BINARY: {
StringLiteral strLit = (StringLiteral)literal;
kuduPredicate = KuduPredicate.newComparisonPredicate(column, op,
((StringLiteral)literal).getUnescapedValue().getBytes());
strLit.getBinValue());
break;
}
case TIMESTAMP: {

View File

@@ -115,6 +115,7 @@ import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.lang.IllegalArgumentException;
import java.nio.charset.StandardCharsets;
import java.util.Calendar;
import java.util.Collections;
import java.util.GregorianCalendar;
@@ -807,11 +808,13 @@ public class JniFrontend {
JniUtil.deserializeThrift(protocolFactory_, secretKey, secretKeyRequest);
String secret = null;
try {
char[] secretCharArray = CONF.getPassword(secretKey.getValue());
String secretKeyStr =
StandardCharsets.UTF_8.decode(secretKey.value).toString();
char[] secretCharArray = CONF.getPassword(secretKeyStr);
if (secretCharArray != null) {
secret = new String(secretCharArray);
} else {
String errMsg = String.format(KEYSTORE_ERROR_MSG, secretKey.getValue());
String errMsg = String.format(KEYSTORE_ERROR_MSG, secretKeyStr);
LOG.error(errMsg);
throw new InternalException(errMsg);
}
@@ -830,7 +833,11 @@ public class JniFrontend {
final TStringLiteral timezone = new TStringLiteral();
JniUtil.deserializeThrift(protocolFactory_, timezone, timezoneT);
// TimeZone.getTimeZone defaults to GMT if it doesn't recognize the timezone.
Calendar c = new GregorianCalendar(TimeZone.getTimeZone(timezone.getValue()));
// Invalid characters are replaced if decoding is not successful. This should lead
// to a failed timezone lookup and using default GMT.
String timezoneStr =
StandardCharsets.UTF_8.decode(timezone.value).toString();
Calendar c = new GregorianCalendar(TimeZone.getTimeZone(timezoneStr));
c.setTimeInMillis(utc_time_millis);
final TCivilTime civilTime = new TCivilTime(

View File

@@ -51,6 +51,7 @@ import org.apache.impala.thrift.TExpr;
import org.apache.impala.thrift.TExprNode;
import org.apache.impala.thrift.TExprNodeType;
import org.apache.impala.thrift.THdfsCompression;
import org.apache.impala.util.StringUtils;
import org.apache.kudu.ColumnSchema;
import org.apache.kudu.ColumnSchema.CompressionAlgorithm;
import org.apache.kudu.ColumnSchema.Encoding;
@@ -163,6 +164,7 @@ public class KuduUtil {
TExprNode literal = boundaryVal.getNodes().get(0);
String colName = col.getName();
org.apache.kudu.Type type = col.getType();
String strLiteral;
switch (type) {
case INT8:
checkCorrectType(literal.isSetInt_literal(), type, colName, literal);
@@ -182,11 +184,13 @@ public class KuduUtil {
break;
case VARCHAR:
checkCorrectType(literal.isSetString_literal(), type, colName, literal);
key.addVarchar(pos, literal.getString_literal().getValue());
key.addVarchar(pos,
StringUtils.fromUtf8Buffer(literal.getString_literal().value, false));
break;
case STRING:
checkCorrectType(literal.isSetString_literal(), type, colName, literal);
key.addString(pos, literal.getString_literal().getValue());
key.addString(pos,
StringUtils.fromUtf8Buffer(literal.getString_literal().value, false));
break;
case UNIXTIME_MICROS:
checkCorrectType(literal.isSetInt_literal(), type, colName, literal);
@@ -244,7 +248,7 @@ public class KuduUtil {
case VARCHAR:
case STRING:
checkCorrectType(literal.isSetString_literal(), type, colName, literal);
return literal.getString_literal().getValue();
return StringUtils.fromUtf8Buffer(literal.getString_literal().value, false);
case BOOL:
checkCorrectType(literal.isSetBool_literal(), type, colName, literal);
return literal.getBool_literal().isValue();

View File

@@ -0,0 +1,55 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package org.apache.impala.util;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.StandardCharsets;
import com.google.common.base.Preconditions;
public class StringUtils {
/**
* Converts String to utf8 byte array. Assumes that this conversion is successful.
*/
public static byte[] toUtf8Array(String str) {
ByteBuffer buf = StandardCharsets.UTF_8.encode(str);
byte[] arr = new byte[buf.limit()];
buf.get(arr);
return arr;
}
/**
* Converts utf8 ByteBuffer to String.
* Handling decoding errors depends on 'canfail':
* if true: return null
* if false: hit precondition
*/
public static String fromUtf8Buffer(ByteBuffer buf, boolean canFail) {
try {
return StandardCharsets.UTF_8.newDecoder().decode(buf).toString();
} catch (CharacterCodingException ex) {
Preconditions.checkState(canFail, "UTF8 decoding failed: %s", ex.getMessage());
return null;
}
}
}

View File

@@ -392,8 +392,9 @@ public class ExprRewriteRulesTest extends FrontendTestBase {
RewritesOk("base64decode(base64encode('\\047\\001\\132\\060')) = " +
"'\\047\\001\\132\\060'", rule, "TRUE");
// Tests correct handling of strings with chars > 127. Should not be folded.
RewritesOk("hex(unhex(hex(unhex('D3'))))", rule, null);
// Tests correct handling of strings with chars > 127.
RewritesOk("concat('á', '你好')", rule, "'á你好'");
RewritesOk("hex(unhex(hex(unhex('D3'))))", rule, "'D3'");
// Tests that non-deterministic functions are not folded.
RewritesOk("rand()", rule, null);
RewritesOk("random()", rule, null);

View File

@@ -141,7 +141,13 @@ public class PlannerTest extends PlannerTestBase {
// Tests that constant folding is applied to all relevant PlanNodes and DataSinks.
// Note that not all Exprs are printed in the explain plan, so validating those
// via this test is currently not possible.
runPlannerTestFile("constant-folding",
TQueryOptions options = defaultQueryOptions();
options.setUtf8_mode(false);
runPlannerTestFile("constant-folding", options,
ImmutableSet.of(PlannerTestOption.EXTENDED_EXPLAIN,
PlannerTestOption.INCLUDE_QUERY_WITH_IMPLICIT_CASTS));
options.setUtf8_mode(true);
runPlannerTestFile("constant-folding-utf8-mode", options,
ImmutableSet.of(PlannerTestOption.EXTENDED_EXPLAIN,
PlannerTestOption.INCLUDE_QUERY_WITH_IMPLICIT_CASTS));
}

View File

@@ -23,6 +23,7 @@ import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
import java.io.File;
import java.nio.charset.StandardCharsets;
import java.util.Random;
import org.apache.hadoop.conf.Configuration;
@@ -134,18 +135,20 @@ public class JniFrontendTest {
@Test
public void testGetSecretFromKeyStore() throws ImpalaException {
// valid secret-key returns the correct secret
TStringLiteral secretKey = new TStringLiteral("openai-api-key-secret");
String keyName = "openai-api-key-secret";
TStringLiteral secretKey = new TStringLiteral(StandardCharsets.UTF_8.encode(keyName));
byte[] secretKeyBytes = JniUtil.serializeToThrift(secretKey);
String secret = JniFrontend.getSecretFromKeyStore(secretKeyBytes);
assertEquals(secret, "secret");
// invalid secret-key returns error
secretKey = new TStringLiteral("dummy-secret");
keyName = "dummy-secret";
secretKey = new TStringLiteral(StandardCharsets.UTF_8.encode(keyName));
secretKeyBytes = JniUtil.serializeToThrift(secretKey);
try {
secret = JniFrontend.getSecretFromKeyStore(secretKeyBytes);
} catch (InternalException e) {
assertEquals(e.getMessage(),
String.format(JniFrontend.KEYSTORE_ERROR_MSG, secretKey.getValue()));
String.format(JniFrontend.KEYSTORE_ERROR_MSG, keyName));
}
}
}

View File

@@ -0,0 +1,25 @@
# Constant folding of utf8-aware functions with utf8_mode=true.
select string_col from functional.alltypes
where int_col = length(concat("á", "é"))
or string_col = substr("áé", 1, 1)
---- PLAN
Analyzed query: SELECT string_col FROM functional.alltypes WHERE int_col =
CAST(2 AS INT) OR string_col = 'á'
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
| Per-Host Resources: mem-estimate=132.00MB mem-reservation=4.03MB thread-reservation=2
PLAN-ROOT SINK
| output exprs: string_col
| mem-estimate=4.00MB mem-reservation=4.00MB spill-buffer=2.00MB thread-reservation=0
|
00:SCAN HDFS [functional.alltypes]
HDFS partitions=24/24 files=24 size=478.45KB
predicates: int_col = CAST(2 AS INT) OR string_col = 'á'
stored statistics:
table: rows=7.30K size=478.45KB
partitions: 24/24 rows=7.30K
columns: all
extrapolated-rows=disabled max-scan-range-rows=310
mem-estimate=128.00MB mem-reservation=32.00KB thread-reservation=1
tuple-ids=0 row-size=17B cardinality=1.39K
in pipelines: 00(GETNEXT)
====

View File

@@ -483,3 +483,79 @@ PLAN-ROOT SINK
tuple-ids=0 row-size=4B cardinality=2
in pipelines: 00(GETNEXT)
====
# Constant folding for non-ascii string literals.
select string_col from functional.alltypes
where string_col = concat("a", "á")
or string_col = hex(unhex("aa"))
or string_col = unhex("aa")
---- PLAN
Analyzed query: SELECT string_col FROM functional.alltypes WHERE string_col IN
('aá', 'AA', 'unhex("AA")')
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
| Per-Host Resources: mem-estimate=132.00MB mem-reservation=4.03MB thread-reservation=2
PLAN-ROOT SINK
| output exprs: string_col
| mem-estimate=4.00MB mem-reservation=4.00MB spill-buffer=2.00MB thread-reservation=0
|
00:SCAN HDFS [functional.alltypes]
HDFS partitions=24/24 files=24 size=478.45KB
predicates: string_col IN ('aá', 'AA', 'unhex("AA")')
stored statistics:
table: rows=7.30K size=478.45KB
partitions: 24/24 rows=7.30K
columns: all
extrapolated-rows=disabled max-scan-range-rows=310
mem-estimate=128.00MB mem-reservation=32.00KB thread-reservation=1
tuple-ids=0 row-size=13B cardinality=2.19K
in pipelines: 00(GETNEXT)
====
# Constant folding for non-ascii string literals cast to BINARY.
select binary_col from functional.binary_tbl
where binary_col = cast(concat("a", "á") as binary)
or binary_col = cast(hex(unhex("aa")) as binary)
or binary_col = cast(unhex("aa") as binary)
---- PLAN
Analyzed query: SELECT binary_col FROM functional.binary_tbl WHERE binary_col IN
('aá', 'AA', 'unhex("AA")')
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
| Per-Host Resources: mem-estimate=36.00MB mem-reservation=4.01MB thread-reservation=2
PLAN-ROOT SINK
| output exprs: binary_col
| mem-estimate=4.00MB mem-reservation=4.00MB spill-buffer=2.00MB thread-reservation=0
|
00:SCAN HDFS [functional.binary_tbl]
HDFS partitions=1/1 files=1 size=189B
predicates: binary_col IN ('aá', 'AA', 'unhex("AA")')
stored statistics:
table: rows=8 size=189B
columns: all
extrapolated-rows=disabled max-scan-range-rows=8
mem-estimate=32.00MB mem-reservation=8.00KB thread-reservation=1
tuple-ids=0 row-size=12B cardinality=1
in pipelines: 00(GETNEXT)
====
# Constant folding of utf8-aware functions with utf8_mode=false.
select string_col from functional.alltypes
where int_col = length(concat("á", "é"))
or string_col = substr("áé", 1, 1)
---- PLAN
Analyzed query: SELECT string_col FROM functional.alltypes WHERE int_col =
CAST(4 AS INT) OR string_col = 'unhex("C3")'
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
| Per-Host Resources: mem-estimate=132.00MB mem-reservation=4.03MB thread-reservation=2
PLAN-ROOT SINK
| output exprs: string_col
| mem-estimate=4.00MB mem-reservation=4.00MB spill-buffer=2.00MB thread-reservation=0
|
00:SCAN HDFS [functional.alltypes]
HDFS partitions=24/24 files=24 size=478.45KB
predicates: int_col = CAST(4 AS INT) OR string_col = 'unhex("C3")'
stored statistics:
table: rows=7.30K size=478.45KB
partitions: 24/24 rows=7.30K
columns: all
extrapolated-rows=disabled max-scan-range-rows=310
mem-estimate=128.00MB mem-reservation=32.00KB thread-reservation=1
tuple-ids=0 row-size=17B cardinality=1.39K
in pipelines: 00(GETNEXT)
====

View File

@@ -761,22 +761,22 @@ PLAN-ROOT SINK
row-size=36B cardinality=0
====
# BINARY predicate.
# Non-ASCII strings cannot be pushed down to Kudu (IMPALA-10349)
# Non-ASCII strings can be pushed down to Kudu (IMPALA-10349)
select * from functional_kudu.binary_tbl where binary_col=cast("á" as binary);
---- PLAN
PLAN-ROOT SINK
|
00:SCAN KUDU [functional_kudu.binary_tbl]
predicates: binary_col = CAST('á' AS BINARY)
kudu predicates: binary_col = 'á'
row-size=36B cardinality=0
====
# BINARY predicate.
# Not valid utf8 strings cannot be pushed down to Kudu (IMPALA-10349)
# Not valid utf8 strings can be pushed down to Kudu (IMPALA-10349)
select * from functional_kudu.binary_tbl where binary_col=cast(unhex("aa") as binary);
---- PLAN
PLAN-ROOT SINK
|
00:SCAN KUDU [functional_kudu.binary_tbl]
predicates: binary_col = CAST(unhex('aa') AS BINARY)
kudu predicates: binary_col = 'unhex("AA")'
row-size=36B cardinality=0
====

View File

@@ -167,7 +167,7 @@ BIGINT
!row_regex:.*TotalKuduScanRoundTrips: 1.*
====
---- QUERY
# Check that filters with constant folding and non ascii characters cannot
# Check that filters with constant folding and non ascii (utf8) characters can
# be pushed down to kudu (IMPALA-10349).
select count(*) from binary_tbl
where binary_col = cast("á" as binary)
@@ -176,10 +176,10 @@ BIGINT
---- RESULTS
0
---- RUNTIME_PROFILE: table_format=kudu
row_regex:.*TotalKuduScanRoundTrips: 1.*
row_regex:.*TotalKuduScanRoundTrips: 0.*
====
---- QUERY
# Check that filters with constant folding and not valid utf8 characters cannot
# Check that filters with constant folding and not valid utf8 characters can
# be pushed down to kudu (IMPALA-10349).
select count(*) from binary_tbl
where binary_col = cast(unhex("AA") as binary)
@@ -188,5 +188,5 @@ BIGINT
---- RESULTS
0
---- RUNTIME_PROFILE: table_format=kudu
row_regex:.*TotalKuduScanRoundTrips: 1.*
row_regex:.*TotalKuduScanRoundTrips: 0.*
====

View File

@@ -383,6 +383,20 @@ stored as kudu
AnalysisException: Invalid date literal: '111111-33-33'
====
---- QUERY
# create table with invalid default STRING value
create table kudu_invalid_default_string (s string primary key default unhex("aa"))
stored as kudu
---- CATCH
AnalysisException: Invalid String default value: unhex('aa')
====
---- QUERY
# create table with invalid range STRING value
create table kudu_invalid_range_string (s string primary key)
partition by range (s) (partition values < unhex("aa")) stored as kudu;
---- CATCH
AnalysisException: Invalid String range partition value: 'unhex("AA")'
====
---- QUERY
# create table with DATE primary key partitioned by range
create table kudu_datepk_range (fdate DATE not null primary key)
partition by range (fdate)