IMPALA-10349: Support constant folding for non ascii strings

Before this patch constant folding only converted the result of an expression to StringLiteral if all characters were ASCII. The change allows both UTF8 strings with non ascii characters and byte arrays that are not valid UTF8 strings - the latter can occur when constant folding is applied to BINARY columns, for example in geospatial functions like st_polygon(). The main goal is being able to push down more predicates, e.g. before that patch a filter like col="á" couldn't be pushed down to Iceberg/Kudu/Parquet stat filtering, as all these expect literals. Main changes: - TStringLiteral uses a binary instead of a string member. This doesn't affect BE as in c++ both types are compiled to std::string. In Jave a java.nio.ByteBuffer is used instead of String. - StringLiteral uses a byte[] member to store the value of the literal in case it is not valid UTF8 and cannot be represented as Java String. In other cases still a String is used to keep the change minimal, though it may be more optimal to use UTF8 byte[] due to the smaller size. Always converting from byte[] to String may be costy in the catalog as partition values are stored as *Literals and rest of the catalog operates on String. - StringLiteral#compareTo() is switched to byte wise compare on byte[] to be consistent with BE. This was not needed for ASCII strings as Java String behaves the same way in that case, but non-ASCII can have different order (note that Impala does not support collations). - When an invalid UTF8 StringLiteral is printed, for example in case of EXPLAIN output, then it is printed as unhex("<byte array in hexadecimal>"). This is a non-lossy way to represent it, but it may be too verbose in some cases, e.g. for large polygons. A follow up commit may refine this, e.g. by limiting the max size printed. An issue found while implementing this is that INSERT does not handle invalid UTF8 partition values correctly, see IMPALA-14096. This behavior is not changed in the patch. Testing: - Added a few tests that push down non-ascii const expressions in predicates (both with utf8_mode=true and false). Change-Id: I70663457a0b0a3443e586350f0a5996bb75ba64a Reviewed-on: http://gerrit.cloudera.org:8080/22603 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2025-12-19 18:12:08 -05:00 · 2025-03-09 23:47:22 +01:00
parent 3475e6b506
commit 85a2211bfb
18 changed files with 345 additions and 63 deletions
--- a/common/thrift/Exprs.thrift
+++ b/common/thrift/Exprs.thrift
@@ -117,7 +117,8 @@ struct TSlotRef {
 }

 struct TStringLiteral {
-  1: required string value;
+  // Use binary instead of string as the value may not be valid utf8.
+  1: required binary value;
 }

 // Additional information for aggregate functions.
--- a/fe/src/main/java/org/apache/impala/analysis/ColumnDef.java
+++ b/fe/src/main/java/org/apache/impala/analysis/ColumnDef.java
@@ -319,6 +319,15 @@ public class ColumnDef {
      Preconditions.checkNotNull(defaultValLiteral);
      outputDefaultValue_ = defaultValLiteral;

+      // Non-utf8 values could be allowed for BINARY literals, but those don't work for
+      // other reason: IMPALA-14173
+      if (outputDefaultValue_ instanceof StringLiteral) {
+        if (!((StringLiteral) outputDefaultValue_).isValidUtf8()) {
+          throw new AnalysisException(
+            "Invalid String default value: " + defaultValue_.toSql());
+        }
+      }
+
      // TODO: Remove when Impala supports a 64-bit TIMESTAMP type.
      if (type_.isTimestamp()) {
        try {
--- a/fe/src/main/java/org/apache/impala/analysis/Expr.java
+++ b/fe/src/main/java/org/apache/impala/analysis/Expr.java
@@ -1840,7 +1840,7 @@ abstract public class Expr extends TreeNode<Expr> implements ParseNode, Cloneabl
        return DEFAULT_AVG_STRING_LENGTH;
      }
    } else if (e instanceof StringLiteral) {
-      return ((StringLiteral) e).getUnescapedValue().length();
+      return ((StringLiteral) e).utf8ArrayLength();
    } else {
      // TODO(tmarshall): Extend this to support other string Exprs, such as
      // function calls that return string.
--- a/fe/src/main/java/org/apache/impala/analysis/LiteralExpr.java
+++ b/fe/src/main/java/org/apache/impala/analysis/LiteralExpr.java
@@ -146,8 +146,7 @@ public abstract class LiteralExpr extends Expr implements Comparable<LiteralExpr
              Long.toString(exprNode.int_literal.value), colType);
          break;
        case STRING_LITERAL:
-          result = LiteralExpr.createFromUnescapedStr(
-              exprNode.string_literal.value, colType);
+          result = new StringLiteral(exprNode.string_literal.getValue(), colType);
          break;
        case BOOL_LITERAL:
          result =  LiteralExpr.createFromUnescapedStr(
@@ -194,7 +193,7 @@ public abstract class LiteralExpr extends Expr implements Comparable<LiteralExpr
  /**
   * Evaluates the given constant expr and returns its result as a LiteralExpr.
   * Assumes expr has been analyzed. Returns constExpr if is it already a LiteralExpr.
-   * Returns null for types that do not have a LiteralExpr subclass, e.g. TIMESTAMP, or
+   * Returns null for types that do not have a LiteralExpr subclass and
   * in cases where the corresponding LiteralExpr is not able to represent the evaluation
   * result, e.g., NaN or infinity. Returns null if the expr evaluation encountered errors
   * or warnings in the BE.
@@ -259,18 +258,7 @@ public abstract class LiteralExpr extends Expr implements Comparable<LiteralExpr
        if (val.isSetBinary_val()) {
          byte[] bytes = new byte[val.binary_val.remaining()];
          val.binary_val.get(bytes);
-          // Converting strings between the BE/FE does not work properly for the
-          // extended ASCII characters above 127. Bail in such cases to avoid
-          // producing incorrect results.
-          for (byte b: bytes) if (b < 0) return null;
-          try {
-            // US-ASCII is 7-bit ASCII.
-            String strVal = new String(bytes, "US-ASCII");
-            // The evaluation result is a raw string that must not be unescaped.
-            result = new StringLiteral(strVal, constExpr.getType(), false);
-          } catch (UnsupportedEncodingException e) {
-            return null;
-          }
+          result = new StringLiteral(bytes, constExpr.getType());
        }
        break;
      case TIMESTAMP:
@@ -312,7 +300,8 @@ public abstract class LiteralExpr extends Expr implements Comparable<LiteralExpr
    if (Expr.IS_NULL_LITERAL.apply(this) && Expr.IS_NULL_LITERAL.apply(other)) return 0;
    if (Expr.IS_NULL_LITERAL.apply(this)) return -1;
    if (Expr.IS_NULL_LITERAL.apply(other)) return 1;
-    if (getClass() != other.getClass()) return -1;
+    // Sublcass implementations do not handle comparison with other Literal types.
+    Preconditions.checkState(getClass() == other.getClass());
    return 0;
  }

--- a/fe/src/main/java/org/apache/impala/analysis/RangePartition.java
+++ b/fe/src/main/java/org/apache/impala/analysis/RangePartition.java
@@ -201,6 +201,13 @@ public class RangePartition extends StmtNode {
          "NULL. Range partition: '%s'", toSql()));
    }

+    if (literal instanceof StringLiteral) {
+        if (!((StringLiteral) literal).isValidUtf8()) {
+          throw new AnalysisException(
+            "Invalid String range partition value: " + literal.toSql());
+        }
+    }
+
    // Special case string literals in timestamp columns for convenience.
    if (literal.getType().isStringType() && colType.isTimestamp()) {
      // Add an explicit cast to TIMESTAMP
--- a/fe/src/main/java/org/apache/impala/analysis/StringLiteral.java
+++ b/fe/src/main/java/org/apache/impala/analysis/StringLiteral.java
@@ -20,7 +20,10 @@ package org.apache.impala.analysis;
 import java.io.IOException;
 import java.io.StringReader;
 import java.math.BigDecimal;
+import java.nio.ByteBuffer;
+import java.util.Arrays;

+import org.apache.commons.codec.binary.Hex;
 import org.apache.impala.catalog.ScalarType;
 import org.apache.impala.catalog.Type;
 import org.apache.impala.catalog.TypeCompatibility;
@@ -29,14 +32,22 @@ import org.apache.impala.compat.MetastoreShim;
 import org.apache.impala.thrift.TExprNode;
 import org.apache.impala.thrift.TExprNodeType;
 import org.apache.impala.thrift.TStringLiteral;
+import org.apache.impala.util.StringUtils;

 import com.google.common.base.MoreObjects;
 import com.google.common.base.Preconditions;

+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import java_cup.runtime.Symbol;

 public class StringLiteral extends LiteralExpr {
-  private final String value_;
+  private final static Logger LOG = LoggerFactory.getLogger(StringLiteral.class);
+  // From the strValue_ and binValue_ always only one is null while the other is non-null.
+  // strValue_ == null means that the value is not decodable as utf8.
+  private final String strValue_;
+  private final byte[] binValue_;
  public static int MAX_STRING_LEN = Integer.MAX_VALUE;

  // Indicates whether this value needs to be unescaped in toThrift() or comparison.
@@ -50,17 +61,33 @@ public class StringLiteral extends LiteralExpr {
  }

  public StringLiteral(String value, Type type, boolean needsUnescaping) {
-    value_ = value;
+    strValue_ = value;
+    binValue_ = null;
    type_ = type;
    needsUnescaping_ = needsUnescaping;
  }

+  public StringLiteral(byte[] value, Type type) {
+    // It is ok to fail as StringLiteral can contain binary data. strValue_ is null in
+    // this case.
+    strValue_ = StringUtils.fromUtf8Buffer(ByteBuffer.wrap(value), true);
+    if (strValue_ == null) {
+      // Save binValue_ only if it cannot be represented as String.
+      binValue_ = value;
+    } else {
+      binValue_ = null;
+    }
+    type_ = type;
+    needsUnescaping_ = false;
+  }
+
  /**
   * Copy c'tor used in clone().
   */
  protected StringLiteral(StringLiteral other) {
    super(other);
-    value_ = other.value_;
+    strValue_ = other.strValue_;
+    binValue_ = other.binValue_;
    needsUnescaping_ = other.needsUnescaping_;
  }

@@ -68,12 +95,22 @@ public class StringLiteral extends LiteralExpr {
  protected boolean localEquals(Expr that) {
    if (!super.localEquals(that)) return false;
    StringLiteral other = (StringLiteral) that;
-    return needsUnescaping_ == other.needsUnescaping_ && type_.equals(other.type_)
-        && value_.equals(other.value_);
+    if (!type_.equals(other.type_)) return false;
+    if (needsUnescaping_ != other.needsUnescaping_) return false;
+    if (binValue_ != null) {
+      Preconditions.checkState(strValue_ == null);
+      if (other.binValue_ == null) return false;
+      return Arrays.equals(binValue_, other.binValue_);
+    }
+    Preconditions.checkState(strValue_ != null);
+    return strValue_.equals(other.strValue_);
  }

+  // TODO: shouldn't this also check needsUnescaping?
  @Override
-  public int hashCode() { return value_.hashCode(); }
+  public int hashCode() {
+    return binValue_ != null ? Arrays.hashCode(binValue_) : strValue_.hashCode();
+  }

  @Override
  public String toSqlImpl(ToSqlOptions options) {
@@ -83,21 +120,35 @@ public class StringLiteral extends LiteralExpr {
  @Override
  protected void toThrift(TExprNode msg) {
    msg.node_type = TExprNodeType.STRING_LITERAL;
-    String val = (needsUnescaping_) ? getUnescapedValue() : value_;
-    msg.string_literal = new TStringLiteral(val);
+    byte[] val;
+    if (needsUnescaping_) {
+      val = StringUtils.toUtf8Array(getUnescapedValue());
+    } else {
+      val = getBinValue();
+    }
+    msg.string_literal = new TStringLiteral(ByteBuffer.wrap(val));
  }

  /**
   * Returns the original value that the string literal was constructed with,
   * without escaping or unescaping it.
+   * Can be only used when the StringLiteral can be represented as java String.
   */
-  public String getValueWithOriginalEscapes() { return value_; }
+  public String getValueWithOriginalEscapes() {
+    checkHasValidString();
+    return strValue_;
+  }

  public String getUnescapedValue() {
+    checkHasValidString();
    // Unescape string exactly like Hive does. Hive's method assumes
    // quotes so we add them here to reuse Hive's code.
-    return MetastoreShim.unescapeSQLString("'" + getNormalizedValue()
-        + "'");
+    return MetastoreShim.unescapeSQLString("'" + getNormalizedValue() + "'");
+  }
+
+  private String unescapeIfNeeded() {
+    checkHasValidString();
+    return needsUnescaping_ ? getUnescapedValue() : strValue_;
  }

  /**
@@ -109,12 +160,17 @@ public class StringLiteral extends LiteralExpr {
   *          SQL as a single-quoted string literal.
   */
  private String getNormalizedValue() {
-    final int len = value_.length();
+    if (strValue_ == null) {
+      // express binary using unhex()
+      String hex = Hex.encodeHexString(binValue_);
+      return "unhex(\"" + hex.toUpperCase() +"\")";
+    }
+    final int len = strValue_.length();
    final StringBuilder sb = new StringBuilder(len);
    for (int i = 0; i < len; ++i) {
-      final char currentChar = value_.charAt(i);
+      final char currentChar = strValue_.charAt(i);
      if (currentChar == '\\' && (i + 1) < len) {
-        final char nextChar = value_.charAt(i + 1);
+        final char nextChar = strValue_.charAt(i + 1);
        // unescape an escaped double quote: remove back-slash in front of the quote.
        if (nextChar == '"' || nextChar == '\'' || nextChar == '\\') {
          if (nextChar != '"') {
@@ -143,8 +199,10 @@ public class StringLiteral extends LiteralExpr {

  @Override
  public String debugString() {
+    // TODO: hex encode the binary case if someone needs this
+    String str = strValue_ != null ? strValue_ : "<can't encode as String>";
    return MoreObjects.toStringHelper(this)
-        .add("value", value_)
+        .add("strValue", str)
        .toString();
  }

@@ -179,7 +237,8 @@ public class StringLiteral extends LiteralExpr {
   *           or if floating point value is NaN or infinite
   */
  public LiteralExpr convertToNumber(Type targetType) throws AnalysisException {
-    StringReader reader = new StringReader(value_);
+    checkHasValidString();
+    StringReader reader = new StringReader(strValue_);
    SqlScanner scanner = new SqlScanner(reader);
    // For distinguishing positive and negative numbers.
    boolean negative = false;
@@ -198,7 +257,7 @@ public class StringLiteral extends LiteralExpr {
      throw new AnalysisException("Failed to convert string literal to number.", e);
    }
    if (sym.sym == SqlParserSymbols.NUMERIC_OVERFLOW) {
-      throw new AnalysisException("Number too large: " + value_);
+      throw new AnalysisException("Number too large: " + strValue_);
    }
    if (sym.sym == SqlParserSymbols.INTEGER_LITERAL) {
      BigDecimal val = (BigDecimal) sym.value;
@@ -212,20 +271,43 @@ public class StringLiteral extends LiteralExpr {
    }
    // Symbol is not an integer or floating point literal.
    throw new AnalysisException("Failed to convert string literal '"
-        + value_ + "' to number.");
+        + strValue_ + "' to number.");
  }

  @Override
  public int compareTo(LiteralExpr o) {
    int ret = super.compareTo(o);
-    if (ret != 0) return ret;
+    if (ret != 0) return ret; // TODO: this doesn't seem to handle the NULL case well
    StringLiteral other = (StringLiteral) o;
-    String thisValue = needsUnescaping_? getUnescapedValue() : value_;
-    String otherValue = other.needsUnescaping_?
-        other.getUnescapedValue() : other.getStringValue();
-    return thisValue.compareTo(otherValue);
+    // Do byte wise comparisin on UTF8 format to be consistant with BE.
+    // TODO: it would be clearer to use backand's comparision functions in literals.
+    byte[] arr1 = getBinValue();
+    byte[] arr2 = other.getBinValue();
+    // TODO: rewrite once Java 8 support is dropped
+    //return Arrays.compare(binValue_, other.binValue_); // added in java 9
+    for (int i = 0; i < arr1.length && i < arr2.length; i++) {
+      if (arr1[i] < arr2[i]) return -1;
+      if (arr1[i] > arr2[i]) return 1;
+    }
+    if (arr1.length < arr2.length) return -1;
+    if (arr1.length > arr2.length) return 1;
+    return 0;
  }

  @Override
  public Expr clone() { return new StringLiteral(this); }
+
+  public int utf8ArrayLength() { return getBinValue().length; }
+
+  public boolean isValidUtf8() { return binValue_ == null; }
+
+  public byte[] getBinValue() {
+    if (binValue_ != null) return binValue_;
+    return StringUtils.toUtf8Array(unescapeIfNeeded());
+  }
+
+  private void checkHasValidString() {
+    Preconditions.checkState(strValue_ != null,
+        "non-utf8 string: %s", getNormalizedValue());
+  }
 }
--- a/fe/src/main/java/org/apache/impala/planner/KuduScanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/KuduScanNode.java
@@ -576,13 +576,16 @@ public class KuduScanNode extends ScanNode {
      case STRING:
      case VARCHAR:
      case CHAR: {
+        StringLiteral strLit = (StringLiteral)literal;
+        if (!strLit.isValidUtf8()) return false;
        kuduPredicate = KuduPredicate.newComparisonPredicate(column, op,
-            ((StringLiteral)literal).getUnescapedValue());
+            strLit.getUnescapedValue());
        break;
      }
      case BINARY: {
+        StringLiteral strLit = (StringLiteral)literal;
        kuduPredicate = KuduPredicate.newComparisonPredicate(column, op,
-            ((StringLiteral)literal).getUnescapedValue().getBytes());
+            strLit.getBinValue());
        break;
      }
      case TIMESTAMP: {
--- a/fe/src/main/java/org/apache/impala/service/JniFrontend.java
+++ b/fe/src/main/java/org/apache/impala/service/JniFrontend.java
@@ -115,6 +115,7 @@ import org.slf4j.LoggerFactory;
 import java.io.File;
 import java.io.IOException;
 import java.lang.IllegalArgumentException;
+import java.nio.charset.StandardCharsets;
 import java.util.Calendar;
 import java.util.Collections;
 import java.util.GregorianCalendar;
@@ -807,11 +808,13 @@ public class JniFrontend {
    JniUtil.deserializeThrift(protocolFactory_, secretKey, secretKeyRequest);
    String secret = null;
    try {
-      char[] secretCharArray = CONF.getPassword(secretKey.getValue());
+      String secretKeyStr =
+          StandardCharsets.UTF_8.decode(secretKey.value).toString();
+      char[] secretCharArray = CONF.getPassword(secretKeyStr);
      if (secretCharArray != null) {
        secret = new String(secretCharArray);
      } else {
-        String errMsg = String.format(KEYSTORE_ERROR_MSG, secretKey.getValue());
+        String errMsg = String.format(KEYSTORE_ERROR_MSG, secretKeyStr);
        LOG.error(errMsg);
        throw new InternalException(errMsg);
      }
@@ -830,7 +833,11 @@ public class JniFrontend {
    final TStringLiteral timezone = new TStringLiteral();
    JniUtil.deserializeThrift(protocolFactory_, timezone, timezoneT);
    // TimeZone.getTimeZone defaults to GMT if it doesn't recognize the timezone.
-    Calendar c = new GregorianCalendar(TimeZone.getTimeZone(timezone.getValue()));
+    // Invalid characters are replaced if decoding is not successful. This should lead
+    // to a failed timezone lookup and using default GMT.
+    String timezoneStr =
+        StandardCharsets.UTF_8.decode(timezone.value).toString();
+    Calendar c = new GregorianCalendar(TimeZone.getTimeZone(timezoneStr));
    c.setTimeInMillis(utc_time_millis);

    final TCivilTime civilTime = new TCivilTime(
--- a/fe/src/main/java/org/apache/impala/util/KuduUtil.java
+++ b/fe/src/main/java/org/apache/impala/util/KuduUtil.java
@@ -51,6 +51,7 @@ import org.apache.impala.thrift.TExpr;
 import org.apache.impala.thrift.TExprNode;
 import org.apache.impala.thrift.TExprNodeType;
 import org.apache.impala.thrift.THdfsCompression;
+import org.apache.impala.util.StringUtils;
 import org.apache.kudu.ColumnSchema;
 import org.apache.kudu.ColumnSchema.CompressionAlgorithm;
 import org.apache.kudu.ColumnSchema.Encoding;
@@ -163,6 +164,7 @@ public class KuduUtil {
    TExprNode literal = boundaryVal.getNodes().get(0);
    String colName = col.getName();
    org.apache.kudu.Type type = col.getType();
+    String strLiteral;
    switch (type) {
      case INT8:
        checkCorrectType(literal.isSetInt_literal(), type, colName, literal);
@@ -182,11 +184,13 @@ public class KuduUtil {
        break;
      case VARCHAR:
        checkCorrectType(literal.isSetString_literal(), type, colName, literal);
-        key.addVarchar(pos, literal.getString_literal().getValue());
+        key.addVarchar(pos,
+            StringUtils.fromUtf8Buffer(literal.getString_literal().value, false));
        break;
      case STRING:
        checkCorrectType(literal.isSetString_literal(), type, colName, literal);
-        key.addString(pos, literal.getString_literal().getValue());
+        key.addString(pos,
+            StringUtils.fromUtf8Buffer(literal.getString_literal().value, false));
        break;
      case UNIXTIME_MICROS:
        checkCorrectType(literal.isSetInt_literal(), type, colName, literal);
@@ -244,7 +248,7 @@ public class KuduUtil {
      case VARCHAR:
      case STRING:
        checkCorrectType(literal.isSetString_literal(), type, colName, literal);
-        return literal.getString_literal().getValue();
+        return StringUtils.fromUtf8Buffer(literal.getString_literal().value, false);
      case BOOL:
        checkCorrectType(literal.isSetBool_literal(), type, colName, literal);
        return literal.getBool_literal().isValue();
--- a/fe/src/main/java/org/apache/impala/util/StringUtils.java
+++ b/fe/src/main/java/org/apache/impala/util/StringUtils.java
@@ -0,0 +1,55 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.impala.util;
+
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.StandardCharsets;
+
+import com.google.common.base.Preconditions;
+
+
+public class StringUtils {
+
+  /**
+   * Converts String to utf8 byte array. Assumes that this conversion is successful.
+   */
+  public static byte[] toUtf8Array(String str) {
+    ByteBuffer buf = StandardCharsets.UTF_8.encode(str);
+    byte[] arr = new byte[buf.limit()];
+    buf.get(arr);
+    return arr;
+  }
+
+  /**
+   * Converts utf8 ByteBuffer to String.
+   * Handling decoding errors depends on 'canfail':
+   * if true: return null
+   * if false: hit precondition
+   */
+  public static String fromUtf8Buffer(ByteBuffer buf, boolean canFail) {
+    try {
+      return StandardCharsets.UTF_8.newDecoder().decode(buf).toString();
+    } catch (CharacterCodingException ex) {
+      Preconditions.checkState(canFail, "UTF8 decoding failed: %s", ex.getMessage());
+      return null;
+    }
+  }
+
+}
--- a/fe/src/test/java/org/apache/impala/analysis/ExprRewriteRulesTest.java
+++ b/fe/src/test/java/org/apache/impala/analysis/ExprRewriteRulesTest.java
@@ -392,8 +392,9 @@ public class ExprRewriteRulesTest extends FrontendTestBase {
    RewritesOk("base64decode(base64encode('\\047\\001\\132\\060')) = " +
      "'\\047\\001\\132\\060'", rule, "TRUE");

-    // Tests correct handling of strings with chars > 127. Should not be folded.
-    RewritesOk("hex(unhex(hex(unhex('D3'))))", rule, null);
+    // Tests correct handling of strings with chars > 127.
+    RewritesOk("concat('á', '你好')", rule, "'á你好'");
+    RewritesOk("hex(unhex(hex(unhex('D3'))))", rule, "'D3'");
    // Tests that non-deterministic functions are not folded.
    RewritesOk("rand()", rule, null);
    RewritesOk("random()", rule, null);
--- a/fe/src/test/java/org/apache/impala/planner/PlannerTest.java
+++ b/fe/src/test/java/org/apache/impala/planner/PlannerTest.java
@@ -141,7 +141,13 @@ public class PlannerTest extends PlannerTestBase {
    // Tests that constant folding is applied to all relevant PlanNodes and DataSinks.
    // Note that not all Exprs are printed in the explain plan, so validating those
    // via this test is currently not possible.
-    runPlannerTestFile("constant-folding",
+    TQueryOptions options = defaultQueryOptions();
+    options.setUtf8_mode(false);
+    runPlannerTestFile("constant-folding", options,
+        ImmutableSet.of(PlannerTestOption.EXTENDED_EXPLAIN,
+            PlannerTestOption.INCLUDE_QUERY_WITH_IMPLICIT_CASTS));
+    options.setUtf8_mode(true);
+    runPlannerTestFile("constant-folding-utf8-mode", options,
        ImmutableSet.of(PlannerTestOption.EXTENDED_EXPLAIN,
            PlannerTestOption.INCLUDE_QUERY_WITH_IMPLICIT_CASTS));
  }
--- a/fe/src/test/java/org/apache/impala/service/JniFrontendTest.java
+++ b/fe/src/test/java/org/apache/impala/service/JniFrontendTest.java
@@ -23,6 +23,7 @@ import static org.mockito.Mockito.mock;
 import static org.mockito.Mockito.when;

 import java.io.File;
+import java.nio.charset.StandardCharsets;
 import java.util.Random;

 import org.apache.hadoop.conf.Configuration;
@@ -134,18 +135,20 @@ public class JniFrontendTest {
  @Test
  public void testGetSecretFromKeyStore() throws ImpalaException {
    // valid secret-key returns the correct secret
-    TStringLiteral secretKey = new TStringLiteral("openai-api-key-secret");
+    String keyName = "openai-api-key-secret";
+    TStringLiteral secretKey = new TStringLiteral(StandardCharsets.UTF_8.encode(keyName));
    byte[] secretKeyBytes = JniUtil.serializeToThrift(secretKey);
    String secret = JniFrontend.getSecretFromKeyStore(secretKeyBytes);
    assertEquals(secret, "secret");
    // invalid secret-key returns error
-    secretKey = new TStringLiteral("dummy-secret");
+    keyName = "dummy-secret";
+    secretKey = new TStringLiteral(StandardCharsets.UTF_8.encode(keyName));
    secretKeyBytes = JniUtil.serializeToThrift(secretKey);
    try {
      secret = JniFrontend.getSecretFromKeyStore(secretKeyBytes);
    } catch (InternalException e) {
      assertEquals(e.getMessage(),
-          String.format(JniFrontend.KEYSTORE_ERROR_MSG, secretKey.getValue()));
+          String.format(JniFrontend.KEYSTORE_ERROR_MSG, keyName));
    }
  }
 }
--- a/testdata/workloads/functional-planner/queries/PlannerTest/constant-folding-utf8-mode.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/constant-folding-utf8-mode.test
@@ -0,0 +1,25 @@
+# Constant folding of utf8-aware functions with utf8_mode=true.
+select string_col from functional.alltypes
+where int_col = length(concat("á", "é"))
+   or string_col = substr("áé", 1, 1)
+---- PLAN
+Analyzed query: SELECT string_col FROM functional.alltypes WHERE int_col =
+CAST(2 AS INT) OR string_col = 'á'
+F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
+|  Per-Host Resources: mem-estimate=132.00MB mem-reservation=4.03MB thread-reservation=2
+PLAN-ROOT SINK
+|  output exprs: string_col
+|  mem-estimate=4.00MB mem-reservation=4.00MB spill-buffer=2.00MB thread-reservation=0
+|
+00:SCAN HDFS [functional.alltypes]
+   HDFS partitions=24/24 files=24 size=478.45KB
+   predicates: int_col = CAST(2 AS INT) OR string_col = 'á'
+   stored statistics:
+     table: rows=7.30K size=478.45KB
+     partitions: 24/24 rows=7.30K
+     columns: all
+   extrapolated-rows=disabled max-scan-range-rows=310
+   mem-estimate=128.00MB mem-reservation=32.00KB thread-reservation=1
+   tuple-ids=0 row-size=17B cardinality=1.39K
+   in pipelines: 00(GETNEXT)
+====
--- a/testdata/workloads/functional-planner/queries/PlannerTest/constant-folding.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/constant-folding.test
@@ -483,3 +483,79 @@ PLAN-ROOT SINK
   tuple-ids=0 row-size=4B cardinality=2
   in pipelines: 00(GETNEXT)
 ====
+# Constant folding for non-ascii string literals.
+select string_col from functional.alltypes
+where string_col = concat("a", "á")
+   or string_col = hex(unhex("aa"))
+   or string_col = unhex("aa")
+---- PLAN
+Analyzed query: SELECT string_col FROM functional.alltypes WHERE string_col IN
+('aá', 'AA', 'unhex("AA")')
+F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
+|  Per-Host Resources: mem-estimate=132.00MB mem-reservation=4.03MB thread-reservation=2
+PLAN-ROOT SINK
+|  output exprs: string_col
+|  mem-estimate=4.00MB mem-reservation=4.00MB spill-buffer=2.00MB thread-reservation=0
+|
+00:SCAN HDFS [functional.alltypes]
+   HDFS partitions=24/24 files=24 size=478.45KB
+   predicates: string_col IN ('aá', 'AA', 'unhex("AA")')
+   stored statistics:
+     table: rows=7.30K size=478.45KB
+     partitions: 24/24 rows=7.30K
+     columns: all
+   extrapolated-rows=disabled max-scan-range-rows=310
+   mem-estimate=128.00MB mem-reservation=32.00KB thread-reservation=1
+   tuple-ids=0 row-size=13B cardinality=2.19K
+   in pipelines: 00(GETNEXT)
+====
+# Constant folding for non-ascii string literals cast to BINARY.
+select binary_col from functional.binary_tbl
+where binary_col = cast(concat("a", "á") as binary)
+   or binary_col = cast(hex(unhex("aa")) as binary)
+   or binary_col = cast(unhex("aa") as binary)
+---- PLAN
+Analyzed query: SELECT binary_col FROM functional.binary_tbl WHERE binary_col IN
+('aá', 'AA', 'unhex("AA")')
+F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
+|  Per-Host Resources: mem-estimate=36.00MB mem-reservation=4.01MB thread-reservation=2
+PLAN-ROOT SINK
+|  output exprs: binary_col
+|  mem-estimate=4.00MB mem-reservation=4.00MB spill-buffer=2.00MB thread-reservation=0
+|
+00:SCAN HDFS [functional.binary_tbl]
+   HDFS partitions=1/1 files=1 size=189B
+   predicates: binary_col IN ('aá', 'AA', 'unhex("AA")')
+   stored statistics:
+     table: rows=8 size=189B
+     columns: all
+   extrapolated-rows=disabled max-scan-range-rows=8
+   mem-estimate=32.00MB mem-reservation=8.00KB thread-reservation=1
+   tuple-ids=0 row-size=12B cardinality=1
+   in pipelines: 00(GETNEXT)
+====
+# Constant folding of utf8-aware functions with utf8_mode=false.
+select string_col from functional.alltypes
+where int_col = length(concat("á", "é"))
+   or string_col = substr("áé", 1, 1)
+---- PLAN
+Analyzed query: SELECT string_col FROM functional.alltypes WHERE int_col =
+CAST(4 AS INT) OR string_col = 'unhex("C3")'
+F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
+|  Per-Host Resources: mem-estimate=132.00MB mem-reservation=4.03MB thread-reservation=2
+PLAN-ROOT SINK
+|  output exprs: string_col
+|  mem-estimate=4.00MB mem-reservation=4.00MB spill-buffer=2.00MB thread-reservation=0
+|
+00:SCAN HDFS [functional.alltypes]
+   HDFS partitions=24/24 files=24 size=478.45KB
+   predicates: int_col = CAST(4 AS INT) OR string_col = 'unhex("C3")'
+   stored statistics:
+     table: rows=7.30K size=478.45KB
+     partitions: 24/24 rows=7.30K
+     columns: all
+   extrapolated-rows=disabled max-scan-range-rows=310
+   mem-estimate=128.00MB mem-reservation=32.00KB thread-reservation=1
+   tuple-ids=0 row-size=17B cardinality=1.39K
+   in pipelines: 00(GETNEXT)
+====
--- a/testdata/workloads/functional-planner/queries/PlannerTest/kudu.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/kudu.test
@@ -761,22 +761,22 @@ PLAN-ROOT SINK
   row-size=36B cardinality=0
 ====
 # BINARY predicate.
-# Non-ASCII strings cannot be pushed down to Kudu (IMPALA-10349)
+# Non-ASCII strings can be pushed down to Kudu (IMPALA-10349)
 select * from functional_kudu.binary_tbl where binary_col=cast("á" as binary);
 ---- PLAN
 PLAN-ROOT SINK
 |
 00:SCAN KUDU [functional_kudu.binary_tbl]
-   predicates: binary_col = CAST('á' AS BINARY)
+   kudu predicates: binary_col = 'á'
   row-size=36B cardinality=0
 ====
 # BINARY predicate.
-# Not valid utf8 strings cannot be pushed down to Kudu (IMPALA-10349)
+# Not valid utf8 strings can be pushed down to Kudu (IMPALA-10349)
 select * from functional_kudu.binary_tbl where binary_col=cast(unhex("aa") as binary);
 ---- PLAN
 PLAN-ROOT SINK
 |
 00:SCAN KUDU [functional_kudu.binary_tbl]
-   predicates: binary_col = CAST(unhex('aa') AS BINARY)
+   kudu predicates: binary_col = 'unhex("AA")'
   row-size=36B cardinality=0
 ====
--- a/testdata/workloads/functional-query/queries/QueryTest/binary-type.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/binary-type.test
@@ -167,7 +167,7 @@ BIGINT
 !row_regex:.*TotalKuduScanRoundTrips: 1.*
 ====
 ---- QUERY
-# Check that filters with constant folding and non ascii characters cannot
+# Check that filters with constant folding and non ascii (utf8) characters can
 # be pushed down to kudu (IMPALA-10349).
 select count(*) from binary_tbl
 where binary_col = cast("á" as binary)
@@ -176,10 +176,10 @@ BIGINT
 ---- RESULTS
 0
 ---- RUNTIME_PROFILE: table_format=kudu
-row_regex:.*TotalKuduScanRoundTrips: 1.*
+row_regex:.*TotalKuduScanRoundTrips: 0.*
 ====
 ---- QUERY
-# Check that filters with constant folding and not valid utf8 characters cannot
+# Check that filters with constant folding and not valid utf8 characters can
 # be pushed down to kudu (IMPALA-10349).
 select count(*) from binary_tbl
 where binary_col = cast(unhex("AA") as binary)
@@ -188,5 +188,5 @@ BIGINT
 ---- RESULTS
 0
 ---- RUNTIME_PROFILE: table_format=kudu
-row_regex:.*TotalKuduScanRoundTrips: 1.*
+row_regex:.*TotalKuduScanRoundTrips: 0.*
 ====
--- a/testdata/workloads/functional-query/queries/QueryTest/kudu_create.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/kudu_create.test
@@ -383,6 +383,20 @@ stored as kudu
 AnalysisException: Invalid date literal: '111111-33-33'
 ====
 ---- QUERY
+# create table with invalid default STRING value
+create table kudu_invalid_default_string (s string primary key default unhex("aa"))
+stored as kudu
+---- CATCH
+AnalysisException: Invalid String default value: unhex('aa')
+====
+---- QUERY
+# create table with invalid range STRING value
+create table kudu_invalid_range_string (s string primary key)
+partition by range (s) (partition values < unhex("aa")) stored as kudu;
+---- CATCH
+AnalysisException: Invalid String range partition value: 'unhex("AA")'
+====
+---- QUERY
 # create table with DATE primary key partitioned by range
 create table kudu_datepk_range (fdate DATE not null primary key)
 partition by range (fdate)