mirror of
https://github.com/apache/impala.git
synced 2025-12-19 18:12:08 -05:00
Before this patch constant folding only converted the result of an
expression to StringLiteral if all characters were ASCII. The
change allows both UTF8 strings with non ascii characters and
byte arrays that are not valid UTF8 strings - the latter can
occur when constant folding is applied to BINARY columns,
for example in geospatial functions like st_polygon().
The main goal is being able to push down more predicates, e.g.
before that patch a filter like col="á" couldn't be pushed down
to Iceberg/Kudu/Parquet stat filtering, as all these expect literals.
Main changes:
- TStringLiteral uses a binary instead of a string member.
This doesn't affect BE as in c++ both types are compiled
to std::string. In Jave a java.nio.ByteBuffer is used instead of
String.
- StringLiteral uses a byte[] member to store the value of
the literal in case it is not valid UTF8 and cannot be
represented as Java String. In other cases still a String
is used to keep the change minimal, though it may be more
optimal to use UTF8 byte[] due to the smaller size. Always
converting from byte[] to String may be costy in the catalog
as partition values are stored as *Literals and rest of the
catalog operates on String.
- StringLiteral#compareTo() is switched to byte wise compare on byte[]
to be consistent with BE. This was not needed for ASCII strings
as Java String behaves the same way in that case, but non-ASCII
can have different order (note that Impala does not support
collations).
- When an invalid UTF8 StringLiteral is printed, for example in
case of EXPLAIN output, then it is printed as
unhex("<byte array in hexadecimal>"). This is a non-lossy way to
represent it, but it may be too verbose in some cases, e.g. for
large polygons. A follow up commit may refine this, e.g. by
limiting the max size printed.
An issue found while implementing this is that INSERT does not
handle invalid UTF8 partition values correctly, see IMPALA-14096.
This behavior is not changed in the patch.
Testing:
- Added a few tests that push down non-ascii const expressions in
predicates (both with utf8_mode=true and false).
Change-Id: I70663457a0b0a3443e586350f0a5996bb75ba64a
Reviewed-on: http://gerrit.cloudera.org:8080/22603
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
198 lines
5.2 KiB
Thrift
198 lines
5.2 KiB
Thrift
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
|
|
namespace py impala_thrift_gen.Exprs
|
|
namespace cpp impala
|
|
namespace java org.apache.impala.thrift
|
|
|
|
include "Types.thrift"
|
|
|
|
enum TExprNodeType {
|
|
NULL_LITERAL = 0
|
|
BOOL_LITERAL = 1
|
|
INT_LITERAL = 2
|
|
FLOAT_LITERAL = 3
|
|
STRING_LITERAL = 4
|
|
DECIMAL_LITERAL = 5
|
|
TIMESTAMP_LITERAL = 6
|
|
CASE_EXPR = 7
|
|
COMPOUND_PRED = 8
|
|
IN_PRED = 9
|
|
IS_NULL_PRED = 10
|
|
LIKE_PRED = 11
|
|
SLOT_REF = 12
|
|
TUPLE_IS_NULL_PRED = 13
|
|
FUNCTION_CALL = 14
|
|
AGGREGATE_EXPR = 15
|
|
IS_NOT_EMPTY_PRED = 16
|
|
KUDU_PARTITION_EXPR = 17
|
|
VALID_TUPLE_ID_EXPR = 18
|
|
DATE_LITERAL = 19
|
|
}
|
|
|
|
struct TBoolLiteral {
|
|
1: required bool value
|
|
}
|
|
|
|
struct TCaseExpr {
|
|
1: required bool has_case_expr
|
|
2: required bool has_else_expr
|
|
}
|
|
|
|
struct TDateLiteral {
|
|
// Number of days since 1970-01-01.
|
|
1: required i32 days_since_epoch
|
|
// String representation
|
|
2: required string date_string
|
|
}
|
|
|
|
struct TDecimalLiteral {
|
|
// Value of the unscaled decimal in two's complement big endian
|
|
// i.e. BigInteger.getBytes()
|
|
1: required binary value
|
|
}
|
|
|
|
struct TFloatLiteral {
|
|
1: required double value
|
|
}
|
|
|
|
struct TIntLiteral {
|
|
1: required i64 value
|
|
}
|
|
|
|
struct TTimestampLiteral {
|
|
// 16-byte raw representation of a TimestampValue
|
|
1: required binary value
|
|
}
|
|
|
|
// The units which can be used when extracting a Timestamp. TExtractField is never used
|
|
// in any messages. This enum is here to provide a single definition that can be shared
|
|
// by the front and backend.
|
|
enum TExtractField {
|
|
INVALID_FIELD = 0
|
|
YEAR = 1
|
|
QUARTER = 2
|
|
MONTH = 3
|
|
DAY = 4
|
|
HOUR = 5
|
|
MINUTE = 6
|
|
SECOND = 7
|
|
MILLISECOND = 8
|
|
EPOCH = 9
|
|
}
|
|
|
|
struct TInPredicate {
|
|
1: required bool is_not_in
|
|
}
|
|
|
|
struct TIsNullPredicate {
|
|
1: required bool is_not_null
|
|
}
|
|
|
|
struct TLiteralPredicate {
|
|
1: required bool value
|
|
2: required bool is_null
|
|
}
|
|
|
|
struct TTupleIsNullPredicate {
|
|
1: required list<Types.TTupleId> tuple_ids
|
|
}
|
|
|
|
struct TSlotRef {
|
|
1: required Types.TSlotId slot_id
|
|
}
|
|
|
|
struct TStringLiteral {
|
|
// Use binary instead of string as the value may not be valid utf8.
|
|
1: required binary value;
|
|
}
|
|
|
|
// Additional information for aggregate functions.
|
|
struct TAggregateExpr {
|
|
// Indicates whether this expr is the merge() of an aggregation.
|
|
1: required bool is_merge_agg
|
|
|
|
// The types of the input arguments to the aggregate function. May differ from the
|
|
// input expr types if this is the merge() of an aggregation.
|
|
2: required list<Types.TColumnType> arg_types;
|
|
}
|
|
|
|
// Expr used to call into the Kudu client to determine the partition index for rows. The
|
|
// values for the partition columns are produced by its children.
|
|
struct TKuduPartitionExpr {
|
|
// The Kudu table to use the partitioning scheme from.
|
|
1: required Types.TTableId target_table_id
|
|
|
|
// Mapping from the children of this expr to their column positions in the table, i.e.
|
|
// child(i) produces the value for column referenced_columns[i].
|
|
// TODO: Include the partition cols in the KuduTableDesciptor and remove this.
|
|
2: required list<i32> referenced_columns
|
|
}
|
|
|
|
struct TCastExpr {
|
|
// Holds the format clause of a cast expression.
|
|
1: required string cast_format
|
|
}
|
|
|
|
// This is essentially a union over the subclasses of Expr.
|
|
struct TExprNode {
|
|
1: required TExprNodeType node_type
|
|
2: required Types.TColumnType type
|
|
3: required i32 num_children
|
|
|
|
// Whether the Expr is constant according to the frontend.
|
|
4: required bool is_constant
|
|
|
|
// The function to execute. Not set for SlotRefs and Literals.
|
|
5: optional Types.TFunction fn
|
|
|
|
// If set, child[vararg_start_idx] is the first vararg child.
|
|
6: optional i32 vararg_start_idx
|
|
|
|
7: optional TBoolLiteral bool_literal
|
|
8: optional TCaseExpr case_expr
|
|
9: optional TDateLiteral date_literal
|
|
10: optional TFloatLiteral float_literal
|
|
11: optional TIntLiteral int_literal
|
|
12: optional TInPredicate in_predicate
|
|
13: optional TIsNullPredicate is_null_pred
|
|
14: optional TLiteralPredicate literal_pred
|
|
15: optional TSlotRef slot_ref
|
|
16: optional TStringLiteral string_literal
|
|
17: optional TTupleIsNullPredicate tuple_is_null_pred
|
|
18: optional TDecimalLiteral decimal_literal
|
|
19: optional TAggregateExpr agg_expr
|
|
20: optional TTimestampLiteral timestamp_literal
|
|
21: optional TKuduPartitionExpr kudu_partition_expr
|
|
22: optional TCastExpr cast_expr
|
|
|
|
// If codegen is disabled for this Expr
|
|
23: optional bool is_codegen_disabled
|
|
}
|
|
|
|
// A flattened representation of a tree of Expr nodes, obtained by depth-first
|
|
// traversal.
|
|
struct TExpr {
|
|
1: required list<TExprNode> nodes
|
|
}
|
|
|
|
// A list of TExprs
|
|
struct TExprBatch {
|
|
1: required list<TExpr> exprs
|
|
}
|
|
|