mirror of
https://github.com/apache/impala.git
synced 2025-12-19 18:12:08 -05:00
IMPALA-12927: Support specifying format for reading JSON BINARY columns
Currently, Impala always assumes that the data in the binary columns of JSON tables is base64 encoded. However, before HIVE-21240, Hive wrote binary data to JSON tables without base64 encoding it, instead writing it as escaped strings. After HIVE-21240, Hive defaults to base64 encoding binary data when writing to JSON tables and introduces the serde property 'json.binary.format' to indicate the encoding method of binary data in JSON tables. To maintain consistency with Hive and avoid correctness issues caused by reading data in an incorrect manner, this patch also introduces the serde property 'json.binary.format' to specify the reading method for binary data in JSON tables. Currently, this property supports reading in either base64 or rawstring formats, same as Hive. Additionally, this patch introduces a query option 'json_binary_format' to achieve the same effect. This query option will only take effect for JSON tables where the serde property 'json.binary.format' is not set. The reading format of binary columns in JSON tables can be configured globally by setting the 'default_query_options'. It should be noted that the default value of 'json_binary_format' is 'NONE', and impala will prohibit reading binary columns of JSON tables that either have "no 'json.binary.format' set and 'json_binary_format' is 'NONE'" or "an invalid 'json.binary.format' value set", and will provide an error message to avoid using an incorrect format without the user noticing. Testing: - Enabled existing binary type E2E tests for JSON tables - Added new E2E test for 'json.binary.format' Change-Id: Idf61fa3afc0f33caa63fbc05393e975733165e82 Reviewed-on: http://gerrit.cloudera.org:8080/22289 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
This commit is contained in:
committed by
Impala Public Jenkins
parent
3d24f45f9c
commit
faf322dd41
@@ -178,6 +178,13 @@ enum TBucketType {
|
||||
HASH = 1
|
||||
}
|
||||
|
||||
// Options for JSON binary format to determine how binary data is encoded in JSON.
|
||||
enum TJsonBinaryFormat {
|
||||
NONE = 0
|
||||
BASE64 = 1
|
||||
RAWSTRING = 2
|
||||
}
|
||||
|
||||
struct TCompressionCodec {
|
||||
// Compression codec
|
||||
1: required THdfsCompression codec
|
||||
@@ -357,6 +364,7 @@ struct THdfsStorageDescriptor {
|
||||
6: required byte quoteChar
|
||||
7: required THdfsFileFormat fileFormat
|
||||
8: required i32 blockSize
|
||||
9: optional TJsonBinaryFormat jsonBinaryFormat
|
||||
}
|
||||
|
||||
// Represents an HDFS partition
|
||||
|
||||
@@ -1026,6 +1026,13 @@ enum TImpalaQueryOptions {
|
||||
|
||||
// If True, use the Calcite planner for compilation
|
||||
USE_CALCITE_PLANNER = 191
|
||||
|
||||
// The default format for reading JSON binary columns, can be overridden by table
|
||||
// property 'json.binary.format' (if set). The valid values are:
|
||||
// NONE - default value, means unspecified format, depends on the table property.
|
||||
// BASE64 - the json binary data is read as base64 encoded string.
|
||||
// RAWSTRING - the json binary data is read as raw string.
|
||||
JSON_BINARY_FORMAT = 192
|
||||
}
|
||||
|
||||
// The summary of a DML statement.
|
||||
|
||||
@@ -778,6 +778,10 @@ struct TQueryOptions {
|
||||
|
||||
// See comment in ImpalaService.thrift
|
||||
192: optional bool use_calcite_planner = false;
|
||||
|
||||
// See comment in ImpalaService.thrift
|
||||
193: optional CatalogObjects.TJsonBinaryFormat json_binary_format =
|
||||
TJsonBinaryFormat.NONE;
|
||||
}
|
||||
|
||||
// Impala currently has three types of sessions: Beeswax, HiveServer2 and external
|
||||
|
||||
Reference in New Issue
Block a user