Files
impala/testdata/workloads/functional-query/queries/QueryTest/json-binary-format.test
Eyizoha faf322dd41 IMPALA-12927: Support specifying format for reading JSON BINARY columns
Currently, Impala always assumes that the data in the binary columns of
JSON tables is base64 encoded. However, before HIVE-21240, Hive wrote
binary data to JSON tables without base64 encoding it, instead writing
it as escaped strings. After HIVE-21240, Hive defaults to base64
encoding binary data when writing to JSON tables and introduces the
serde property 'json.binary.format' to indicate the encoding method of
binary data in JSON tables.

To maintain consistency with Hive and avoid correctness issues caused by
reading data in an incorrect manner, this patch also introduces the
serde property 'json.binary.format' to specify the reading method for
binary data in JSON tables. Currently, this property supports reading in
either base64 or rawstring formats, same as Hive.

Additionally, this patch introduces a query option 'json_binary_format'
to achieve the same effect. This query option will only take effect for
JSON tables where the serde property 'json.binary.format' is not set.
The reading format of binary columns in JSON tables can be configured
globally by setting the 'default_query_options'. It should be noted that
the default value of 'json_binary_format' is 'NONE', and impala will
prohibit reading binary columns of JSON tables that either have
"no 'json.binary.format' set and 'json_binary_format' is 'NONE'" or
"an invalid 'json.binary.format' value set", and will provide an error
message to avoid using an incorrect format without the user noticing.

Testing:
  - Enabled existing binary type E2E tests for JSON tables
  - Added new E2E test for 'json.binary.format'

Change-Id: Idf61fa3afc0f33caa63fbc05393e975733165e82
Reviewed-on: http://gerrit.cloudera.org:8080/22289
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2025-04-29 16:16:12 +00:00

110 lines
3.6 KiB
Plaintext

====
---- QUERY
# No property or query option set, scanning binary columns will throw an exception.
# Refresh is needed for serdeproperties changes to take effect, see IMPALA-13748.
alter table binary_tbl unset serdeproperties ('json.binary.format');
refresh binary_tbl;
set json_binary_format=none;
select id, string_col, cast(binary_col as string) from binary_tbl
---- CATCH
No valid serde properties 'json.binary.format' or query option 'json_binary_format' ('base64' or 'rawstring') provided for scanning binary column of json table '$DATABASE.binary_tbl'.
====
---- QUERY
# No binary column scanned, no exception thrown.
set json_binary_format=none;
select id, string_col from binary_tbl
---- TYPES
INT, STRING
---- RESULTS:
1,'ascii'
2,'ascii'
3,'null'
4,'empty'
5,'valid utf8'
6,'valid utf8'
7,'invalid utf8'
8,'invalid utf8'
====
---- QUERY
# No property set but query option set, scanning binary columns will use the query option.
set json_binary_format=rawstring;
select id, string_col, cast(binary_col as string) from binary_tbl
---- TYPES
INT, STRING, STRING
---- RESULTS:
1,'ascii','YmluYXJ5MQ=='
2,'ascii','YmluYXJ5Mg=='
3,'null','NULL'
4,'empty',''
5,'valid utf8','w6FydsOtenTFsXLFkXTDvGvDtnJmw7pyw7M='
6,'valid utf8','5L2g5aW9aGVsbG8='
7,'invalid utf8','AP8A/w=='
8,'invalid utf8','/0QzIhEA'
====
---- QUERY
# If the property is set, it takes precedence over the query option, even if the value is
# invalid.
alter table binary_tbl set serdeproperties ('json.binary.format'='foobar');
refresh binary_tbl;
set json_binary_format=rawstring;
select id, string_col, cast(binary_col as string) from binary_tbl
---- CATCH
Invalid serde property 'json.binary.format' for scanning binary column of json table '$DATABASE.binary_tbl'. Valid values are 'base64' or 'rawstring'.
====
---- QUERY
# Setting the property to 'base64', scanning binary columns will use base64 encoding,
# rather than the query option 'rawstring'.
alter table binary_tbl set serdeproperties ('json.binary.format'='base64');
refresh binary_tbl;
set json_binary_format=rawstring;
select id, string_col, base64encode(cast(binary_col as string)) from binary_tbl
---- TYPES
INT, STRING, STRING
---- RESULTS:
1,'ascii','YmluYXJ5MQ=='
2,'ascii','YmluYXJ5Mg=='
3,'null','NULL'
4,'empty',''
5,'valid utf8','w6FydsOtenTFsXLFkXTDvGvDtnJmw7pyw7M='
6,'valid utf8','5L2g5aW9aGVsbG8='
7,'invalid utf8','AP8A/w=='
8,'invalid utf8','/0QzIhEA'
====
---- QUERY
# Unsetting the property and setting the query option to 'base64' will have the same
# effect.
alter table binary_tbl unset serdeproperties ('json.binary.format');
refresh binary_tbl;
set json_binary_format=base64;
select id, string_col, base64encode(cast(binary_col as string)) from binary_tbl
---- TYPES
INT, STRING, STRING
---- RESULTS:
1,'ascii','YmluYXJ5MQ=='
2,'ascii','YmluYXJ5Mg=='
3,'null','NULL'
4,'empty',''
5,'valid utf8','w6FydsOtenTFsXLFkXTDvGvDtnJmw7pyw7M='
6,'valid utf8','5L2g5aW9aGVsbG8='
7,'invalid utf8','AP8A/w=='
8,'invalid utf8','/0QzIhEA'
====
---- QUERY
# Test scanning multiple json tables with different binary column formats
# ('functional_json.binary_tbl' has 'base64').
alter table binary_tbl set serdeproperties ('json.binary.format'='rawstring');
refresh binary_tbl;
select r.id, cast(r.binary_col as string), base64encode(cast(b.binary_col as string))
from binary_tbl r join functional_json.binary_tbl b using (id)
---- TYPES
INT, STRING, STRING
---- RESULTS:
1,'YmluYXJ5MQ==','YmluYXJ5MQ=='
2,'YmluYXJ5Mg==','YmluYXJ5Mg=='
3,'NULL','NULL'
4,'',''
5,'w6FydsOtenTFsXLFkXTDvGvDtnJmw7pyw7M=','w6FydsOtenTFsXLFkXTDvGvDtnJmw7pyw7M='
6,'5L2g5aW9aGVsbG8=','5L2g5aW9aGVsbG8='
7,'AP8A/w==','AP8A/w=='
8,'/0QzIhEA','/0QzIhEA'
====