Files
impala/testdata/bin/load-dependent-tables.sql
Eyizoha 2f06a7b052 IMPALA-10798: Initial support for reading JSON files
Prototype of HdfsJsonScanner implemented based on rapidjson, which
supports scanning data from splitting json files.

The scanning of JSON data is mainly completed by two parts working
together. The first part is the JsonParser responsible for parsing the
JSON object, which is implemented based on the SAX-style API of
rapidjson. It reads data from the char stream, parses it, and calls the
corresponding callback function when encountering the corresponding JSON
element. See the comments of the JsonParser class for more details.

The other part is the HdfsJsonScanner, which inherits from HdfsScanner
and provides callback functions for the JsonParser. The callback
functions are responsible for providing data buffers to the Parser and
converting and materializing the Parser's parsing results into RowBatch.
It should be noted that the parser returns numeric values as strings to
the scanner. The scanner uses the TextConverter class to convert the
strings to the desired types, similar to how the HdfsTextScanner works.
This is an advantage compared to using number value provided by
rapidjson directly, as it eliminates concerns about inconsistencies in
converting decimals (e.g. losing precision).

Added a startup flag, enable_json_scanner, to be able to disable this
feature if we hit critical bugs in production.

Limitations
 - Multiline json objects are not fully supported yet. It is ok when
   each file has only one scan range. However, when a file has multiple
   scan ranges, there is a small probability of incomplete scanning of
   multiline JSON objects that span ScanRange boundaries (in such cases,
   parsing errors may be reported). For more details, please refer to
   the comments in the 'multiline_json.test'.
 - Compressed JSON files are not supported yet.
 - Complex types are not supported yet.

Tests
 - Most of the existing end-to-end tests can run on JSON format.
 - Add TestQueriesJsonTables in test_queries.py for testing multiline,
   malformed, and overflow in JSON.

Change-Id: I31309cb8f2d04722a0508b3f9b8f1532ad49a569
Reviewed-on: http://gerrit.cloudera.org:8080/19699
Reviewed-by: Quanlong Huang <huangquanlong@gmail.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2023-09-05 16:55:41 +00:00

118 lines
4.7 KiB
SQL

-- Licensed to the Apache Software Foundation (ASF) under one
-- or more contributor license agreements. See the NOTICE file
-- distributed with this work for additional information
-- regarding copyright ownership. The ASF licenses this file
-- to you under the Apache License, Version 2.0 (the
-- "License"); you may not use this file except in compliance
-- with the License. You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing,
-- software distributed under the License is distributed on an
-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-- KIND, either express or implied. See the License for the
-- specific language governing permissions and limitations
-- under the License.
-- Create and load tables that depend upon data in the hive test-warehouse already existing
-- Load a mixed-format table. Hive behaves oddly when mixing formats,
-- but the following incantation ensures that the result is a
-- three-partition table. First is text format, second is sequence
-- file, third is RC file. Must be called after test-warehouse is
-- successfully populated
USE functional;
DROP TABLE IF EXISTS alltypesmixedformat;
CREATE EXTERNAL TABLE alltypesmixedformat (
id int,
bool_col boolean,
tinyint_col tinyint,
smallint_col smallint,
int_col int,
bigint_col bigint,
float_col float,
double_col double,
date_string_col string,
string_col string,
timestamp_col timestamp)
partitioned by (year int, month int)
row format delimited fields terminated by ',' escaped by '\\'
stored as TEXTFILE
LOCATION '/test-warehouse/alltypesmixedformat';
INSERT OVERWRITE TABLE alltypesmixedformat PARTITION (year=2009, month=1)
SELECT id, bool_col, tinyint_col, smallint_col, int_col, bigint_col,
float_col, double_col, date_string_col, string_col, timestamp_col
FROM alltypes
WHERE year=2009 and month=1;
ALTER TABLE alltypesmixedformat SET FILEFORMAT SEQUENCEFILE;
LOAD DATA INPATH '/tmp/alltypes_seq/year=2009/month=2/'
OVERWRITE INTO TABLE alltypesmixedformat PARTITION (year=2009, month=2);
ALTER TABLE alltypesmixedformat SET FILEFORMAT RCFILE;
LOAD DATA INPATH '/tmp/alltypes_rc/year=2009/month=3/'
OVERWRITE INTO TABLE alltypesmixedformat PARTITION (year=2009, month=3);
ALTER TABLE alltypesmixedformat SET FILEFORMAT PARQUET;
LOAD DATA INPATH '/tmp/alltypes_parquet/year=2009/month=4'
OVERWRITE INTO TABLE alltypesmixedformat PARTITION (year=2009, month=4);
ALTER TABLE alltypesmixedformat PARTITION (year=2009, month=1)
SET SERDEPROPERTIES('field.delim'=',', 'escape.delim'='\\');
ALTER TABLE alltypesmixedformat PARTITION (year=2009, month=1)
SET FILEFORMAT TEXTFILE;
ALTER TABLE alltypesmixedformat PARTITION (year=2009, month=2)
SET SERDEPROPERTIES('field.delim'=',', 'escape.delim'='\\');
ALTER TABLE alltypesmixedformat PARTITION (year=2009, month=2)
SET FILEFORMAT SEQUENCEFILE;
ALTER TABLE alltypesmixedformat PARTITION (year=2009, month=3)
SET FILEFORMAT RCFILE;
ALTER TABLE alltypesmixedformat PARTITION (year=2009, month=4)
SET FILEFORMAT PARQUET;
DROP TABLE IF EXISTS functional_parquet.chars_formats;
CREATE EXTERNAL TABLE functional_parquet.chars_formats
(cs CHAR(5), cl CHAR(140), vc VARCHAR(32))
STORED AS PARQUET
LOCATION '/test-warehouse/chars_formats_parquet';
DROP TABLE IF EXISTS functional_orc_def.chars_formats;
CREATE EXTERNAL TABLE functional_orc_def.chars_formats
(cs CHAR(5), cl CHAR(140), vc VARCHAR(32))
STORED AS ORC
LOCATION '/test-warehouse/chars_formats_orc_def';
DROP TABLE IF EXISTS functional.chars_formats;
CREATE EXTERNAL TABLE functional.chars_formats
(cs CHAR(5), cl CHAR(140), vc VARCHAR(32))
ROW FORMAT delimited fields terminated by ',' escaped by '\\'
STORED AS TEXTFILE
LOCATION '/test-warehouse/chars_formats_text';
DROP TABLE IF EXISTS functional_json.chars_formats;
CREATE EXTERNAL TABLE functional_json.chars_formats
(cs CHAR(5), cl CHAR(140), vc VARCHAR(32))
ROW FORMAT delimited fields terminated by ',' escaped by '\\'
STORED AS JSONFILE
LOCATION '/test-warehouse/chars_formats_json';
DROP TABLE IF EXISTS functional_avro_snap.chars_formats;
CREATE EXTERNAL TABLE functional_avro_snap.chars_formats
(cs CHAR(5), cl CHAR(140), vc VARCHAR(32))
STORED AS AVRO
LOCATION '/test-warehouse/chars_formats_avro_snap'
TBLPROPERTIES ('avro.schema.literal'='{"type":"record",
"name":"CharTypesTest","doc":"Schema generated by Kite",
"fields":[
{"name":"cs","type":["null","string"], "doc":"Type inferred"},
{"name":"cl","type":["null","string"], "doc":"Type inferred"},
{"name":"vc","type":["null","string"], "doc":"Type inferred"}
]}');
---- Unsupported Impala table types
USE functional;
DROP VIEW IF EXISTS hive_view;
CREATE VIEW hive_view AS SELECT 1 AS int_col FROM alltypes limit 1;