mirror of
https://github.com/apache/impala.git
synced 2026-01-08 03:02:48 -05:00
IMPALA-2523: Make HdfsTableSink aware of clustered input
IMPALA-2521 introduced clustering for insert statements. This change makes the HdfsTableSink aware of clustered inputs, so that partitions are opened, written, and closed one by one. This change also adds/modifies tests in several ways: - clustered insert tests switch from selecting all rows from alltypessmall to alltypes. Together with varying settings for batch_size, this results in a larger number of row batches being written. - clustered insert tests select from alltypes instead of functional.alltypes to make sure we also select from various input formats. - clustered insert tests have been added to select from alltypestiny to create inserts with 1 and 2 rows per partition respectively. - exhaustive insert tests now use different values for batch_size: 1, 16, 0 (meaning default, 1024). This is limited to uncompressed parquet files, to maintain a reasonable runtime. On my machine execution of test.insert took 1778 seconds, compared to 1002 seconds with the just default row batch size. - There is additional testing in test_insert_behaviour.py to make sure that insertion over several row batches only creates one file per partition. - It renames the test_insert method to make it unique in the file and allow for effective filtering with -k. - It adds tests to the Analyzer test suite. Change-Id: Ibeda0bdabbfe44c8ac95bf7c982a75649e1b82d0 Reviewed-on: http://gerrit.cloudera.org:8080/4863 Reviewed-by: Lars Volker <lv@cloudera.com> Reviewed-by: Tim Armstrong <tarmstrong@cloudera.com> Tested-by: Internal Jenkins
This commit is contained in:
committed by
Internal Jenkins
parent
178fd59142
commit
8ea21d099f
@@ -861,14 +861,87 @@ Memory limit exceeded
|
||||
---- QUERY
|
||||
# IMPALA-2521: clustered insert into table
|
||||
insert into table alltypesinsert
|
||||
partition (year, month) /*+ clustered */
|
||||
select * from functional.alltypessmall;
|
||||
partition (year, month) /*+ clustered,shuffle */
|
||||
select * from alltypes;
|
||||
---- SETUP
|
||||
DROP PARTITIONS alltypesinsert
|
||||
RESET alltypesinsert
|
||||
---- RESULTS
|
||||
year=2009/month=1/: 25
|
||||
year=2009/month=2/: 25
|
||||
year=2009/month=3/: 25
|
||||
year=2009/month=4/: 25
|
||||
year=2009/month=1/: 310
|
||||
year=2009/month=10/: 310
|
||||
year=2009/month=11/: 300
|
||||
year=2009/month=12/: 310
|
||||
year=2009/month=2/: 280
|
||||
year=2009/month=3/: 310
|
||||
year=2009/month=4/: 300
|
||||
year=2009/month=5/: 310
|
||||
year=2009/month=6/: 300
|
||||
year=2009/month=7/: 310
|
||||
year=2009/month=8/: 310
|
||||
year=2009/month=9/: 300
|
||||
year=2010/month=1/: 310
|
||||
year=2010/month=10/: 310
|
||||
year=2010/month=11/: 300
|
||||
year=2010/month=12/: 310
|
||||
year=2010/month=2/: 280
|
||||
year=2010/month=3/: 310
|
||||
year=2010/month=4/: 300
|
||||
year=2010/month=5/: 310
|
||||
year=2010/month=6/: 300
|
||||
year=2010/month=7/: 310
|
||||
year=2010/month=8/: 310
|
||||
year=2010/month=9/: 300
|
||||
====
|
||||
---- QUERY
|
||||
# IMPALA-2521: clustered insert into table
|
||||
insert into table alltypesinsert
|
||||
partition (year, month) /*+ clustered,shuffle */
|
||||
select * from alltypestiny;
|
||||
---- SETUP
|
||||
DROP PARTITIONS alltypesinsert
|
||||
RESET alltypesinsert
|
||||
---- RESULTS
|
||||
year=2009/month=1/: 2
|
||||
year=2009/month=2/: 2
|
||||
year=2009/month=3/: 2
|
||||
year=2009/month=4/: 2
|
||||
====
|
||||
---- QUERY
|
||||
# IMPALA-2521: clustered insert into table
|
||||
insert into table alltypesinsert
|
||||
partition (year, month) /*+ clustered,noshuffle */
|
||||
select * from alltypestiny;
|
||||
---- SETUP
|
||||
DROP PARTITIONS alltypesinsert
|
||||
RESET alltypesinsert
|
||||
---- RESULTS
|
||||
year=2009/month=1/: 2
|
||||
year=2009/month=2/: 2
|
||||
year=2009/month=3/: 2
|
||||
year=2009/month=4/: 2
|
||||
====
|
||||
---- QUERY
|
||||
# IMPALA-2521: clustered insert into table
|
||||
insert into table alltypesinsert
|
||||
partition (year, month) /*+ clustered,shuffle */
|
||||
select * from alltypestiny where int_col = 0;
|
||||
---- SETUP
|
||||
DROP PARTITIONS alltypesinsert
|
||||
RESET alltypesinsert
|
||||
---- RESULTS
|
||||
year=2009/month=1/: 1
|
||||
year=2009/month=2/: 1
|
||||
year=2009/month=3/: 1
|
||||
year=2009/month=4/: 1
|
||||
====
|
||||
---- QUERY
|
||||
# IMPALA-2521: clustered, unpartitioned insert into table
|
||||
insert into table alltypesnopart_insert
|
||||
/*+ clustered,shuffle */
|
||||
select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col,
|
||||
double_col, date_string_col, string_col, timestamp_col from alltypessmall;
|
||||
---- SETUP
|
||||
RESET alltypesnopart_insert
|
||||
---- RESULTS
|
||||
: 100
|
||||
====
|
||||
|
||||
Reference in New Issue
Block a user