mirror of
https://github.com/apache/impala.git
synced 2026-01-27 06:10:53 -05:00
Minor compactions can compact several delta directories into a single
delta directory. The current directory filtering algorithm had to be
modified to handle minor compacted directories and prefer those over
plain delta directories. This happens in the Frontend, mostly in
AcidUtils.java.
Hive Streaming Ingestion writes similar delta directories, but they
might contain rows Impala cannot see based on its valid write id list.
E.g. we can have the following delta directory:
full_acid/delta_0000001_0000010/0000 # minWriteId: 1
# maxWriteId: 10
This delta dir contains rows with write ids between 1 and 10. But maybe
we are only allowed to see write ids less than 5. Therefore we need to
check the ACID write id column (named originalTransaction) to determine
which rows are valid.
Delta directories written by Hive Streaming don't have a visibility txn
id, so we can recognize them based on the directory name. If there's
a visibilityTxnId and it is committed => every row is valid:
full_acid/delta_0000001_0000010_v01234 # has visibilityTxnId
# every row is valid
If there's no visibilityTxnId then it was created via Hive Streaming,
therefore we need to validate rows. Fortunately Hive Streaming writes
rows with different write ids into different ORC stripes, therefore we
don't need to validate the write id per row. If we had statistics,
we could validate per stripe, but since Hive Streaming doesn't write
statistics we validate the write id per ORC row batch (an alternative
could be to do a 2-pass read, first we'd read a single value from each
stripe's 'currentTransaction' field, then we'd read the stripe if the
write id is valid).
Testing
* the frontend logic is tested in AcidUtilsTest
* the backend row validation is tested in test_acid_row_validation
Change-Id: I5ed74585a2d73ebbcee763b0545be4412926299d
Reviewed-on: http://gerrit.cloudera.org:8080/15818
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
139 lines
2.6 KiB
Plaintext
139 lines
2.6 KiB
Plaintext
====
|
|
---- HIVE_QUERY
|
|
# Create a table with Hive and run insert, select, and drop from Impala on it.
|
|
use $DATABASE;
|
|
create table tt (x int) tblproperties (
|
|
'transactional'='true',
|
|
'transactional_properties'='insert_only');
|
|
|
|
insert into tt values (1);
|
|
====
|
|
---- QUERY
|
|
invalidate metadata tt;
|
|
select * from tt
|
|
---- RESULTS
|
|
1
|
|
====
|
|
---- HIVE_QUERY
|
|
# Insert from Hive to test refresh table from Impala in the below test.
|
|
use $DATABASE;
|
|
insert into tt values (2);
|
|
====
|
|
---- QUERY
|
|
refresh tt;
|
|
select * from tt order by x;
|
|
---- RESULTS
|
|
1
|
|
2
|
|
====
|
|
---- QUERY
|
|
# Do a second refresh on an already refreshed ACID table.
|
|
refresh tt;
|
|
select * from tt order by x;
|
|
---- RESULTS
|
|
1
|
|
2
|
|
====
|
|
---- QUERY
|
|
insert overwrite table tt values (3);
|
|
insert into tt values (4);
|
|
====
|
|
---- QUERY
|
|
refresh tt;
|
|
select * from tt order by x;
|
|
---- RESULTS
|
|
3
|
|
4
|
|
====
|
|
---- QUERY
|
|
create table upgraded_table (x int);
|
|
insert into upgraded_table values (1);
|
|
====
|
|
---- HIVE_QUERY
|
|
use $DATABASE;
|
|
# Upgrade to the table to insert only acid when there are already values in it.
|
|
alter table upgraded_table set tblproperties
|
|
('transactional' = 'true', 'transactional_properties' = 'insert_only',
|
|
'EXTERNAL' = 'FALSE');
|
|
====
|
|
---- QUERY
|
|
refresh upgraded_table;
|
|
insert into upgraded_table values (2);
|
|
insert into upgraded_table values (3);
|
|
====
|
|
---- QUERY
|
|
select * from upgraded_table;
|
|
---- RESULTS
|
|
1
|
|
2
|
|
3
|
|
====
|
|
---- QUERY
|
|
drop table tt;
|
|
show tables;
|
|
---- RESULTS
|
|
'upgraded_table'
|
|
====
|
|
---- QUERY
|
|
# After dropping the table I re-create and drop it again to check that all the locks
|
|
# are released properly from HMS.
|
|
create table tt (x int) tblproperties (
|
|
'transactional'='true',
|
|
'transactional_properties'='insert_only');
|
|
====
|
|
---- QUERY
|
|
show tables;
|
|
---- RESULTS
|
|
'upgraded_table'
|
|
'tt'
|
|
====
|
|
---- QUERY
|
|
drop table tt;
|
|
show tables;
|
|
---- RESULTS
|
|
'upgraded_table'
|
|
====
|
|
---- QUERY
|
|
create table full_acid (i int) stored as orc
|
|
tblproperties('transactional'='true');
|
|
show tables;
|
|
---- RESULTS
|
|
'full_acid'
|
|
'upgraded_table'
|
|
====
|
|
---- QUERY
|
|
drop table full_acid;
|
|
---- RESULTS
|
|
'Table has been dropped.'
|
|
====
|
|
---- QUERY
|
|
show tables;
|
|
---- RESULTS
|
|
'upgraded_table'
|
|
====
|
|
---- QUERY
|
|
# Test reading minor-compacted table.
|
|
show files in functional_orc_def.complextypestbl_minor_compacted;
|
|
---- LABELS
|
|
Path,Size,Partition
|
|
---- RESULTS
|
|
row_regex:'$NAMENODE/test-warehouse/managed/complextypestbl_minor_compacted_orc_def/delta_0000001_0000008_v\d+/bucket_00000','.+KB',''
|
|
---- TYPES
|
|
STRING,STRING,STRING
|
|
====
|
|
---- QUERY
|
|
select row__id.originaltransaction, id
|
|
from functional_orc_def.complextypestbl_minor_compacted;
|
|
---- RESULTS
|
|
1,1
|
|
2,2
|
|
3,3
|
|
4,4
|
|
5,5
|
|
6,6
|
|
7,7
|
|
8,8
|
|
---- TYPES
|
|
BIGINT,BIGINT
|
|
====
|