mirror of
https://github.com/apache/impala.git
synced 2026-01-31 09:00:19 -05:00
This implements scanning full ACID tables that contain complex types. The same technique works that we use for primitive types. I.e. we add a LEFT ANTI JOIN on top of the Hdfs scan node in order to subtract the deleted rows from the inserted rows. However, there were some types of queries where we couldn't do that. These are the queries that scan the nested collection items directly. E.g.: SELECT item FROM complextypestbl.int_array; The above query only creates a single tuple descriptor that holds the collection items. Since this tuple descriptor is not at the table-level, we cannot add slot references to the hidden ACID column which are at the top level of the table schema. To resolve this I added a statement rewriter that rewrites the above statement to the following: SELECT item FROM complextypestbl $a$1, $a$1.int_array; Now in this example we'll have two tuple descriptors, one for the table-level, and one for the collection item. So we can add the ACID slot refs to the table-level tuple descriptor. The rewrite is implemented by the new AcidRewriter class. Performance I executed the following query with num_nodes=1 on a non-transactional table (without the rewrite), and on an ACID table (with the rewrite): select count(*) from customer_nested.c_orders.o_lineitems; Without the rewrite: Fetched 1 row(s) in 0.41s +--------------+--------+-------+----------+----------+-------+------------+----------+---------------+---------------------------------------------------+ | Operator | #Hosts | #Inst | Avg Time | Max Time | #Rows | Est. #Rows | Peak Mem | Est. Peak Mem | Detail | +--------------+--------+-------+----------+----------+-------+------------+----------+---------------+---------------------------------------------------+ | F00:ROOT | 1 | 1 | 13.61us | 13.61us | | | 0 B | 0 B | | | 01:AGGREGATE | 1 | 1 | 3.68ms | 3.68ms | 1 | 1 | 16.00 KB | 10.00 MB | FINALIZE | | 00:SCAN HDFS | 1 | 1 | 280.47ms | 280.47ms | 6.00M | 15.00M | 56.98 MB | 8.00 MB | tpch_nested_orc_def.customer.c_orders.o_lineitems | +--------------+--------+-------+----------+----------+-------+------------+----------+---------------+---------------------------------------------------+ With the rewrite: Fetched 1 row(s) in 0.42s +---------------------------+--------+-------+----------+----------+---------+------------+----------+---------------+---------------------------------------+ | Operator | #Hosts | #Inst | Avg Time | Max Time | #Rows | Est. #Rows | Peak Mem | Est. Peak Mem | Detail | +---------------------------+--------+-------+----------+----------+---------+------------+----------+---------------+---------------------------------------+ | F00:ROOT | 1 | 1 | 25.16us | 25.16us | | | 0 B | 0 B | | | 05:AGGREGATE | 1 | 1 | 3.44ms | 3.44ms | 1 | 1 | 63.00 KB | 10.00 MB | FINALIZE | | 01:SUBPLAN | 1 | 1 | 16.52ms | 16.52ms | 6.00M | 125.92M | 47.00 KB | 0 B | | | |--04:NESTED LOOP JOIN | 1 | 1 | 188.47ms | 188.47ms | 0 | 10 | 24.00 KB | 12 B | CROSS JOIN | | | |--02:SINGULAR ROW SRC | 1 | 1 | 0ns | 0ns | 0 | 1 | 0 B | 0 B | | | | 03:UNNEST | 1 | 1 | 25.37ms | 25.37ms | 0 | 10 | 0 B | 0 B | $a$1.c_orders.o_lineitems o_lineitems | | 00:SCAN HDFS | 1 | 1 | 96.26ms | 96.26ms | 100.00K | 12.59M | 38.19 MB | 72.00 MB | default.customer_nested $a$1 | +---------------------------+--------+-------+----------+----------+---------+------------+----------+---------------+---------------------------------------+ So the overhead is very little. Testing * Added planner tests to PlannerTest/acid-scans.test * E2E query tests to QueryTest/full-acid-complex-type-scans.test * E2E tests for rowid-generation: QueryTest/full-acid-rowid.test Change-Id: I8b2c6cd3d87c452c5b96a913b14c90ada78d4c6f Reviewed-on: http://gerrit.cloudera.org:8080/16228 Reviewed-by: Zoltan Borok-Nagy <boroknagyz@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Reviewed-by: Tim Armstrong <tarmstrong@cloudera.com>
42 lines
1.5 KiB
Plaintext
42 lines
1.5 KiB
Plaintext
====
|
|
---- QUERY
|
|
alter table functional.insert_only_transactional_table change column x y bigint;
|
|
---- CATCH
|
|
AnalysisException: ALTER TABLE not supported on transactional (ACID) table: functional.insert_only_transactional_table
|
|
====
|
|
---- QUERY
|
|
drop stats functional.insert_only_transactional_table;
|
|
---- CATCH
|
|
AnalysisException: DROP STATS not supported on transactional (ACID) table: functional.insert_only_transactional_table
|
|
====
|
|
---- QUERY
|
|
insert into functional_orc_def.full_transactional_table values (1);
|
|
---- CATCH
|
|
AnalysisException: INSERT not supported on full transactional (ACID) table: functional_orc_def.full_transactional_table
|
|
====
|
|
---- QUERY
|
|
truncate table functional_orc_def.full_transactional_table;
|
|
---- CATCH
|
|
AnalysisException: TRUNCATE not supported on full transactional (ACID) table: functional_orc_def.full_transactional_table
|
|
====
|
|
---- QUERY
|
|
# Impala should reject tables that have multiple files in the same
|
|
# bucket in the same directory.
|
|
# Note: This table is clearly not bucketed, but for row ID
|
|
# generation it has virtual buckets based on the file names.
|
|
create table test_promotion_fail (i int) stored as orc;
|
|
====
|
|
---- HIVE_QUERY
|
|
use $DATABASE;
|
|
insert into test_promotion_fail values (1);
|
|
insert into test_promotion_fail values (1);
|
|
alter table test_promotion_fail
|
|
set tblproperties('EXTERNAL'='false','transactional'='true');
|
|
====
|
|
---- QUERY
|
|
refresh test_promotion_fail;
|
|
select * from test_promotion_fail;
|
|
---- CATCH
|
|
Found original file with unexpected name
|
|
====
|