Files
impala/testdata/workloads/functional-query/queries/QueryTest/inline-view.test
Fang-Yu Rao 07d44b7aff IMPALA-13262: Do not always migrate inferred predicates into inline view
This patch removes a predicate inferred from a set of analytic
predicates if both sides of the inferred predicate reference the same
TupleId when migrating those analytic predicates into an inline view.
This is to prevent Impala from pushing the inferred conjunct to the
scan node before the analytic functions are applied, which could produce
an incorrect result.

Testing:
 - Added additional query and planner test cases to verify Impala's
   behavior after this patch.
 - Verified the patch passed the core tests.

Change-Id: I6e2632b3b1a140ae0104ceba4e2f474ac1bbcda1
Reviewed-on: http://gerrit.cloudera.org:8080/21688
Reviewed-by: Michael Smith <michael.smith@cloudera.com>
Reviewed-by: Riza Suminto <riza.suminto@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2024-08-21 23:25:39 +00:00

708 lines
18 KiB
Plaintext

====
---- QUERY
# join between three tables, extra join predicates, extra scan predicates, nulls in joins
# cols
# (alltypesagg.tinyint_col contains nulls instead of 0s)
select x.smallint_col, x.id, x.tinyint_col, c.id, x.int_col, x.float_col, c.string_col
from (
select a.smallint_col smallint_col, a.tinyint_col tinyint_col, a.day day,
a.int_col int_col, a.month month,
b.float_col float_col, b.id id
from (
select *
from alltypesagg a
where month=1
) a
join alltypessmall b
on (a.smallint_col = b.id)
) x
join alltypessmall c on (x.tinyint_col = c.id)
where x.day=1
and x.int_col > 899
and x.float_col > 4.5
and c.string_col < '7'
and x.int_col + x.float_col + cast(c.string_col as float) < 1000
---- RESULTS
15,15,5,5,915,5.5,'5'
16,16,6,6,916,6.599999904632568,'6'
31,31,1,1,931,6.599999904632568,'1'
32,32,2,2,932,7.699999809265137,'2'
33,33,3,3,933,8.800000190734863,'3'
34,34,4,4,934,9.899999618530273,'4'
41,41,1,1,941,6.599999904632568,'1'
42,42,2,2,942,7.699999809265137,'2'
43,43,3,3,943,8.800000190734863,'3'
44,44,4,4,944,9.899999618530273,'4'
5,5,5,5,905,5.5,'5'
55,55,5,5,955,5.5,'5'
56,56,6,6,956,6.599999904632568,'6'
6,6,6,6,906,6.599999904632568,'6'
65,65,5,5,965,5.5,'5'
66,66,6,6,966,6.599999904632568,'6'
81,81,1,1,981,6.599999904632568,'1'
82,82,2,2,982,7.699999809265137,'2'
83,83,3,3,983,8.800000190734863,'3'
84,84,4,4,984,9.899999618530273,'4'
91,91,1,1,991,6.599999904632568,'1'
---- TYPES
smallint, int, tinyint, int, int, float, string
====
---- QUERY
# Same join as above, but subquery on the RHS
select x.smallint_col, x.id, x.tinyint_col, c.id, x.int_col, x.float_col, c.string_col
from alltypessmall c
join
(
select a.smallint_col smallint_col, a.tinyint_col tinyint_col, a.day day,
a.int_col int_col, a.month month,
b.float_col float_col, b.id id
from alltypessmall b
join
(
select *
from alltypesagg a
where month=1
) a
on (a.smallint_col = b.id)
) x
on (x.tinyint_col = c.id)
where x.day=1
and x.int_col > 899
and x.float_col > 4.5
and c.string_col < '7'
and x.int_col + x.float_col + cast(c.string_col as float) < 1000
---- RESULTS
15,15,5,5,915,5.5,'5'
16,16,6,6,916,6.599999904632568,'6'
31,31,1,1,931,6.599999904632568,'1'
32,32,2,2,932,7.699999809265137,'2'
33,33,3,3,933,8.800000190734863,'3'
34,34,4,4,934,9.899999618530273,'4'
41,41,1,1,941,6.599999904632568,'1'
42,42,2,2,942,7.699999809265137,'2'
43,43,3,3,943,8.800000190734863,'3'
44,44,4,4,944,9.899999618530273,'4'
5,5,5,5,905,5.5,'5'
55,55,5,5,955,5.5,'5'
56,56,6,6,956,6.599999904632568,'6'
6,6,6,6,906,6.599999904632568,'6'
65,65,5,5,965,5.5,'5'
66,66,6,6,966,6.599999904632568,'6'
81,81,1,1,981,6.599999904632568,'1'
82,82,2,2,982,7.699999809265137,'2'
83,83,3,3,983,8.800000190734863,'3'
84,84,4,4,984,9.899999618530273,'4'
91,91,1,1,991,6.599999904632568,'1'
---- TYPES
smallint, int, tinyint, int, int, float, string
====
---- QUERY
# aggregate without group by
select *
from (
select count(*), count(tinyint_col), min(tinyint_col), max(tinyint_col),
sum(tinyint_col), avg(tinyint_col)
from (
select * from alltypesagg
) a
) b
---- RESULTS
11000,9000,1,9,45000,5
---- TYPES
bigint, bigint, tinyint, tinyint, bigint, double
====
---- QUERY
# aggregate with group-by, having
select *
from (
select int_col % 7 c1, count(*) c2, avg(int_col) c3
from (
select * from alltypesagg where day is not null
) a
group by 1
having avg(int_col) > 500 or count(*) = 10
) b
where c1 is not null
and c2 > 10
---- RESULTS
4,1430,501
5,1430,502
0,1420,500.5
---- TYPES
int, bigint, double
====
---- QUERY
# subquery with aggregation and order by/limit, as left-hand side of join;
# having clause in subquery is transfered to merge agg step in distrib plan
select *
from (
select int_col, count(*)
from alltypessmall
where month = 1
group by int_col
having count(*) > 2
order by count(*) desc, int_col limit 5
) t1
join alltypes t2 on (t1.int_col = t2.id)
where month = 1
---- RESULTS: VERIFY_IS_EQUAL_SORTED
0,3,0,true,0,0,0,0,0,0,'01/01/09','0',2009-01-01 00:00:00,2009,1
1,3,1,false,1,1,1,10,1.100000023841858,10.1,'01/01/09','1',2009-01-01 00:01:00,2009,1
2,3,2,true,2,2,2,20,2.200000047683716,20.2,'01/01/09','2',2009-01-01 00:02:00.100000000,2009,1
3,3,3,false,3,3,3,30,3.299999952316284,30.3,'01/01/09','3',2009-01-01 00:03:00.300000000,2009,1
4,3,4,true,4,4,4,40,4.400000095367432,40.4,'01/01/09','4',2009-01-01 00:04:00.600000000,2009,1
---- TYPES
int, bigint, int, boolean, tinyint, smallint, int, bigint, float, double, string, string, timestamp, int, int
====
---- QUERY
# subquery with aggregation and order by/limit, as left-hand side of join;
# having clause in subquery is transfered to merge agg step in distrib plan
# Disable the estimation of cardinality for an hdfs table withot stats.
set DISABLE_HDFS_NUM_ROWS_ESTIMATE=1;
select *
from (
select int_col, count(*)
from alltypessmall
where month = 1
group by int_col
having count(*) > 2
order by count(*) desc, int_col limit 5
) t1
join alltypes t2 on (t1.int_col = t2.id)
where month = 1
---- RESULTS
0,3,0,true,0,0,0,0,0,0,'01/01/09','0',2009-01-01 00:00:00,2009,1
1,3,1,false,1,1,1,10,1.100000023841858,10.1,'01/01/09','1',2009-01-01 00:01:00,2009,1
2,3,2,true,2,2,2,20,2.200000047683716,20.2,'01/01/09','2',2009-01-01 00:02:00.100000000,2009,1
3,3,3,false,3,3,3,30,3.299999952316284,30.3,'01/01/09','3',2009-01-01 00:03:00.300000000,2009,1
4,3,4,true,4,4,4,40,4.400000095367432,40.4,'01/01/09','4',2009-01-01 00:04:00.600000000,2009,1
---- TYPES
int, bigint, int, boolean, tinyint, smallint, int, bigint, float, double, string, string, timestamp, int, int
====
---- QUERY
select distinct *
from (
select bool_col, tinyint_col, count(*)
from alltypesagg
group by bool_col, tinyint_col
having bool_col = true
) x
where tinyint_col < 6
---- RESULTS
true,2,1000
true,4,1000
---- TYPES
boolean, tinyint, bigint
====
---- QUERY
# distinct w/ explicit select list
select *
from (
select distinct bool_col, tinyint_col
from (
select * from alltypesagg where tinyint_col < 7
) y
) x
where bool_col = true
---- RESULTS
true,2
true,4
true,6
---- TYPES
boolean, tinyint
====
---- QUERY
# semi-join on string
select *
from (
select d.*
from DimTbl d left semi join JoinTbl j on (j.test_name = d.name)
) x
where x.name > 'Name1'
---- RESULTS
1002,'Name2',94611
1003,'Name3',94612
1004,'Name4',94612
1005,'Name5',94613
1006,'Name6',94613
---- TYPES
bigint, string, int
====
---- QUERY
select j.*, d.*
from (
select *
from JoinTbl a
) j
left outer join
(
select *
from DimTbl b
) d
on (j.test_name = d.name)
where j.test_id <= 1006
---- RESULTS
1001,'Name1',94611,5000,1001,'Name1',94611
1002,'Name2',94611,5000,1002,'Name2',94611
1003,'Name3',94611,5000,1003,'Name3',94612
1004,'Name4',94611,5000,1004,'Name4',94612
1005,'Name5',94611,5000,1005,'Name5',94613
1006,'Name16',94612,15000,NULL,'NULL',NULL
1006,'Name16',94612,5000,NULL,'NULL',NULL
1006,'Name16',94616,15000,NULL,'NULL',NULL
1006,'Name16',94616,5000,NULL,'NULL',NULL
1006,'Name6',94616,15000,1006,'Name6',94613
1006,'Name6',94616,5000,1006,'Name6',94613
---- TYPES
bigint, string, int, int, bigint, string, int
====
---- QUERY
# TODO: If we apply predicate on d, the result will be incorrect. This is a general
# predicate evaluation issue.
#
select j.*, d.*
from (
select *
from JoinTbl a
) j
left outer join
(
select *
from DimTbl b
) d
on (j.test_name = d.name)
where j.test_id <= 1006
---- RESULTS
1001,'Name1',94611,5000,1001,'Name1',94611
1002,'Name2',94611,5000,1002,'Name2',94611
1003,'Name3',94611,5000,1003,'Name3',94612
1004,'Name4',94611,5000,1004,'Name4',94612
1005,'Name5',94611,5000,1005,'Name5',94613
1006,'Name16',94612,15000,NULL,'NULL',NULL
1006,'Name16',94612,5000,NULL,'NULL',NULL
1006,'Name16',94616,15000,NULL,'NULL',NULL
1006,'Name16',94616,5000,NULL,'NULL',NULL
1006,'Name6',94616,15000,1006,'Name6',94613
1006,'Name6',94616,5000,1006,'Name6',94613
---- TYPES
bigint, string, int, int, bigint, string, int
====
---- QUERY
# Constant selects in subqueries
select * from (select 1, 2) x
---- RESULTS
1,2
---- TYPES
tinyint, tinyint
====
---- QUERY
# Constant selects in subqueries
select * from (select y from (select 1 y) a where y < 10) b
---- RESULTS
1
---- TYPES
tinyint
====
---- QUERY
# Constant selects in subqueries
select * from (select 1 union all select 2 union all select * from (select 3) y) x
---- RESULTS
1
2
3
---- TYPES
tinyint
====
---- QUERY
# Join on inline views made up of unions of constant selects
select * from
(select 1 a, 2 b union all select 1 a, 2 b) x
inner join
(select 1 a, 3 b union all select 1 a, 2 b) y on x.a = y.a
inner join
(select 1 a, 3 b union all select 1 a, 3 b) z on z.b = y.b
---- RESULTS
1,2,1,3,1,3
1,2,1,3,1,3
1,2,1,3,1,3
1,2,1,3,1,3
---- TYPES
tinyint, tinyint, tinyint, tinyint, tinyint, tinyint
====
---- QUERY
# Semi and inner join on a table and on inline views made up of constant selects
select x.date_string_col, z.* from alltypessmall x
left semi join
(select 1 a, 3 b union all select 1 a, 3 b) y on y.a = x.id
inner join
(select 1 a, 3 b union all select 1 a, 3 b) z on z.b = x.id + 2
---- RESULTS
'01/01/09',1,3
'01/01/09',1,3
---- TYPES
string, tinyint, tinyint
====
---- QUERY
# Values statement in subqueries
select * from (values(1, 2), (3, 4)) x
---- RESULTS
1,2
3,4
---- TYPES
TINYINT, TINYINT
====
---- QUERY
# Values statement in subqueries
select * from (select y from (values(1 as y), (11)) a where y < 10) b
---- RESULTS
1
---- TYPES
TINYINT
====
---- QUERY
# Values statement in subqueries with union
select * from (values(1), (2) union all select * from (values(3)) y) x
---- RESULTS
1
2
3
---- TYPES
TINYINT
====
---- QUERY
# Join on inline views made up of values statements
select * from
(values(1 a, 2 b), (1, 2)) x
inner join
(values(1 a, 3 b), (1, 2)) y on x.a = y.a
inner join
(values(1 a, 3 b), (1, 3)) z on z.b = y.b
---- RESULTS
1,2,1,3,1,3
1,2,1,3,1,3
1,2,1,3,1,3
1,2,1,3,1,3
---- TYPES
TINYINT, TINYINT, TINYINT, TINYINT, TINYINT, TINYINT
====
---- QUERY
# Semi and inner join on a table and on inline views made up of values statements
select x.date_string_col, z.* from alltypessmall x
left semi join
(values(1 a, 3 b), (1, 3)) y on y.a = x.id
inner join
(values(1 a, 3 b), (1, 3)) z on z.b = x.id + 2
---- RESULTS
'01/01/09',1,3
'01/01/09',1,3
---- TYPES
STRING, TINYINT, TINYINT
====
---- QUERY
# Select constant with unreferenced aggregate in subquery
select 1 from (select count(*) from alltypessmall) x
---- RESULTS
1
---- TYPES
tinyint
====
---- QUERY
# Select constant with unreferenced distinct aggregate in subquery
select 1 from (select count(distinct tinyint_col) from alltypessmall) x
---- RESULTS
1
---- TYPES
tinyint
====
---- QUERY
# Select aggregate from unreferenced aggregate in subquery
select count(*) from (select count(*) from alltypessmall) x
---- RESULTS
1
---- TYPES
bigint
====
---- QUERY
# Select * from aggregate in subquery
select * from (select count(*) from alltypessmall) x
---- RESULTS
100
---- TYPES
bigint
====
---- QUERY
# Select from aliased aggregate in subquery
select c from (select count(*) c from alltypessmall) x
---- RESULTS
100
---- TYPES
bigint
====
---- QUERY
# Select aggregate from aliased aggregate in subquery
select count(c) from (select count(*) c from alltypessmall) x
---- RESULTS
1
---- TYPES
bigint
====
---- QUERY
# Select aggregate from aggregate of basetable column in subquery
select count(1) from (select count(tinyint_col) from alltypessmall) x
---- RESULTS
1
---- TYPES
bigint
====
---- QUERY
# Select aggregate from aggregate in subquery with group by
select count(1) from
(select count(*) from alltypessmall group by tinyint_col) x
---- RESULTS
10
---- TYPES
bigint
====
---- QUERY
# Regression test for IMPALA-812
select bool_col, count(*) from
(select bool_col FROM ( SELECT bool_col FROM alltypessmall t ) t WHERE bool_col ) t
group by 1
---- RESULTS
true,50
---- TYPES
boolean, bigint
====
---- QUERY
# Test that tuple ids from semi-joined table refs are not added in an
# IsNullPredicate expr; inline view with left semi join between a base table
# and an inline view (IMPALA-1526)
select t1.int_col
from alltypestiny t1 left join
(select coalesce(t1.year, 384) as int_col
from alltypesagg t1
where t1.bigint_col in
(select day as int_col from alltypesagg where t1.id = day)) t2
on t2.int_col = t1.month
where t1.month is not null
---- RESULTS: VERIFY_IS_EQUAL_SORTED
0
1
0
1
0
1
0
1
---- TYPES
INT
====
---- QUERY
# Test that tuple ids from semi-joined table refs are not added in
# an IsNullPredicate expr; inline view with right semi join between
# two inline views (IMPALA-1526)
select distinct t1.int_col
from alltypestiny t1 left join
(select coalesce(t3.int_col, 384) as int_col
from (select int_col from alltypes) t1
right semi join (select int_col from alltypesagg) t3 on t1.int_col = t3.int_col) t2
on t2.int_col = t1.month
where t1.month is not null
---- RESULTS: VERIFY_IS_EQUAL_SORTED
0
1
---- TYPES
INT
====
---- QUERY
# Regression test for IMPALA-1528
SELECT t2.int_col is NULL FROM alltypesagg t1
LEFT JOIN (SELECT COALESCE(76, -937, -981) AS int_col FROM alltypestiny) t2
ON (t2.int_col) = (t1.year) limit 1
---- RESULTS
true
---- TYPES
boolean
====
---- QUERY
# IMPALA-1987: Equi-join predicates of outer joins contain TupleIsNullPredicate exprs.
select t1.int_col, t2.int_col, t3.id
from alltypestiny t1 left outer join
(select coalesce(int_col, 384) as int_col from alltypestiny) t2
on t1.int_col = t2.int_col
left outer join
(select 0 as id from alltypestiny) t3
on t1.int_col = t3.id
order by 1 limit 5
---- RESULTS
0,0,0
0,0,0
0,0,0
0,0,0
0,0,0
---- TYPES
INT,INT,TINYINT
====
---- QUERY
# IMPALA-10182: Nulls get eliminated with union-all for duplicate columns
select count(*) from (
select c1, c2 from (select tinyint_col c1, tinyint_col c2
from alltypesagg group by 1, 2) t1
group by 1, 2
union all
select c1, c2 from (select tinyint_col c1, tinyint_col c2
from alltypesagg group by 1, 2) t1
group by 1, 2) tt;
---- RESULTS
20
---- TYPES
BIGINT
====
---- QUERY
# IMPALA-10182: Nulls get eliminated with union-all for duplicate columns
# Introduce nulls from the null producing side of left outer join
with dt1 as (select t2.int_col y from alltypessmall t1
left outer join alltypestiny t2 on t1.int_col = t2.int_col)
select * from (
select c1, c2 from (select dt1.y c1, dt1.y c2 from dt1 group by 1, 2) t1
group by 1, 2
union all
select c1, c2 from (select dt1.y c1, dt1.y c2 from dt1 group by 1, 2) t1
group by 1, 2) tt order by 1, 2;
---- RESULTS
0,0
0,0
1,1
1,1
NULL,NULL
NULL,NULL
---- TYPES
INT, INT
====
---- QUERY
# IMPALA-11323: Constant expressions should get filtered out so they
# don't get placed in the select node in the planner.
with t as (select 1 a), v as
(select distinct a, cast(null as smallint) b, cast(null as smallint) c from t)
select distinct a,b,c from v
union all
select distinct a,b,c from v;
---- RESULTS
1,NULL,NULL
1,NULL,NULL
---- TYPES
TINYINT, SMALLINT, SMALLINT
====
---- QUERY
# IMPALA-13262: Do not migrate an inferred predicate into an inline view if both sides of
# the inferred predicate reference the same TupleId.
# Note that the following are the rows in the inline view 'iv' that satisfy
# 'iv.id = iv.int_col'.
# If the predicate 'id = int_col' was pushed down to the scan node of the table
# 'functional.alltypes', Impala would have produced 0 row, which is the wrong result.
# +------+------+---------+------+
# | year | id | int_col | rn |
# +------+------+---------+------+
# | 2009 | 9 | 9 | 3641 |
# | 2009 | 8 | 8 | 3642 |
# | 2009 | 7 | 7 | 3643 |
# | 2009 | 6 | 6 | 3644 |
# | 2009 | 5 | 5 | 3645 |
# | 2009 | 4 | 4 | 3646 |
# | 2009 | 3 | 3 | 3647 |
# | 2009 | 2 | 2 | 3648 |
# | 2009 | 1 | 1 | 3649 |
# | 2009 | 0 | 0 | 3650 |
SELECT * FROM
(
SELECT year, id, int_col, ROW_NUMBER() over(PARTITION BY year ORDER BY id DESC) rn
FROM functional.alltypes
) iv
WHERE rn = 3650 AND iv.id = iv.int_col;
---- RESULTS
2009,0,0,3650
---- TYPES
INT, INT, INT, BIGINT
====
---- QUERY
# IMPALA-13262: This is a variant of the query above to show the rows satisfying
# 'rn = 3650' but not necessarily 'iv.id = iv.int_col'.
SELECT * FROM
(
SELECT year, id, int_col, ROW_NUMBER() over(PARTITION BY year ORDER BY id DESC) rn
FROM functional.alltypes
) iv
WHERE rn = 3650;
---- RESULTS: VERIFY_IS_EQUAL_SORTED
2009,0,0,3650
2010,3650,0,3650
---- TYPES
INT, INT, INT, BIGINT
====
---- QUERY
# IMPALA-13262: This is a variant of the query above to verify we won't get any row if
# we apply 'id = int_col' to the base table 'functional.alltypes' before the application
# of the analytic predicate 'rn = 3650'.
SELECT * FROM
(
SELECT year, id, int_col, ROW_NUMBER() over(PARTITION BY year ORDER BY id DESC) rn
FROM functional.alltypes WHERE id = int_col
) iv
WHERE rn = 3650;
---- RESULTS
---- TYPES
INT, INT, INT, BIGINT
====
---- QUERY
# IMPALA-13262: This is the query that shows the results of the inline view 'iv' above.
SELECT year, id, int_col, ROW_NUMBER() over(PARTITION BY year ORDER BY id DESC) rn
FROM functional.alltypes WHERE id = int_col;
---- RESULTS
2009,9,9,1
2009,8,8,2
2009,7,7,3
2009,6,6,4
2009,5,5,5
2009,4,4,6
2009,3,3,7
2009,2,2,8
2009,1,1,9
2009,0,0,10
---- TYPES
INT, INT, INT, BIGINT
====
---- QUERY
# IMPALA-13262: This query shows that the inferred predicate 'iv_1.id = iv_1.int_col'
# won't be migrated into the inline view 'iv_2' so that 'id = int_col' won't be pushed
# to the scan node of the table 'functional.alltypes'.
SELECT * FROM
(
SELECT year, id, int_col, ROW_NUMBER() over(PARTITION BY year ORDER BY id DESC) rn
FROM (SELECT * FROM functional.alltypes UNION ALL SELECT * FROM functional.alltypes) iv_2
) iv_1
WHERE rn > 7200 and iv_1.id = iv_1.int_col;
---- RESULTS
2009,9,9,7281
2009,9,9,7282
2009,8,8,7283
2009,8,8,7284
2009,7,7,7285
2009,7,7,7286
2009,6,6,7287
2009,6,6,7288
2009,5,5,7289
2009,5,5,7290
2009,4,4,7291
2009,4,4,7292
2009,3,3,7293
2009,3,3,7294
2009,2,2,7295
2009,2,2,7296
2009,1,1,7297
2009,1,1,7298
2009,0,0,7299
2009,0,0,7300
---- TYPES
INT, INT, INT, BIGINT
====