Files
impala/testdata/workloads/functional-query/queries/QueryTest/subquery.test
Lenni Kuff 04edc8f534 Update benchmark tests to run against generic workload, data loading with scale factor, +more
This change updates the run-benchmark script to enable it to target one or more
workloads. Now benchmarks can be run like:

./run-benchmark --workloads=hive-benchmark,tpch

We lookup the workload in the workloads directory, then read the associated
query .test files and start executing them.

To ensure the queries are not duplicated between benchmark and query tests, I
moved all existing queries (under fe/src/test/resources/* to the workloads
directory. You do NOT need to look through all the .test files, I've just moved
them. The one new file is the 'hive-benchmark.test' which contains the hive
benchmark queries.

Also added support for generating schema for different scale factors as well as
executing against these scale factors. For example, let's say we have a dataset
with a scale factor called "SF1". We would first generate the schema using:

./generate_schema_statements --workload=<workload> --scale_factor="SF3"
This will create tables with a unique names from the other scale factors.

Run the generated .sql file to load the data. Alternatively, the data can loaded
by running a new python script:
./bin/load-data.py -w <workload1>,<workload2> -e <exploration strategy> -s [scale factor]
For example: load-data.sh -w tpch -e core -s SF3

Then run against this:
./run-benchmark --workloads=<workload> --scale_factor=SF3

This changeset also includes a few other minor tweaks to some of the test
scripts.

Change-Id: Ife8a8d91567d75c9612be37bec96c1e7780f50d6
2014-01-08 10:44:22 -08:00

438 lines
10 KiB
Plaintext

select *
from (
select y x
from (
select id y from hbasealltypessmall
) a
) b
---- TYPES
int
---- RESULTS
0
1
10
11
12
13
14
15
16
17
18
19
2
20
21
22
23
24
25
26
27
28
29
3
30
31
32
33
34
35
36
37
38
39
4
40
41
42
43
44
45
46
47
48
49
5
50
51
52
53
54
55
56
57
58
59
6
60
61
62
63
64
65
66
67
68
69
7
70
71
72
73
74
75
76
77
78
79
8
80
81
82
83
84
85
86
87
88
89
9
90
91
92
93
94
95
96
97
98
99
====
# subquery with predicate inside
select *
from (
select * from hbasealltypessmall where string_col = '4'
) a
---- TYPES
int, boolean, double, float, bigint, int, smallint, tinyint, string, string, timestamp
---- RESULTS
14,true,40.4,4.400000095367432,40,4,4,4,'01/02/09','4',2009-01-02 00:14:00.510000000
24,true,40.4,4.400000095367432,40,4,4,4,'01/03/09','4',2009-01-03 00:24:00.960000000
29,false,40.4,4.400000095367432,40,4,4,4,'02/01/09','4',2009-02-01 00:04:00.600000000
39,false,40.4,4.400000095367432,40,4,4,4,'02/02/09','4',2009-02-02 00:14:00.510000000
4,true,40.4,4.400000095367432,40,4,4,4,'01/01/09','4',2009-01-01 00:04:00.600000000
49,false,40.4,4.400000095367432,40,4,4,4,'02/03/09','4',2009-02-03 00:24:00.960000000
54,true,40.4,4.400000095367432,40,4,4,4,'03/01/09','4',2009-03-01 00:04:00.600000000
64,true,40.4,4.400000095367432,40,4,4,4,'03/02/09','4',2009-03-02 00:14:00.510000000
74,true,40.4,4.400000095367432,40,4,4,4,'03/03/09','4',2009-03-03 00:24:00.960000000
79,false,40.4,4.400000095367432,40,4,4,4,'04/01/09','4',2009-04-01 00:04:00.600000000
89,false,40.4,4.400000095367432,40,4,4,4,'04/02/09','4',2009-04-02 00:14:00.510000000
99,false,40.4,4.400000095367432,40,4,4,4,'04/03/09','4',2009-04-03 00:24:00.960000000
====
# subquery with predicate push down
select *
from (
select *
from (
select * from hbasealltypessmall
) x
) y
where string_col = '4'
---- TYPES
int, boolean, double, float, bigint, int, smallint, tinyint, string, string, timestamp
---- RESULTS
14,true,40.4,4.400000095367432,40,4,4,4,'01/02/09','4',2009-01-02 00:14:00.510000000
24,true,40.4,4.400000095367432,40,4,4,4,'01/03/09','4',2009-01-03 00:24:00.960000000
29,false,40.4,4.400000095367432,40,4,4,4,'02/01/09','4',2009-02-01 00:04:00.600000000
39,false,40.4,4.400000095367432,40,4,4,4,'02/02/09','4',2009-02-02 00:14:00.510000000
4,true,40.4,4.400000095367432,40,4,4,4,'01/01/09','4',2009-01-01 00:04:00.600000000
49,false,40.4,4.400000095367432,40,4,4,4,'02/03/09','4',2009-02-03 00:24:00.960000000
54,true,40.4,4.400000095367432,40,4,4,4,'03/01/09','4',2009-03-01 00:04:00.600000000
64,true,40.4,4.400000095367432,40,4,4,4,'03/02/09','4',2009-03-02 00:14:00.510000000
74,true,40.4,4.400000095367432,40,4,4,4,'03/03/09','4',2009-03-03 00:24:00.960000000
79,false,40.4,4.400000095367432,40,4,4,4,'04/01/09','4',2009-04-01 00:04:00.600000000
89,false,40.4,4.400000095367432,40,4,4,4,'04/02/09','4',2009-04-02 00:14:00.510000000
99,false,40.4,4.400000095367432,40,4,4,4,'04/03/09','4',2009-04-03 00:24:00.960000000
====
# join between three tables, extra join predicates, extra scan predicates, nulls in joins
# cols
# (alltypesagg.tinyint_col contains nulls instead of 0s)
select x.smallint_col, x.id, x.tinyint_col, c.id, x.int_col, x.float_col, c.string_col
from (
select a.smallint_col smallint_col, a.tinyint_col tinyint_col, a.day day,
a.int_col int_col, a.month month,
b.float_col float_col, b.id id
from (
select *
from alltypesagg$TABLE a
where month=1
) a
join alltypessmall$TABLE b
on (a.smallint_col = b.id)
) x
join alltypessmall$TABLE c on (x.tinyint_col = c.id)
where x.day=1
and x.int_col > 899
and x.float_col > 4.5
and c.string_col < '7'
and x.int_col + x.float_col + c.string_col < 1000
---- TYPES
smallint, int, tinyint, int, int, float, string
---- RESULTS
15,15,5,5,915,5.5,'5'
16,16,6,6,916,6.599999904632568,'6'
31,31,1,1,931,6.599999904632568,'1'
32,32,2,2,932,7.699999809265137,'2'
33,33,3,3,933,8.800000190734863,'3'
34,34,4,4,934,9.899999618530273,'4'
41,41,1,1,941,6.599999904632568,'1'
42,42,2,2,942,7.699999809265137,'2'
43,43,3,3,943,8.800000190734863,'3'
44,44,4,4,944,9.899999618530273,'4'
5,5,5,5,905,5.5,'5'
55,55,5,5,955,5.5,'5'
56,56,6,6,956,6.599999904632568,'6'
6,6,6,6,906,6.599999904632568,'6'
65,65,5,5,965,5.5,'5'
66,66,6,6,966,6.599999904632568,'6'
81,81,1,1,981,6.599999904632568,'1'
82,82,2,2,982,7.699999809265137,'2'
83,83,3,3,983,8.800000190734863,'3'
84,84,4,4,984,9.899999618530273,'4'
91,91,1,1,991,6.599999904632568,'1'
====
# Same join as above, but subquery on the RHS
select x.smallint_col, x.id, x.tinyint_col, c.id, x.int_col, x.float_col, c.string_col
from alltypessmall$TABLE c
join
(
select a.smallint_col smallint_col, a.tinyint_col tinyint_col, a.day day,
a.int_col int_col, a.month month,
b.float_col float_col, b.id id
from alltypessmall$TABLE b
join
(
select *
from alltypesagg$TABLE a
where month=1
) a
on (a.smallint_col = b.id)
) x
on (x.tinyint_col = c.id)
where x.day=1
and x.int_col > 899
and x.float_col > 4.5
and c.string_col < '7'
and x.int_col + x.float_col + c.string_col < 1000
---- TYPES
smallint, int, tinyint, int, int, float, string
---- RESULTS
15,15,5,5,915,5.5,'5'
16,16,6,6,916,6.599999904632568,'6'
31,31,1,1,931,6.599999904632568,'1'
32,32,2,2,932,7.699999809265137,'2'
33,33,3,3,933,8.800000190734863,'3'
34,34,4,4,934,9.899999618530273,'4'
41,41,1,1,941,6.599999904632568,'1'
42,42,2,2,942,7.699999809265137,'2'
43,43,3,3,943,8.800000190734863,'3'
44,44,4,4,944,9.899999618530273,'4'
5,5,5,5,905,5.5,'5'
55,55,5,5,955,5.5,'5'
56,56,6,6,956,6.599999904632568,'6'
6,6,6,6,906,6.599999904632568,'6'
65,65,5,5,965,5.5,'5'
66,66,6,6,966,6.599999904632568,'6'
81,81,1,1,981,6.599999904632568,'1'
82,82,2,2,982,7.699999809265137,'2'
83,83,3,3,983,8.800000190734863,'3'
84,84,4,4,984,9.899999618530273,'4'
91,91,1,1,991,6.599999904632568,'1'
====
# aggregate without group by
select *
from (
select count(*), count(tinyint_col), min(tinyint_col), max(tinyint_col),
sum(tinyint_col), avg(tinyint_col)
from (
select * from alltypesagg$TABLE
) a
) b
---- TYPES
bigint, bigint, tinyint, tinyint, bigint, double
---- RESULTS
10000,9000,1,9,45000,5
====
# aggregate with group-by, having
select *
from (
select int_col % 7 c1, count(*) c2, avg(int_col) c3
from (
select * from alltypesagg$TABLE
) a
group by 1
having avg(int_col) > 500 or count(*) = 10
) b
where c1 is not null
and c2 > 10
---- TYPES
int, bigint, double
---- RESULTS
0,1420,500.5
4,1430,501
5,1430,502
====
#
# So, we've this test to test multiple level of aggregate instead.
select c1, c3, m2
from (
select c1, c3, max(c2) m2
from (
select c1, c2, c3
from (
select int_col c1, tinyint_col c2, max(id) c3
from hbasealltypessmall
group by 1, 2
order by 1,2
limit 5
) x
) x2
group by c1, c3
limit 10
) t
where c1 > 0
order by 2, 1 desc
limit 3
---- TYPES
int, int, tinyint
---- RESULTS
1,96,1
2,97,2
3,98,3
====
#Do not materialize the agg expr slot
select c1, c2
from (
select int_col c1, tinyint_col c2, min(float_col) c3
from hbasealltypessmall
group by 1, 2
) x
---- TYPES
int, tinyint
---- RESULTS
0,0
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9
====
select distinct *
from (
select bool_col, tinyint_col, count(*)
from alltypesagg$TABLE
group by bool_col, tinyint_col
having bool_col = true
) x
where tinyint_col < 6
---- TYPES
boolean, tinyint, bigint
---- RESULTS
true,2,1000
true,4,1000
====
# distinct w/ explicit select list
select *
from (
select distinct bool_col, tinyint_col
from (
select * from alltypesagg$TABLE where tinyint_col < 7
) y
) x
where bool_col = true
---- TYPES
boolean, tinyint
---- RESULTS
true,2
true,4
true,6
====
# semi-join on string
select *
from (
select d.*
from DimTbl d left semi join JoinTbl j on (j.test_name = d.name)
) x
where x.name > 'Name1'
---- TYPES
bigint, string, int
---- RESULTS
1002,'Name2',94611
1003,'Name3',94612
1004,'Name4',94612
1005,'Name5',94613
1006,'Name6',94613
====
select j.*, d.*
from (
select *
from JoinTbl a
) j
left outer join
(
select *
from DimTbl b
) d
on (j.test_name = d.name)
where j.test_id <= 1006
---- TYPES
bigint, string, int, int, bigint, string, int
---- RESULTS
1001,'Name1',94611,5000,1001,'Name1',94611
1002,'Name2',94611,5000,1002,'Name2',94611
1003,'Name3',94611,5000,1003,'Name3',94612
1004,'Name4',94611,5000,1004,'Name4',94612
1005,'Name5',94611,5000,1005,'Name5',94613
1006,'Name16',94612,15000,NULL,'NULL',NULL
1006,'Name16',94612,5000,NULL,'NULL',NULL
1006,'Name16',94616,15000,NULL,'NULL',NULL
1006,'Name16',94616,5000,NULL,'NULL',NULL
1006,'Name6',94616,15000,1006,'Name6',94613
1006,'Name6',94616,5000,1006,'Name6',94613
====
# TODO: If we apply predicate on d, the result will be incorrect. This is a general
# predicate evaluation issue.
#
select j.*, d.*
from (
select *
from JoinTbl a
) j
left outer join
(
select *
from DimTbl b
) d
on (j.test_name = d.name)
where j.test_id <= 1006
---- TYPES
bigint, string, int, int, bigint, string, int
---- RESULTS
1001,'Name1',94611,5000,1001,'Name1',94611
1002,'Name2',94611,5000,1002,'Name2',94611
1003,'Name3',94611,5000,1003,'Name3',94612
1004,'Name4',94611,5000,1004,'Name4',94612
1005,'Name5',94611,5000,1005,'Name5',94613
1006,'Name16',94612,15000,NULL,'NULL',NULL
1006,'Name16',94612,5000,NULL,'NULL',NULL
1006,'Name16',94616,15000,NULL,'NULL',NULL
1006,'Name16',94616,5000,NULL,'NULL',NULL
1006,'Name6',94616,15000,1006,'Name6',94613
1006,'Name6',94616,5000,1006,'Name6',94613
====