Files
impala/testdata/workloads/functional-query/queries/QueryTest/distinct.test
Lenni Kuff 04edc8f534 Update benchmark tests to run against generic workload, data loading with scale factor, +more
This change updates the run-benchmark script to enable it to target one or more
workloads. Now benchmarks can be run like:

./run-benchmark --workloads=hive-benchmark,tpch

We lookup the workload in the workloads directory, then read the associated
query .test files and start executing them.

To ensure the queries are not duplicated between benchmark and query tests, I
moved all existing queries (under fe/src/test/resources/* to the workloads
directory. You do NOT need to look through all the .test files, I've just moved
them. The one new file is the 'hive-benchmark.test' which contains the hive
benchmark queries.

Also added support for generating schema for different scale factors as well as
executing against these scale factors. For example, let's say we have a dataset
with a scale factor called "SF1". We would first generate the schema using:

./generate_schema_statements --workload=<workload> --scale_factor="SF3"
This will create tables with a unique names from the other scale factors.

Run the generated .sql file to load the data. Alternatively, the data can loaded
by running a new python script:
./bin/load-data.py -w <workload1>,<workload2> -e <exploration strategy> -s [scale factor]
For example: load-data.sh -w tpch -e core -s SF3

Then run against this:
./run-benchmark --workloads=<workload> --scale_factor=SF3

This changeset also includes a few other minor tweaks to some of the test
scripts.

Change-Id: Ife8a8d91567d75c9612be37bec96c1e7780f50d6
2014-01-08 10:44:22 -08:00

125 lines
4.0 KiB
Plaintext

# distinct *
select distinct *
from alltypesagg$TABLE
where id < 20
---- TYPES
int, int, int, int, boolean, tinyint, smallint, int, bigint, float, double, string, string, timestamp
---- RESULTS
2010,1,1,0,true,NULL,NULL,NULL,NULL,NULL,NULL,'01/01/10','0',2010-01-01 00:00:00
2010,1,1,1,false,1,1,1,10,1.100000023841858,10.1,'01/01/10','1',2010-01-01 00:01:00
2010,1,1,10,true,NULL,10,10,100,11,101,'01/01/10','10',2010-01-01 00:10:00.450000000
2010,1,1,11,false,1,11,11,110,12.10000038146973,111.1,'01/01/10','11',2010-01-01 00:11:00.550000000
2010,1,1,12,true,2,12,12,120,13.19999980926514,121.2,'01/01/10','12',2010-01-01 00:12:00.660000000
2010,1,1,13,false,3,13,13,130,14.30000019073486,131.3,'01/01/10','13',2010-01-01 00:13:00.780000000
2010,1,1,14,true,4,14,14,140,15.39999961853027,141.4,'01/01/10','14',2010-01-01 00:14:00.910000000
2010,1,1,15,false,5,15,15,150,16.5,151.5,'01/01/10','15',2010-01-01 00:15:01.500000000
2010,1,1,16,true,6,16,16,160,17.60000038146973,161.6,'01/01/10','16',2010-01-01 00:16:01.200000000
2010,1,1,17,false,7,17,17,170,18.70000076293945,171.7,'01/01/10','17',2010-01-01 00:17:01.360000000
2010,1,1,18,true,8,18,18,180,19.79999923706055,181.8,'01/01/10','18',2010-01-01 00:18:01.530000000
2010,1,1,19,false,9,19,19,190,20.89999961853027,191.9,'01/01/10','19',2010-01-01 00:19:01.710000000
2010,1,1,2,true,2,2,2,20,2.200000047683716,20.2,'01/01/10','2',2010-01-01 00:02:00.100000000
2010,1,1,3,false,3,3,3,30,3.299999952316284,30.3,'01/01/10','3',2010-01-01 00:03:00.300000000
2010,1,1,4,true,4,4,4,40,4.400000095367432,40.4,'01/01/10','4',2010-01-01 00:04:00.600000000
2010,1,1,5,false,5,5,5,50,5.5,50.5,'01/01/10','5',2010-01-01 00:05:00.100000000
2010,1,1,6,true,6,6,6,60,6.599999904632568,60.6,'01/01/10','6',2010-01-01 00:06:00.150000000
2010,1,1,7,false,7,7,7,70,7.699999809265137,70.7,'01/01/10','7',2010-01-01 00:07:00.210000000
2010,1,1,8,true,8,8,8,80,8.800000190734863,80.8,'01/01/10','8',2010-01-01 00:08:00.280000000
2010,1,1,9,false,9,9,9,90,9.899999618530273,90.90000000000001,'01/01/10','9',2010-01-01 00:09:00.360000000
====
# distinct w/ explicit select list
select distinct bool_col, tinyint_col
from alltypesagg$TABLE
---- TYPES
boolean, tinyint
---- RESULTS
false,1
false,3
false,5
false,7
false,9
true,2
true,4
true,6
true,8
true,NULL
====
# count(distinct)
select count(distinct smallint_col, tinyint_col)
from alltypesagg$TABLE
---- TYPES
bigint
---- RESULTS
100
====
# count(distinct) w/ grouping and non-distinct count()
select tinyint_col, count(distinct smallint_col), count(smallint_col)
from alltypesagg$TABLE group by 1
---- TYPES
tinyint, bigint, bigint
---- RESULTS
1,10,1000
2,10,1000
3,10,1000
4,10,1000
5,10,1000
6,10,1000
7,10,1000
8,10,1000
9,10,1000
NULL,10,900
====
# count(distinct) w/ grouping and non-distinct count()
select tinyint_col, count(distinct int_col, smallint_col), count(smallint_col)
from alltypesagg$TABLE group by 1
---- TYPES
tinyint, bigint, bigint
---- RESULTS
1,100,1000
2,100,1000
3,100,1000
4,100,1000
5,100,1000
6,100,1000
7,100,1000
8,100,1000
9,100,1000
NULL,100,900
====
# count(distinct) and sum(distinct) w/ grouping and non-distinct count()
select tinyint_col, count(distinct smallint_col), sum(distinct smallint_col),
count(smallint_col)
from alltypesagg$TABLE group by 1
---- TYPES
tinyint, bigint, bigint, bigint
---- RESULTS
1,10,460,1000
2,10,470,1000
3,10,480,1000
4,10,490,1000
5,10,500,1000
6,10,510,1000
7,10,520,1000
8,10,530,1000
9,10,540,1000
NULL,10,450,900
====
# count(distinct) and sum(distinct) w/ grouping; distinct in min() and max()
# ignored
select tinyint_col, count(distinct smallint_col), sum(distinct smallint_col),
count(smallint_col), min(distinct int_col), max(distinct float_col)
from alltypesagg$TABLE group by 1
---- TYPES
tinyint, bigint, bigint, bigint, int, float
---- RESULTS
1,10,460,1000,1,1090.099975585938
2,10,470,1000,2,1091.199951171875
3,10,480,1000,3,1092.300048828125
4,10,490,1000,4,1093.400024414062
5,10,500,1000,5,1094.5
6,10,510,1000,6,1095.599975585938
7,10,520,1000,7,1096.699951171875
8,10,530,1000,8,1097.800048828125
9,10,540,1000,9,1098.900024414062
NULL,10,450,900,10,1089
====