mirror of
https://github.com/apache/impala.git
synced 2026-01-07 09:02:19 -05:00
This adds most of the Hive TPCH queries into the functional Impala tests. This code review doesn't actually include the TPCH data. The data set is relatively large. Instead I updated scripts to copy the data from a data host. This change has a few parts: 1) Update the benchmark schema generation/test vector generation to be more generic. This way we can use the same schema creation/data loading steps for TPCH as we do for benchmark tests. 2) Add in schema template for the TPCH workload along with test vectors and dimensions which are used for schema generation. 3) Add in a new test file for each TPC-H query. The Hive TPCH work broke down the queries to generate some "temp" tables, then execute using joins/selects from these temp tables. Since creating the temp tables does some real work it is good to execute these via Impala. Each test a) Runs all the Insert statements to generate the temp tables b) runs the additional TPCH queries 4) Updated all the TPCH insert statements and queries to be parameterized on $TABLE name. This way we can run the tests across all combinations of file format/compression/etc. 5) Updated data loading Change-Id: I6891acc4c7464eaf1dc7dbbb532ddbeb6c259bab
115 lines
5.6 KiB
SQL
115 lines
5.6 KiB
SQL
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
|
|
# This file is used to define schema templates for generating and loading data for
|
|
# Impala tests. The goal is to provide a single place to define a table + data files
|
|
# and have the schema and data load statements generated for each combination of file
|
|
# format, compression, etc. The way this works is by specifying how to create a
|
|
# 'base table'. The base table can be used to generate tables in other file formats
|
|
# by performing the defined INSERT / SELECT INTO statement. Each new table using the
|
|
# file format/compression combination needs to have a unique name, so all the
|
|
# statements are pameterized on table name.
|
|
# This file is read in by the 'generate_schema_statements.py' script to
|
|
# to generate all the schema for the Imapla benchmark tests.
|
|
#
|
|
# Each table is defined as a new section in this file with the following format:
|
|
# === <- Start new section
|
|
# Data set name - Used to group sets of tables together
|
|
# --- <- End sub-section
|
|
# Base table name
|
|
# --- <- End sub-section
|
|
# DROP TABLE statement
|
|
# CREATE TABLE statement - Statement to drop and create a table
|
|
# --- <- End sub-section
|
|
# INSERT/SELECT * - The INSERT/SELECT * command for loading from the base table
|
|
# --- <- End sub-section
|
|
# LOAD from LOCAL - How to load data for the the base table
|
|
# === <- End section
|
|
===
|
|
grep1gb
|
|
---
|
|
grep1gb
|
|
---
|
|
DROP TABLE %(table_name)s;
|
|
CREATE EXTERNAL TABLE %(table_name)s (field string) partitioned by (chunk int) stored as %(file_format)s
|
|
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
|
|
ALTER TABLE %(table_name)s ADD PARTITION (chunk=0);
|
|
ALTER TABLE %(table_name)s ADD PARTITION (chunk=1);
|
|
ALTER TABLE %(table_name)s ADD PARTITION (chunk=2);
|
|
ALTER TABLE %(table_name)s ADD PARTITION (chunk=3);
|
|
ALTER TABLE %(table_name)s ADD PARTITION (chunk=4);
|
|
ALTER TABLE %(table_name)s ADD PARTITION (chunk=5);
|
|
---
|
|
set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
|
|
FROM %(base_table_name)s INSERT OVERWRITE TABLE %(table_name)s PARTITION(chunk) SELECT *;
|
|
---
|
|
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/grep1GB/part-00000' OVERWRITE INTO TABLE %(table_name)s PARTITION(chunk=0);
|
|
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/grep1GB/part-00001' OVERWRITE INTO TABLE %(table_name)s PARTITION(chunk=1);
|
|
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/grep1GB/part-00002' OVERWRITE INTO TABLE %(table_name)s PARTITION(chunk=2);
|
|
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/grep1GB/part-00003' OVERWRITE INTO TABLE %(table_name)s PARTITION(chunk=3);
|
|
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/grep1GB/part-00004' OVERWRITE INTO TABLE %(table_name)s PARTITION(chunk=4);
|
|
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/grep1GB/part-00005' OVERWRITE INTO TABLE %(table_name)s PARTITION(chunk=5);
|
|
===
|
|
grep10gb
|
|
---
|
|
grep10gb
|
|
---
|
|
DROP TABLE %(table_name)s;
|
|
CREATE EXTERNAL TABLE %(table_name)s (field string) partitioned by (chunk int) stored as %(file_format)s
|
|
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
|
|
ALTER TABLE %(table_name)s ADD PARTITION (chunk=0);
|
|
ALTER TABLE %(table_name)s ADD PARTITION (chunk=1);
|
|
ALTER TABLE %(table_name)s ADD PARTITION (chunk=2);
|
|
ALTER TABLE %(table_name)s ADD PARTITION (chunk=3);
|
|
ALTER TABLE %(table_name)s ADD PARTITION (chunk=4);
|
|
ALTER TABLE %(table_name)s ADD PARTITION (chunk=5);
|
|
---
|
|
set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
|
|
FROM %(base_table_name)s INSERT OVERWRITE TABLE %(table_name)s PARTITION(chunk) SELECT *;
|
|
---
|
|
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/grep10GB/part-00000' OVERWRITE INTO TABLE %(table_name)s PARTITION(chunk=0);
|
|
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/grep10GB/part-00001' OVERWRITE INTO TABLE %(table_name)s PARTITION(chunk=1);
|
|
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/grep10GB/part-00002' OVERWRITE INTO TABLE %(table_name)s PARTITION(chunk=2);
|
|
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/grep10GB/part-00003' OVERWRITE INTO TABLE %(table_name)s PARTITION(chunk=3);
|
|
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/grep10GB/part-00004' OVERWRITE INTO TABLE %(table_name)s PARTITION(chunk=4);
|
|
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/grep10GB/part-00005' OVERWRITE INTO TABLE %(table_name)s PARTITION(chunk=5);
|
|
===
|
|
web
|
|
---
|
|
rankings
|
|
---
|
|
DROP TABLE %(table_name)s;
|
|
CREATE EXTERNAL TABLE %(table_name)s (
|
|
pageRank int,
|
|
pageURL string,
|
|
avgDuration int)
|
|
row format delimited fields terminated by '|' stored as %(file_format)s
|
|
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s/Rankings.dat';
|
|
---
|
|
set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
|
|
FROM %(base_table_name)s INSERT OVERWRITE TABLE %(table_name)s SELECT *;
|
|
---
|
|
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/html1GB/Rankings.dat' OVERWRITE INTO TABLE %(table_name)s;
|
|
===
|
|
web
|
|
---
|
|
uservisits
|
|
---
|
|
DROP TABLE %(table_name)s;
|
|
CREATE EXTERNAL TABLE %(table_name)s (
|
|
sourceIP string,
|
|
destURL string,
|
|
visitDate string,
|
|
adRevenue float,
|
|
userAgent string,
|
|
cCode string,
|
|
lCode string,
|
|
sKeyword string,
|
|
avgTimeOnSite int)
|
|
row format delimited fields terminated by '|' stored as %(file_format)s
|
|
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s/UserVisits.dat';
|
|
---
|
|
set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
|
|
FROM %(base_table_name)s INSERT OVERWRITE TABLE %(table_name)s SELECT *;
|
|
---
|
|
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/html1GB/UserVisits.dat' OVERWRITE INTO TABLE %(table_name)s;
|
|
===
|