mirror of
https://github.com/apache/impala.git
synced 2025-12-31 06:02:51 -05:00
This patch converts the tpcds schemas to use decimal instead of float/double. Currently, Impala can only r/w decimal in text, therefore, the tables are constrained to text. The schemas were obtained from the official tpc spec: http://www.tpc.org/tpcds/spec/tpcds_1.1.0.pdf Change-Id: I1ef0113dcb48bad52af75ee93b47b08adf9e1a69 Reviewed-on: http://gerrit.ent.cloudera.com:8080/2403 Reviewed-by: Ishaan Joshi <ishaan@cloudera.com> Tested-by: jenkins
410 lines
12 KiB
SQL
410 lines
12 KiB
SQL
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
|
|
# For details on this file format please see hive-benchmark_schema_template.sql
|
|
====
|
|
---- DATASET
|
|
tpcds
|
|
---- BASE_TABLE_NAME
|
|
customer_demographics
|
|
---- COLUMNS
|
|
cd_demo_sk bigint
|
|
cd_gender string
|
|
cd_marital_status string
|
|
cd_education_status string
|
|
cd_purchase_estimate int
|
|
cd_credit_rating string
|
|
cd_dep_count int
|
|
cd_dep_employed_count int
|
|
cd_dep_college_count int
|
|
---- ROW_FORMAT
|
|
delimited fields terminated by '|'
|
|
---- DEPENDENT_LOAD
|
|
INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name}
|
|
SELECT * FROM {db_name}.{table_name};
|
|
---- LOAD
|
|
LOAD DATA LOCAL INPATH
|
|
'{impala_home}/testdata/impala-data/{db_name}/customer_demographics/'
|
|
OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
|
|
====
|
|
---- DATASET
|
|
tpcds
|
|
---- BASE_TABLE_NAME
|
|
date_dim
|
|
---- COLUMNS
|
|
d_date_sk bigint
|
|
d_date_id string
|
|
d_date string
|
|
d_month_seq int
|
|
d_week_seq int
|
|
d_quarter_seq int
|
|
d_year int
|
|
d_dow int
|
|
d_moy int
|
|
d_dom int
|
|
d_qoy int
|
|
d_fy_year int
|
|
d_fy_quarter_seq int
|
|
d_fy_week_seq int
|
|
d_day_name string
|
|
d_quarter_name string
|
|
d_holiday string
|
|
d_weekend string
|
|
d_following_holiday string
|
|
d_first_dom int
|
|
d_last_dom int
|
|
d_same_day_ly int
|
|
d_same_day_lq int
|
|
d_current_day string
|
|
d_current_week string
|
|
d_current_month string
|
|
d_current_quarter string
|
|
d_current_year string
|
|
---- ROW_FORMAT
|
|
delimited fields terminated by '|'
|
|
---- DEPENDENT_LOAD
|
|
INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name}
|
|
SELECT * FROM {db_name}.{table_name};
|
|
---- LOAD
|
|
LOAD DATA LOCAL INPATH '{impala_home}/testdata/impala-data/{db_name}/date_dim/'
|
|
OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
|
|
====
|
|
---- DATASET
|
|
tpcds
|
|
---- BASE_TABLE_NAME
|
|
time_dim
|
|
---- COLUMNS
|
|
t_time_sk bigint
|
|
t_time_id string
|
|
t_time int
|
|
t_hour int
|
|
t_minute int
|
|
t_second int
|
|
t_am_pm string
|
|
t_shift string
|
|
t_sub_shift string
|
|
t_meal_time string
|
|
---- ROW_FORMAT
|
|
delimited fields terminated by '|'
|
|
---- DEPENDENT_LOAD
|
|
INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name}
|
|
SELECT * FROM {db_name}.{table_name};
|
|
---- LOAD
|
|
LOAD DATA LOCAL INPATH '{impala_home}/testdata/impala-data/{db_name}/time_dim/'
|
|
OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
|
|
====
|
|
---- DATASET
|
|
tpcds
|
|
---- BASE_TABLE_NAME
|
|
item
|
|
---- COLUMNS
|
|
i_item_sk bigint
|
|
i_item_id string
|
|
i_rec_start_date string
|
|
i_rec_end_date string
|
|
i_item_desc string
|
|
i_current_price decimal(7,2)
|
|
i_wholesale_cost decimal(7,2)
|
|
i_brand_id int
|
|
i_brand string
|
|
i_class_id int
|
|
i_class string
|
|
i_category_id int
|
|
i_category string
|
|
i_manufact_id int
|
|
i_manufact string
|
|
i_size string
|
|
i_formulation string
|
|
i_color string
|
|
i_units string
|
|
i_container string
|
|
i_manager_id int
|
|
i_product_name string
|
|
---- ROW_FORMAT
|
|
delimited fields terminated by '|'
|
|
---- DEPENDENT_LOAD
|
|
INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name}
|
|
SELECT * FROM {db_name}.{table_name};
|
|
---- LOAD
|
|
LOAD DATA LOCAL INPATH '{impala_home}/testdata/impala-data/{db_name}/item/'
|
|
OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
|
|
====
|
|
---- DATASET
|
|
tpcds
|
|
---- BASE_TABLE_NAME
|
|
store
|
|
---- COLUMNS
|
|
s_store_sk bigint
|
|
s_store_id string
|
|
s_rec_start_date string
|
|
s_rec_end_date string
|
|
s_closed_date_sk int
|
|
s_store_name string
|
|
s_number_employees int
|
|
s_floor_space int
|
|
s_hours string
|
|
s_manager string
|
|
s_market_id int
|
|
s_geography_class string
|
|
s_market_desc string
|
|
s_market_manager string
|
|
s_division_id int
|
|
s_division_name string
|
|
s_company_id int
|
|
s_company_name string
|
|
s_street_number string
|
|
s_street_name string
|
|
s_street_type string
|
|
s_suite_number string
|
|
s_city string
|
|
s_county string
|
|
s_state string
|
|
s_zip string
|
|
s_country string
|
|
s_gmt_offset decimal(5,2)
|
|
s_tax_precentage decimal(5,2)
|
|
---- ROW_FORMAT
|
|
delimited fields terminated by '|'
|
|
---- DEPENDENT_LOAD
|
|
INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name}
|
|
SELECT * FROM {db_name}.{table_name};
|
|
---- LOAD
|
|
LOAD DATA LOCAL INPATH '{impala_home}/testdata/impala-data/{db_name}/store/'
|
|
OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
|
|
====
|
|
---- DATASET
|
|
tpcds
|
|
---- BASE_TABLE_NAME
|
|
customer
|
|
---- COLUMNS
|
|
c_customer_sk bigint
|
|
c_customer_id string
|
|
c_current_cdemo_sk int
|
|
c_current_hdemo_sk int
|
|
c_current_addr_sk int
|
|
c_first_shipto_date_sk int
|
|
c_first_sales_date_sk int
|
|
c_salutation string
|
|
c_first_name string
|
|
c_last_name string
|
|
c_preferred_cust_flag string
|
|
c_birth_day int
|
|
c_birth_month int
|
|
c_birth_year int
|
|
c_birth_country string
|
|
c_login string
|
|
c_email_address string
|
|
c_last_review_date string
|
|
---- ROW_FORMAT
|
|
delimited fields terminated by '|'
|
|
---- DEPENDENT_LOAD
|
|
INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name}
|
|
SELECT * FROM {db_name}.{table_name};
|
|
---- LOAD
|
|
LOAD DATA LOCAL INPATH '{impala_home}/testdata/impala-data/{db_name}/customer/'
|
|
OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
|
|
====
|
|
---- DATASET
|
|
tpcds
|
|
---- BASE_TABLE_NAME
|
|
promotion
|
|
---- COLUMNS
|
|
p_promo_sk bigint
|
|
p_promo_id string
|
|
p_start_date_sk int
|
|
p_end_date_sk int
|
|
p_item_sk int
|
|
p_cost decimal(15,2)
|
|
p_response_target int
|
|
p_promo_name string
|
|
p_channel_dmail string
|
|
p_channel_email string
|
|
p_channel_catalog string
|
|
p_channel_tv string
|
|
p_channel_radio string
|
|
p_channel_press string
|
|
p_channel_event string
|
|
p_channel_demo string
|
|
p_channel_details string
|
|
p_purpose string
|
|
p_discount_active string
|
|
---- ROW_FORMAT
|
|
delimited fields terminated by '|'
|
|
---- DEPENDENT_LOAD
|
|
INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name}
|
|
SELECT * FROM {db_name}.{table_name};
|
|
---- LOAD
|
|
LOAD DATA LOCAL INPATH '{impala_home}/testdata/impala-data/{db_name}/promotion/'
|
|
OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
|
|
====
|
|
---- DATASET
|
|
tpcds
|
|
---- BASE_TABLE_NAME
|
|
household_demographics
|
|
---- COLUMNS
|
|
hd_demo_sk bigint
|
|
hd_income_band_sk int
|
|
hd_buy_potential string
|
|
hd_dep_count int
|
|
hd_vehicle_count int
|
|
---- ROW_FORMAT
|
|
delimited fields terminated by '|'
|
|
---- DEPENDENT_LOAD
|
|
INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name}
|
|
SELECT * FROM {db_name}.{table_name};
|
|
---- LOAD
|
|
LOAD DATA LOCAL INPATH
|
|
'{impala_home}/testdata/impala-data/{db_name}/household_demographics/'
|
|
OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
|
|
====
|
|
---- DATASET
|
|
tpcds
|
|
---- BASE_TABLE_NAME
|
|
customer_address
|
|
---- COLUMNS
|
|
ca_address_sk bigint
|
|
ca_address_id string
|
|
ca_street_number string
|
|
ca_street_name string
|
|
ca_street_type string
|
|
ca_suite_number string
|
|
ca_city string
|
|
ca_county string
|
|
ca_state string
|
|
ca_zip string
|
|
ca_country string
|
|
ca_gmt_offset decimal(5,2)
|
|
ca_location_type string
|
|
---- ROW_FORMAT
|
|
delimited fields terminated by '|'
|
|
---- DEPENDENT_LOAD
|
|
INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name}
|
|
SELECT * FROM {db_name}.{table_name};
|
|
---- LOAD
|
|
LOAD DATA LOCAL INPATH '{impala_home}/testdata/impala-data/{db_name}/customer_address/'
|
|
OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
|
|
====
|
|
---- DATASET
|
|
tpcds
|
|
---- BASE_TABLE_NAME
|
|
store_sales_unpartitioned
|
|
---- COLUMNS
|
|
ss_sold_date_sk bigint
|
|
ss_sold_time_sk bigint
|
|
ss_item_sk bigint
|
|
ss_customer_sk bigint
|
|
ss_cdemo_sk bigint
|
|
ss_hdemo_sk bigint
|
|
ss_addr_sk bigint
|
|
ss_store_sk bigint
|
|
ss_promo_sk bigint
|
|
ss_ticket_number int
|
|
ss_quantity int
|
|
ss_wholesale_cost decimal(7,2)
|
|
ss_list_price decimal(7,2)
|
|
ss_sales_price decimal(7,2)
|
|
ss_ext_discount_amt decimal(7,2)
|
|
ss_ext_sales_price decimal(7,2)
|
|
ss_ext_wholesale_cost decimal(7,2)
|
|
ss_ext_list_price decimal(7,2)
|
|
ss_ext_tax decimal(7,2)
|
|
ss_coupon_amt decimal(7,2)
|
|
ss_net_paid decimal(7,2)
|
|
ss_net_paid_inc_tax decimal(7,2)
|
|
ss_net_profit decimal(7,2)
|
|
---- ROW_FORMAT
|
|
delimited fields terminated by '|'
|
|
---- DEPENDENT_LOAD
|
|
INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name}
|
|
SELECT * FROM {db_name}.{table_name};
|
|
---- LOAD
|
|
LOAD DATA LOCAL INPATH '{impala_home}/testdata/impala-data/{db_name}/store_sales/'
|
|
OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
|
|
====
|
|
---- DATASET
|
|
tpcds
|
|
---- BASE_TABLE_NAME
|
|
store_sales
|
|
---- COLUMNS
|
|
ss_sold_date_sk bigint
|
|
ss_sold_time_sk bigint
|
|
ss_item_sk bigint
|
|
ss_customer_sk bigint
|
|
ss_cdemo_sk bigint
|
|
ss_hdemo_sk bigint
|
|
ss_addr_sk bigint
|
|
ss_store_sk bigint
|
|
ss_promo_sk bigint
|
|
ss_ticket_number int
|
|
ss_quantity int
|
|
ss_wholesale_cost decimal(7,2)
|
|
ss_list_price decimal(7,2)
|
|
ss_sales_price decimal(7,2)
|
|
ss_ext_discount_amt decimal(7,2)
|
|
ss_ext_sales_price decimal(7,2)
|
|
ss_ext_wholesale_cost decimal(7,2)
|
|
ss_ext_list_price decimal(7,2)
|
|
ss_ext_tax decimal(7,2)
|
|
ss_coupon_amt decimal(7,2)
|
|
ss_net_paid decimal(7,2)
|
|
ss_net_paid_inc_tax decimal(7,2)
|
|
ss_net_profit decimal(7,2)
|
|
---- PARTITION_COLUMNS
|
|
ss_date string
|
|
---- ROW_FORMAT
|
|
delimited fields terminated by '|'
|
|
---- DEPENDENT_LOAD
|
|
-- Split the load into multiple steps to reduce total memory usage for larger
|
|
-- scale factors. TODO: Dynamically scale this based on the scale factor?
|
|
INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} PARTITION (ss_date)
|
|
SELECT * FROM {db_name}.{table_name}
|
|
WHERE ss_date <= '1999-03-29';
|
|
|
|
INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} PARTITION (ss_date)
|
|
SELECT * FROM {db_name}.{table_name}
|
|
WHERE ss_date > '1999-03-29' and ss_date <= '2000-06-21';
|
|
|
|
INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} PARTITION (ss_date)
|
|
SELECT * FROM {db_name}.{table_name}
|
|
WHERE ss_date > '2000-06-21' and ss_date <= '2001-09-15';
|
|
|
|
INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} PARTITION (ss_date)
|
|
SELECT * FROM {db_name}.{table_name}
|
|
WHERE ss_date > '2001-09-15';
|
|
---- LOAD
|
|
USE {db_name};
|
|
|
|
-- Disable auto.convert.join due to HIVE-5068
|
|
set hive.auto.convert.join=false;
|
|
set hive.exec.max.dynamic.partitions.pernode=10000;
|
|
set hive.exec.max.dynamic.partitions=10000;
|
|
set hive.exec.dynamic.partition.mode=nonstrict;
|
|
set hive.exec.dynamic.partition=true;
|
|
|
|
insert overwrite table {table_name} partition(ss_date)
|
|
select ss.*, d.d_date as ss_date
|
|
from date_dim d
|
|
join store_sales_unpartitioned ss
|
|
on (ss.ss_sold_date_sk = d.d_date_sk)
|
|
where ss.ss_sold_date_sk is not null
|
|
distribute by ss_date;
|
|
---- LOAD_LOCAL
|
|
USE {db_name};
|
|
|
|
-- Disable auto.convert.join due to HIVE-5068
|
|
set hive.auto.convert.join=false;
|
|
set hive.exec.max.dynamic.partitions.pernode=10000;
|
|
set hive.exec.max.dynamic.partitions=10000;
|
|
set hive.exec.dynamic.partition.mode=nonstrict;
|
|
set hive.exec.dynamic.partition=true;
|
|
|
|
insert overwrite table {table_name} partition(ss_date)
|
|
select ss.*, d.d_date as ss_date
|
|
from date_dim d
|
|
join store_sales_unpartitioned ss
|
|
on (ss.ss_sold_date_sk = d.d_date_sk)
|
|
where ss.ss_sold_date_sk is not null
|
|
-- The filter below on d_date is needed to reduce the number of partitions generated for
|
|
-- local testing. This filter reduces the number of partitions from ~1800 to 120.
|
|
and (d.d_date like '%-01' or d.d_date like '%-15')
|
|
distribute by ss_date;
|
|
====
|