Files
impala/testdata/workloads/functional-query/queries/QueryTest/hdfs_scanner_profile.test
aphadke f87da848f5 IMPALA-4863/IMPALA-5311: Correctly account the file type and compression codec
If a scan range is skipped at runtime the scan node skips reading
the range and never figures out the underlying compression codec used
to compress the files. In such a scenario we default the compression
codec to NONE which can be misleading. This change marks these files
as filtered in the scan node profile

e.g. - File Formats: TEXT/NONE:364 TEXT/NONE(Skipped):1460

Change-Id: I797916505f62e568f4159e07099481b8ff571da2
Reviewed-on: http://gerrit.cloudera.org:8080/7245
Reviewed-by: Tim Armstrong <tarmstrong@cloudera.com>
Tested-by: Tim Armstrong <tarmstrong@cloudera.com>
2017-09-21 17:38:08 +00:00

35 lines
1.2 KiB
Plaintext

# This query will do a full table scan to count the num of rows
# read during a scan
select * from alltypesagg
---- RUNTIME_PROFILE
row_regex: .*RowsRead: 11.00K .
====
---- QUERY
# This query verifies that a scan range is marked as skipped
# in the profile if the correct compression cannot be inferred
# for a scan range
select count(*) from tpcds_parquet.store_sales
---- RUNTIME_PROFILE
row_regex: .*File Formats: PARQUET/Unknown\(Skipped\):.*
====
---- QUERY
# This query verifies that a when a parquet scan range is runtime
# filtered, it is marked as skipped and the compression codec is
# marked as unknown.
set runtime_filter_wait_time_ms=500000;
select count(*) from tpcds_parquet.store_sales
join tpcds_parquet.date_dim on
ss_sold_date_sk = d_date_sk where d_qoy=1
---- RUNTIME_PROFILE
row_regex: .*File Formats: PARQUET/NONE:.* PARQUET/Unknown\(Skipped\).*
====
---- QUERY
# This query verifies that a when a text scan range is runtime
# filtered, it is marked as skipped.
set runtime_filter_wait_time_ms=100000;
select count(*) from tpcds.store_sales join tpcds.date_dim on
ss_sold_date_sk = d_date_sk where d_qoy=1
---- RUNTIME_PROFILE
row_regex: .*File Formats: TEXT/NONE:.* TEXT/NONE\(Skipped\):.*
====