Files
impala/testdata/workloads/functional-query/queries/QueryTest/virtual-column-input-file-name.test
Zoltan Borok-Nagy 23d09638de IMPALA-801, IMPALA-8011: Add INPUT__FILE__NAME virtual column for file name
Hive has virtual column INPUT__FILE__NAME which returns the data file
name that stores the actual row. It can be used in several ways, see the
above two Jira tickets for examples. This virtual column is also needed
to support position-based delete files in Iceberg V2 tables.

This patch also adds the foundations to support further table-level
virtual columns later. Virtual columns are stored at the table level
in a separate list from the table schema. During path resolution
in Path.resolve() we also try to resolve virtual columns. Slot
descriptors also store the information whether they refer to a virtual
column.

Currently we only add the INPUT__FILE__NAME virtual column. The value
of this column can be set in the template tuple of the scanners.

All kinds of operations are possible on this virtual column, users
can invoke additional functions on it, can filter rows, can group by,
etc.

Special care is needed for virtual columns when column masking/row
filtering is applicable on them. They are added as "hidden" select
list items to the table masking views which means they don't
expand by * expressions. They still need to be included in *
expressions though when they are coming from user-written views.

Testing:
 * analyzer tests
 * added e2e tests

Change-Id: I498591f1db08a91a5c846df59086d2291df4ff61
Reviewed-on: http://gerrit.cloudera.org:8080/18514
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2022-06-08 13:02:52 +00:00

184 lines
12 KiB
Plaintext

====
---- QUERY
# Select INPUT_FILE__NAME plus all cols
select input__file__name, * from alltypestiny;
---- RESULTS
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=1(/base_\d*/|/).*',0,true,0,0,0,0,0,0,'01/01/09','0',2009-01-01 00:00:00,2009,1
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=1(/base_\d*/|/).*',1,false,1,1,1,10,1.100000023841858,10.1,'01/01/09','1',2009-01-01 00:01:00,2009,1
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=2(/base_\d*/|/).*',2,true,0,0,0,0,0,0,'02/01/09','0',2009-02-01 00:00:00,2009,2
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=2(/base_\d*/|/).*',3,false,1,1,1,10,1.100000023841858,10.1,'02/01/09','1',2009-02-01 00:01:00,2009,2
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=3(/base_\d*/|/).*',4,true,0,0,0,0,0,0,'03/01/09','0',2009-03-01 00:00:00,2009,3
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=3(/base_\d*/|/).*',5,false,1,1,1,10,1.100000023841858,10.1,'03/01/09','1',2009-03-01 00:01:00,2009,3
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=4(/base_\d*/|/).*',6,true,0,0,0,0,0,0,'04/01/09','0',2009-04-01 00:00:00,2009,4
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=4(/base_\d*/|/).*',7,false,1,1,1,10,1.100000023841858,10.1,'04/01/09','1',2009-04-01 00:01:00,2009,4
---- TYPES
STRING, INT, BOOLEAN, TINYINT, SMALLINT, INT, BIGINT, FLOAT, DOUBLE, STRING, STRING, TIMESTAMP, INT, INT
====
---- QUERY
# Select INPUT_FILE__NAME plus non-clustering col
select input__file__name, id from alltypestiny;
---- RESULTS
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=1(/base_\d*/|/).*',0
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=1(/base_\d*/|/).*',1
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=2(/base_\d*/|/).*',2
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=2(/base_\d*/|/).*',3
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=3(/base_\d*/|/).*',4
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=3(/base_\d*/|/).*',5
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=4(/base_\d*/|/).*',6
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=4(/base_\d*/|/).*',7
---- TYPES
STRING, INT
====
---- QUERY
# Select INPUT_FILE__NAME plus clustering col
select input__file__name, month from alltypestiny;
---- RESULTS
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=1(/base_\d*/|/).*',1
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=1(/base_\d*/|/).*',1
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=2(/base_\d*/|/).*',2
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=2(/base_\d*/|/).*',2
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=3(/base_\d*/|/).*',3
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=3(/base_\d*/|/).*',3
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=4(/base_\d*/|/).*',4
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=4(/base_\d*/|/).*',4
---- TYPES
STRING, INT
====
---- QUERY
# Select INPUT_FILE__NAME only
select input__file__name from alltypestiny;
---- RESULTS
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=1(/base_\d*/|/).*'
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=1(/base_\d*/|/).*'
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=2(/base_\d*/|/).*'
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=2(/base_\d*/|/).*'
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=3(/base_\d*/|/).*'
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=3(/base_\d*/|/).*'
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=4(/base_\d*/|/).*'
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=4(/base_\d*/|/).*'
---- TYPES
STRING
====
---- QUERY
# Select INPUT_FILE__NAME multiple times
select input__file__name, input__file__name from alltypestiny;
---- RESULTS
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=1(/base_\d*/|/).*',regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=1(/base_\d*/|/).*'
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=1(/base_\d*/|/).*',regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=1(/base_\d*/|/).*'
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=2(/base_\d*/|/).*',regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=2(/base_\d*/|/).*'
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=2(/base_\d*/|/).*',regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=2(/base_\d*/|/).*'
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=3(/base_\d*/|/).*',regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=3(/base_\d*/|/).*'
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=3(/base_\d*/|/).*',regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=3(/base_\d*/|/).*'
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=4(/base_\d*/|/).*',regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=4(/base_\d*/|/).*'
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=4(/base_\d*/|/).*',regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=4(/base_\d*/|/).*'
---- TYPES
STRING, STRING
====
---- QUERY
# Select INPUT_FILE__NAME from two tables
select att.input__file__name, att.id, ats.input__file__name from alltypestiny att join alltypessmall ats on att.id=ats.id;
---- RESULTS
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*alltypestiny[^/]*/year=2009/month=1(/base_\d*/|/).*',0,regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*alltypessmall[^/]*/year=2009/month=1(/base_\d*/|/).*'
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*alltypestiny[^/]*/year=2009/month=1(/base_\d*/|/).*',1,regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*alltypessmall[^/]*/year=2009/month=1(/base_\d*/|/).*'
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*alltypestiny[^/]*/year=2009/month=2(/base_\d*/|/).*',2,regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*alltypessmall[^/]*/year=2009/month=1(/base_\d*/|/).*'
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*alltypestiny[^/]*/year=2009/month=2(/base_\d*/|/).*',3,regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*alltypessmall[^/]*/year=2009/month=1(/base_\d*/|/).*'
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*alltypestiny[^/]*/year=2009/month=3(/base_\d*/|/).*',4,regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*alltypessmall[^/]*/year=2009/month=1(/base_\d*/|/).*'
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*alltypestiny[^/]*/year=2009/month=3(/base_\d*/|/).*',5,regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*alltypessmall[^/]*/year=2009/month=1(/base_\d*/|/).*'
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*alltypestiny[^/]*/year=2009/month=4(/base_\d*/|/).*',6,regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*alltypessmall[^/]*/year=2009/month=1(/base_\d*/|/).*'
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*alltypestiny[^/]*/year=2009/month=4(/base_\d*/|/).*',7,regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*alltypessmall[^/]*/year=2009/month=1(/base_\d*/|/).*'
---- TYPES
STRING, INT, STRING
====
---- QUERY
# Group by INPUT__FILE__NAME
select input__file__name, count(*) from alltypes
group by input__file__name
order by input__file__name;
---- RESULTS
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=1(/base_\d*/|/).*',310
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=10(/base_\d*/|/).*',310
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=11(/base_\d*/|/).*',300
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=12(/base_\d*/|/).*',310
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=2(/base_\d*/|/).*',280
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=3(/base_\d*/|/).*',310
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=4(/base_\d*/|/).*',300
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=5(/base_\d*/|/).*',310
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=6(/base_\d*/|/).*',300
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=7(/base_\d*/|/).*',310
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=8(/base_\d*/|/).*',310
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=9(/base_\d*/|/).*',300
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2010/month=1(/base_\d*/|/).*',310
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2010/month=10(/base_\d*/|/).*',310
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2010/month=11(/base_\d*/|/).*',300
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2010/month=12(/base_\d*/|/).*',310
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2010/month=2(/base_\d*/|/).*',280
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2010/month=3(/base_\d*/|/).*',310
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2010/month=4(/base_\d*/|/).*',300
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2010/month=5(/base_\d*/|/).*',310
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2010/month=6(/base_\d*/|/).*',300
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2010/month=7(/base_\d*/|/).*',310
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2010/month=8(/base_\d*/|/).*',310
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2010/month=9(/base_\d*/|/).*',300
---- TYPES
STRING, BIGINT
====
---- QUERY
# Filter results by LIKE
select input__file__name, count(*) from alltypes
where input__file__name like '%year=2009/month=1%'
group by input__file__name
order by input__file__name;
---- RESULTS
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=1(/base_\d*/|/).*',310
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=10(/base_\d*/|/).*',310
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=11(/base_\d*/|/).*',300
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2009/month=12(/base_\d*/|/).*',310
---- TYPES
STRING, BIGINT
====
---- QUERY
# REGEXP_LIKE
select input__file__name, count(*) from alltypes
where regexp_like(input__file__name, 'year=2010/month=2')
group by input__file__name
order by input__file__name;
---- RESULTS
regex:'$NAMENODE/test-warehouse(/managed/functional[^/]*)?/[^/]*/year=2010/month=2(/base_\d*/|/).*',280
---- TYPES
STRING, BIGINT
====
---- QUERY
# REGEXP_EXTRACT
select regexp_extract(input__file__name, 'year=\\d+/month=\\d+', 0)
from alltypestiny;
---- RESULTS
'year=2009/month=1'
'year=2009/month=1'
'year=2009/month=2'
'year=2009/month=2'
'year=2009/month=3'
'year=2009/month=3'
'year=2009/month=4'
'year=2009/month=4'
---- TYPES
STRING
====
---- QUERY
# REGEXP_REPLACE
select regexp_replace(regexp_extract(input__file__name, 'year=\\d+/month=\\d+', 0),
'year=(\\d+)/month=(\\d+)',
'\\2/\\1')
from alltypestiny;
---- RESULTS
'1/2009'
'1/2009'
'2/2009'
'2/2009'
'3/2009'
'3/2009'
'4/2009'
'4/2009'
---- TYPES
STRING
====