mirror of
https://github.com/apache/impala.git
synced 2026-02-01 03:00:22 -05:00
List all file formats that a HdfsScanNode needs to process in any fragment instance. It is possible that some file formats will not be needed in all fragment instances. This is a step towards sharing codegen between different impala backends. Using the file formats provided in the thrift file, a backend can codegen code for file formats that are not needed in its own process but are needed in other fragment instances running on other backends, and the resulting binary can be shared between multiple backends. Codegenning for file formats will be done based on the thrift message and not on what is needed for the actual backend. This leads to some extra work in case a file format is not needed for the current backend and codegen sharing is not available (at this point it is not implemented). However, the overall number of such cases is low. Also adding the file formats to the node's explain string at level 3. Testing: - Added tests to verify that the file formats are present in the explain string at level 3. Change-Id: Iad6b8271bd248983f327c07883a3bedf50f25b5d Reviewed-on: http://gerrit.cloudera.org:8080/16728 Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Reviewed-by: Csaba Ringhofer <csringhofer@cloudera.com>
91 lines
4.7 KiB
Plaintext
91 lines
4.7 KiB
Plaintext
====
|
|
---- QUERY
|
|
# Explain a simple hash join query.
|
|
explain
|
|
select *
|
|
from tpch.lineitem join tpch.orders on l_orderkey = o_orderkey;
|
|
---- RESULTS: VERIFY_IS_EQUAL
|
|
row_regex:.*Max Per-Host Resource Reservation: Memory=[0-9.]*MB Threads=[0-9]*.*
|
|
row_regex:.*Per-Host Resource Estimates: Memory=[0-9.]*MB.*
|
|
'Analyzed query: SELECT * FROM tpch.lineitem INNER JOIN tpch.orders ON l_orderkey'
|
|
'= o_orderkey'
|
|
''
|
|
'F02:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1'
|
|
row_regex:.*Per-Host Resources: mem-estimate=[0-9.]*[A-Z]*B mem-reservation=[0-9.]*[A-Z]*B thread-reservation=1
|
|
' PLAN-ROOT SINK'
|
|
' | output exprs: tpch.lineitem.l_orderkey, tpch.lineitem.l_partkey, tpch.lineitem.l_suppkey, tpch.lineitem.l_linenumber, tpch.lineitem.l_quantity, tpch.lineitem.l_extendedprice, tpch.lineitem.l_discount, tpch.lineitem.l_tax, tpch.lineitem.l_returnflag, tpch.lineitem.l_linestatus, tpch.lineitem.l_shipdate, tpch.lineitem.l_commitdate, tpch.lineitem.l_receiptdate, tpch.lineitem.l_shipinstruct, tpch.lineitem.l_shipmode, tpch.lineitem.l_comment, tpch.orders.o_orderkey, tpch.orders.o_custkey, tpch.orders.o_orderstatus, tpch.orders.o_totalprice, tpch.orders.o_orderdate, tpch.orders.o_orderpriority, tpch.orders.o_clerk, tpch.orders.o_shippriority, tpch.orders.o_comment'
|
|
row_regex:.*mem-estimate=[0-9.]*[A-Z]*B mem-reservation=[0-9.]*[A-Z]*B thread-reservation=0
|
|
' |'
|
|
' 04:EXCHANGE [UNPARTITIONED]'
|
|
row_regex:.*mem-estimate=[0-9.]*[A-Z]*B mem-reservation=[0-9.]*[A-Z]*B thread-reservation=0
|
|
' tuple-ids=0,1 row-size=402B cardinality=5.76M'
|
|
' in pipelines: 00(GETNEXT)'
|
|
''
|
|
'F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=3'
|
|
row_regex:.*Per-Host Resources: mem-estimate=[0-9.]*[A-Z]*B mem-reservation=[0-9.]*[A-Z]*B thread-reservation=.*
|
|
' DATASTREAM SINK [FRAGMENT=F02, EXCHANGE=04, UNPARTITIONED]'
|
|
row_regex:.*mem-estimate=[0-9.]*[A-Z]*B mem-reservation=[0-9.]*[A-Z]*B thread-reservation=0
|
|
' 02:HASH JOIN [INNER JOIN, BROADCAST]'
|
|
' | hash predicates: l_orderkey = o_orderkey'
|
|
' | fk/pk conjuncts: l_orderkey = o_orderkey'
|
|
' | runtime filters: RF000[bloom] <- o_orderkey'
|
|
row_regex:.*mem-estimate=[0-9.]*[A-Z]*B mem-reservation=[0-9.]*[A-Z]*B spill-buffer=[0-9.]*MB thread-reservation=.*
|
|
' | tuple-ids=0,1 row-size=402B cardinality=5.76M'
|
|
' | in pipelines: 00(GETNEXT), 01(OPEN)'
|
|
' |'
|
|
' |--03:EXCHANGE [BROADCAST]'
|
|
row_regex:.*mem-estimate=[0-9.]*[A-Z]*B mem-reservation=[0-9.]*[A-Z]*B thread-reservation=0
|
|
' | tuple-ids=1 row-size=171B cardinality=1.50M'
|
|
' | in pipelines: 01(GETNEXT)'
|
|
' |'
|
|
' 00:SCAN HDFS [tpch.lineitem, RANDOM]'
|
|
row_regex:.*partitions=1/1 files=1 size=.*
|
|
' runtime filters: RF000[bloom] -> l_orderkey'
|
|
' stored statistics:'
|
|
row_regex:.*table: rows=[0-9.]*[A-Z]* size=.*
|
|
' columns: all'
|
|
row_regex:.*extrapolated-rows=disabled max-scan-range-rows=[0-9]*.*
|
|
' file formats: [TEXT]'
|
|
row_regex:.*mem-estimate=[0-9.]*[A-Z]*B mem-reservation=[0-9.]*[A-Z]*B thread-reservation=1.*
|
|
' tuple-ids=0 row-size=231B cardinality=6.00M'
|
|
' in pipelines: 00(GETNEXT)'
|
|
''
|
|
'F01:PLAN FRAGMENT [RANDOM] hosts=2 instances=2'
|
|
row_regex:.*Per-Host Resources: mem-estimate=[0-9.]*[A-Z]*B mem-reservation=[0-9.]*[A-Z]*B thread-reservation=.*
|
|
' DATASTREAM SINK [FRAGMENT=F00, EXCHANGE=03, BROADCAST]'
|
|
row_regex:.* | mem-estimate=[0-9.]*[A-Z]*B mem-reservation=[0-9.]*[A-Z]*B thread-reservation=0
|
|
' 01:SCAN HDFS [tpch.orders, RANDOM]'
|
|
row_regex:.*partitions=1/1 files=1 size=.*
|
|
' stored statistics:'
|
|
row_regex:.*table: rows=[0-9.]*[A-Z]* size=.*
|
|
' columns: all'
|
|
row_regex:.* extrapolated-rows=disabled max-scan-range-rows=[0-9]*.*
|
|
' file formats: [TEXT]'
|
|
row_regex:.* mem-estimate=[0-9.]*[A-Z]*B mem-reservation=[0-9.]*[A-Z]*B thread-reservation=.*
|
|
' tuple-ids=1 row-size=171B cardinality=1.50M'
|
|
' in pipelines: 01(GETNEXT)'
|
|
====
|
|
---- QUERY
|
|
# Tests the warning about missing table stats in the explain header.
|
|
explain select count(t1.int_col), avg(t2.float_col), sum(t3.bigint_col)
|
|
from functional_avro.alltypes t1
|
|
inner join functional_parquet.alltypessmall t2 on (t1.id = t2.id)
|
|
left outer join functional_avro.alltypes t3 on (t2.id = t3.id)
|
|
where t1.month = 1 and t2.year = 2009 and t3.bool_col = false
|
|
---- RESULTS: VERIFY_IS_SUBSET
|
|
'WARNING: The following tables are missing relevant table and/or column statistics.'
|
|
'functional_avro.alltypes, functional_parquet.alltypessmall'
|
|
====
|
|
---- QUERY
|
|
# Tests whether all file formats are present in the explain string.
|
|
explain select * from functional.alltypesmixedformat;
|
|
---- RESULTS: VERIFY_IS_SUBSET
|
|
' file formats: [PARQUET, RC_FILE, SEQUENCE_FILE, TEXT]'
|
|
====
|
|
---- QUERY
|
|
# Tests whether file formats are present in the explain string in case of Iceberg tables.
|
|
explain select * from functional_parquet.iceberg_partitioned;
|
|
---- RESULTS: VERIFY_IS_SUBSET
|
|
' file formats: [PARQUET]'
|
|
====
|