mirror of
https://github.com/apache/impala.git
synced 2025-12-25 02:03:09 -05:00
IMPALA-7165: [DOCS] Correct example for dynamic partition pruning
Change-Id: I44d1054f55d3dc7947ccf4c2ef440e506c41f963 Reviewed-on: http://gerrit.cloudera.org:8080/10703 Reviewed-by: Vuk Ercegovac <vercegovac@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
This commit is contained in:
committed by
Impala Public Jenkins
parent
b864112fa2
commit
d29d5a25e5
@@ -1870,42 +1870,44 @@ from length_demo;
|
||||
</p>
|
||||
|
||||
<codeblock id="simple_dpp_example">
|
||||
create table yy (s string) partitioned by (year int) stored as parquet;
|
||||
insert into yy partition (year) values ('1999', 1999), ('2000', 2000),
|
||||
('2001', 2001), ('2010',2010);
|
||||
compute stats yy;
|
||||
CREATE TABLE yy (s STRING) PARTITIONED BY (year INT);
|
||||
INSERT INTO yy PARTITION (year) VALUES ('1999', 1999), ('2000', 2000),
|
||||
('2001', 2001), ('2010', 2010), ('2018', 2018);
|
||||
COMPUTE STATS yy;
|
||||
|
||||
create table yy2 (s string) partitioned by (year int) stored as parquet;
|
||||
insert into yy2 partition (year) values ('1999', 1999), ('2000', 2000),
|
||||
('2001', 2001);
|
||||
compute stats yy2;
|
||||
CREATE TABLE yy2 (s STRING, year INT);
|
||||
INSERT INTO yy2 VALUES ('1999', 1999), ('2000', 2000), ('2001', 2001);
|
||||
COMPUTE STATS yy2;
|
||||
|
||||
-- The query reads an unknown number of partitions, whose key values are only
|
||||
-- known at run time. The 'runtime filters' lines show how the information about
|
||||
-- the partitions is calculated in query fragment 02, and then used in query
|
||||
-- fragment 00 to decide which partitions to skip.
|
||||
explain select s from yy2 where year in (select year from yy where year between 2000 and 2005);
|
||||
+----------------------------------------------------------+
|
||||
| Explain String |
|
||||
+----------------------------------------------------------+
|
||||
| Estimated Per-Host Requirements: Memory=16.00MB VCores=2 |
|
||||
| |
|
||||
| 04:EXCHANGE [UNPARTITIONED] |
|
||||
| | |
|
||||
| 02:HASH JOIN [LEFT SEMI JOIN, BROADCAST] |
|
||||
| | hash predicates: year = year |
|
||||
| | <b>runtime filters: RF000 <- year</b> |
|
||||
| | |
|
||||
| |--03:EXCHANGE [BROADCAST] |
|
||||
| | | |
|
||||
| | 01:SCAN HDFS [dpp.yy] |
|
||||
| | partitions=2/4 files=2 size=468B |
|
||||
| | |
|
||||
| 00:SCAN HDFS [dpp.yy2] |
|
||||
| partitions=2/3 files=2 size=468B |
|
||||
| <b>runtime filters: RF000 -> year</b> |
|
||||
+----------------------------------------------------------+
|
||||
-- The following query reads an unknown number of partitions, whose key values
|
||||
-- are only known at run time. The 'runtime filters' lines show how the
|
||||
-- information about the partitions is calculated in query fragment 02, and then
|
||||
-- used in query fragment 00 to decide which partitions to skip.
|
||||
|
||||
EXPLAIN SELECT s FROM yy WHERE year IN (SELECT year FROM yy2);
|
||||
+--------------------------------------------------------------------------+
|
||||
| PLAN-ROOT SINK |
|
||||
| | |
|
||||
| 04:EXCHANGE [UNPARTITIONED] |
|
||||
| | |
|
||||
| 02:HASH JOIN [LEFT SEMI JOIN, BROADCAST] |
|
||||
| | hash predicates: year = year |
|
||||
| | <b>runtime filters: RF000 <- year</b> |
|
||||
| | |
|
||||
| |--03:EXCHANGE [BROADCAST] |
|
||||
| | | |
|
||||
| | 01:SCAN HDFS [default.yy2] |
|
||||
| | partitions=1/1 files=1 size=620B |
|
||||
| | |
|
||||
| 00:SCAN HDFS [default.yy] |
|
||||
| <b>partitions=5/5</b> files=5 size=1.71KB |
|
||||
| runtime filters: RF000 -> year |
|
||||
+--------------------------------------------------------------------------+
|
||||
|
||||
SELECT s FROM yy WHERE year IN (SELECT year FROM yy2); -- Returns 3 rows from yy
|
||||
PROFILE;
|
||||
</codeblock>
|
||||
|
||||
<p id="order_by_scratch_dir">
|
||||
By default, intermediate files used during large sort, join, aggregation, or analytic function operations
|
||||
are stored in the directory <filepath>/tmp/impala-scratch</filepath> . These files are removed when the
|
||||
|
||||
@@ -143,19 +143,27 @@ under the License.
|
||||
</li>
|
||||
|
||||
<li>
|
||||
<codeph><xref href="impala_alter_table.xml#alter_table">ALTER TABLE</xref></codeph>: you can add or drop partitions, to work with
|
||||
different portions of a huge data set. You can designate the HDFS directory that holds the data files for a specific partition.
|
||||
With data partitioned by date values, you might <q>age out</q> data that is no longer relevant.
|
||||
<note conref="../shared/impala_common.xml#common/add_partition_set_location"/>
|
||||
<codeph><xref href="impala_alter_table.xml#alter_table">ALTER
|
||||
TABLE</xref></codeph>: you can add or drop partitions, to work
|
||||
with different portions of a huge data set. You can designate the HDFS
|
||||
directory that holds the data files for a specific partition. With
|
||||
data partitioned by date values, you might <q>age out</q> data that is
|
||||
no longer relevant. <note
|
||||
conref="../shared/impala_common.xml#common/add_partition_set_location"
|
||||
/>
|
||||
</li>
|
||||
|
||||
<li>
|
||||
<codeph><xref href="impala_insert.xml#insert">INSERT</xref></codeph>: When you insert data into a partitioned table, you identify
|
||||
the partitioning columns. One or more values from each inserted row are not stored in data files, but instead determine the
|
||||
directory where that row value is stored. You can also specify which partition to load a set of data into, with <codeph>INSERT
|
||||
OVERWRITE</codeph> statements; you can replace the contents of a specific partition but you cannot append data to a specific
|
||||
partition.
|
||||
<p rev="1.3.1" conref="../shared/impala_common.xml#common/insert_inherit_permissions"/>
|
||||
<codeph><xref href="impala_insert.xml#insert">INSERT</xref></codeph>:
|
||||
When you insert data into a partitioned table, you identify the
|
||||
partitioning columns. One or more values from each inserted row are
|
||||
not stored in data files, but instead determine the directory where
|
||||
that row value is stored. You can also specify which partition to load
|
||||
a set of data into, with <codeph>INSERT OVERWRITE</codeph> statements;
|
||||
you can replace the contents of a specific partition but you cannot
|
||||
append data to a specific partition. <p rev="1.3.1"
|
||||
conref="../shared/impala_common.xml#common/insert_inherit_permissions"
|
||||
/>
|
||||
</li>
|
||||
|
||||
<li>
|
||||
@@ -242,7 +250,8 @@ insert into weather <b>partition (year=2014, month=04, day)</b> select 'sunny',2
|
||||
|
||||
<conbody>
|
||||
|
||||
<p rev="1.3.1" conref="../shared/impala_common.xml#common/insert_inherit_permissions"/>
|
||||
<p rev="1.3.1"
|
||||
conref="../shared/impala_common.xml#common/insert_inherit_permissions"/>
|
||||
|
||||
</conbody>
|
||||
|
||||
@@ -377,7 +386,8 @@ insert into weather <b>partition (year=2014, month=04, day)</b> select 'sunny',2
|
||||
|
||||
<p conref="../shared/impala_common.xml#common/partitions_and_views"/>
|
||||
|
||||
<p conref="../shared/impala_common.xml#common/analytic_partition_pruning_caveat"/>
|
||||
<p
|
||||
conref="../shared/impala_common.xml#common/analytic_partition_pruning_caveat"/>
|
||||
|
||||
</conbody>
|
||||
|
||||
@@ -408,19 +418,38 @@ SELECT COUNT(*) FROM sales_table WHERE year IN (2005, 2010, 2015);
|
||||
</codeblock>
|
||||
|
||||
<p>
|
||||
Dynamic partition pruning involves using information only available at run time, such as the result of a subquery:
|
||||
Dynamic partition pruning involves using information only available
|
||||
at run time, such as the result of a subquery. The following example
|
||||
shows a simple dynamic partition pruning.
|
||||
</p>
|
||||
|
||||
<codeblock conref="../shared/impala_common.xml#common/simple_dpp_example"/>
|
||||
|
||||
<!-- Former example. Not sure it really would trigger DPP. SELECT COUNT(*) FROM sales_table WHERE year = (SELECT MAX(year) FROM some_other_table); -->
|
||||
<p>
|
||||
In the above example, Impala evaluates the subquery, sends the
|
||||
subquery results to all Impala nodes participating in the query, and
|
||||
then each <cmdname>impalad</cmdname> daemon uses the dynamic partition
|
||||
pruning optimization to read only the partitions with the relevant key
|
||||
values.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
In this case, Impala evaluates the subquery, sends the subquery results to all Impala nodes participating in the query, and then
|
||||
each <cmdname>impalad</cmdname> daemon uses the dynamic partition pruning optimization to read only the partitions with the
|
||||
relevant key values.
|
||||
The output query plan from the <codeph>EXPLAIN</codeph> statement
|
||||
shows that runtime filters are enabled. The plan also shows that it
|
||||
expects to read all 5 partitions of the <codeph>yy</codeph> table,
|
||||
indicating that static partition pruning will not happen.
|
||||
</p>
|
||||
|
||||
<p>The Filter summary in the <codeph>PROFILE</codeph> output shows that
|
||||
the scan node filtered out based on a runtime filter of dynamic
|
||||
partition pruning. </p>
|
||||
|
||||
<codeblock>Filter 0 (1.00 MB):
|
||||
- Files processed: 3
|
||||
- <b>Files rejected: 1 (1)</b>
|
||||
- Files total: 3 (3)
|
||||
</codeblock>
|
||||
|
||||
<p>
|
||||
Dynamic partition pruning is especially effective for queries involving joins of several large partitioned tables. Evaluating the
|
||||
<codeph>ON</codeph> clauses of the join predicates might normally require reading data from all partitions of certain tables. If
|
||||
@@ -429,7 +458,8 @@ SELECT COUNT(*) FROM sales_table WHERE year IN (2005, 2010, 2015);
|
||||
and the amount of intermediate data stored and transmitted across the network during the query.
|
||||
</p>
|
||||
|
||||
<p conref="../shared/impala_common.xml#common/spill_to_disk_vs_dynamic_partition_pruning"/>
|
||||
<p
|
||||
conref="../shared/impala_common.xml#common/spill_to_disk_vs_dynamic_partition_pruning"/>
|
||||
|
||||
<p>
|
||||
Dynamic partition pruning is part of the runtime filtering feature, which applies to other kinds of queries in addition to queries
|
||||
@@ -479,7 +509,8 @@ SELECT COUNT(*) FROM sales_table WHERE year IN (2005, 2010, 2015);
|
||||
</li>
|
||||
|
||||
<li>
|
||||
<p conref="../shared/impala_common.xml#common/complex_types_partitioning"/>
|
||||
<p
|
||||
conref="../shared/impala_common.xml#common/complex_types_partitioning"/>
|
||||
</li>
|
||||
|
||||
<li>
|
||||
@@ -559,7 +590,8 @@ SELECT COUNT(*) FROM sales_table WHERE year IN (2005, 2010, 2015);
|
||||
formats.
|
||||
</p>
|
||||
|
||||
<note conref="../shared/impala_common.xml#common/add_partition_set_location"/>
|
||||
<note
|
||||
conref="../shared/impala_common.xml#common/add_partition_set_location"/>
|
||||
|
||||
<p>
|
||||
What happens to the data files when a partition is dropped depends on whether the partitioned table is designated as internal or
|
||||
@@ -615,7 +647,8 @@ SELECT COUNT(*) FROM sales_table WHERE year IN (2005, 2010, 2015);
|
||||
|
||||
<note type="important">
|
||||
<p conref="../shared/impala_common.xml#common/cs_or_cis"/>
|
||||
<p conref="../shared/impala_common.xml#common/incremental_stats_after_full"/>
|
||||
<p
|
||||
conref="../shared/impala_common.xml#common/incremental_stats_after_full"/>
|
||||
<p conref="../shared/impala_common.xml#common/incremental_stats_caveats"/>
|
||||
</note>
|
||||
|
||||
|
||||
@@ -344,18 +344,21 @@ under the License.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
The following example shows a query that uses a single runtime filter (labelled <codeph>RF00</codeph>)
|
||||
to prune the partitions that are scanned in one stage of the query, based on evaluating the
|
||||
result set of a subquery:
|
||||
The following example shows a query that uses a single runtime filter,
|
||||
labeled <codeph>RF000</codeph>, to prune the partitions that are scanned
|
||||
in one stage of the query, based on evaluating the result set of a
|
||||
subquery:
|
||||
</p>
|
||||
|
||||
<codeblock conref="../shared/impala_common.xml#common/simple_dpp_example"/>
|
||||
|
||||
<p>
|
||||
The query profile (displayed by the <codeph>PROFILE</codeph> command in <cmdname>impala-shell</cmdname>)
|
||||
contains both the <codeph>EXPLAIN</codeph> plan and more detailed information about the internal
|
||||
workings of the query. The profile output includes a section labelled the <q>filter routing table</q>,
|
||||
with information about each filter based on its ID.
|
||||
The query profile (displayed by the <codeph>PROFILE</codeph> command
|
||||
in <cmdname>impala-shell</cmdname>) contains both the
|
||||
<codeph>EXPLAIN</codeph> plan and more detailed information about the
|
||||
internal workings of the query. The profile output includes the
|
||||
<codeph>Filter routing table</codeph> section with information about
|
||||
each filter based on its ID.
|
||||
</p>
|
||||
</conbody>
|
||||
</concept>
|
||||
|
||||
Reference in New Issue
Block a user