IMPALA-7165: [DOCS] Correct example for dynamic partition pruning

Change-Id: I44d1054f55d3dc7947ccf4c2ef440e506c41f963 Reviewed-on: http://gerrit.cloudera.org:8080/10703 Reviewed-by: Vuk Ercegovac <vercegovac@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2025-12-25 02:03:09 -05:00 · 2018-06-12 16:18:20 -07:00
parent b864112fa2
commit d29d5a25e5
3 changed files with 99 additions and 61 deletions
--- a/docs/shared/impala_common.xml
+++ b/docs/shared/impala_common.xml
@@ -1870,42 +1870,44 @@ from length_demo;
      </p>

 <codeblock id="simple_dpp_example">
-create table yy (s string) partitioned by (year int) stored as parquet;
-insert into yy partition (year) values ('1999', 1999), ('2000', 2000),
-  ('2001', 2001), ('2010',2010);
-compute stats yy;
+CREATE TABLE yy (s STRING) PARTITIONED BY (year INT);
+INSERT INTO yy PARTITION (year) VALUES ('1999', 1999), ('2000', 2000),
+  ('2001', 2001), ('2010', 2010), ('2018', 2018);
+COMPUTE STATS yy;

-create table yy2 (s string) partitioned by (year int) stored as parquet;
-insert into yy2 partition (year) values ('1999', 1999), ('2000', 2000),
-  ('2001', 2001);
-compute stats yy2;
+CREATE TABLE yy2 (s STRING, year INT);
+INSERT INTO yy2 VALUES ('1999', 1999), ('2000', 2000), ('2001', 2001);
+COMPUTE STATS yy2;

-- The query reads an unknown number of partitions, whose key values are only
-- known at run time. The 'runtime filters' lines show how the information about
-- the partitions is calculated in query fragment 02, and then used in query
-- fragment 00 to decide which partitions to skip.
-explain select s from yy2 where year in (select year from yy where year between 2000 and 2005);
-+----------------------------------------------------------+
-| Explain String                                           |
-+----------------------------------------------------------+
-| Estimated Per-Host Requirements: Memory=16.00MB VCores=2 |
-|                                                          |
-| 04:EXCHANGE [UNPARTITIONED]                              |
-| |                                                        |
-| 02:HASH JOIN [LEFT SEMI JOIN, BROADCAST]                 |
-| |  hash predicates: year = year                          |
-| |  <b>runtime filters: RF000 &lt;- year</b>                        |
-| |                                                        |
-| |--03:EXCHANGE [BROADCAST]                               |
-| |  |                                                     |
-| |  01:SCAN HDFS [dpp.yy]                                 |
-| |     partitions=2/4 files=2 size=468B                   |
-| |                                                        |
-| 00:SCAN HDFS [dpp.yy2]                                   |
-|    partitions=2/3 files=2 size=468B                      |
-|    <b>runtime filters: RF000 -> year</b>                        |
-+----------------------------------------------------------+
+-- The following query reads an unknown number of partitions, whose key values
+-- are only known at run time. The 'runtime filters' lines show how the
+-- information about the partitions is calculated in query fragment 02, and then
+-- used in query fragment 00 to decide which partitions to skip.
+
+EXPLAIN SELECT s FROM yy WHERE year IN (SELECT year FROM yy2);
+--------------------------------------------------------------------------+
+| PLAN-ROOT SINK                                                           |
+| |                                                                        |
+| 04:EXCHANGE [UNPARTITIONED]                                              |
+| |                                                                        |
+| 02:HASH JOIN [LEFT SEMI JOIN, BROADCAST]                                 |
+| |  hash predicates: year = year                                          |
+| |  <b>runtime filters: RF000 &lt;- year</b>                              |
+| |                                                                        |
+| |--03:EXCHANGE [BROADCAST]                                               |
+| |  |                                                                     |
+| |  01:SCAN HDFS [default.yy2]                                            |
+| |     partitions=1/1 files=1 size=620B                                   |
+| |                                                                        |
+| 00:SCAN HDFS [default.yy]                                                |
+|    <b>partitions=5/5</b> files=5 size=1.71KB                             |
+|    runtime filters: RF000 -> year                                        |
+--------------------------------------------------------------------------+
+
+SELECT s FROM yy WHERE year IN (SELECT year FROM yy2); -- Returns 3 rows from yy
+PROFILE;
 </codeblock>
+
      <p id="order_by_scratch_dir">
        By default, intermediate files used during large sort, join, aggregation, or analytic function operations
        are stored in the directory <filepath>/tmp/impala-scratch</filepath> . These files are removed when the
--- a/docs/topics/impala_partitioning.xml
+++ b/docs/topics/impala_partitioning.xml
@@ -143,19 +143,27 @@ under the License.
        </li>

        <li>
-          <codeph><xref href="impala_alter_table.xml#alter_table">ALTER TABLE</xref></codeph>: you can add or drop partitions, to work with
-          different portions of a huge data set. You can designate the HDFS directory that holds the data files for a specific partition.
-          With data partitioned by date values, you might <q>age out</q> data that is no longer relevant.
-          <note conref="../shared/impala_common.xml#common/add_partition_set_location"/>
+          <codeph><xref href="impala_alter_table.xml#alter_table">ALTER
+              TABLE</xref></codeph>: you can add or drop partitions, to work
+          with different portions of a huge data set. You can designate the HDFS
+          directory that holds the data files for a specific partition. With
+          data partitioned by date values, you might <q>age out</q> data that is
+          no longer relevant. <note
+            conref="../shared/impala_common.xml#common/add_partition_set_location"
+          />
        </li>

        <li>
-          <codeph><xref href="impala_insert.xml#insert">INSERT</xref></codeph>: When you insert data into a partitioned table, you identify
-          the partitioning columns. One or more values from each inserted row are not stored in data files, but instead determine the
-          directory where that row value is stored. You can also specify which partition to load a set of data into, with <codeph>INSERT
-          OVERWRITE</codeph> statements; you can replace the contents of a specific partition but you cannot append data to a specific
-          partition.
-          <p rev="1.3.1" conref="../shared/impala_common.xml#common/insert_inherit_permissions"/>
+          <codeph><xref href="impala_insert.xml#insert">INSERT</xref></codeph>:
+          When you insert data into a partitioned table, you identify the
+          partitioning columns. One or more values from each inserted row are
+          not stored in data files, but instead determine the directory where
+          that row value is stored. You can also specify which partition to load
+          a set of data into, with <codeph>INSERT OVERWRITE</codeph> statements;
+          you can replace the contents of a specific partition but you cannot
+          append data to a specific partition. <p rev="1.3.1"
+            conref="../shared/impala_common.xml#common/insert_inherit_permissions"
+          />
        </li>

        <li>
@@ -242,7 +250,8 @@ insert into weather <b>partition (year=2014, month=04, day)</b> select 'sunny',2

    <conbody>

-      <p rev="1.3.1" conref="../shared/impala_common.xml#common/insert_inherit_permissions"/>
+      <p rev="1.3.1"
+        conref="../shared/impala_common.xml#common/insert_inherit_permissions"/>

    </conbody>

@@ -377,7 +386,8 @@ insert into weather <b>partition (year=2014, month=04, day)</b> select 'sunny',2

        <p conref="../shared/impala_common.xml#common/partitions_and_views"/>

-        <p conref="../shared/impala_common.xml#common/analytic_partition_pruning_caveat"/>
+        <p
+          conref="../shared/impala_common.xml#common/analytic_partition_pruning_caveat"/>

      </conbody>

@@ -408,19 +418,38 @@ SELECT COUNT(*) FROM sales_table WHERE year IN (2005, 2010, 2015);
 </codeblock>

        <p>
-          Dynamic partition pruning involves using information only available at run time, such as the result of a subquery:
+          Dynamic partition pruning involves using information only available
+          at run time, such as the result of a subquery. The following example
+          shows a simple dynamic partition pruning.
        </p>

 <codeblock conref="../shared/impala_common.xml#common/simple_dpp_example"/>

-<!-- Former example. Not sure it really would trigger DPP. SELECT COUNT(*) FROM sales_table WHERE year = (SELECT MAX(year) FROM some_other_table); -->
+        <p>
+          In the above example, Impala evaluates the subquery, sends the
+          subquery results to all Impala nodes participating in the query, and
+          then each <cmdname>impalad</cmdname> daemon uses the dynamic partition
+          pruning optimization to read only the partitions with the relevant key
+          values.
+        </p>

        <p>
-          In this case, Impala evaluates the subquery, sends the subquery results to all Impala nodes participating in the query, and then
-          each <cmdname>impalad</cmdname> daemon uses the dynamic partition pruning optimization to read only the partitions with the
-          relevant key values.
+          The output query plan from the <codeph>EXPLAIN</codeph> statement
+          shows that runtime filters are enabled. The plan also shows that it
+          expects to read all 5 partitions of the <codeph>yy</codeph> table,
+          indicating that static partition pruning will not happen.
        </p>

+        <p>The Filter summary in the <codeph>PROFILE</codeph> output shows that
+          the scan node filtered out based on a runtime filter of dynamic
+          partition pruning. </p>
+
+<codeblock>Filter 0 (1.00 MB):
+ - Files processed: 3
+ - <b>Files rejected: 1 (1)</b>
+ - Files total: 3 (3)
+</codeblock>
+
        <p>
          Dynamic partition pruning is especially effective for queries involving joins of several large partitioned tables. Evaluating the
          <codeph>ON</codeph> clauses of the join predicates might normally require reading data from all partitions of certain tables. If
@@ -429,7 +458,8 @@ SELECT COUNT(*) FROM sales_table WHERE year IN (2005, 2010, 2015);
          and the amount of intermediate data stored and transmitted across the network during the query.
        </p>

-        <p conref="../shared/impala_common.xml#common/spill_to_disk_vs_dynamic_partition_pruning"/>
+        <p
+          conref="../shared/impala_common.xml#common/spill_to_disk_vs_dynamic_partition_pruning"/>

        <p>
          Dynamic partition pruning is part of the runtime filtering feature, which applies to other kinds of queries in addition to queries
@@ -479,7 +509,8 @@ SELECT COUNT(*) FROM sales_table WHERE year IN (2005, 2010, 2015);
        </li>

        <li>
-          <p conref="../shared/impala_common.xml#common/complex_types_partitioning"/>
+          <p
+            conref="../shared/impala_common.xml#common/complex_types_partitioning"/>
        </li>

        <li>
@@ -559,7 +590,8 @@ SELECT COUNT(*) FROM sales_table WHERE year IN (2005, 2010, 2015);
        formats.
      </p>

-      <note conref="../shared/impala_common.xml#common/add_partition_set_location"/>
+      <note
+        conref="../shared/impala_common.xml#common/add_partition_set_location"/>

      <p>
        What happens to the data files when a partition is dropped depends on whether the partitioned table is designated as internal or
@@ -615,7 +647,8 @@ SELECT COUNT(*) FROM sales_table WHERE year IN (2005, 2010, 2015);

      <note type="important">
        <p conref="../shared/impala_common.xml#common/cs_or_cis"/>
-        <p conref="../shared/impala_common.xml#common/incremental_stats_after_full"/>
+        <p
+          conref="../shared/impala_common.xml#common/incremental_stats_after_full"/>
        <p conref="../shared/impala_common.xml#common/incremental_stats_caveats"/>
      </note>

--- a/docs/topics/impala_runtime_filtering.xml
+++ b/docs/topics/impala_runtime_filtering.xml
@@ -344,18 +344,21 @@ under the License.
      </p>

      <p>
-        The following example shows a query that uses a single runtime filter (labelled <codeph>RF00</codeph>)
-        to prune the partitions that are scanned in one stage of the query, based on evaluating the
-        result set of a subquery:
+        The following example shows a query that uses a single runtime filter,
+        labeled <codeph>RF000</codeph>, to prune the partitions that are scanned
+        in one stage of the query, based on evaluating the result set of a
+        subquery:
      </p>

 <codeblock conref="../shared/impala_common.xml#common/simple_dpp_example"/>

      <p>
-        The query profile (displayed by the <codeph>PROFILE</codeph> command in <cmdname>impala-shell</cmdname>)
-        contains both the <codeph>EXPLAIN</codeph> plan and more detailed information about the internal
-        workings of the query. The profile output includes a section labelled the <q>filter routing table</q>,
-        with information about each filter based on its ID.
+        The query profile (displayed by the <codeph>PROFILE</codeph> command
+        in <cmdname>impala-shell</cmdname>) contains both the
+          <codeph>EXPLAIN</codeph> plan and more detailed information about the
+        internal workings of the query. The profile output includes the
+          <codeph>Filter routing table</codeph> section with information about
+        each filter based on its ID.
      </p>
    </conbody>
  </concept>