Files
impala/testdata/workloads/functional-query/queries/QueryTest/stats-extrapolation.test
Paul Rogers 85a8b34645 IMPALA-7905: Hive keywords not quoted for identifiers
Impala often generates SQL for statements using the toSql() call.
Generated SQL is often used during testing or when writing the query
plan. Impala keywords such as "create", when used as identifiers,
must be quoted:

SELECT `select`, `from` FROM `order` ...

The code in ToSqlUtils.getIdentSql() quotes the identifier if it is
an Impala or Hive keyword, or if it does not follow the identifier
pattern. The code uses the Hive lexer to detect a keyword. But, the
code contained a flaw: the lexer expects a case-insensitive input.
We provide a case sensitive input. As a result, "MONTH" is caught as a
Hive keyword and quoted, but "month" is not. This patch fixes that flaw.

This patch also fixes:

IMPALA-8051: Compute stats fails on a column with comment character in
name

The code uses the Hive lexical analyzer to check names. Since "#" and
"--" are comment characters, a name like "foo#" is parsed as "foo" which
does not need quotes, hence we don't quote "foo#", which causes issues.
Added a special check for "#" and "--" to resolve this issue.

Testing:

* Refactored getIdentSql() easier testing.
* Added a tests to the recently added ToSqlUtilsTest for this case and
  several others.
* Making this change caused the columns `month`, `year`, and `key` to be
  quoted when before they were not. Updated many tests as a result.
* Added a new identSql() function, for use in tests, to match the
  quoting that Impala uses, and to handle the wildcard, and multi-part
  names. Used this in ToSqlTest to handle the quoted names.
* PlannerTest emits statement SQL to the output file wrapped to 80
  columns and sometimes leaves trailing spaces at the end of the line.
  Some tools remove that trailing space, resulting in trivial file
  differences.  Fixed this to remove trailing spaces in order to simplify
  file comparisons.
* Tweaked the "In pipelines" output to avoid trailing spaces when no
  pipelines are listed.
* Reran all FE tests.

Change-Id: I06cc20b052a3a66535a171c36b4b31477c0ba6d0
Reviewed-on: http://gerrit.cloudera.org:8080/12009
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2019-01-24 01:26:08 +00:00

211 lines
8.3 KiB
Plaintext

====
---- QUERY
# This test relies on a deterministic row order so we use "sort by (id)".
create table alltypes sort by (id) like functional_parquet.alltypes;
alter table alltypes set tblproperties("impala.enable.stats.extrapolation"="true");
insert into alltypes partition(year, month)
select * from functional_parquet.alltypes where year = 2009;
====
---- QUERY
# No stats are available.
explain select id from alltypes;
---- RESULTS: VERIFY_IS_SUBSET
' stored statistics:'
' table: rows=unavailable size=unavailable'
' partitions: 0/12 rows=unavailable'
' columns: unavailable'
row_regex:.* extrapolated-rows=unavailable.*
' tuple-ids=0 row-size=4B cardinality=unavailable'
---- TYPES
STRING
====
---- QUERY
compute stats alltypes
---- RESULTS
'Updated 1 partition(s) and 11 column(s).'
---- TYPES
STRING
====
---- QUERY
# Only the table-level row count is stored. The partition row counts are extrapolated.
show table stats alltypes
---- LABELS
YEAR, MONTH, #ROWS, EXTRAP #ROWS, #FILES, SIZE, BYTES CACHED, CACHE REPLICATION, FORMAT, INCREMENTAL STATS, LOCATION
---- RESULTS
'2009','1',-1,308,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=1'
'2009','2',-1,289,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=2'
'2009','3',-1,308,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=3'
'2009','4',-1,302,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=4'
'2009','5',-1,308,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=5'
'2009','6',-1,302,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=6'
'2009','7',-1,308,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=7'
'2009','8',-1,308,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=8'
'2009','9',-1,302,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=9'
'2009','10',-1,308,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=10'
'2009','11',-1,302,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=11'
'2009','12',-1,308,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=12'
'Total','',3650,3650,12,regex:.*B,'0B','','','',''
---- TYPES
STRING,STRING,BIGINT,BIGINT,BIGINT,STRING,STRING,STRING,STRING,STRING,STRING
====
---- QUERY
# Stats are available now.
explain select id from alltypes;
---- RESULTS: VERIFY_IS_EQUAL
row_regex:.*Max Per-Host Resource Reservation: Memory=.*
row_regex:.*Per-Host Resource Estimates: Memory=.*
'Codegen disabled by planner'
row_regex:.*Analyzed query: SELECT id FROM test_stats_extrapolation_.*.alltypes.*
''
'F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1'
row_regex:.*Per-Host Resources: mem-estimate=.* mem-reservation=.*
'PLAN-ROOT SINK'
row_regex:.*mem-estimate=.* mem-reservation=.*
'|'
'00:SCAN HDFS [$DATABASE.alltypes]'
row_regex:.*partitions=12/12 files=12 size=.*
' stored statistics:'
row_regex:.*table: rows=3650 size=.*
' partitions: 0/12 rows=unavailable'
' columns: all'
row_regex:.* extrapolated-rows=3650.*
row_regex:.*mem-estimate=.* mem-reservation=.*
' tuple-ids=0 row-size=4B cardinality=3.65K'
' in pipelines: 00(GETNEXT)'
---- TYPES
STRING
====
---- QUERY
# Select a subset of partitions.
explain select id from alltypes where month in (1, 2, 3);
---- RESULTS: VERIFY_IS_EQUAL
row_regex:.*Max Per-Host Resource Reservation: Memory=.*
row_regex:.*Per-Host Resource Estimates: Memory=.*
'Codegen disabled by planner'
row_regex:.*Analyzed query: SELECT id FROM test_stats_extrapolation_.*.alltypes WHERE.*
'`month` IN (CAST(1 AS INT), CAST(2 AS INT), CAST(3 AS INT))'
''
'F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1'
row_regex:.*mem-estimate=.* mem-reservation=.*
'PLAN-ROOT SINK'
row_regex:.*mem-estimate=.* mem-reservation=.*
'|'
'00:SCAN HDFS [$DATABASE.alltypes]'
' partition predicates: `month` IN (CAST(1 AS INT), CAST(2 AS INT), CAST(3 AS INT))'
row_regex:.*partitions=3/12 files=3 size=.*
' stored statistics:'
row_regex:.*table: rows=3650 size=.*
' partitions: 0/3 rows=unavailable'
' columns: all'
row_regex:.* extrapolated-rows=904.*
row_regex:.*mem-estimate=.* mem-reservation=.*
' tuple-ids=0 row-size=4B cardinality=904'
' in pipelines: 00(GETNEXT)'
---- TYPES
STRING
====
---- QUERY
# Double the data in existing partitions.
insert into alltypes partition(year, month)
select * from functional_parquet.alltypes where year = 2009;
explain select id from alltypes;
---- RESULTS: VERIFY_IS_EQUAL
row_regex:.*Max Per-Host Resource Reservation: Memory=.*
row_regex:.*Per-Host Resource Estimates: Memory=.*
row_regex:.*Analyzed query: SELECT id FROM test_stats_extrapolation_.*.alltypes.*
''
'F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1'
row_regex:.*Per-Host Resources: mem-estimate=.* mem-reservation=.*
'PLAN-ROOT SINK'
row_regex:.*mem-estimate=.* mem-reservation=.*
'|'
'00:SCAN HDFS [$DATABASE.alltypes]'
row_regex:.*partitions=12/12 files=24 size=.*
' stored statistics:'
row_regex:.*table: rows=3650 size=.*
' partitions: 0/12 rows=unavailable'
' columns: all'
row_regex:.* extrapolated-rows=7300.*
row_regex:.*mem-estimate=.* mem-reservation=.*
' tuple-ids=0 row-size=4B cardinality=7.30K'
' in pipelines: 00(GETNEXT)'
---- TYPES
STRING
====
---- QUERY
# Create new partitions and extrapolate their row count.
insert into alltypes partition(year, month)
select * from functional_parquet.alltypes where year = 2010;
explain select id from alltypes where year = 2010;
---- RESULTS: VERIFY_IS_EQUAL
row_regex:.*Max Per-Host Resource Reservation: Memory=.*
row_regex:.*Per-Host Resource Estimates: Memory=16MB.*
'Codegen disabled by planner'
row_regex:.*Analyzed query: SELECT id FROM test_stats_extrapolation_.*.alltypes WHERE.*
'`year` = CAST(2010 AS INT)'
''
'F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1'
row_regex:.*Per-Host Resources: mem-estimate=.* mem-reservation=.*
'PLAN-ROOT SINK'
row_regex:.*mem-estimate=.* mem-reservation=.*
'|'
'00:SCAN HDFS [$DATABASE.alltypes]'
' partition predicates: `year` = CAST(2010 AS INT)'
row_regex:.*partitions=12/24 files=12 size=.*
' stored statistics:'
row_regex:.*table: rows=3650 size=.*
' partitions: 0/12 rows=unavailable'
' columns: all'
row_regex:.* extrapolated-rows=3651.*
row_regex:.*mem-estimate=.* mem-reservation=.*
' tuple-ids=0 row-size=4B cardinality=3.65K'
' in pipelines: 00(GETNEXT)'
---- TYPES
STRING
====
---- QUERY
# Compute stats and run the same query again.
compute stats alltypes;
explain select id from alltypes where year = 2010;
---- RESULTS: VERIFY_IS_EQUAL
row_regex:.*Max Per-Host Resource Reservation: Memory=.*
row_regex:.*Per-Host Resource Estimates: Memory=16MB.*
'Codegen disabled by planner'
row_regex:.*Analyzed query: SELECT id FROM test_stats_extrapolation_.*.alltypes WHERE.*
'`year` = CAST(2010 AS INT)'
''
'F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1'
row_regex:.*Per-Host Resources: mem-estimate=.* mem-reservation=.*
'PLAN-ROOT SINK'
row_regex:.*mem-estimate=.* mem-reservation=.*
'|'
'00:SCAN HDFS [$DATABASE.alltypes]'
' partition predicates: `year` = CAST(2010 AS INT)'
row_regex:.*partitions=12/24 files=12 size=.*
' stored statistics:'
row_regex:.*table: rows=10950 size=.*
' partitions: 0/12 rows=unavailable'
' columns: all'
row_regex:.* extrapolated-rows=3651
row_regex:.*mem-estimate=.* mem-reservation=.*
' tuple-ids=0 row-size=4B cardinality=3.65K'
' in pipelines: 00(GETNEXT)'
---- TYPES
STRING
====
---- QUERY
# Test that dropping stats resets everything.
drop stats alltypes;
explain select id from alltypes;
---- RESULTS: VERIFY_IS_SUBSET
' stored statistics:'
' table: rows=unavailable size=unavailable'
' partitions: 0/24 rows=unavailable'
' columns: unavailable'
row_regex:.* extrapolated-rows=unavailable.*
' tuple-ids=0 row-size=4B cardinality=unavailable'
' in pipelines: 00(GETNEXT)'
---- TYPES
STRING
====