impala/testdata/workloads/functional-planner/queries/PlannerTest/tablesample.test

# Sample 10%
select * from functional.alltypes tablesample system(10) repeatable(1234)
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
|  Per-Host Resources: mem-estimate=32.00MB mem-reservation=32.00KB
PLAN-ROOT SINK
|  mem-estimate=0B mem-reservation=0B
|
00:SCAN HDFS [functional.alltypes]
   partitions=3/24 files=3 size=60.68KB
   stored statistics:
     table: rows=7300 size=478.45KB
     partitions: 24/24 rows=7300
     columns: all
   extrapolated-rows=disabled max-scan-range-rows=2449
   mem-estimate=32.00MB mem-reservation=32.00KB
   tuple-ids=0 row-size=97B cardinality=730
====
# Sample 50%
select * from functional.alltypes tablesample system(50) repeatable(1234)
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
|  Per-Host Resources: mem-estimate=80.00MB mem-reservation=32.00KB
PLAN-ROOT SINK
|  mem-estimate=0B mem-reservation=0B
|
00:SCAN HDFS [functional.alltypes]
   partitions=12/24 files=12 size=239.26KB
   stored statistics:
     table: rows=7300 size=478.45KB
     partitions: 24/24 rows=7300
     columns: all
   extrapolated-rows=disabled max-scan-range-rows=621
   mem-estimate=80.00MB mem-reservation=32.00KB
   tuple-ids=0 row-size=97B cardinality=3650
====
# Sampling and scan predicates. Scan predicates are evaluated after sampling and
# that is reflected in the cardinality.
select * from functional.alltypes tablesample system(50) repeatable(1234)
where id < 10
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
|  Per-Host Resources: mem-estimate=80.00MB mem-reservation=32.00KB
PLAN-ROOT SINK
|  mem-estimate=0B mem-reservation=0B
|
00:SCAN HDFS [functional.alltypes]
   partitions=12/24 files=12 size=239.26KB
   predicates: id < 10
   stored statistics:
     table: rows=7300 size=478.45KB
     partitions: 24/24 rows=7300
     columns: all
   extrapolated-rows=disabled max-scan-range-rows=621
   parquet dictionary predicates: id < 10
   mem-estimate=80.00MB mem-reservation=32.00KB
   tuple-ids=0 row-size=97B cardinality=365
====
# Partition pruning + sampling. Partition pruning happens before sampling.
select * from functional.alltypes tablesample system(50) repeatable(1234)
where year = 2009
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
|  Per-Host Resources: mem-estimate=48.00MB mem-reservation=32.00KB
PLAN-ROOT SINK
|  mem-estimate=0B mem-reservation=0B
|
00:SCAN HDFS [functional.alltypes]
   partitions=6/24 files=6 size=119.70KB
   stored statistics:
     table: rows=7300 size=478.45KB
     partitions: 12/12 rows=3650
     columns: all
   extrapolated-rows=disabled max-scan-range-rows=1241
   mem-estimate=48.00MB mem-reservation=32.00KB
   tuple-ids=0 row-size=97B cardinality=1825
====
# Edge case: sample 0%, no files should be selected
select * from functional.alltypes tablesample system(0)
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
|  Per-Host Resources: mem-estimate=0B mem-reservation=0B
PLAN-ROOT SINK
|  mem-estimate=0B mem-reservation=0B
|
00:SCAN HDFS [functional.alltypes]
   partitions=0/24 files=0 size=0B
   stored statistics:
     table: rows=7300 size=478.45KB
     partitions: 24/24 rows=7300
     columns: all
   extrapolated-rows=disabled max-scan-range-rows=0
   mem-estimate=0B mem-reservation=0B
   tuple-ids=0 row-size=97B cardinality=0
====
# Edge case: sample 1%, at least one file should be selected
select * from functional.alltypes tablesample system(1) repeatable(1234)
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
|  Per-Host Resources: mem-estimate=32.00MB mem-reservation=32.00KB
PLAN-ROOT SINK
|  mem-estimate=0B mem-reservation=0B
|
00:SCAN HDFS [functional.alltypes]
   partitions=1/24 files=1 size=19.95KB
   stored statistics:
     table: rows=7300 size=478.45KB
     partitions: 24/24 rows=7300
     columns: all
   extrapolated-rows=disabled max-scan-range-rows=7300
   mem-estimate=32.00MB mem-reservation=32.00KB
   tuple-ids=0 row-size=97B cardinality=73
====
# Edge case: sample 1% and prune partitions, at least one file should be selected
select * from functional.alltypes tablesample system(1) repeatable(1234)
where year = 2010
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
|  Per-Host Resources: mem-estimate=32.00MB mem-reservation=32.00KB
PLAN-ROOT SINK
|  mem-estimate=0B mem-reservation=0B
|
00:SCAN HDFS [functional.alltypes]
   partitions=1/24 files=1 size=20.36KB
   stored statistics:
     table: rows=7300 size=478.45KB
     partitions: 12/12 rows=3650
     columns: all
   extrapolated-rows=disabled max-scan-range-rows=7300
   mem-estimate=32.00MB mem-reservation=32.00KB
   tuple-ids=0 row-size=97B cardinality=37
====
# Edge case: sample 100%, all files should be selected
select * from functional.alltypes tablesample system (100)
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
|  Per-Host Resources: mem-estimate=128.00MB mem-reservation=32.00KB
PLAN-ROOT SINK
|  mem-estimate=0B mem-reservation=0B
|
00:SCAN HDFS [functional.alltypes]
   partitions=24/24 files=24 size=478.45KB
   stored statistics:
     table: rows=7300 size=478.45KB
     partitions: 24/24 rows=7300
     columns: all
   extrapolated-rows=disabled max-scan-range-rows=310
   mem-estimate=128.00MB mem-reservation=32.00KB
   tuple-ids=0 row-size=97B cardinality=7300
====
# Table that has no stats.
select id from functional_parquet.alltypes tablesample system(10) repeatable(1234)
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
|  Per-Host Resources: mem-estimate=16.00MB mem-reservation=16.00KB
PLAN-ROOT SINK
|  mem-estimate=0B mem-reservation=0B
|
00:SCAN HDFS [functional_parquet.alltypes]
   partitions=3/24 files=3 size=24.23KB
   stored statistics:
     table: rows=unavailable size=unavailable
     partitions: 0/24 rows=unavailable
     columns: unavailable
   extrapolated-rows=disabled max-scan-range-rows=unavailable
   mem-estimate=16.00MB mem-reservation=16.00KB
   tuple-ids=0 row-size=4B cardinality=unavailable
====
# Sampling in a subquery.
select id from functional.alltypes t1 where exists (
  select id from functional.alltypessmall t2 tablesample system(10) repeatable(1234)
  where t1.id = t2.id)
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
|  Per-Host Resources: mem-estimate=162.94MB mem-reservation=2.98MB runtime-filters-memory=1.00MB
PLAN-ROOT SINK
|  mem-estimate=0B mem-reservation=0B
|
02:HASH JOIN [LEFT SEMI JOIN]
|  hash predicates: t1.id = t2.id
|  runtime filters: RF000[bloom] <- t2.id
|  mem-estimate=1.94MB mem-reservation=1.94MB spill-buffer=64.00KB
|  tuple-ids=0 row-size=4B cardinality=10
|
|--01:SCAN HDFS [functional.alltypessmall t2]
|     partitions=1/4 files=1 size=1.57KB
|     stored statistics:
|       table: rows=100 size=6.32KB
|       partitions: 4/4 rows=100
|       columns: all
|     extrapolated-rows=disabled max-scan-range-rows=100
|     mem-estimate=32.00MB mem-reservation=8.00KB
|     tuple-ids=1 row-size=4B cardinality=10
|
00:SCAN HDFS [functional.alltypes t1]
   partitions=24/24 files=24 size=478.45KB
   runtime filters: RF000[bloom] -> t1.id
   stored statistics:
     table: rows=7300 size=478.45KB
     partitions: 24/24 rows=7300
     columns: all
   extrapolated-rows=disabled max-scan-range-rows=310
   mem-estimate=128.00MB mem-reservation=32.00KB
   tuple-ids=0 row-size=4B cardinality=7300
====
# Sampling in WITH-clause view.
with t as (select * from functional.alltypes tablesample system(10) repeatable(1234))
select id from t
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
|  Per-Host Resources: mem-estimate=32.00MB mem-reservation=32.00KB
PLAN-ROOT SINK
|  mem-estimate=0B mem-reservation=0B
|
00:SCAN HDFS [functional.alltypes]
   partitions=3/24 files=3 size=60.68KB
   stored statistics:
     table: rows=7300 size=478.45KB
     partitions: 24/24 rows=7300
     columns: all
   extrapolated-rows=disabled max-scan-range-rows=2449
   mem-estimate=32.00MB mem-reservation=32.00KB
   tuple-ids=0 row-size=4B cardinality=730
====