mirror of
https://github.com/apache/impala.git
synced 2026-01-08 21:03:01 -05:00
Split out the encoder/type for parquet reader/writer. I think this puts us
in a better place to support future encodings.
On the tpch lineitem table, the results are:
Before:
BytesWritten: 236.45 MB
Per Column Sizes:
l_comment: 75.71 MB
l_commitdate: 8.64 MB
l_discount: 11.19 MB
l_extendedprice: 33.02 MB
l_linenumber: 4.56 MB
l_linestatus: 869.98 KB
l_orderkey: 8.99 MB
l_partkey: 27.02 MB
l_quantity: 11.58 MB
l_receiptdate: 8.65 MB
l_returnflag: 1.40 MB
l_shipdate: 8.65 MB
l_shipinstruct: 1.45 MB
l_shipmode: 2.17 MB
l_suppkey: 21.91 MB
l_tax: 10.68 MB
After:
BytesWritten: 198.63 MB (84%)
Per Column Sizes:
l_comment: 75.71 MB (100%)
l_commitdate: 8.64 MB (100%)
l_discount: 2.89 MB (25.8%)
l_extendedprice: 33.13 MB (100.33%)
l_linenumber: 1.50 MB (32.89%)
l_linestatus: 870.26 KB (100.032%)
l_orderkey: 9.18 MB (102.11%)
l_partkey: 27.10 MB (100.29%)
l_quantity: 4.32 MB (37.31%)
l_receiptdate: 8.65 MB (100%)
l_returnflag: 1.40 MB (100%)
l_shipdate: 8.65 MB (100%)
l_shipinstruct: 1.45 MB (100%)
l_shipmode: 2.17 MB (100%)
l_suppkey: 10.11 MB (46.14%)
l_tax: 2.89 MB (27.06%)
The table is overall 84% as big (i.e. 16% smaller). A few columns got marginally
bigger. If the file filled the 1 GB, I'd expect the overhead to decrease even
more.
The restructuring to use a virtual call doesn't seem to change things much and
will go away when we codegen the scanner.
Here's what they look like with this patch (note this is on the before data files,
so only string cols are dictionary encoded).
Before query times:
Insert Time: 8.5 sec
select *: 2.3 sec
select avg(l_orderkey): .33 sec
After query times:
Insert Time: 9.5 sec <-- Longer due to doing dictionary encoding
select *: 2.4 sec <-- kind of noisy, possibly a slight slow down
select avg(l_orderkey): .33 sec
Change-Id: I213fdca1bb972cc200dc0cd9fb14b77a8d36d9e6
Reviewed-on: http://gerrit.ent.cloudera.com:8080/238
Tested-by: jenkins <kitchen-build@cloudera.com>
Reviewed-by: Skye Wanderman-Milne <skye@cloudera.com>
52 lines
2.2 KiB
Python
52 lines
2.2 KiB
Python
#!/usr/bin/env python
|
|
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
|
|
# Targeted Impala insert tests
|
|
#
|
|
import logging
|
|
import pytest
|
|
from tests.common.test_vector import *
|
|
from tests.common.impala_test_suite import *
|
|
from tests.common.test_dimensions import create_exec_option_dimension
|
|
|
|
# TODO: Add Gzip back. IMPALA-424
|
|
PARQUET_CODECS = ['none', 'snappy']
|
|
|
|
class TestInsertQueries(ImpalaTestSuite):
|
|
@classmethod
|
|
def get_workload(self):
|
|
return 'functional-query'
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestInsertQueries, cls).add_test_dimensions()
|
|
# Fix the exec_option vector to have a single value. This is needed should we decide
|
|
# to run the insert tests in parallel (otherwise there will be two tests inserting
|
|
# into the same table at the same time for the same file format).
|
|
# TODO: When we do decide to run these tests in parallel we could create unique temp
|
|
# tables for each test case to resolve the concurrency problems.
|
|
cls.TestMatrix.add_dimension(create_exec_option_dimension(
|
|
cluster_sizes=[0], disable_codegen_options=[False], batch_sizes=[0]))
|
|
|
|
cls.TestMatrix.add_dimension(TestDimension("compression_codec", *PARQUET_CODECS));
|
|
|
|
# Insert is currently only supported for text and parquet
|
|
# For parquet, we want to iterate through all the compression codecs
|
|
# TODO: each column in parquet can have a different codec. We could
|
|
# test all the codecs in one table/file with some additional flags.
|
|
cls.TestMatrix.add_constraint(lambda v:\
|
|
v.get_value('table_format').file_format == 'parquet' or \
|
|
(v.get_value('table_format').file_format == 'text' and \
|
|
v.get_value('compression_codec') == 'none'))
|
|
cls.TestMatrix.add_constraint(lambda v:\
|
|
v.get_value('table_format').compression_codec == 'none')
|
|
|
|
@pytest.mark.execute_serially
|
|
def test_insert1(self, vector):
|
|
vector.get_value('exec_option')['PARQUET_COMPRESSION_CODEC'] = \
|
|
vector.get_value('compression_codec')
|
|
self.run_test_case('QueryTest/insert', vector)
|
|
|
|
@pytest.mark.execute_serially
|
|
def test_insert_overwrite(self, vector):
|
|
self.run_test_case('QueryTest/insert_overwrite', vector)
|