mirror of
https://github.com/apache/impala.git
synced 2025-12-19 09:58:28 -05:00
- Added a "removed" note for the ALLOW_UNSUPPORTED_FORMATS and SEQ_COMPRESSION_MODE query options. - Will remove the above options from the docs at the next compatibility breaking release. Change-Id: I363accf5f284d2a1535cea0652b2b579379b9588 Reviewed-on: http://gerrit.cloudera.org:8080/11842 Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Reviewed-by: Bikramjeet Vig <bikramjeet.vig@cloudera.com>
244 lines
8.8 KiB
XML
244 lines
8.8 KiB
XML
<?xml version="1.0" encoding="UTF-8"?>
|
|
<!--
|
|
Licensed to the Apache Software Foundation (ASF) under one
|
|
or more contributor license agreements. See the NOTICE file
|
|
distributed with this work for additional information
|
|
regarding copyright ownership. The ASF licenses this file
|
|
to you under the Apache License, Version 2.0 (the
|
|
"License"); you may not use this file except in compliance
|
|
with the License. You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing,
|
|
software distributed under the License is distributed on an
|
|
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
KIND, either express or implied. See the License for the
|
|
specific language governing permissions and limitations
|
|
under the License.
|
|
-->
|
|
<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
|
|
<concept id="seqfile">
|
|
|
|
<title id="sequencefile">Using the SequenceFile File Format with Impala Tables</title>
|
|
<titlealts audience="PDF"><navtitle>SequenceFile Data Files</navtitle></titlealts>
|
|
<prolog>
|
|
<metadata>
|
|
<data name="Category" value="Impala"/>
|
|
<!-- <data name="Category" value="SequenceFile"/> -->
|
|
<data name="Category" value="File Formats"/>
|
|
<data name="Category" value="Developers"/>
|
|
<data name="Category" value="Data Analysts"/>
|
|
</metadata>
|
|
</prolog>
|
|
|
|
<conbody>
|
|
|
|
<p> Impala supports using SequenceFile data files. </p>
|
|
|
|
<table>
|
|
<title>SequenceFile Format Support in Impala</title>
|
|
<tgroup cols="5">
|
|
<colspec colname="1" colwidth="10*"/>
|
|
<colspec colname="2" colwidth="10*"/>
|
|
<colspec colname="3" colwidth="20*"/>
|
|
<colspec colname="4" colwidth="30*"/>
|
|
<colspec colname="5" colwidth="30*"/>
|
|
<thead>
|
|
<row>
|
|
<entry>
|
|
File Type
|
|
</entry>
|
|
<entry>
|
|
Format
|
|
</entry>
|
|
<entry>
|
|
Compression Codecs
|
|
</entry>
|
|
<entry>
|
|
Impala Can CREATE?
|
|
</entry>
|
|
<entry>
|
|
Impala Can INSERT?
|
|
</entry>
|
|
</row>
|
|
</thead>
|
|
<tbody>
|
|
<row conref="impala_file_formats.xml#file_formats/sequencefile_support">
|
|
<entry/>
|
|
</row>
|
|
</tbody>
|
|
</tgroup>
|
|
</table>
|
|
|
|
<p outputclass="toc inpage"/>
|
|
</conbody>
|
|
|
|
<concept id="seqfile_create">
|
|
|
|
<title>Creating SequenceFile Tables and Loading Data</title>
|
|
<prolog>
|
|
<metadata>
|
|
<data name="Category" value="ETL"/>
|
|
</metadata>
|
|
</prolog>
|
|
|
|
<conbody>
|
|
|
|
<p>
|
|
If you do not have an existing data file to use, begin by creating one in the appropriate format.
|
|
</p>
|
|
|
|
<p>
|
|
<b>To create a SequenceFile table:</b>
|
|
</p>
|
|
|
|
<p>
|
|
In the <codeph>impala-shell</codeph> interpreter, issue a command similar to:
|
|
</p>
|
|
|
|
<codeblock>create table sequencefile_table (<varname>column_specs</varname>) stored as sequencefile;</codeblock>
|
|
|
|
<p>
|
|
Because Impala can query some kinds of tables that it cannot currently write to, after creating tables of
|
|
certain file formats, you might use the Hive shell to load the data. See
|
|
<xref href="impala_file_formats.xml#file_formats"/> for details. After loading data into a table through
|
|
Hive or other mechanism outside of Impala, issue a <codeph>REFRESH <varname>table_name</varname></codeph>
|
|
statement the next time you connect to the Impala node, before querying the table, to make Impala recognize
|
|
the new data.
|
|
</p>
|
|
|
|
<p>
|
|
For example, here is how you might create some SequenceFile tables in Impala (by specifying the columns
|
|
explicitly, or cloning the structure of another table), load data through Hive, and query them through
|
|
Impala:
|
|
</p>
|
|
|
|
<codeblock>$ impala-shell -i localhost
|
|
[localhost:21000] > create table seqfile_table (x int) stored as sequencefile;
|
|
[localhost:21000] > create table seqfile_clone like some_other_table stored as sequencefile;
|
|
[localhost:21000] > quit;
|
|
|
|
$ hive
|
|
hive> insert into table seqfile_table select x from some_other_table;
|
|
3 Rows loaded to seqfile_table
|
|
Time taken: 19.047 seconds
|
|
hive> quit;
|
|
|
|
$ impala-shell -i localhost
|
|
[localhost:21000] > select * from seqfile_table;
|
|
Returned 0 row(s) in 0.23s
|
|
[localhost:21000] > -- Make Impala recognize the data loaded through Hive;
|
|
[localhost:21000] > refresh seqfile_table;
|
|
[localhost:21000] > select * from seqfile_table;
|
|
+---+
|
|
| x |
|
|
+---+
|
|
| 1 |
|
|
| 2 |
|
|
| 3 |
|
|
+---+
|
|
Returned 3 row(s) in 0.23s</codeblock>
|
|
|
|
<p conref="../shared/impala_common.xml#common/complex_types_unsupported_filetype"/>
|
|
|
|
</conbody>
|
|
</concept>
|
|
|
|
<concept id="seqfile_compression">
|
|
|
|
<title>Enabling Compression for SequenceFile Tables</title>
|
|
<prolog>
|
|
<metadata>
|
|
<data name="Category" value="Compression"/>
|
|
<data name="Category" value="Snappy"/>
|
|
</metadata>
|
|
</prolog>
|
|
|
|
<conbody>
|
|
|
|
<p> You may want to enable compression on existing tables. Enabling
|
|
compression provides performance gains in most cases and is supported
|
|
for SequenceFile tables. For example, to enable Snappy compression, you
|
|
would specify the following additional settings when loading data
|
|
through the Hive shell: </p>
|
|
|
|
<codeblock>hive> SET hive.exec.compress.output=true;
|
|
hive> SET mapred.max.split.size=256000000;
|
|
hive> SET mapred.output.compression.type=BLOCK;
|
|
hive> SET mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;
|
|
hive> insert overwrite table <varname>new_table</varname> select * from <varname>old_table</varname>;</codeblock>
|
|
|
|
<p>
|
|
If you are converting partitioned tables, you must complete additional steps. In such a case, specify
|
|
additional settings similar to the following:
|
|
</p>
|
|
|
|
<codeblock>hive> create table <varname>new_table</varname> (<varname>your_cols</varname>) partitioned by (<varname>partition_cols</varname>) stored as <varname>new_format</varname>;
|
|
hive> SET hive.exec.dynamic.partition.mode=nonstrict;
|
|
hive> SET hive.exec.dynamic.partition=true;
|
|
hive> insert overwrite table <varname>new_table</varname> partition(<varname>comma_separated_partition_cols</varname>) select * from <varname>old_table</varname>;</codeblock>
|
|
|
|
<p>
|
|
Remember that Hive does not require that you specify a source format for it. Consider the case of
|
|
converting a table with two partition columns called <codeph>year</codeph> and <codeph>month</codeph> to a
|
|
Snappy compressed SequenceFile. Combining the components outlined previously to complete this table
|
|
conversion, you would specify settings similar to the following:
|
|
</p>
|
|
|
|
<codeblock>hive> create table TBL_SEQ (int_col int, string_col string) STORED AS SEQUENCEFILE;
|
|
hive> SET hive.exec.compress.output=true;
|
|
hive> SET mapred.max.split.size=256000000;
|
|
hive> SET mapred.output.compression.type=BLOCK;
|
|
hive> SET mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;
|
|
hive> SET hive.exec.dynamic.partition.mode=nonstrict;
|
|
hive> SET hive.exec.dynamic.partition=true;
|
|
hive> INSERT OVERWRITE TABLE tbl_seq SELECT * FROM tbl;</codeblock>
|
|
|
|
<p>
|
|
To complete a similar process for a table that includes partitions, you would specify settings similar to
|
|
the following:
|
|
</p>
|
|
|
|
<codeblock>hive> CREATE TABLE tbl_seq (int_col INT, string_col STRING) PARTITIONED BY (year INT) STORED AS SEQUENCEFILE;
|
|
hive> SET hive.exec.compress.output=true;
|
|
hive> SET mapred.max.split.size=256000000;
|
|
hive> SET mapred.output.compression.type=BLOCK;
|
|
hive> SET mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;
|
|
hive> SET hive.exec.dynamic.partition.mode=nonstrict;
|
|
hive> SET hive.exec.dynamic.partition=true;
|
|
hive> INSERT OVERWRITE TABLE tbl_seq PARTITION(year) SELECT * FROM tbl;</codeblock>
|
|
|
|
<note>
|
|
<p>
|
|
The compression type is specified in the following command:
|
|
</p>
|
|
<codeblock>SET mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;</codeblock>
|
|
<p>
|
|
You could elect to specify alternative codecs such as <codeph>GzipCodec</codeph> here.
|
|
</p>
|
|
</note>
|
|
</conbody>
|
|
</concept>
|
|
|
|
<concept id="seqfile_performance">
|
|
|
|
<title>Query Performance for Impala SequenceFile Tables</title>
|
|
|
|
<conbody>
|
|
|
|
<p>
|
|
In general, expect query performance with SequenceFile tables to be
|
|
faster than with tables using text data, but slower than with
|
|
Parquet tables. See <xref href="impala_parquet.xml#parquet"/>
|
|
for information about using the Parquet file format for
|
|
high-performance analytic queries.
|
|
</p>
|
|
|
|
<p conref="../shared/impala_common.xml#common/s3_block_splitting"/>
|
|
|
|
</conbody>
|
|
</concept>
|
|
|
|
</concept>
|