mirror of
https://github.com/apache/impala.git
synced 2025-12-19 09:58:28 -05:00
For this change to land in master, the audience="hidden" code review needs to be completed first. Otherwise, the doc build would still work but the audience="hidden" content would be visible rather than hidden as desired. Some work happening in parallel might introduce additional instances of audience="Cloudera". I suggest addressing those in a followup CR so this global change can land quickly. Since the changes apply across so many different files, but are so narrow in scope, I suggest that the way to validate (check that no extraneous changes were introduced accidentally) is to diff just the changed lines: git diff -U0 HEAD^ HEAD In patch set 2, I updated other topics marked audience="Cloudera" by CRs that were pushed in the meantime. Change-Id: Ic93d89da77e1f51bbf548a522d98d0c4e2fb31c8 Reviewed-on: http://gerrit.cloudera.org:8080/5613 Reviewed-by: John Russell <jrussell@cloudera.com> Tested-by: Impala Public Jenkins
263 lines
9.2 KiB
XML
263 lines
9.2 KiB
XML
<?xml version="1.0" encoding="UTF-8"?>
|
|
<!--
|
|
Licensed to the Apache Software Foundation (ASF) under one
|
|
or more contributor license agreements. See the NOTICE file
|
|
distributed with this work for additional information
|
|
regarding copyright ownership. The ASF licenses this file
|
|
to you under the Apache License, Version 2.0 (the
|
|
"License"); you may not use this file except in compliance
|
|
with the License. You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing,
|
|
software distributed under the License is distributed on an
|
|
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
KIND, either express or implied. See the License for the
|
|
specific language governing permissions and limitations
|
|
under the License.
|
|
-->
|
|
<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
|
|
<concept id="rcfile">
|
|
|
|
<title>Using the RCFile File Format with Impala Tables</title>
|
|
<titlealts audience="PDF"><navtitle>RCFile Data Files</navtitle></titlealts>
|
|
<prolog>
|
|
<metadata>
|
|
<data name="Category" value="Impala"/>
|
|
<!-- <data name="Category" value="RCFile"/> -->
|
|
<data name="Category" value="File Formats"/>
|
|
<data name="Category" value="Tables"/>
|
|
<data name="Category" value="Developers"/>
|
|
<data name="Category" value="Data Analysts"/>
|
|
</metadata>
|
|
</prolog>
|
|
|
|
<conbody>
|
|
|
|
<p>
|
|
<indexterm audience="hidden">RCFile support in Impala</indexterm>
|
|
Impala supports using RCFile data files.
|
|
</p>
|
|
|
|
<table>
|
|
<title>RCFile Format Support in Impala</title>
|
|
<tgroup cols="5">
|
|
<colspec colname="1" colwidth="10*"/>
|
|
<colspec colname="2" colwidth="10*"/>
|
|
<colspec colname="3" colwidth="20*"/>
|
|
<colspec colname="4" colwidth="30*"/>
|
|
<colspec colname="5" colwidth="30*"/>
|
|
<thead>
|
|
<row>
|
|
<entry>
|
|
File Type
|
|
</entry>
|
|
<entry>
|
|
Format
|
|
</entry>
|
|
<entry>
|
|
Compression Codecs
|
|
</entry>
|
|
<entry>
|
|
Impala Can CREATE?
|
|
</entry>
|
|
<entry>
|
|
Impala Can INSERT?
|
|
</entry>
|
|
</row>
|
|
</thead>
|
|
<tbody>
|
|
<row conref="impala_file_formats.xml#file_formats/rcfile_support">
|
|
<entry/>
|
|
</row>
|
|
</tbody>
|
|
</tgroup>
|
|
</table>
|
|
|
|
<p outputclass="toc inpage"/>
|
|
</conbody>
|
|
|
|
<concept id="rcfile_create">
|
|
|
|
<title>Creating RCFile Tables and Loading Data</title>
|
|
<prolog>
|
|
<metadata>
|
|
<data name="Category" value="ETL"/>
|
|
</metadata>
|
|
</prolog>
|
|
|
|
<conbody>
|
|
|
|
<p>
|
|
If you do not have an existing data file to use, begin by creating one in the appropriate format.
|
|
</p>
|
|
|
|
<p>
|
|
<b>To create an RCFile table:</b>
|
|
</p>
|
|
|
|
<p>
|
|
In the <codeph>impala-shell</codeph> interpreter, issue a command similar to:
|
|
</p>
|
|
|
|
<codeblock>create table rcfile_table (<varname>column_specs</varname>) stored as rcfile;</codeblock>
|
|
|
|
<p>
|
|
Because Impala can query some kinds of tables that it cannot currently write to, after creating tables of
|
|
certain file formats, you might use the Hive shell to load the data. See
|
|
<xref href="impala_file_formats.xml#file_formats"/> for details. After loading data into a table through
|
|
Hive or other mechanism outside of Impala, issue a <codeph>REFRESH <varname>table_name</varname></codeph>
|
|
statement the next time you connect to the Impala node, before querying the table, to make Impala recognize
|
|
the new data.
|
|
</p>
|
|
|
|
<note type="important">
|
|
See <xref href="impala_known_issues.xml#known_issues"/> for potential compatibility issues with
|
|
RCFile tables created in Hive 0.12, due to a change in the default RCFile SerDe for Hive.
|
|
</note>
|
|
|
|
<p>
|
|
For example, here is how you might create some RCFile tables in Impala (by specifying the columns
|
|
explicitly, or cloning the structure of another table), load data through Hive, and query them through
|
|
Impala:
|
|
</p>
|
|
|
|
<codeblock>$ impala-shell -i localhost
|
|
[localhost:21000] > create table rcfile_table (x int) stored as rcfile;
|
|
[localhost:21000] > create table rcfile_clone like some_other_table stored as rcfile;
|
|
[localhost:21000] > quit;
|
|
|
|
$ hive
|
|
hive> insert into table rcfile_table select x from some_other_table;
|
|
3 Rows loaded to rcfile_table
|
|
Time taken: 19.015 seconds
|
|
hive> quit;
|
|
|
|
$ impala-shell -i localhost
|
|
[localhost:21000] > select * from rcfile_table;
|
|
Returned 0 row(s) in 0.23s
|
|
[localhost:21000] > -- Make Impala recognize the data loaded through Hive;
|
|
[localhost:21000] > refresh rcfile_table;
|
|
[localhost:21000] > select * from rcfile_table;
|
|
+---+
|
|
| x |
|
|
+---+
|
|
| 1 |
|
|
| 2 |
|
|
| 3 |
|
|
+---+
|
|
Returned 3 row(s) in 0.23s</codeblock>
|
|
|
|
<p conref="../shared/impala_common.xml#common/complex_types_unsupported_filetype"/>
|
|
|
|
</conbody>
|
|
</concept>
|
|
|
|
<concept id="rcfile_compression">
|
|
|
|
<title>Enabling Compression for RCFile Tables</title>
|
|
<prolog>
|
|
<metadata>
|
|
<data name="Category" value="Snappy"/>
|
|
<data name="Category" value="Compression"/>
|
|
</metadata>
|
|
</prolog>
|
|
|
|
<conbody>
|
|
|
|
<p>
|
|
<indexterm audience="hidden">compression</indexterm>
|
|
You may want to enable compression on existing tables. Enabling compression provides performance gains in
|
|
most cases and is supported for RCFile tables. For example, to enable Snappy compression, you would specify
|
|
the following additional settings when loading data through the Hive shell:
|
|
</p>
|
|
|
|
<codeblock>hive> SET hive.exec.compress.output=true;
|
|
hive> SET mapred.max.split.size=256000000;
|
|
hive> SET mapred.output.compression.type=BLOCK;
|
|
hive> SET mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;
|
|
hive> INSERT OVERWRITE TABLE <varname>new_table</varname> SELECT * FROM <varname>old_table</varname>;</codeblock>
|
|
|
|
<p>
|
|
If you are converting partitioned tables, you must complete additional steps. In such a case, specify
|
|
additional settings similar to the following:
|
|
</p>
|
|
|
|
<codeblock>hive> CREATE TABLE <varname>new_table</varname> (<varname>your_cols</varname>) PARTITIONED BY (<varname>partition_cols</varname>) STORED AS <varname>new_format</varname>;
|
|
hive> SET hive.exec.dynamic.partition.mode=nonstrict;
|
|
hive> SET hive.exec.dynamic.partition=true;
|
|
hive> INSERT OVERWRITE TABLE <varname>new_table</varname> PARTITION(<varname>comma_separated_partition_cols</varname>) SELECT * FROM <varname>old_table</varname>;</codeblock>
|
|
|
|
<p>
|
|
Remember that Hive does not require that you specify a source format for it. Consider the case of
|
|
converting a table with two partition columns called <codeph>year</codeph> and <codeph>month</codeph> to a
|
|
Snappy compressed RCFile. Combining the components outlined previously to complete this table conversion,
|
|
you would specify settings similar to the following:
|
|
</p>
|
|
|
|
<codeblock>hive> CREATE TABLE tbl_rc (int_col INT, string_col STRING) STORED AS RCFILE;
|
|
hive> SET hive.exec.compress.output=true;
|
|
hive> SET mapred.max.split.size=256000000;
|
|
hive> SET mapred.output.compression.type=BLOCK;
|
|
hive> SET mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;
|
|
hive> SET hive.exec.dynamic.partition.mode=nonstrict;
|
|
hive> SET hive.exec.dynamic.partition=true;
|
|
hive> INSERT OVERWRITE TABLE tbl_rc SELECT * FROM tbl;</codeblock>
|
|
|
|
<p>
|
|
To complete a similar process for a table that includes partitions, you would specify settings similar to
|
|
the following:
|
|
</p>
|
|
|
|
<codeblock>hive> CREATE TABLE tbl_rc (int_col INT, string_col STRING) PARTITIONED BY (year INT) STORED AS RCFILE;
|
|
hive> SET hive.exec.compress.output=true;
|
|
hive> SET mapred.max.split.size=256000000;
|
|
hive> SET mapred.output.compression.type=BLOCK;
|
|
hive> SET mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;
|
|
hive> SET hive.exec.dynamic.partition.mode=nonstrict;
|
|
hive> SET hive.exec.dynamic.partition=true;
|
|
hive> INSERT OVERWRITE TABLE tbl_rc PARTITION(year) SELECT * FROM tbl;</codeblock>
|
|
|
|
<note>
|
|
<p>
|
|
The compression type is specified in the following command:
|
|
</p>
|
|
<codeblock>SET mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;</codeblock>
|
|
<p>
|
|
You could elect to specify alternative codecs such as <codeph>GzipCodec</codeph> here.
|
|
</p>
|
|
</note>
|
|
</conbody>
|
|
</concept>
|
|
|
|
<concept id="rcfile_performance">
|
|
|
|
<title>Query Performance for Impala RCFile Tables</title>
|
|
|
|
<conbody>
|
|
|
|
<p>
|
|
In general, expect query performance with RCFile tables to be
|
|
faster than with tables using text data, but slower than with
|
|
Parquet tables. See <xref href="impala_parquet.xml#parquet"/>
|
|
for information about using the Parquet file format for
|
|
high-performance analytic queries.
|
|
</p>
|
|
|
|
<p conref="../shared/impala_common.xml#common/s3_block_splitting"/>
|
|
|
|
</conbody>
|
|
</concept>
|
|
|
|
<concept audience="hidden" id="rcfile_data_types">
|
|
|
|
<title>Data Type Considerations for RCFile Tables</title>
|
|
|
|
<conbody>
|
|
|
|
<p></p>
|
|
</conbody>
|
|
</concept>
|
|
</concept>
|