mirror of
https://github.com/apache/impala.git
synced 2025-12-19 09:58:28 -05:00
-Added broadcast_bytes_limit query option Change-Id: I4385749de35f8379ecf6566fe515ed500b42d6cc Reviewed-on: http://gerrit.cloudera.org:8080/14863 Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Reviewed-by: Joe McDonnell <joemcdonnell@cloudera.com>
721 lines
31 KiB
XML
721 lines
31 KiB
XML
<?xml version="1.0" encoding="UTF-8"?>
|
|
<!--
|
|
Licensed to the Apache Software Foundation (ASF) under one
|
|
or more contributor license agreements. See the NOTICE file
|
|
distributed with this work for additional information
|
|
regarding copyright ownership. The ASF licenses this file
|
|
to you under the Apache License, Version 2.0 (the
|
|
"License"); you may not use this file except in compliance
|
|
with the License. You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing,
|
|
software distributed under the License is distributed on an
|
|
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
KIND, either express or implied. See the License for the
|
|
specific language governing permissions and limitations
|
|
under the License.
|
|
-->
|
|
<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
|
|
<concept id="txtfile">
|
|
|
|
<title>Using Text Data Files with Impala Tables</title>
|
|
<titlealts audience="PDF"><navtitle>Text Data Files</navtitle></titlealts>
|
|
<prolog>
|
|
<metadata>
|
|
<data name="Category" value="Impala"/>
|
|
<data name="Category" value="File Formats"/>
|
|
<data name="Category" value="Tables"/>
|
|
<data name="Category" value="Developers"/>
|
|
<data name="Category" value="Data Analysts"/>
|
|
</metadata>
|
|
</prolog>
|
|
|
|
<conbody>
|
|
|
|
<p> Impala supports using text files as the storage format for input and
|
|
output. Text files are a convenient format to use for interchange with
|
|
other applications or scripts that produce or read delimited text files,
|
|
such as CSV or TSV with commas or tabs for delimiters. </p>
|
|
|
|
<p>
|
|
Text files are also very flexible in their column definitions. For example, a text file could have more
|
|
fields than the Impala table, and those extra fields are ignored during queries; or it could have fewer
|
|
fields than the Impala table, and those missing fields are treated as <codeph>NULL</codeph> values in
|
|
queries. You could have fields that were treated as numbers or timestamps in a table, then use <codeph>ALTER
|
|
TABLE ... REPLACE COLUMNS</codeph> to switch them to strings, or the reverse.
|
|
</p>
|
|
|
|
<table>
|
|
<title>Text Format Support in Impala</title>
|
|
<tgroup cols="5">
|
|
<colspec colname="1" colwidth="10*"/>
|
|
<colspec colname="2" colwidth="10*"/>
|
|
<colspec colname="3" colwidth="20*"/>
|
|
<colspec colname="4" colwidth="30*"/>
|
|
<colspec colname="5" colwidth="30*"/>
|
|
<thead>
|
|
<row>
|
|
<entry>
|
|
File Type
|
|
</entry>
|
|
<entry>
|
|
Format
|
|
</entry>
|
|
<entry>
|
|
Compression Codecs
|
|
</entry>
|
|
<entry>
|
|
Impala Can CREATE?
|
|
</entry>
|
|
<entry>
|
|
Impala Can INSERT?
|
|
</entry>
|
|
</row>
|
|
</thead>
|
|
<tbody>
|
|
<row conref="impala_file_formats.xml#file_formats/txtfile_support">
|
|
<entry/>
|
|
</row>
|
|
</tbody>
|
|
</tgroup>
|
|
</table>
|
|
|
|
<p outputclass="toc inpage"/>
|
|
|
|
</conbody>
|
|
|
|
<concept id="text_performance">
|
|
|
|
<title>Query Performance for Impala Text Tables</title>
|
|
<prolog>
|
|
<metadata>
|
|
<data name="Category" value="Performance"/>
|
|
</metadata>
|
|
</prolog>
|
|
|
|
<conbody>
|
|
|
|
<p>
|
|
Data stored in text format is relatively bulky, and not as efficient to query as binary formats such as
|
|
Parquet. You typically use text tables with Impala if that is the format you receive the data and you do
|
|
not have control over that process, or if you are a relatively new Hadoop user and not familiar with
|
|
techniques to generate files in other formats. (Because the default format for <codeph>CREATE
|
|
TABLE</codeph> is text, you might create your first Impala tables as text without giving performance much
|
|
thought.) Either way, look for opportunities to use more efficient file formats for the tables used in your
|
|
most performance-critical queries.
|
|
</p>
|
|
|
|
<p>
|
|
For frequently queried data, you might load the original text data files into one Impala table, then use an
|
|
<codeph>INSERT</codeph> statement to transfer the data to another table that uses the Parquet file format;
|
|
the data is converted automatically as it is stored in the destination table.
|
|
</p>
|
|
|
|
<p>
|
|
For more compact data, consider using LZO compression for the text files. LZO is the only compression codec
|
|
that Impala supports for text data, because the <q>splittable</q> nature of LZO data files lets different
|
|
nodes work on different parts of the same file in parallel. See <xref href="impala_txtfile.xml#lzo"/> for
|
|
details.
|
|
</p>
|
|
|
|
<p rev="2.0.0">
|
|
You can also use text data compressed in the bzip2, deflate, gzip, Snappy, or
|
|
zstd formats. Because these compressed formats are not <q>splittable</q> in the way that LZO
|
|
is, there is less opportunity for Impala to parallelize queries on them. Therefore, use
|
|
these types of compressed data only for convenience if that is the format in which you
|
|
receive the data. Prefer to use LZO compression for text data if you have the choice, or
|
|
convert the data to Parquet using an <codeph>INSERT ... SELECT</codeph> statement to copy
|
|
the original data into a Parquet table. </p>
|
|
|
|
<note rev="2.2.0">
|
|
<p>
|
|
Impala supports bzip files created by the <codeph>bzip2</codeph> command, but not bzip files with
|
|
multiple streams created by the <codeph>pbzip2</codeph> command. Impala decodes only the data from the
|
|
first part of such files, leading to incomplete results.
|
|
</p>
|
|
</note>
|
|
|
|
<p>
|
|
The maximum size that Impala can accommodate for an individual bzip file is 1 GB (after uncompression).
|
|
</p>
|
|
<p>
|
|
Impala supports zstd files created by the zstd command line tool.
|
|
</p>
|
|
|
|
<p conref="../shared/impala_common.xml#common/s3_block_splitting"/>
|
|
|
|
</conbody>
|
|
|
|
</concept>
|
|
|
|
<concept id="text_ddl">
|
|
|
|
<title>Creating Text Tables</title>
|
|
|
|
<conbody>
|
|
|
|
<p>
|
|
<b>To create a table using text data files:</b>
|
|
</p>
|
|
|
|
<p>
|
|
If the exact format of the text data files (such as the delimiter character) is not significant, use the
|
|
<codeph>CREATE TABLE</codeph> statement with no extra clauses at the end to create a text-format table. For
|
|
example:
|
|
</p>
|
|
|
|
<codeblock>create table my_table(id int, s string, n int, t timestamp, b boolean);
|
|
</codeblock>
|
|
|
|
<p>
|
|
The data files created by any <codeph>INSERT</codeph> statements will use the Ctrl-A character (hex 01) as
|
|
a separator between each column value.
|
|
</p>
|
|
|
|
<p>
|
|
A common use case is to import existing text files into an Impala table. The syntax is more verbose; the
|
|
significant part is the <codeph>FIELDS TERMINATED BY</codeph> clause, which must be preceded by the
|
|
<codeph>ROW FORMAT DELIMITED</codeph> clause. The statement can end with a <codeph>STORED AS
|
|
TEXTFILE</codeph> clause, but that clause is optional because text format tables are the default. For
|
|
example:
|
|
</p>
|
|
|
|
<codeblock>create table csv(id int, s string, n int, t timestamp, b boolean)
|
|
row format delimited
|
|
<ph id="csv">fields terminated by ',';</ph>
|
|
|
|
create table tsv(id int, s string, n int, t timestamp, b boolean)
|
|
row format delimited
|
|
<ph id="tsv">fields terminated by '\t';</ph>
|
|
|
|
create table pipe_separated(id int, s string, n int, t timestamp, b boolean)
|
|
row format delimited
|
|
<ph id="psv">fields terminated by '|'</ph>
|
|
stored as textfile;
|
|
</codeblock>
|
|
|
|
<p>
|
|
You can create tables with specific separator characters to import text files in familiar formats such as
|
|
CSV, TSV, or pipe-separated. You can also use these tables to produce output data files, by copying data
|
|
into them through the <codeph>INSERT ... SELECT</codeph> syntax and then extracting the data files from the
|
|
Impala data directory.
|
|
</p>
|
|
|
|
<p rev="1.3.1">
|
|
In Impala 1.3.1 and higher, you can specify a delimiter character <codeph>'\</codeph><codeph>0'</codeph> to
|
|
use the ASCII 0 (<codeph>nul</codeph>) character for text tables:
|
|
</p>
|
|
|
|
<codeblock rev="1.3.1">create table nul_separated(id int, s string, n int, t timestamp, b boolean)
|
|
row format delimited
|
|
fields terminated by '\0'
|
|
stored as textfile;
|
|
</codeblock>
|
|
|
|
<note>
|
|
<p>
|
|
Do not surround string values with quotation marks in text data files that you construct. If you need to
|
|
include the separator character inside a field value, for example to put a string value with a comma
|
|
inside a CSV-format data file, specify an escape character on the <codeph>CREATE TABLE</codeph> statement
|
|
with the <codeph>ESCAPED BY</codeph> clause, and insert that character immediately before any separator
|
|
characters that need escaping.
|
|
</p>
|
|
</note>
|
|
|
|
<p>
|
|
Issue a <codeph>DESCRIBE FORMATTED <varname>table_name</varname></codeph> statement to see the details of
|
|
how each table is represented internally in Impala.
|
|
</p>
|
|
|
|
<p conref="../shared/impala_common.xml#common/complex_types_unsupported_filetype"/>
|
|
|
|
</conbody>
|
|
|
|
</concept>
|
|
|
|
<concept id="text_data_files">
|
|
|
|
<title>Data Files for Text Tables</title>
|
|
|
|
<conbody>
|
|
|
|
<p>
|
|
When Impala queries a table with data in text format, it consults all the data files in the data directory
|
|
for that table, with some exceptions:
|
|
</p>
|
|
|
|
<ul rev="2.2.0">
|
|
<li>
|
|
<p>
|
|
Impala ignores any hidden files, that is, files whose names start with a dot or an underscore.
|
|
</p>
|
|
</li>
|
|
|
|
<li>
|
|
<p conref="../shared/impala_common.xml#common/ignore_file_extensions"/>
|
|
</li>
|
|
|
|
<li>
|
|
<p> Impala uses suffixes to recognize when text data files are compressed text. For Impala
|
|
to recognize the compressed text files, they must have the appropriate file extension
|
|
corresponding to the compression codec, either <codeph>.bz2</codeph>, <codeph>.deflate</codeph>
|
|
<codeph>.gz</codeph>, <codeph>.snappy</codeph>, or <codeph>.zst</codeph>.
|
|
The extensions can be in uppercase or lowercase. </p>
|
|
</li>
|
|
|
|
<li> Otherwise, the file names are not significant. When you put files
|
|
into an HDFS directory through ETL jobs, or point Impala to an
|
|
existing HDFS directory with the <codeph>CREATE EXTERNAL
|
|
TABLE</codeph> statement, or move data files under external control
|
|
with the <codeph>LOAD DATA</codeph> statement, Impala preserves the
|
|
original file names. </li>
|
|
</ul>
|
|
|
|
<p> File names for data produced through Impala <codeph>INSERT</codeph>
|
|
statements are given unique names to avoid file name conflicts. </p>
|
|
|
|
<p>
|
|
An <codeph>INSERT ... SELECT</codeph> statement produces one data file from each node that processes the
|
|
<codeph>SELECT</codeph> part of the statement. An <codeph>INSERT ... VALUES</codeph> statement produces a
|
|
separate data file for each statement; because Impala is more efficient querying a small number of huge
|
|
files than a large number of tiny files, the <codeph>INSERT ... VALUES</codeph> syntax is not recommended
|
|
for loading a substantial volume of data. If you find yourself with a table that is inefficient due to too
|
|
many small data files, reorganize the data into a few large files by doing <codeph>INSERT ...
|
|
SELECT</codeph> to transfer the data to a new table.
|
|
</p>
|
|
|
|
<p>
|
|
<b>Special values within text data files:</b>
|
|
</p>
|
|
|
|
<ul>
|
|
<li rev="1.4.0">
|
|
<p>
|
|
Impala recognizes the literal strings <codeph>inf</codeph> for infinity and <codeph>nan</codeph> for
|
|
<q>Not a Number</q>, for <codeph>FLOAT</codeph> and <codeph>DOUBLE</codeph> columns.
|
|
</p>
|
|
</li>
|
|
|
|
<li>
|
|
<p> Impala recognizes the literal string <codeph>\N</codeph> to
|
|
represent <codeph>NULL</codeph>. When using Sqoop, specify the
|
|
options <codeph>--null-non-string</codeph> and
|
|
<codeph>--null-string</codeph> to ensure all <codeph>NULL</codeph>
|
|
values are represented correctly in the Sqoop output files.
|
|
<codeph>\N</codeph> needs to be escaped as in the below example:
|
|
<codeblock>--null-string '\\N' --null-non-string '\\N'</codeblock>
|
|
</p>
|
|
</li>
|
|
<li>
|
|
<p>By default, Sqoop writes <codeph>NULL</codeph> values using the
|
|
string <codeph>null</codeph>, which causes a conversion error when
|
|
such rows are evaluated by Impala. A workaround for existing tables
|
|
and data files is to change the table properties through
|
|
<codeph>ALTER TABLE <varname>name</varname> SET
|
|
TBLPROPERTIES("serialization.null.format"="null")</codeph>.</p>
|
|
</li>
|
|
|
|
<li>
|
|
<p conref="../shared/impala_common.xml#common/skip_header_lines"/>
|
|
</li>
|
|
</ul>
|
|
|
|
</conbody>
|
|
|
|
</concept>
|
|
|
|
<concept id="text_etl">
|
|
|
|
<title>Loading Data into Impala Text Tables</title>
|
|
<prolog>
|
|
<metadata>
|
|
<data name="Category" value="ETL"/>
|
|
<data name="Category" value="Ingest"/>
|
|
</metadata>
|
|
</prolog>
|
|
|
|
<conbody>
|
|
|
|
<p>
|
|
To load an existing text file into an Impala text table, use the <codeph>LOAD DATA</codeph> statement and
|
|
specify the path of the file in HDFS. That file is moved into the appropriate Impala data directory.
|
|
</p>
|
|
|
|
<p>
|
|
To load multiple existing text files into an Impala text table, use the <codeph>LOAD DATA</codeph>
|
|
statement and specify the HDFS path of the directory containing the files. All non-hidden files are moved
|
|
into the appropriate Impala data directory.
|
|
</p>
|
|
|
|
<p>
|
|
To convert data to text from any other file format supported by Impala, use a SQL statement such as:
|
|
</p>
|
|
|
|
<codeblock>-- Text table with default delimiter, the hex 01 character.
|
|
CREATE TABLE text_table AS SELECT * FROM other_file_format_table;
|
|
|
|
-- Text table with user-specified delimiter. Currently, you cannot specify
|
|
-- the delimiter as part of CREATE TABLE LIKE or CREATE TABLE AS SELECT.
|
|
-- But you can change an existing text table to have a different delimiter.
|
|
CREATE TABLE csv LIKE other_file_format_table;
|
|
ALTER TABLE csv SET SERDEPROPERTIES ('serialization.format'=',', 'field.delim'=',');
|
|
INSERT INTO csv SELECT * FROM other_file_format_table;</codeblock>
|
|
|
|
<p>
|
|
This can be a useful technique to see how Impala represents special values within a text-format data file.
|
|
Use the <codeph>DESCRIBE FORMATTED</codeph> statement to see the HDFS directory where the data files are
|
|
stored, then use Linux commands such as <codeph>hdfs dfs -ls <varname>hdfs_directory</varname></codeph> and
|
|
<codeph>hdfs dfs -cat <varname>hdfs_file</varname></codeph> to display the contents of an Impala-created
|
|
text file.
|
|
</p>
|
|
|
|
<p>
|
|
To create a few rows in a text table for test purposes, you can use the <codeph>INSERT ... VALUES</codeph>
|
|
syntax:
|
|
</p>
|
|
|
|
<codeblock>INSERT INTO <varname>text_table</varname> VALUES ('string_literal',100,hex('hello world'));</codeblock>
|
|
|
|
<note>
|
|
Because Impala and the HDFS infrastructure are optimized for multi-megabyte files, avoid the <codeph>INSERT
|
|
... VALUES</codeph> notation when you are inserting many rows. Each <codeph>INSERT ... VALUES</codeph>
|
|
statement produces a new tiny file, leading to fragmentation and reduced performance. When creating any
|
|
substantial volume of new data, use one of the bulk loading techniques such as <codeph>LOAD DATA</codeph>
|
|
or <codeph>INSERT ... SELECT</codeph>. Or, <xref href="impala_hbase.xml#impala_hbase">use an HBase
|
|
table</xref> for single-row <codeph>INSERT</codeph> operations, because HBase tables are not subject to the
|
|
same fragmentation issues as tables stored on HDFS.
|
|
</note>
|
|
|
|
<p> When you create a text file for use with an Impala text table, specify
|
|
<codeph>\N</codeph> to represent a <codeph>NULL</codeph> value. For
|
|
the differences between <codeph>NULL</codeph> and empty strings, see
|
|
<xref href="impala_literals.xml#null"/>. </p>
|
|
|
|
<p>
|
|
If a text file has fewer fields than the columns in the corresponding Impala table, all the corresponding
|
|
columns are set to <codeph>NULL</codeph> when the data in that file is read by an Impala query.
|
|
</p>
|
|
|
|
<p>
|
|
If a text file has more fields than the columns in the corresponding Impala table, the extra fields are
|
|
ignored when the data in that file is read by an Impala query.
|
|
</p>
|
|
|
|
<p>
|
|
You can also use manual HDFS operations such as <codeph>hdfs dfs -put</codeph> or <codeph>hdfs dfs
|
|
-cp</codeph> to put data files in the data directory for an Impala table. When you copy or move new data
|
|
files into the HDFS directory for the Impala table, issue a <codeph>REFRESH
|
|
<varname>table_name</varname></codeph> statement in <cmdname>impala-shell</cmdname> before issuing the next
|
|
query against that table, to make Impala recognize the newly added files.
|
|
</p>
|
|
|
|
</conbody>
|
|
|
|
</concept>
|
|
|
|
<concept id="lzo">
|
|
|
|
<title>Using LZO-Compressed Text Files</title>
|
|
<prolog>
|
|
<metadata>
|
|
<data name="Category" value="LZO"/>
|
|
<data name="Category" value="Compression"/>
|
|
</metadata>
|
|
</prolog>
|
|
|
|
<conbody>
|
|
|
|
<p> Impala supports using text data files that employ LZO compression.
|
|
Where practical, apply compression to text data files. Impala queries
|
|
are usually I/O-bound; reducing the amount of data read from disk
|
|
typically speeds up a query, despite the extra CPU work to uncompress
|
|
the data in memory. </p>
|
|
|
|
<p>
|
|
Impala can work with LZO-compressed text files are preferable to files compressed by other codecs, because
|
|
LZO-compressed files are <q>splittable</q>, meaning that different portions of a file can be uncompressed
|
|
and processed independently by different nodes.
|
|
</p>
|
|
|
|
<p>
|
|
Impala does not currently support writing LZO-compressed text files.
|
|
</p>
|
|
|
|
<p>
|
|
Because Impala can query LZO-compressed files but currently cannot write them, you use Hive to do the
|
|
initial <codeph>CREATE TABLE</codeph> and load the data, then switch back to Impala to run queries. For
|
|
instructions on setting up LZO compression for Hive <codeph>CREATE TABLE</codeph> and
|
|
<codeph>INSERT</codeph> statements, see
|
|
<xref href="https://cwiki.apache.org/confluence/display/Hive/LanguageManual+LZO" scope="external" format="html">the
|
|
LZO page on the Hive wiki</xref>. Once you have created an LZO text table, you can also manually add
|
|
LZO-compressed text files to it, produced by the
|
|
<xref href="http://www.lzop.org/" scope="external" format="html"> <cmdname>lzop</cmdname></xref> command
|
|
or similar method.
|
|
</p>
|
|
|
|
<section id="lzo_setup">
|
|
|
|
<title>Preparing to Use LZO-Compressed Text Files</title>
|
|
|
|
<p>
|
|
Before using LZO-compressed tables in Impala, do the following one-time setup for each machine in the
|
|
cluster. Install the necessary packages using either the public repository, a private repository
|
|
you establish, or by using packages. You must do these steps manually, whether or not you
|
|
are using cluster management software.
|
|
</p>
|
|
|
|
<ol>
|
|
<li>
|
|
<b>Prepare your systems to work with LZO by downloading and installing the appropriate libraries:</b>
|
|
|
|
<p>
|
|
Download and install the appropriate file to each machine on which you intend to use LZO with Impala.
|
|
</p>
|
|
</li>
|
|
|
|
<li>
|
|
<b>Configure Impala to use LZO:</b>
|
|
<p>
|
|
Use <b>one</b> of the following sets of commands to refresh your package management system's
|
|
repository information, install the base LZO support for Hadoop, and install the LZO support for
|
|
Impala.
|
|
</p>
|
|
|
|
<note rev="1.2.0">
|
|
<p rev="1.2.0">
|
|
The name of the Hadoop LZO package changed in the distant past.
|
|
Currently, the package name is <codeph>hadoop-lzo</codeph>.
|
|
</p>
|
|
</note>
|
|
|
|
<p>
|
|
<b>For RHEL/CentOS systems:</b>
|
|
</p>
|
|
<codeblock>$ sudo yum update
|
|
$ sudo yum install hadoop-lzo
|
|
$ sudo yum install impala-lzo</codeblock>
|
|
<p>
|
|
<b>For SUSE systems:</b>
|
|
</p>
|
|
<codeblock rev="1.2">$ sudo apt-get update
|
|
$ sudo zypper install hadoop-lzo
|
|
$ sudo zypper install impala-lzo</codeblock>
|
|
<p>
|
|
<b>For Debian/Ubuntu systems:</b>
|
|
</p>
|
|
<codeblock>$ sudo zypper update
|
|
$ sudo apt-get install hadoop-lzo
|
|
$ sudo apt-get install impala-lzo</codeblock>
|
|
<note>
|
|
<p>
|
|
The level of the <codeph>impala-lzo</codeph> package is closely tied to the version of Impala
|
|
you use. Any time you upgrade Impala, re-do the installation command for
|
|
<codeph>impala-lzo</codeph> on each applicable machine to make sure you have the appropriate
|
|
version of that package.
|
|
</p>
|
|
</note>
|
|
</li>
|
|
|
|
<li>
|
|
For <codeph>core-site.xml</codeph> on the client <b>and</b> server (that is, in the configuration
|
|
directories for both Impala and Hadoop), append <codeph>com.hadoop.compression.lzo.LzopCodec</codeph>
|
|
to the comma-separated list of codecs. For example:
|
|
<codeblock><property>
|
|
<name>io.compression.codecs</name>
|
|
<value>org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec,
|
|
org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.DeflateCodec,
|
|
org.apache.hadoop.io.compress.SnappyCodec,com.hadoop.compression.lzo.LzopCodec</value>
|
|
</property></codeblock>
|
|
<note>
|
|
<p>
|
|
If this is the first time you have edited the Hadoop <filepath>core-site.xml</filepath> file, note
|
|
that the <filepath>/etc/hadoop/conf</filepath> directory is typically a symbolic link, so the
|
|
canonical <filepath>core-site.xml</filepath> might reside in a different directory:
|
|
</p>
|
|
<codeblock>$ ls -l /etc/hadoop
|
|
total 8
|
|
lrwxrwxrwx. 1 root root 29 Feb 26 2013 conf -> /etc/alternatives/hadoop-conf
|
|
lrwxrwxrwx. 1 root root 10 Feb 26 2013 conf.dist -> conf.empty
|
|
drwxr-xr-x. 2 root root 4096 Feb 26 2013 conf.empty
|
|
drwxr-xr-x. 2 root root 4096 Oct 28 15:46 conf.pseudo</codeblock>
|
|
<p>
|
|
If the <codeph>io.compression.codecs</codeph> property is missing from
|
|
<filepath>core-site.xml</filepath>, only add <codeph>com.hadoop.compression.lzo.LzopCodec</codeph>
|
|
to the new property value, not all the names from the preceding example.
|
|
</p>
|
|
</note>
|
|
</li>
|
|
|
|
<li>
|
|
Restart the MapReduce and Impala services.
|
|
</li>
|
|
</ol>
|
|
|
|
</section>
|
|
|
|
<section id="lzo_create_table">
|
|
|
|
<title>Creating LZO Compressed Text Tables</title>
|
|
|
|
<p>
|
|
A table containing LZO-compressed text files must be created in Hive with the following storage clause:
|
|
</p>
|
|
|
|
<codeblock>STORED AS
|
|
INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
|
|
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'</codeblock>
|
|
|
|
<p>
|
|
Also, certain Hive settings need to be in effect. For example:
|
|
</p>
|
|
|
|
<codeblock>hive> SET mapreduce.output.fileoutputformat.compress=true;
|
|
hive> SET hive.exec.compress.output=true;
|
|
hive> SET mapreduce.output.fileoutputformat.compress.codec=com.hadoop.compression.lzo.LzopCodec;
|
|
hive> CREATE TABLE lzo_t (s string) STORED AS
|
|
> INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
|
|
> OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat';
|
|
hive> INSERT INTO TABLE lzo_t SELECT col1, col2 FROM uncompressed_text_table;</codeblock>
|
|
|
|
<p>
|
|
Once you have created LZO-compressed text tables, you can convert data stored in other tables (regardless
|
|
of file format) by using the <codeph>INSERT ... SELECT</codeph> statement in Hive.
|
|
</p>
|
|
|
|
<p>
|
|
Files in an LZO-compressed table must use the <codeph>.lzo</codeph> extension. Examine the files in the
|
|
HDFS data directory after doing the <codeph>INSERT</codeph> in Hive, to make sure the files have the
|
|
right extension. If the required settings are not in place, you end up with regular uncompressed files,
|
|
and Impala cannot access the table because it finds data files with the wrong (uncompressed) format.
|
|
</p>
|
|
|
|
<p>
|
|
After loading data into an LZO-compressed text table, index the files so that they can be split. You
|
|
index the files by running a Java class,
|
|
<codeph>com.hadoop.compression.lzo.DistributedLzoIndexer</codeph>, through the Linux command line. This
|
|
Java class is included in the <codeph>hadoop-lzo</codeph> package.
|
|
</p>
|
|
|
|
<p>
|
|
Run the indexer using a command like the following:
|
|
</p>
|
|
|
|
<codeblock>$ hadoop jar /usr/lib/hadoop/lib/hadoop-lzo-<varname>version</varname>-gplextras.jar
|
|
com.hadoop.compression.lzo.DistributedLzoIndexer /<varname>hdfs_location_of_table</varname>/</codeblock>
|
|
|
|
<note>
|
|
If the path of the JAR file in the preceding example is not recognized, do a <cmdname>find</cmdname>
|
|
command to locate <filepath>hadoop-lzo-*-gplextras.jar</filepath> and use that path.
|
|
</note>
|
|
|
|
<p>
|
|
Indexed files have the same name as the file they index, with the <codeph>.index</codeph> extension. If
|
|
the data files are not indexed, Impala queries still work, but the queries read the data from remote
|
|
DataNodes, which is very inefficient.
|
|
</p>
|
|
|
|
<p>
|
|
Once the LZO-compressed tables are created, and data is loaded and indexed, you can query them through
|
|
Impala. As always, the first time you start <cmdname>impala-shell</cmdname> after creating a table in
|
|
Hive, issue an <codeph>INVALIDATE METADATA</codeph> statement so that Impala recognizes the new table.
|
|
(In Impala 1.2 and higher, you only have to run <codeph>INVALIDATE METADATA</codeph> on one node, rather
|
|
than on all the Impala nodes.)
|
|
</p>
|
|
|
|
</section>
|
|
|
|
</conbody>
|
|
|
|
</concept>
|
|
|
|
<concept rev="2.0.0" id="gzip">
|
|
|
|
<title>Using bzip2, deflate, gzip, Snappy, or zstd Text Files</title>
|
|
<prolog>
|
|
<metadata>
|
|
<data name="Category" value="Deflate"/>
|
|
<data name="Category" value="Snappy"/>
|
|
<data name="Category" value="Gzip"/>
|
|
<data name="Category" value="Zstd"/>
|
|
<data name="Category" value="Compression"/>
|
|
</metadata>
|
|
</prolog>
|
|
|
|
<conbody>
|
|
|
|
<p> Impala supports using text data files that employ bzip2, deflate, gzip, Snappy, or zstd
|
|
compression. These compression types are primarily for convenience within an existing ETL
|
|
pipeline rather than maximum performance. Although it requires less I/O to read compressed
|
|
text than the equivalent uncompressed text, files compressed by these codecs are not
|
|
<q>splittable</q> and therefore cannot take full advantage of the Impala parallel query
|
|
capability. Impala can read compressed text files written by Hive.</p>
|
|
|
|
<p> As each Snappy-compressed file is processed, the node doing the work reads the entire file
|
|
into memory and then decompresses it. Therefore, the node must have enough memory to hold
|
|
both the compressed and uncompressed data from the text file. The memory required to hold
|
|
the uncompressed data is difficult to estimate in advance, potentially causing problems on
|
|
systems with low memory limits or with resource management enabled. <ph rev="2.1.0">This
|
|
memory overhead is reduced for bzip2-, deflate-, gzip-, and zstd-compressed text files. The
|
|
compressed data is decompressed as it is read, rather than all at once.</ph>
|
|
</p>
|
|
|
|
<p> To create a table to hold compressed text, create a text table with no special compression
|
|
options. Specify the delimiter and escape character if required, using the <codeph>ROW
|
|
FORMAT</codeph> clause. </p>
|
|
|
|
<p>
|
|
Because Impala can query compressed text files but currently cannot write them, produce the compressed text
|
|
files outside Impala and use the <codeph>LOAD DATA</codeph> statement, manual HDFS commands to move them to
|
|
the appropriate Impala data directory. (Or, you can use <codeph>CREATE EXTERNAL TABLE</codeph> and point
|
|
the <codeph>LOCATION</codeph> attribute at a directory containing existing compressed text files.)
|
|
</p>
|
|
|
|
<p>
|
|
The following example shows how you can create a regular text table, put different kinds of compressed and
|
|
uncompressed files into it, and Impala automatically recognizes and decompresses each one based on their
|
|
file extensions:
|
|
</p>
|
|
|
|
<codeblock>create table csv_compressed (a string, b string, c string)
|
|
row format delimited fields terminated by ",";
|
|
|
|
insert into csv_compressed values
|
|
('one - uncompressed', 'two - uncompressed', 'three - uncompressed'),
|
|
('abc - uncompressed', 'xyz - uncompressed', '123 - uncompressed');
|
|
...make equivalent .bz2, .gz, .snappy, and .zst files and load them into same table directory...
|
|
|
|
select * from csv_compressed;
|
|
+--------------------+--------------------+----------------------+
|
|
| a | b | c |
|
|
+--------------------+--------------------+----------------------+
|
|
| one - snappy | two - snappy | three - snappy |
|
|
| one - uncompressed | two - uncompressed | three - uncompressed |
|
|
| abc - uncompressed | xyz - uncompressed | 123 - uncompressed |
|
|
| one - bz2 | two - bz2 | three - bz2 |
|
|
| abc - bz2 | xyz - bz2 | 123 - bz2 |
|
|
| one - gzip | two - gzip | three - gzip |
|
|
| abc - gzip | xyz - gzip | 123 - gzip |
|
|
| one - zstd | two - zstd | three - zstd |
|
|
| abc - zstd | xyz - zstd | 123 - zstd |
|
|
| one - deflate | two - deflate | three - deflate |
|
|
| abc - deflate | xyz - deflate | 123 - deflate |
|
|
+--------------------+--------------------+----------------------+
|
|
|
|
$ hdfs dfs -ls 'hdfs://127.0.0.1:8020/user/hive/warehouse/file_formats.db/csv_compressed/';
|
|
...truncated for readability...
|
|
75 hdfs://127.0.0.1:8020/user/hive/warehouse/file_formats.db/csv_compressed/csv_compressed.snappy
|
|
79 hdfs://127.0.0.1:8020/user/hive/warehouse/file_formats.db/csv_compressed/csv_compressed_bz2.csv.bz2
|
|
80 hdfs://127.0.0.1:8020/user/hive/warehouse/file_formats.db/csv_compressed/csv_compressed_gzip.csv.gz
|
|
58 hdfs://127.0.0.1:8020/user/hive/warehouse/file_formats.db/csv_compressed/csv_compressed_zstd.csv.zst
|
|
48 hdfs://127.0.0.1:8020/user/hive/warehouse/file_formats.db/csv_compressed/csv_compressed_deflate.csv.deflate
|
|
116 hdfs://127.0.0.1:8020/user/hive/warehouse/file_formats.db/csv_compressed/dd414df64d67d49b_data.0.
|
|
</codeblock>
|
|
|
|
</conbody>
|
|
|
|
</concept>
|
|
|
|
</concept>
|