mirror of
https://github.com/apache/impala.git
synced 2025-12-25 02:03:09 -05:00
Extended the ALTER TABLE documentation with the SORT BY clause. Also added more information about the available and the deafult sort orders to the CREATE TABLE description. Testing: Built docs locally. Change-Id: Ieb348d8395a6140f0be200d73e2f22fded9a5116 Reviewed-on: http://gerrit.cloudera.org:8080/21083 Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Reviewed-by: Daniel Becker <daniel.becker@cloudera.com>
1461 lines
62 KiB
XML
1461 lines
62 KiB
XML
<?xml version="1.0" encoding="UTF-8"?>
|
|
<!--
|
|
Licensed to the Apache Software Foundation (ASF) under one
|
|
or more contributor license agreements. See the NOTICE file
|
|
distributed with this work for additional information
|
|
regarding copyright ownership. The ASF licenses this file
|
|
to you under the Apache License, Version 2.0 (the
|
|
"License"); you may not use this file except in compliance
|
|
with the License. You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing,
|
|
software distributed under the License is distributed on an
|
|
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
KIND, either express or implied. See the License for the
|
|
specific language governing permissions and limitations
|
|
under the License.
|
|
-->
|
|
<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
|
|
<concept id="create_table" outputclass="impala sql_statement">
|
|
|
|
<title outputclass="impala_title sql_statement_title">CREATE TABLE Statement</title>
|
|
|
|
<titlealts audience="PDF">
|
|
|
|
<navtitle>CREATE TABLE</navtitle>
|
|
|
|
</titlealts>
|
|
|
|
<prolog>
|
|
<metadata>
|
|
<data name="Category" value="Impala"/>
|
|
<data name="Category" value="SQL"/>
|
|
<data name="Category" value="DDL"/>
|
|
<data name="Category" value="Impala Data Types"/>
|
|
<data name="Category" value="Developers"/>
|
|
<data name="Category" value="Data Analysts"/>
|
|
<data name="Category" value="HDFS Caching"/>
|
|
<data name="Category" value="Tables"/>
|
|
<data name="Category" value="Schemas"/>
|
|
<data name="Category" value="S3"/>
|
|
<data name="Category" value="Kudu"/>
|
|
</metadata>
|
|
</prolog>
|
|
|
|
<conbody>
|
|
|
|
<p>
|
|
Creates a new table and specifies its characteristics. While creating a table, you
|
|
optionally specify aspects such as:
|
|
</p>
|
|
|
|
<ul>
|
|
<li>
|
|
Whether the table is internal or external.
|
|
</li>
|
|
|
|
<li>
|
|
The columns and associated data types.
|
|
</li>
|
|
|
|
<li>
|
|
The columns used for physically partitioning the data.
|
|
</li>
|
|
|
|
<li>
|
|
The file format for data files.
|
|
</li>
|
|
|
|
<li>
|
|
The HDFS directory where the data files are located.
|
|
</li>
|
|
</ul>
|
|
|
|
<p conref="../shared/impala_common.xml#common/syntax_blurb"/>
|
|
|
|
<p>
|
|
The general syntax for creating a table and specifying its columns is as follows:
|
|
</p>
|
|
|
|
<p>
|
|
<b>Explicit column definitions:</b>
|
|
</p>
|
|
|
|
<codeblock>CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [<varname>db_name</varname>.]<varname>table_name</varname>
|
|
(<varname>col_name</varname> <varname>data_type</varname>
|
|
[<varname>constraint_specification</varname>]
|
|
[COMMENT '<varname>col_comment</varname>']
|
|
[, ...]
|
|
)
|
|
[PARTITIONED BY (<varname>col_name</varname> <varname>data_type</varname> [COMMENT '<varname>col_comment</varname>'], ...)]
|
|
<ph rev="2.9.0 IMPALA-4166">[SORT BY ([<varname>column</varname> [, <varname>column</varname> ...]])]</ph>
|
|
[COMMENT '<varname>table_comment</varname>']
|
|
[ROW FORMAT <varname>row_format</varname>]
|
|
[WITH SERDEPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)]
|
|
[STORED AS <varname>file_format</varname>]
|
|
[LOCATION '<varname>hdfs_path</varname>']
|
|
<ph rev="1.4.0">[CACHED IN '<varname>pool_name</varname>'</ph> <ph rev="2.2.0">[WITH REPLICATION = <varname>integer</varname>]</ph> | UNCACHED]
|
|
[TBLPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)]
|
|
</codeblock>
|
|
|
|
<p>
|
|
<b>CREATE TABLE AS SELECT:</b>
|
|
</p>
|
|
|
|
<codeblock>CREATE [EXTERNAL] TABLE [IF NOT EXISTS] <varname>db_name</varname>.]<varname>table_name</varname>
|
|
<ph rev="2.5.0">[PARTITIONED BY (<varname>col_name</varname>[, ...])]</ph>
|
|
<ph rev="2.9.0 IMPALA-4166">[SORT BY ([<varname>column</varname> [, <varname>column</varname> ...]])]</ph>
|
|
[COMMENT '<varname>table_comment</varname>']
|
|
[ROW FORMAT <varname>row_format</varname>]
|
|
[WITH SERDEPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)]
|
|
<ph rev="">[STORED AS <varname>ctas_file_format</varname>]</ph>
|
|
[LOCATION '<varname>hdfs_path</varname>']
|
|
<ph rev="1.4.0"> [CACHED IN '<varname>pool_name</varname>'</ph> <ph rev="2.2.0">[WITH REPLICATION = <varname>integer</varname>]</ph> | UNCACHED]
|
|
[TBLPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)]
|
|
AS
|
|
<varname>select_statement</varname></codeblock>
|
|
|
|
<codeblock>primitive_type:
|
|
TINYINT
|
|
| SMALLINT
|
|
| INT
|
|
| BIGINT
|
|
| BOOLEAN
|
|
| FLOAT
|
|
| DOUBLE
|
|
<ph rev="1.4.0">| DECIMAL</ph>
|
|
| STRING
|
|
<ph rev="2.0.0">| CHAR</ph>
|
|
<ph rev="2.0.0">| VARCHAR</ph>
|
|
| TIMESTAMP
|
|
|
|
<ph rev="2.3.0">complex_type:
|
|
struct_type
|
|
| array_type
|
|
| map_type
|
|
|
|
struct_type: STRUCT < <varname>name</varname> : <varname>primitive_or_complex_type</varname> [COMMENT '<varname>comment_string</varname>'], ... >
|
|
|
|
array_type: ARRAY < <varname>primitive_or_complex_type</varname> >
|
|
|
|
map_type: MAP < <varname>primitive_type</varname>, <varname>primitive_or_complex_type</varname> >
|
|
</ph>
|
|
|
|
constraint_specification:
|
|
PRIMARY KEY (<varname>col_name</varname>, ...) [DISABLE] [NOVALIDATE] [RELY], [<varname>foreign_key_specification</varname>, ...]
|
|
|
|
foreign_key_specification:
|
|
FOREIGN KEY (<varname>col_name</varname>, ...) REFERENCES table_name(<varname>col_name</varname>, ...) [DISABLE] [NOVALIDATE] [RELY]
|
|
|
|
row_format:
|
|
DELIMITED [FIELDS TERMINATED BY '<varname>char</varname>' [ESCAPED BY '<varname>char</varname>']]
|
|
[LINES TERMINATED BY '<varname>char</varname>']
|
|
|
|
file_format:
|
|
PARQUET
|
|
| TEXTFILE
|
|
| AVRO
|
|
| SEQUENCEFILE
|
|
| RCFILE
|
|
|
|
<ph rev="">ctas_file_format:
|
|
PARQUET
|
|
| TEXTFILE</ph>
|
|
</codeblock>
|
|
|
|
<p>
|
|
<b>Column definitions inferred from data file:</b>
|
|
</p>
|
|
|
|
<codeblock>CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [<varname>db_name</varname>.]<varname>table_name</varname>
|
|
LIKE PARQUET '<varname>hdfs_path_of_parquet_file</varname>'
|
|
[PARTITIONED BY (<varname>col_name</varname> <varname>data_type</varname> [COMMENT '<varname>col_comment</varname>'], ...)]
|
|
<ph rev="2.9.0 IMPALA-4166">[SORT BY ([<varname>column</varname> [, <varname>column</varname> ...]])]</ph>
|
|
[COMMENT '<varname>table_comment</varname>']
|
|
[ROW FORMAT <varname>row_format</varname>]
|
|
[WITH SERDEPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)]
|
|
[STORED AS <varname>file_format</varname>]
|
|
[LOCATION '<varname>hdfs_path</varname>']
|
|
<ph rev="1.4.0"> [CACHED IN '<varname>pool_name</varname>'</ph> <ph rev="2.2.0">[WITH REPLICATION = <varname>integer</varname>]</ph> | UNCACHED]
|
|
[TBLPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)]
|
|
data_type:
|
|
<varname>primitive_type</varname>
|
|
| array_type
|
|
| map_type
|
|
| struct_type
|
|
</codeblock>
|
|
|
|
<p>
|
|
<b>Internal Kudu tables:</b>
|
|
</p>
|
|
|
|
<codeblock rev="kudu">CREATE TABLE [IF NOT EXISTS] [<varname>db_name</varname>.]<varname>table_name</varname>
|
|
(<varname>col_name</varname> <varname>data_type</varname>
|
|
<ph rev="kudu IMPALA-3719">[<varname>kudu_column_attribute</varname> ...]</ph>
|
|
[COMMENT '<varname>col_comment</varname>']
|
|
[, ...]
|
|
[PRIMARY KEY (<varname>col_name</varname>[, ...])]
|
|
)
|
|
<ph rev="kudu">[PARTITION BY <varname>kudu_partition_clause</varname>]</ph>
|
|
[COMMENT '<varname>table_comment</varname>']
|
|
STORED AS KUDU
|
|
[TBLPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)]
|
|
</codeblock>
|
|
|
|
<p rev="kudu IMPALA-3719">
|
|
<b>Kudu column attributes:</b>
|
|
<codeblock rev="kudu">
|
|
PRIMARY KEY
|
|
| [NOT] NULL
|
|
| ENCODING <varname>codec</varname>
|
|
| COMPRESSION <varname>algorithm</varname>
|
|
| DEFAULT <varname>constant</varname>
|
|
| BLOCK_SIZE <varname>number</varname>
|
|
</codeblock>
|
|
</p>
|
|
|
|
<p rev="kudu IMPALA-3719">
|
|
<b>kudu_partition_clause:</b>
|
|
<codeblock rev="kudu">
|
|
kudu_partition_clause ::= [ <varname>hash_clause</varname> [, ...]] [, <varname>range_clause</varname> ]
|
|
|
|
hash_clause ::=
|
|
HASH [ (<varname>pk_col</varname> [, ...]) ]
|
|
PARTITIONS <varname>n</varname>
|
|
|
|
range_clause ::=
|
|
RANGE [ (<varname>pk_col</varname> [, ...]) ]
|
|
(
|
|
{
|
|
PARTITION <varname>constant_expression</varname> <varname>range_comparison_operator</varname> VALUES <varname>range_comparison_operator</varname> <varname>constant_expression</varname>
|
|
| PARTITION VALUE = <varname>constant_expression_or_tuple</varname>
|
|
}
|
|
[, ...]
|
|
)
|
|
|
|
range_comparison_operator ::= { < | <= }
|
|
</codeblock>
|
|
</p>
|
|
|
|
<p>
|
|
<b>External Kudu tables:</b>
|
|
</p>
|
|
|
|
<p>In Impala 3.4 and earlier, you can create an external Kudu table based on a pre-existing Kudu
|
|
schema using the table property <codeph>'kudu.table_name'='internal_kudu_name'</codeph>. </p>
|
|
|
|
<codeblock>CREATE EXTERNAL TABLE [IF NOT EXISTS] [<varname>db_name</varname>.]<varname>table_name</varname>
|
|
[COMMENT '<varname>col_comment</varname>']
|
|
STORED AS KUDU
|
|
[TBLPROPERTIES ('<varname>kudu.table.name</varname>'='<varname>internal_kudu_name</varname>', '<varname>key1</varname>'='<varname>value1</varname>',...)]
|
|
</codeblock>
|
|
|
|
<p>Alternatively, in Impala 3.4 and higher, you can also create an external Kudu table as follows:</p>
|
|
|
|
<codeblock rev="kudu">CREATE EXTERNAL TABLE [IF NOT EXISTS] [<varname>db_name</varname>.]<varname>table_name</varname>
|
|
(<varname>col_name</varname> <varname>data_type</varname>
|
|
[<varname>kudu_column_attribute</varname> ...]
|
|
[COMMENT '<varname>col_comment</varname>']
|
|
[, ...]
|
|
[PRIMARY KEY (<varname>col_name</varname>[, ...])]
|
|
)
|
|
[PARTITION BY <varname>kudu_partition_clause</varname>]
|
|
[COMMENT '<varname>table_comment</varname>']
|
|
STORED AS KUDU
|
|
[TBLPROPERTIES ('external.table.purge'='true', '<varname>key1</varname>'='<varname>value1</varname>',...)]
|
|
</codeblock>
|
|
|
|
<ul>
|
|
<li>Use a Hive metastore (HMS) 3 or later.</li>
|
|
<li>Provide column specifications to define the schema when you create the table, similar to
|
|
creating an internal table. </li>
|
|
<li>Omit the <codeph>kudu.table_name</codeph> table property as there is no pre-existing
|
|
schema. </li>
|
|
<li>Include the required <codeph>external.table.purge</codeph> property.</li>
|
|
</ul>
|
|
|
|
<p> Only the schema metadata is stored in HMS when you create an external table; however, using
|
|
this create table syntax, drop table on the Kudu external table deletes the data stored
|
|
outside HMS in Kudu as well as the metadata (schema) inside HMS. Likewise, renaming the table
|
|
changes the name of the table in HMS and in Kudu. Kudu synchronizes changes to the actual data
|
|
and metadata; consequently, operations such as dropping a table or altering a table name
|
|
simulate internal table operations. </p>
|
|
|
|
<p>
|
|
<b>CREATE TABLE AS SELECT for Kudu tables:</b>
|
|
</p>
|
|
|
|
<codeblock rev="kudu">CREATE TABLE [IF NOT EXISTS] <varname>db_name</varname>.]<varname>table_name</varname>
|
|
[PRIMARY KEY (<varname>col_name</varname>[, ...])]
|
|
[PARTITION BY <varname>kudu_partition_clause</varname>]
|
|
[COMMENT '<varname>table_comment</varname>']
|
|
STORED AS KUDU
|
|
[TBLPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)]
|
|
AS
|
|
<varname>select_statement</varname></codeblock>
|
|
|
|
<p conref="../shared/impala_common.xml#common/ddl_blurb"/>
|
|
|
|
<!-- Should really have some info up front about all the data types and file formats.
|
|
Consider adding here, or at least making inline links to the relevant keywords
|
|
in the syntax spec above. -->
|
|
|
|
<p>
|
|
<b>Column definitions:</b>
|
|
</p>
|
|
|
|
<p>
|
|
Depending on the form of the <codeph>CREATE TABLE</codeph> statement, the column
|
|
definitions are required or not allowed.
|
|
</p>
|
|
|
|
<p>
|
|
With the <codeph>CREATE TABLE AS SELECT</codeph> and <codeph>CREATE TABLE LIKE</codeph>
|
|
syntax, you do not specify the columns at all; the column names and types are derived from
|
|
the source table, query, or data file.
|
|
</p>
|
|
|
|
<p> With the basic <codeph>CREATE TABLE</codeph> syntax, you must list one or more columns, its
|
|
name, type, optionally constraints, and optionally a comment, in addition to any columns used
|
|
as partitioning keys. There is one exception where the column list is not required: when
|
|
creating an Avro table with the <codeph>STORED AS AVRO</codeph> clause, you can omit the list
|
|
of columns and specify the same metadata as part of the <codeph>TBLPROPERTIES</codeph> clause. </p>
|
|
|
|
<p rev="3.4.0">
|
|
<b>Constraints:</b>
|
|
</p>
|
|
<p>Constraints are advisory and intended for estimating cardinality during query planning in a
|
|
future release; there is no attempt to enforce constraints. Add primary and foreign key
|
|
information after column definitions. Do not include a constraint name; the constraint name is
|
|
generated internally as a UUID. The following constraint states are supported: <ul
|
|
id="ul_gbz_3kl_4kb">
|
|
<li>DISABLE</li>
|
|
<li>NOVALIDATE</li>
|
|
<li>RELY</li>
|
|
</ul>The ENABLE, VALIDATE, and NORELY options are not supported. The foreign key must be
|
|
defined as the primary key in the referenced table. </p>
|
|
<p> Constraint examples: <codeblock>CREATE TABLE pk(col1 INT, col2 STRING, PRIMARY KEY(col1, col2));</codeblock>
|
|
<codeblock>CREATE TABLE fk(id INT, col1 INT, col2 STRING, PRIMARY KEY(id),
|
|
FOREIGN KEY(col1, col2) REFERENCES pk(col1, col2));</codeblock>
|
|
<codeblock>CREATE TABLE pk(id INT, PRIMARY KEY(id) DISABLE, NOVALIDATE, RELY);</codeblock>
|
|
<codeblock>CREATE TABLE fk(id INT, col1 INT, col2 STRING, PRIMARY KEY(id),
|
|
FOREIGN KEY(col1, col2) REFERENCES pk(col1, col2));</codeblock>
|
|
</p>
|
|
|
|
<p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
|
|
|
|
<p rev="2.3.0">
|
|
The Impala complex types (<codeph>STRUCT</codeph>, <codeph>ARRAY</codeph>, or
|
|
<codeph>MAP</codeph>) are available in <keyword keyref="impala23_full"/> and higher.
|
|
Because you can nest these types (for example, to make an array of maps or a struct with
|
|
an array field), these types are also sometimes referred to as nested types. See
|
|
<xref href="impala_complex_types.xml#complex_types"/> for usage details.
|
|
</p>
|
|
|
|
<!-- This is kind of an obscure and rare usage scenario. Consider moving all the complex type stuff further down
|
|
after some of the more common clauses. -->
|
|
|
|
<p rev="2.3.0">
|
|
Impala can create tables containing complex type columns, with any supported file format.
|
|
Because currently Impala can only query complex type columns in Parquet tables, creating
|
|
tables with complex type columns and other file formats such as text is of limited use.
|
|
For example, you might create a text table including some columns with complex types with
|
|
Impala, and use Hive as part of your to ingest the nested type data and copy it to an
|
|
identical Parquet table. Or you might create a partitioned table containing complex type
|
|
columns using one file format, and use <codeph>ALTER TABLE</codeph> to change the file
|
|
format of individual partitions to Parquet; Impala can then query only the Parquet-format
|
|
partitions in that table.
|
|
</p>
|
|
|
|
<p conref="../shared/impala_common.xml#common/complex_types_partitioning"/>
|
|
|
|
<p>
|
|
<b>Internal and external tables (EXTERNAL and LOCATION clauses):</b>
|
|
</p>
|
|
|
|
<p> By default, Impala creates an internal table, where Impala manages the underlying data files
|
|
for the table, and physically deletes the data files when you drop the table. If you specify
|
|
the <codeph>EXTERNAL</codeph> clause, Impala treats the table as an <q>external</q> table,
|
|
where the data files are typically produced outside Impala and queried from their original
|
|
locations in HDFS, and Impala leaves the data files in place when you drop the table. For
|
|
details about internal and external tables, see <xref href="impala_tables.xml#tables"/>. </p>
|
|
|
|
<p>
|
|
Typically, for an external table you include a <codeph>LOCATION</codeph> clause to specify
|
|
the path to the HDFS directory where Impala reads and writes files for the table. For
|
|
example, if your data pipeline produces Parquet files in the HDFS directory
|
|
<filepath>/user/etl/destination</filepath>, you might create an external table as follows:
|
|
</p>
|
|
|
|
<codeblock>CREATE EXTERNAL TABLE external_parquet (c1 INT, c2 STRING, c3 TIMESTAMP)
|
|
STORED AS PARQUET LOCATION '/user/etl/destination';
|
|
</codeblock>
|
|
|
|
<p>
|
|
Although the <codeph>EXTERNAL</codeph> and <codeph>LOCATION</codeph> clauses are often
|
|
specified together, <codeph>LOCATION</codeph> is optional for external tables, and you can
|
|
also specify <codeph>LOCATION</codeph> for internal tables. The difference is all about
|
|
whether Impala <q>takes control</q> of the underlying data files and moves them when you
|
|
rename the table, or deletes them when you drop the table. For more about internal and
|
|
external tables and how they interact with the <codeph>LOCATION</codeph> attribute, see
|
|
<xref
|
|
href="impala_tables.xml#tables"/>.
|
|
</p>
|
|
|
|
<p>
|
|
<b>Partitioned tables (PARTITIONED BY clause):</b>
|
|
</p>
|
|
|
|
<p>
|
|
The <codeph>PARTITIONED BY</codeph> clause divides the data files based on the values from
|
|
one or more specified columns. Impala queries can use the partition metadata to minimize
|
|
the amount of data that is read from disk or transmitted across the network, particularly
|
|
during join queries. For details about partitioning, see
|
|
<xref
|
|
href="impala_partitioning.xml#partitioning"/>.
|
|
</p>
|
|
|
|
<note rev="IMPALA-3719">
|
|
<p>
|
|
All Kudu tables require partitioning, which involves different syntax than non-Kudu
|
|
tables. See the <codeph>PARTITION BY</codeph> clause, rather than <codeph>PARTITIONED
|
|
BY</codeph>, for Kudu tables.
|
|
</p>
|
|
|
|
<p rev="IMPALA-5546">
|
|
In <keyword keyref="impala210_full"/> and higher, the <codeph>PARTITION BY</codeph>
|
|
clause is optional for Kudu tables. If the clause is omitted, Impala automatically
|
|
constructs a single partition that is not connected to any column. Because such a table
|
|
cannot take advantage of Kudu features for parallelized queries and query optimizations,
|
|
omitting the <codeph>PARTITION BY</codeph> clause is only appropriate for small lookup
|
|
tables.
|
|
</p>
|
|
</note>
|
|
|
|
<p rev="2.5.0">
|
|
Prior to <keyword keyref="impala25_full"/>, you could use a partitioned table as the
|
|
source and copy data from it, but could not specify any partitioning clauses for the new
|
|
table. In <keyword
|
|
keyref="impala25_full"/> and higher, you can now use the
|
|
<codeph>PARTITIONED BY</codeph> clause with a <codeph>CREATE TABLE AS SELECT</codeph>
|
|
statement. See the examples under the following discussion of the <codeph>CREATE TABLE AS
|
|
SELECT</codeph> syntax variation.
|
|
</p>
|
|
|
|
<p rev="2.9.0 IMPALA-4166">
|
|
<b>Sorted tables (SORT BY clause):</b>
|
|
</p>
|
|
|
|
<p rev="2.9.0 IMPALA-4166">
|
|
The optional <codeph>SORT BY</codeph> clause lets you specify zero or more columns that
|
|
are sorted in ascending order in the data files created by each Impala <codeph>INSERT</codeph>
|
|
or <codeph>CREATE TABLE AS SELECT</codeph> operation. There are two orderings to chose
|
|
from: <codeph>LEXICAL</codeph> and <codeph>ZORDER</codeph>. The default ordering is
|
|
<codeph>LEXICAL</codeph>, which can be used for any number of sort columns.
|
|
<codeph>ZORDER</codeph> can only be used to sort more than one column.
|
|
</p>
|
|
|
|
<p rev="2.9.0 IMPALA-4166">
|
|
Creating data files that are sorted is most useful for Parquet tables, where the
|
|
metadata stored inside each file includes the minimum and maximum values for each
|
|
column in the file. (The statistics apply to each row group within the file;
|
|
for simplicity, Impala writes a single row group in each file.)
|
|
Grouping data values together in relatively narrow ranges within each data file makes it
|
|
possible for Impala to quickly skip over data files that do not contain value ranges
|
|
indicated in the <codeph>WHERE</codeph> clause of a query, and can improve the
|
|
effectiveness of Parquet encoding and compression.
|
|
</p>
|
|
|
|
<p rev="2.9.0 IMPALA-4166">
|
|
This clause is not applicable for Kudu tables or HBase tables. Although it works for other
|
|
HDFS file formats besides Parquet, the more efficient layout is most evident with Parquet
|
|
tables, because each Parquet data file includes statistics about the data values in that
|
|
file.
|
|
</p>
|
|
|
|
<p rev="2.9.0 IMPALA-4166">
|
|
The <codeph>SORT BY</codeph> columns cannot include any partition key columns for a
|
|
partitioned table, because those column values are not represented in the underlying data
|
|
files.
|
|
</p>
|
|
|
|
<p rev="2.9.0 IMPALA-4166">
|
|
Because data files can arrive in Impala tables by mechanisms that do not respect the
|
|
<codeph>SORT BY</codeph> clause, such as <codeph>LOAD DATA</codeph> or ETL tools that
|
|
create HDFS files, Impala does not guarantee or rely on the data being sorted. The sorting
|
|
aspect is only used to create a more efficient layout for Parquet files generated by
|
|
Impala, which helps to optimize the processing of those Parquet files during Impala
|
|
queries. During an <codeph>INSERT</codeph> or <codeph>CREATE TABLE AS SELECT</codeph>
|
|
operation, the sorting occurs when the <codeph>SORT BY</codeph> clause applies to the
|
|
destination table for the data, regardless of whether the source table has a <codeph>SORT
|
|
BY</codeph> clause.
|
|
</p>
|
|
|
|
<p rev="2.9.0 IMPALA-4166">
|
|
For example, when creating a table intended to contain census data, you might define sort
|
|
columns such as last name and state. If a data file in this table contains a narrow range
|
|
of last names, for example from <codeph>Smith</codeph> to <codeph>Smythe</codeph>, Impala
|
|
can quickly detect that this data file contains no matches for a <codeph>WHERE</codeph>
|
|
clause such as <codeph>WHERE last_name = 'Jones'</codeph> and avoid reading the entire
|
|
file.
|
|
</p>
|
|
|
|
<codeblock rev="2.9.0 IMPALA-4166">CREATE TABLE census_data (last_name STRING, first_name STRING, state STRING, address STRING)
|
|
SORT BY LEXICAL (last_name, state)
|
|
STORED AS PARQUET;
|
|
</codeblock>
|
|
|
|
<p rev="2.9.0 IMPALA-4166">
|
|
Likewise, if an existing table contains data without any sort order, you can reorganize
|
|
the data in a more efficient way by using <codeph>INSERT</codeph> or <codeph>CREATE TABLE
|
|
AS SELECT</codeph> to copy that data into a new table with a <codeph>SORT BY</codeph>
|
|
clause:
|
|
</p>
|
|
|
|
<codeblock rev="2.9.0 IMPALA-4166">CREATE TABLE sorted_census_data
|
|
SORT BY (last_name, state)
|
|
STORED AS PARQUET
|
|
AS SELECT last_name, first_name, state, address
|
|
FROM unsorted_census_data;
|
|
</codeblock>
|
|
|
|
<p rev="2.9.0 IMPALA-4166">
|
|
The metadata for the <codeph>SORT BY</codeph> clause is stored in the
|
|
<codeph>TBLPROPERTIES</codeph> fields for the table. Other SQL engines that can
|
|
interoperate with Impala tables, such as Hive and Spark SQL, do not recognize this
|
|
property when inserting into a table that has a <codeph>SORT BY</codeph> clause.
|
|
</p>
|
|
|
|
<p>
|
|
<b>Transactional tables:</b>
|
|
</p>
|
|
|
|
<p>
|
|
In the version 3.3 and higher, when integrated with Hive 3, Impala can create, read, and
|
|
insert into transactional tables.
|
|
</p>
|
|
|
|
<p>
|
|
To create a table that supports transactions, use the <codeph>TBLPROPERTIES</codeph>
|
|
clause and set the <codeph>'transactional'</codeph> and
|
|
<codeph>'transactional_properties'</codeph> as below. Currently, Impala only supports
|
|
insert-only transactional tables.
|
|
<codeblock>TBLPROPERTIES('transactional'='true', 'transactional_properties'='insert_only')</codeblock>
|
|
</p>
|
|
|
|
<p>
|
|
When integrated with Hive3 and the <codeph>DEFAULT_TRANSACTIONAL_TYPE</codeph> query
|
|
option is set to <codeph>INSERT_ONLY</codeph>, tables are created as insert-only
|
|
transactional table by default.
|
|
</p>
|
|
|
|
<p>
|
|
Transactional tables are not supported for Kudu and HBase.
|
|
</p>
|
|
|
|
<p rev="kudu" conref="../shared/impala_common.xml#common/kudu_blurb"/>
|
|
|
|
<p rev="kudu">
|
|
Because Kudu tables do not support clauses related to HDFS and S3 data files and
|
|
partitioning mechanisms, the syntax associated with the <codeph>STORED AS KUDU</codeph>
|
|
clause is shown separately in the above syntax descriptions. Kudu tables have their own
|
|
syntax for <codeph>CREATE TABLE</codeph>, <codeph>CREATE EXTERNAL TABLE</codeph>, and
|
|
<codeph>CREATE TABLE AS SELECT</codeph>. <ph rev="IMPALA-2256">Prior to
|
|
<keyword keyref="impala210_full"/>, all internal Kudu tables require a <codeph>PARTITION
|
|
BY</codeph> clause, different than the <codeph>PARTITIONED BY</codeph> clause for
|
|
HDFS-backed tables.</ph>
|
|
</p>
|
|
|
|
<p>
|
|
Here are some examples of creating empty Kudu tables:
|
|
</p>
|
|
|
|
<codeblock>
|
|
<ph rev="IMPALA-2256">-- Single partition. Only for <keyword keyref="impala210_full"/> and higher.
|
|
-- Only suitable for small lookup tables.
|
|
CREATE TABLE kudu_no_partition_by_clause
|
|
(
|
|
id bigint PRIMARY KEY, s STRING, b BOOLEAN
|
|
)
|
|
STORED AS KUDU;</ph>
|
|
|
|
-- Single-column primary key.
|
|
CREATE TABLE kudu_t1 (id BIGINT PRIMARY key, s STRING, b BOOLEAN)
|
|
PARTITION BY HASH (id) PARTITIONS 20 STORED AS KUDU;
|
|
|
|
-- Multi-column primary key.
|
|
CREATE TABLE kudu_t2 (id BIGINT, s STRING, b BOOLEAN, PRIMARY KEY (id,s))
|
|
PARTITION BY HASH (s) PARTITIONS 30 STORED AS KUDU;
|
|
|
|
<![CDATA[-- Meaningful primary key column is good for range partitioning.
|
|
CREATE TABLE kudu_t3 (id BIGINT, year INT, s STRING,
|
|
b BOOLEAN, PRIMARY KEY (id,year))
|
|
PARTITION BY HASH (id) PARTITIONS 20,
|
|
RANGE (year) (PARTITION 1980 <= VALUES < 1990,
|
|
PARTITION 1990 <= VALUES < 2000,
|
|
PARTITION VALUE = 2001,
|
|
PARTITION 2001 < VALUES)
|
|
STORED AS KUDU;
|
|
]]>
|
|
</codeblock>
|
|
|
|
<p>
|
|
Here is an example of creating an external Kudu table based on an pre-existing table
|
|
identified by the table property: </p>
|
|
|
|
<codeblock><![CDATA[
|
|
-- Inherits column definitions from original table.
|
|
-- For tables created through Impala, the kudu.table_name property
|
|
-- comes from DESCRIBE FORMATTED output from the original table.
|
|
CREATE EXTERNAL TABLE external_t1 STORED AS KUDU
|
|
TBLPROPERTIES ('kudu.table_name'='kudu_tbl_created_via_api');
|
|
]]>
|
|
</codeblock>
|
|
|
|
<p>
|
|
In Impala 3.4 and higher, by default HMS implicitly translates internal Kudu tables to
|
|
external Kudu tables with the 'external.table.purge' property set to true. You can explicitly
|
|
create such external Kudu tables similar to the way you create internal Kudu tables. You must
|
|
set the table property <codeph>'external.table.purge'</codeph> to true. Here is an example of
|
|
creating an external Kudu table:
|
|
</p>
|
|
|
|
<codeblock>CREATE EXTERNAL TABLE myextkudutbl (
|
|
id int PRIMARY KEY,
|
|
name string)
|
|
PARTITION BY HASH PARTITIONS 8
|
|
STORED AS KUDU
|
|
TBLPROPERTIES ('external.table.purge'='true');
|
|
</codeblock>
|
|
<p>
|
|
Operations on the resulting external table in Impala, HMS, and Kudu table metadata is
|
|
synchronized. HMS-Kudu integration does not need to be enabled for external table
|
|
synchronization. Such synchronized tables behave similar to internal tables. For example,
|
|
dropping a table removes the underlying Kudu table data as well as the table metadata in HMS.
|
|
</p>
|
|
<p>
|
|
If you want to drop only the HMS metadata and not drop the Kudu table, you
|
|
can set <codeph>external.table.purge</codeph> to false, as shown in the following example:
|
|
</p>
|
|
|
|
<codeblock>
|
|
ALTER TABLE myextkudutbl set tblproperties('external.table.purge'='false');
|
|
</codeblock>
|
|
|
|
<p>
|
|
Here is an example of <codeph>CREATE TABLE AS SELECT</codeph> syntax for a Kudu table:
|
|
</p>
|
|
|
|
<codeblock><![CDATA[
|
|
-- The CTAS statement defines the primary key and partitioning scheme.
|
|
-- The rest of the column definitions are derived from the select list.
|
|
CREATE TABLE ctas_t1
|
|
PRIMARY KEY (id) PARTITION BY HASH (id) PARTITIONS 10
|
|
STORED AS KUDU
|
|
AS SELECT id, s FROM kudu_t1;
|
|
]]>
|
|
</codeblock>
|
|
|
|
<p rev="kudu">
|
|
The following <codeph>CREATE TABLE</codeph> clauses are not supported for Kudu tables:
|
|
</p>
|
|
|
|
<ul rev="kudu">
|
|
<li>
|
|
<codeph>PARTITIONED BY</codeph> (Kudu tables use the clause <codeph>PARTITION
|
|
BY</codeph> instead)
|
|
</li>
|
|
|
|
<li>
|
|
<codeph>LOCATION</codeph>
|
|
</li>
|
|
|
|
<li>
|
|
<codeph>ROWFORMAT</codeph>
|
|
</li>
|
|
|
|
<li>
|
|
<codeph>CACHED IN | UNCACHED</codeph>
|
|
</li>
|
|
|
|
<li>
|
|
<codeph>WITH SERDEPROPERTIES</codeph>
|
|
</li>
|
|
</ul>
|
|
|
|
<p rev="IMPALA-3719">
|
|
For more on the <codeph>PRIMARY KEY</codeph> clause, see
|
|
<xref href="impala_kudu.xml#kudu_primary_key"/> and
|
|
<xref
|
|
href="impala_kudu.xml#kudu_primary_key_attribute"/>.
|
|
</p>
|
|
|
|
<p>
|
|
For more on creating a Kudu table with a specific replication factor, see
|
|
<xref href="impala_kudu.xml#kudu_replication_factor"/>.
|
|
</p>
|
|
|
|
<p rev="IMPALA-3719">
|
|
For more on the <codeph>NULL</codeph> and <codeph>NOT NULL</codeph> attributes, see
|
|
<xref
|
|
href="impala_kudu.xml#kudu_not_null_attribute"/>.
|
|
</p>
|
|
|
|
<p rev="IMPALA-3719">
|
|
For more on the <codeph>ENCODING</codeph> attribute, see
|
|
<xref href="impala_kudu.xml#kudu_encoding_attribute"/>.
|
|
</p>
|
|
|
|
<p rev="IMPALA-3719">
|
|
For more on the <codeph>COMPRESSION</codeph> attribute, see
|
|
<xref href="impala_kudu.xml#kudu_compression_attribute"/>.
|
|
</p>
|
|
|
|
<p rev="IMPALA-3719">
|
|
For more on the <codeph>DEFAULT</codeph> attribute, see
|
|
<xref href="impala_kudu.xml#kudu_default_attribute"/>.
|
|
</p>
|
|
|
|
<p rev="IMPALA-3719">
|
|
For more on the <codeph>BLOCK_SIZE</codeph> attribute, see
|
|
<xref href="impala_kudu.xml#kudu_block_size_attribute"/>.
|
|
</p>
|
|
|
|
<p rev="kudu IMPALA-3719">
|
|
<b>Partitioning for Kudu tables (PARTITION BY clause)</b>
|
|
</p>
|
|
|
|
<p rev="kudu IMPALA-3719">
|
|
For Kudu tables, you specify logical partitioning across one or more columns using the
|
|
<codeph>PARTITION BY</codeph> clause. In contrast to partitioning for HDFS-based tables,
|
|
multiple values for a partition key column can be located in the same partition. The
|
|
optional <codeph>HASH</codeph> clause lets you divide one or a set of partition key
|
|
columns into a specified number of buckets. You can use more than one
|
|
<codeph>HASH</codeph> clause, specifying a distinct set of partition key columns for each.
|
|
The optional <codeph>RANGE</codeph> clause further subdivides the partitions, based on a
|
|
set of comparison operations for the partition key columns.
|
|
</p>
|
|
|
|
<p rev="kudu IMPALA-3719">
|
|
Here are some examples of the <codeph>PARTITION BY HASH</codeph> syntax:
|
|
</p>
|
|
|
|
<codeblock rev="kudu IMPALA-3719"><![CDATA[
|
|
-- Apply hash function to 1 primary key column.
|
|
create table hash_t1 (x bigint, y bigint, s string, primary key (x,y))
|
|
partition by hash (x) partitions 10
|
|
stored as kudu;
|
|
|
|
-- Apply hash function to a different primary key column.
|
|
create table hash_t2 (x bigint, y bigint, s string, primary key (x,y))
|
|
partition by hash (y) partitions 10
|
|
stored as kudu;
|
|
|
|
-- Apply hash function to both primary key columns.
|
|
-- In this case, the total number of partitions is 10.
|
|
create table hash_t3 (x bigint, y bigint, s string, primary key (x,y))
|
|
partition by hash (x,y) partitions 10
|
|
stored as kudu;
|
|
|
|
-- When the column list is omitted, apply hash function to all primary key columns.
|
|
create table hash_t4 (x bigint, y bigint, s string, primary key (x,y))
|
|
partition by hash partitions 10
|
|
stored as kudu;
|
|
|
|
-- Hash the X values independently from the Y values.
|
|
-- In this case, the total number of partitions is 10 x 20.
|
|
create table hash_t5 (x bigint, y bigint, s string, primary key (x,y))
|
|
partition by hash (x) partitions 10, hash (y) partitions 20
|
|
stored as kudu;
|
|
]]>
|
|
</codeblock>
|
|
|
|
<p rev="kudu IMPALA-3719">
|
|
Here are some examples of the <codeph>PARTITION BY RANGE</codeph> syntax:
|
|
</p>
|
|
|
|
<codeblock rev="kudu IMPALA-3719"><![CDATA[
|
|
-- Create partitions that cover every possible value of X.
|
|
-- Ranges that span multiple values use the keyword VALUES between
|
|
-- a pair of < and <= comparisons.
|
|
create table range_t1 (x bigint, s string, s2 string, primary key (x, s))
|
|
partition by range (x)
|
|
(
|
|
partition 0 <= values <= 49, partition 50 <= values <= 100,
|
|
partition values < 0, partition 100 < values
|
|
)
|
|
stored as kudu;
|
|
|
|
-- Create partitions that cover some possible values of X.
|
|
-- Values outside the covered range(s) are rejected.
|
|
-- New range partitions can be added through ALTER TABLE.
|
|
create table range_t2 (x bigint, s string, s2 string, primary key (x, s))
|
|
partition by range (x)
|
|
(
|
|
partition 0 <= values <= 49, partition 50 <= values <= 100
|
|
)
|
|
stored as kudu;
|
|
|
|
-- A range can also specify a single specific value, using the keyword VALUE
|
|
-- with an = comparison.
|
|
create table range_t3 (x bigint, s string, s2 string, primary key (x, s))
|
|
partition by range (s)
|
|
(
|
|
partition value = 'Yes', partition value = 'No', partition value = 'Maybe'
|
|
)
|
|
stored as kudu;
|
|
|
|
-- Using multiple columns in the RANGE clause and tuples inside the partition spec
|
|
-- only works for partitions specified with the VALUE= syntax.
|
|
create table range_t4 (x bigint, s string, s2 string, primary key (x, s))
|
|
partition by range (x,s)
|
|
(
|
|
partition value = (0,'zero'), partition value = (1,'one'), partition value = (2,'two')
|
|
)
|
|
stored as kudu;
|
|
]]>
|
|
</codeblock>
|
|
|
|
<p rev="kudu IMPALA-3719">
|
|
Here are some examples combining both <codeph>HASH</codeph> and <codeph>RANGE</codeph>
|
|
syntax for the <codeph>PARTITION BY</codeph> clause:
|
|
</p>
|
|
|
|
<codeblock rev="kudu IMPALA-3719"><![CDATA[
|
|
-- Values from each range partition are hashed into 10 associated buckets.
|
|
-- Total number of partitions in this case is 10 x 2.
|
|
create table combined_t1 (x bigint, s string, s2 string, primary key (x, s))
|
|
partition by hash (x) partitions 10, range (x)
|
|
(
|
|
partition 0 <= values <= 49, partition 50 <= values <= 100
|
|
)
|
|
stored as kudu;
|
|
|
|
-- The hash partitioning and range partitioning can apply to different columns.
|
|
-- But all the columns used in either partitioning scheme must be from the primary key.
|
|
create table combined_t2 (x bigint, s string, s2 string, primary key (x, s))
|
|
partition by hash (s) partitions 10, range (x)
|
|
(
|
|
partition 0 <= values <= 49, partition 50 <= values <= 100
|
|
)
|
|
stored as kudu;
|
|
]]>
|
|
</codeblock>
|
|
|
|
<p rev="kudu IMPALA-3719">
|
|
For more usage details and examples of the Kudu partitioning syntax, see
|
|
<xref keyref="impala_kudu"/>.
|
|
</p>
|
|
|
|
<p>
|
|
<b>Specifying file format (STORED AS and ROW FORMAT clauses):</b>
|
|
</p>
|
|
|
|
<p rev="DOCS-1523">
|
|
The <codeph>STORED AS</codeph> clause identifies the format of the underlying data files.
|
|
Currently, Impala can query more types of file formats than it can create or insert into.
|
|
Use Hive to perform any create or data load operations that are not currently available in
|
|
Impala. For example, Impala can create an Avro, SequenceFile, or RCFile table but cannot
|
|
insert data into it. There are also Impala-specific procedures for using compression with
|
|
each kind of file format. For details about working with data files of various formats,
|
|
see <xref href="impala_file_formats.xml#file_formats"/>.
|
|
</p>
|
|
|
|
<note>
|
|
In Impala 1.4.0 and higher, Impala can create Avro tables, which formerly required doing
|
|
the <codeph>CREATE TABLE</codeph> statement in Hive. See
|
|
<xref href="impala_avro.xml#avro"/> for details and examples.
|
|
</note>
|
|
|
|
<p>
|
|
By default (when no <codeph>STORED AS</codeph> clause is specified), data files in Impala
|
|
tables are created as text files with Ctrl-A (hex 01) characters as the delimiter.
|
|
<!-- Verify if ROW FORMAT is entirely ignored outside of text tables, or does it apply somehow to SequenceFile and/or RCFile too? -->
|
|
Specify the <codeph>ROW FORMAT DELIMITED</codeph> clause to produce or ingest data files
|
|
that use a different delimiter character such as tab or <codeph>|</codeph>, or a different
|
|
line end character such as carriage return or newline. When specifying delimiter and line
|
|
end characters with the <codeph>FIELDS TERMINATED BY</codeph> and <codeph>LINES TERMINATED
|
|
BY</codeph> clauses, use <codeph>'\t'</codeph> for tab, <codeph>'\n'</codeph> for newline
|
|
or linefeed, <codeph>'\r'</codeph> for carriage return, and
|
|
<codeph>\</codeph><codeph>0</codeph> for ASCII <codeph>nul</codeph> (hex 00). For more
|
|
examples of text tables, see <xref href="impala_txtfile.xml#txtfile"/>.
|
|
</p>
|
|
|
|
<p>
|
|
The <codeph>ESCAPED BY</codeph> clause applies both to text files that you create through
|
|
an <codeph>INSERT</codeph> statement to an Impala <codeph>TEXTFILE</codeph> table, and to
|
|
existing data files that you put into an Impala table directory. (You can ingest existing
|
|
data files either by creating the table with <codeph>CREATE EXTERNAL TABLE ...
|
|
LOCATION</codeph>, the <codeph>LOAD DATA</codeph> statement, or through an HDFS operation
|
|
such as <codeph>hdfs dfs -put <varname>file</varname>
|
|
<varname>hdfs_path</varname></codeph>.) Choose an escape character that is not used
|
|
anywhere else in the file, and put it in front of each instance of the delimiter character
|
|
that occurs within a field value. Surrounding field values with quotation marks does not
|
|
help Impala to parse fields with embedded delimiter characters; the quotation marks are
|
|
considered to be part of the column value. If you want to use <codeph>\</codeph> as the
|
|
escape character, specify the clause in <cmdname>impala-shell</cmdname> as <codeph>ESCAPED
|
|
BY '\\'</codeph>.
|
|
</p>
|
|
|
|
<note conref="../shared/impala_common.xml#common/thorn"/>
|
|
|
|
<p>
|
|
<b>Cloning tables (LIKE clause):</b>
|
|
</p>
|
|
|
|
<p>
|
|
To create an empty table with the same columns, comments, and other attributes as another
|
|
table, use the following variation. The <codeph>CREATE TABLE ... LIKE</codeph> form allows
|
|
a restricted set of clauses, currently only the <codeph>LOCATION</codeph>,
|
|
<codeph>COMMENT</codeph>, and <codeph>STORED AS</codeph> clauses.
|
|
</p>
|
|
|
|
<codeblock>CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [<varname>db_name</varname>.]<varname>table_name</varname>
|
|
<ph rev="1.4.0">LIKE { [<varname>db_name</varname>.]<varname>table_name</varname> | PARQUET '<varname>hdfs_path_of_parquet_file</varname>' }</ph>
|
|
[COMMENT '<varname>table_comment</varname>']
|
|
[STORED AS <varname>file_format</varname>]
|
|
[LOCATION '<varname>hdfs_path</varname>']</codeblock>
|
|
|
|
<note rev="1.2.0">
|
|
<p rev="1.2.0">
|
|
To clone the structure of a table and transfer data into it in a single operation, use
|
|
the <codeph>CREATE TABLE AS SELECT</codeph> syntax described in the next subsection.
|
|
</p>
|
|
</note>
|
|
|
|
<p>
|
|
When you clone the structure of an existing table using the <codeph>CREATE TABLE ...
|
|
LIKE</codeph> syntax, the new table keeps the same file format as the original one, so you
|
|
only need to specify the <codeph>STORED AS</codeph> clause if you want to use a different
|
|
file format, or when specifying a view as the original table. (Creating a table
|
|
<q>like</q> a view produces a text table by default.)
|
|
</p>
|
|
|
|
<p>
|
|
Although normally Impala cannot create an HBase table directly, Impala can clone the
|
|
structure of an existing HBase table with the <codeph>CREATE TABLE ... LIKE</codeph>
|
|
syntax, preserving the file format and metadata from the original table.
|
|
</p>
|
|
|
|
<p>
|
|
There are some exceptions to the ability to use <codeph>CREATE TABLE ... LIKE</codeph>
|
|
with an Avro table. For example, you cannot use this technique for an Avro table that is
|
|
specified with an Avro schema but no columns. When in doubt, check if a <codeph>CREATE
|
|
TABLE ... LIKE</codeph> operation works in Hive; if not, it typically will not work in
|
|
Impala either.
|
|
</p>
|
|
|
|
<p>
|
|
If the original table is partitioned, the new table inherits the same partition key
|
|
columns. Because the new table is initially empty, it does not inherit the actual
|
|
partitions that exist in the original one. To create partitions in the new table, insert
|
|
data or issue <codeph>ALTER TABLE ... ADD PARTITION</codeph> statements.
|
|
</p>
|
|
|
|
<p conref="../shared/impala_common.xml#common/create_table_like_view"/>
|
|
|
|
<p>
|
|
Because <codeph>CREATE TABLE ... LIKE</codeph> only manipulates table metadata, not the
|
|
physical data of the table, issue <codeph>INSERT INTO TABLE</codeph> statements afterward
|
|
to copy any data from the original table into the new one, optionally converting the data
|
|
to a new file format. (For some file formats, Impala can do a <codeph>CREATE TABLE ...
|
|
LIKE</codeph> to create the table, but Impala cannot insert data in that file format; in
|
|
these cases, you must load the data in Hive. See
|
|
<xref
|
|
href="impala_file_formats.xml#file_formats"/> for details.)
|
|
</p>
|
|
|
|
<p rev="1.2" id="ctas">
|
|
<b>CREATE TABLE AS SELECT:</b>
|
|
</p>
|
|
|
|
<p>
|
|
The <codeph>CREATE TABLE AS SELECT</codeph> syntax is a shorthand notation to create a
|
|
table based on column definitions from another table, and copy data from the source table
|
|
to the destination table without issuing any separate <codeph>INSERT</codeph> statement.
|
|
This idiom is so popular that it has its own acronym, <q>CTAS</q>.
|
|
</p>
|
|
|
|
<p>
|
|
The following examples show how to copy data from a source table <codeph>T1</codeph> to a
|
|
variety of destinations tables, applying various transformations to the table properties,
|
|
table layout, or the data itself as part of the operation:
|
|
</p>
|
|
|
|
<codeblock>
|
|
-- Sample table to be the source of CTAS operations.
|
|
CREATE TABLE t1 (x INT, y STRING);
|
|
INSERT INTO t1 VALUES (1, 'one'), (2, 'two'), (3, 'three');
|
|
|
|
-- Clone all the columns and data from one table to another.
|
|
CREATE TABLE clone_of_t1 AS SELECT * FROM t1;
|
|
+-------------------+
|
|
| summary |
|
|
+-------------------+
|
|
| Inserted 3 row(s) |
|
|
+-------------------+
|
|
|
|
-- Clone the columns and data, and convert the data to a different file format.
|
|
CREATE TABLE parquet_version_of_t1 STORED AS PARQUET AS SELECT * FROM t1;
|
|
+-------------------+
|
|
| summary |
|
|
+-------------------+
|
|
| Inserted 3 row(s) |
|
|
+-------------------+
|
|
|
|
-- Copy only some rows to the new table.
|
|
CREATE TABLE subset_of_t1 AS SELECT * FROM t1 WHERE x >= 2;
|
|
+-------------------+
|
|
| summary |
|
|
+-------------------+
|
|
| Inserted 2 row(s) |
|
|
+-------------------+
|
|
|
|
-- Same idea as CREATE TABLE LIKE: clone table layout but do not copy any data.
|
|
CREATE TABLE empty_clone_of_t1 AS SELECT * FROM t1 WHERE 1=0;
|
|
+-------------------+
|
|
| summary |
|
|
+-------------------+
|
|
| Inserted 0 row(s) |
|
|
+-------------------+
|
|
|
|
-- Reorder and rename columns and transform the data.
|
|
CREATE TABLE t5 AS SELECT upper(y) AS s, x+1 AS a, 'Entirely new column' AS n FROM t1;
|
|
+-------------------+
|
|
| summary |
|
|
+-------------------+
|
|
| Inserted 3 row(s) |
|
|
+-------------------+
|
|
SELECT * FROM t5;
|
|
+-------+---+---------------------+
|
|
| s | a | n |
|
|
+-------+---+---------------------+
|
|
| ONE | 2 | Entirely new column |
|
|
| TWO | 3 | Entirely new column |
|
|
| THREE | 4 | Entirely new column |
|
|
+-------+---+---------------------+
|
|
</codeblock>
|
|
|
|
<!-- These are a little heavyweight to get into here. Therefore commenting out.
|
|
Some overlap with the new column-changing examples in the code listing above.
|
|
Create tables with different column order, names, or types than the original.
|
|
CREATE TABLE some_columns_from_t1 AS SELECT c1, c3, c5 FROM t1;
|
|
CREATE TABLE reordered_columns_from_t1 AS SELECT c4, c3, c1, c2 FROM t1;
|
|
CREATE TABLE synthesized_columns AS SELECT upper(c1) AS all_caps, c2+c3 AS total, "California" AS state FROM t1;</codeblock>
|
|
-->
|
|
|
|
<!-- CREATE TABLE AS <select> now incorporated up higher in the original syntax diagram. -->
|
|
|
|
<p rev="1.2">
|
|
See <xref href="impala_select.xml#select"/> for details about query syntax for the
|
|
<codeph>SELECT</codeph> portion of a <codeph>CREATE TABLE AS SELECT</codeph> statement.
|
|
</p>
|
|
|
|
<p rev="1.2">
|
|
The newly created table inherits the column names that you select from the original table,
|
|
which you can override by specifying column aliases in the query. Any column or table
|
|
comments from the original table are not carried over to the new table.
|
|
</p>
|
|
|
|
<note rev="DOCS-1523">
|
|
When using the <codeph>STORED AS</codeph> clause with a <codeph>CREATE TABLE AS
|
|
SELECT</codeph> statement, the destination table must be a file format that Impala can
|
|
write to: currently, text or Parquet. You cannot specify an Avro, SequenceFile, or RCFile
|
|
table as the destination table for a CTAS operation.
|
|
</note>
|
|
|
|
<p rev="2.5.0">
|
|
Prior to <keyword keyref="impala25_full"/> you could use a partitioned table as the source
|
|
and copy data from it, but could not specify any partitioning clauses for the new table.
|
|
In <keyword
|
|
keyref="impala25_full"/> and higher, you can now use the
|
|
<codeph>PARTITIONED BY</codeph> clause with a <codeph>CREATE TABLE AS SELECT</codeph>
|
|
statement. The following example demonstrates how you can copy data from an unpartitioned
|
|
table in a <codeph>CREATE TABLE AS SELECT</codeph> operation, creating a new partitioned
|
|
table in the process. The main syntax consideration is the column order in the
|
|
<codeph>PARTITIONED BY</codeph> clause and the select list: the partition key columns must
|
|
be listed last in the select list, in the same order as in the <codeph>PARTITIONED
|
|
BY</codeph> clause. Therefore, in this case, the column order in the destination table is
|
|
different from the source table. You also only specify the column names in the
|
|
<codeph>PARTITIONED BY</codeph> clause, not the data types or column comments.
|
|
</p>
|
|
|
|
<codeblock rev="2.5.0">
|
|
create table partitions_no (year smallint, month tinyint, s string);
|
|
insert into partitions_no values (2016, 1, 'January 2016'),
|
|
(2016, 2, 'February 2016'), (2016, 3, 'March 2016');
|
|
|
|
-- Prove that the source table is not partitioned.
|
|
show partitions partitions_no;
|
|
ERROR: AnalysisException: Table is not partitioned: ctas_partition_by.partitions_no
|
|
|
|
-- Create new table with partitions based on column values from source table.
|
|
<b>create table partitions_yes partitioned by (year, month)
|
|
as select s, year, month from partitions_no;</b>
|
|
+-------------------+
|
|
| summary |
|
|
+-------------------+
|
|
| Inserted 3 row(s) |
|
|
+-------------------+
|
|
|
|
-- Prove that the destination table is partitioned.
|
|
show partitions partitions_yes;
|
|
+-------+-------+-------+--------+------+...
|
|
| year | month | #Rows | #Files | Size |...
|
|
+-------+-------+-------+--------+------+...
|
|
| 2016 | 1 | -1 | 1 | 13B |...
|
|
| 2016 | 2 | -1 | 1 | 14B |...
|
|
| 2016 | 3 | -1 | 1 | 11B |...
|
|
| Total | | -1 | 3 | 38B |...
|
|
+-------+-------+-------+--------+------+...
|
|
</codeblock>
|
|
|
|
<p rev="2.5.0">
|
|
The most convenient layout for partitioned tables is with all the partition key columns at
|
|
the end. The CTAS <codeph>PARTITIONED BY</codeph> syntax requires that column order in the
|
|
select list, resulting in that same column order in the destination table.
|
|
</p>
|
|
|
|
<codeblock rev="2.5.0">
|
|
describe partitions_no;
|
|
+-------+----------+---------+
|
|
| name | type | comment |
|
|
+-------+----------+---------+
|
|
| year | smallint | |
|
|
| month | tinyint | |
|
|
| s | string | |
|
|
+-------+----------+---------+
|
|
|
|
-- The CTAS operation forced us to put the partition key columns last.
|
|
-- Having those columns last works better with idioms such as SELECT *
|
|
-- for partitioned tables.
|
|
describe partitions_yes;
|
|
+-------+----------+---------+
|
|
| name | type | comment |
|
|
+-------+----------+---------+
|
|
| s | string | |
|
|
| year | smallint | |
|
|
| month | tinyint | |
|
|
+-------+----------+---------+
|
|
</codeblock>
|
|
|
|
<p rev="2.5.0">
|
|
Attempting to use a select list with the partition key columns not at the end results in
|
|
an error due to a column name mismatch:
|
|
</p>
|
|
|
|
<codeblock rev="2.5.0">
|
|
-- We expect this CTAS to fail because non-key column S
|
|
-- comes after key columns YEAR and MONTH in the select list.
|
|
create table partitions_maybe partitioned by (year, month)
|
|
as select year, month, s from partitions_no;
|
|
ERROR: AnalysisException: Partition column name mismatch: year != month
|
|
</codeblock>
|
|
|
|
<p rev="1.2">
|
|
For example, the following statements show how you can clone all the data in a table, or a
|
|
subset of the columns and/or rows, or reorder columns, rename them, or construct them out
|
|
of expressions:
|
|
</p>
|
|
|
|
<p rev="1.2">
|
|
As part of a CTAS operation, you can convert the data to any file format that Impala can
|
|
write (currently, <codeph>TEXTFILE</codeph> and <codeph>PARQUET</codeph>). You cannot
|
|
specify the lower-level properties of a text table, such as the delimiter.
|
|
</p>
|
|
|
|
<p rev="obwl" conref="../shared/impala_common.xml#common/insert_sort_blurb"/>
|
|
|
|
<p rev="1.4.0">
|
|
<b>CREATE TABLE LIKE PARQUET:</b>
|
|
</p>
|
|
|
|
<p rev="1.4.0">
|
|
The variation <codeph>CREATE TABLE ... LIKE PARQUET
|
|
'<varname>hdfs_path_of_parquet_file</varname>'</codeph> lets you skip the column
|
|
definitions of the <codeph>CREATE TABLE</codeph> statement. The column names and data
|
|
types are automatically configured based on the organization of the specified Parquet data
|
|
file, which must already reside in HDFS. You can use a data file located outside the
|
|
Impala database directories, or a file from an existing Impala Parquet table; either way,
|
|
Impala only uses the column definitions from the file and does not use the HDFS location
|
|
for the <codeph>LOCATION</codeph> attribute of the new table. (Although you can also
|
|
specify the enclosing directory with the <codeph>LOCATION</codeph> attribute, to both use
|
|
the same schema as the data file and point the Impala table at the associated directory
|
|
for querying.)
|
|
</p>
|
|
|
|
<p rev="1.4.0">
|
|
The following considerations apply when you use the <codeph>CREATE TABLE LIKE
|
|
PARQUET</codeph> technique:
|
|
</p>
|
|
|
|
<ul rev="1.4.0">
|
|
<li>
|
|
Any column comments from the original table are not preserved in the new table. Each
|
|
column in the new table has a comment stating the low-level Parquet field type used to
|
|
deduce the appropriate SQL column type.
|
|
</li>
|
|
|
|
<li>
|
|
If you use a data file from a partitioned Impala table, any partition key columns from
|
|
the original table are left out of the new table, because they are represented in HDFS
|
|
directory names rather than stored in the data file. To preserve the partition
|
|
information, repeat the same <codeph>PARTITION</codeph> clause as in the original
|
|
<codeph>CREATE TABLE</codeph> statement.
|
|
</li>
|
|
|
|
<li>
|
|
The file format of the new table defaults to text, as with other kinds of <codeph>CREATE
|
|
TABLE</codeph> statements. To make the new table also use Parquet format, include the
|
|
clause <codeph>STORED AS PARQUET</codeph> in the <codeph>CREATE TABLE LIKE
|
|
PARQUET</codeph> statement.
|
|
</li>
|
|
|
|
<li>
|
|
If the Parquet data file comes from an existing Impala table, currently, any
|
|
<codeph>TINYINT</codeph> or <codeph>SMALLINT</codeph> columns are turned into
|
|
<codeph>INT</codeph> columns in the new table. Internally, Parquet stores such values as
|
|
32-bit integers.
|
|
</li>
|
|
|
|
<li>
|
|
When the destination table uses the Parquet file format, the <codeph>CREATE TABLE AS
|
|
SELECT</codeph> and <codeph>INSERT ... SELECT</codeph> statements always create at least
|
|
one data file, even if the <codeph>SELECT</codeph> part of the statement does not match
|
|
any rows. You can use such an empty Parquet data file as a template for subsequent
|
|
<codeph>CREATE TABLE LIKE PARQUET</codeph> statements.
|
|
</li>
|
|
</ul>
|
|
|
|
<p>
|
|
For more details about creating Parquet tables, and examples of the <codeph>CREATE TABLE
|
|
LIKE PARQUET</codeph> syntax, see <xref
|
|
href="impala_parquet.xml#parquet"/>.
|
|
</p>
|
|
|
|
<p>
|
|
<b>Visibility and Metadata (TBLPROPERTIES and WITH SERDEPROPERTIES clauses):</b>
|
|
</p>
|
|
|
|
<p rev="1.2">
|
|
You can associate arbitrary items of metadata with a table by specifying the
|
|
<codeph>TBLPROPERTIES</codeph> clause. This clause takes a comma-separated list of
|
|
key-value pairs and stores those items in the metastore database. You can also change the
|
|
table properties later with an <codeph>ALTER TABLE</codeph> statement. You can observe the
|
|
table properties for different delimiter and escape characters using the <codeph>DESCRIBE
|
|
FORMATTED</codeph> command, and change those settings for an existing table with
|
|
<codeph>ALTER TABLE ... SET TBLPROPERTIES</codeph>.
|
|
</p>
|
|
|
|
<p rev="1.2">
|
|
You can also associate SerDes properties with the table by specifying key-value pairs
|
|
through the <codeph>WITH SERDEPROPERTIES</codeph> clause. This metadata is not used by
|
|
Impala, which has its own built-in serializer and deserializer for the file formats it
|
|
supports. Particular property values might be needed for Hive compatibility with certain
|
|
variations of file formats, particularly Avro.
|
|
</p>
|
|
|
|
<p>
|
|
Some DDL operations that interact with other Hadoop components require specifying
|
|
particular values in the <codeph>SERDEPROPERTIES</codeph> or
|
|
<codeph>TBLPROPERTIES</codeph> fields, such as creating an Avro table or an HBase table.
|
|
(You typically create HBase tables in Hive, because they require additional clauses not
|
|
currently available in Impala.)
|
|
<!-- Haven't got a working example from Lenni, so suppressing this recommendation for now.
|
|
The Avro schema properties can be specified through either
|
|
<codeph>TBLPROPERTIES</codeph> or <codeph>SERDEPROPERTIES</codeph>;
|
|
for best compatibility with future versions of Hive,
|
|
use <codeph>SERDEPROPERTIES</codeph> in this case.
|
|
-->
|
|
</p>
|
|
|
|
<p>
|
|
To see the column definitions and column comments for an existing table, for example
|
|
before issuing a <codeph>CREATE TABLE ... LIKE</codeph> or a <codeph>CREATE TABLE ... AS
|
|
SELECT</codeph> statement, issue the statement <codeph>DESCRIBE
|
|
<varname>table_name</varname></codeph>. To see even more detail, such as the location of
|
|
data files and the values for clauses such as <codeph>ROW FORMAT</codeph> and
|
|
<codeph>STORED AS</codeph>, issue the statement <codeph>DESCRIBE FORMATTED
|
|
<varname>table_name</varname></codeph>. <codeph>DESCRIBE FORMATTED</codeph> is also needed
|
|
to see any overall table comment (as opposed to individual column comments).
|
|
</p>
|
|
|
|
<p>
|
|
After creating a table, your <cmdname>impala-shell</cmdname> session or another
|
|
<cmdname>impala-shell</cmdname> connected to the same node can immediately query that
|
|
table. There might be a brief interval (one statestore heartbeat) before the table can be
|
|
queried through a different Impala node. To make the <codeph>CREATE TABLE</codeph>
|
|
statement return only when the table is recognized by all Impala nodes in the cluster,
|
|
enable the <codeph>SYNC_DDL</codeph> query option.
|
|
</p>
|
|
|
|
<p rev="1.4.0">
|
|
<b>HDFS caching (CACHED IN clause):</b>
|
|
</p>
|
|
|
|
<p rev="1.4.0">
|
|
If you specify the <codeph>CACHED IN</codeph> clause, any existing or future data files in
|
|
the table directory or the partition subdirectories are designated to be loaded into
|
|
memory with the HDFS caching mechanism. See
|
|
<xref
|
|
href="impala_perf_hdfs_caching.xml#hdfs_caching"/> for details about using
|
|
the HDFS caching feature.
|
|
</p>
|
|
|
|
<p
|
|
conref="../shared/impala_common.xml#common/impala_cache_replication_factor"/>
|
|
|
|
<!-- Say something in here about the SHOW statement, e.g. SHOW TABLES, SHOW TABLE/COLUMN STATS, SHOW PARTITIONS. -->
|
|
|
|
<p>
|
|
<b>Column order</b>:
|
|
</p>
|
|
|
|
<p>
|
|
If you intend to use the table to hold data files produced by some external source,
|
|
specify the columns in the same order as they appear in the data files.
|
|
</p>
|
|
|
|
<p>
|
|
If you intend to insert or copy data into the table through Impala, or if you have control
|
|
over the way externally produced data files are arranged, use your judgment to specify
|
|
columns in the most convenient order:
|
|
</p>
|
|
|
|
<ul>
|
|
<li>
|
|
<p>
|
|
If certain columns are often <codeph>NULL</codeph>, specify those columns last. You
|
|
might produce data files that omit these trailing columns entirely. Impala
|
|
automatically fills in the <codeph>NULL</codeph> values if so.
|
|
</p>
|
|
</li>
|
|
|
|
<li>
|
|
<p>
|
|
If an unpartitioned table will be used as the source for an <codeph>INSERT ...
|
|
SELECT</codeph> operation into a partitioned table, specify last in the unpartitioned
|
|
table any columns that correspond to partition key columns in the partitioned table,
|
|
and in the same order as the partition key columns are declared in the partitioned
|
|
table. This technique lets you use <codeph>INSERT ... SELECT *</codeph> when copying
|
|
data to the partitioned table, rather than specifying each column name individually.
|
|
</p>
|
|
</li>
|
|
|
|
<li>
|
|
<p>
|
|
If you specify columns in an order that you later discover is suboptimal, you can
|
|
sometimes work around the problem without recreating the table. You can create a view
|
|
that selects columns from the original table in a permuted order, then do a
|
|
<codeph>SELECT *</codeph> from the view. When inserting data into a table, you can
|
|
specify a permuted order for the inserted columns to match the order in the
|
|
destination table.
|
|
</p>
|
|
</li>
|
|
</ul>
|
|
|
|
<p conref="../shared/impala_common.xml#common/hive_blurb"/>
|
|
|
|
<p>
|
|
Impala queries can make use of metadata about the table and columns, such as the number of
|
|
rows in a table or the number of different values in a column. Prior to Impala 1.2.2, to
|
|
create this metadata, you issued the <codeph>ANALYZE TABLE</codeph> statement in Hive to
|
|
gather this information, after creating the table and loading representative data into it.
|
|
In Impala 1.2.2 and higher, the <codeph>COMPUTE STATS</codeph> statement produces these
|
|
statistics within Impala, without needing to use Hive at all.
|
|
</p>
|
|
|
|
<p conref="../shared/impala_common.xml#common/hbase_blurb"/>
|
|
|
|
<note>
|
|
<p>
|
|
The Impala <codeph>CREATE TABLE</codeph> statement cannot create an HBase table, because
|
|
it currently does not support the <codeph>STORED BY</codeph> clause needed for HBase
|
|
tables. Create such tables in Hive, then query them through Impala. For information on
|
|
using Impala with HBase tables, see <xref href="impala_hbase.xml#impala_hbase"/>.
|
|
</p>
|
|
</note>
|
|
|
|
<p conref="../shared/impala_common.xml#common/s3_blurb"/>
|
|
|
|
<p rev="2.2.0">
|
|
To create a table where the data resides in the Amazon Simple Storage Service (S3),
|
|
specify a <codeph>s3a://</codeph> prefix <codeph>LOCATION</codeph> attribute pointing to
|
|
the data files in S3.
|
|
</p>
|
|
|
|
<p rev="2.6.0 IMPALA-1878">
|
|
In <keyword keyref="impala26_full"/> and higher, you can use this special
|
|
<codeph>LOCATION</codeph> syntax as part of a <codeph>CREATE TABLE AS SELECT</codeph>
|
|
statement.
|
|
</p>
|
|
|
|
<p conref="../shared/impala_common.xml#common/s3_ddl"/>
|
|
|
|
<p conref="../shared/impala_common.xml#common/insert_sort_blurb"/>
|
|
|
|
<p conref="../shared/impala_common.xml#common/hdfs_blurb"/>
|
|
|
|
<p>
|
|
The <codeph>CREATE TABLE</codeph> statement for an internal table creates a directory in
|
|
HDFS. The <codeph>CREATE EXTERNAL TABLE</codeph> statement associates the table with an
|
|
existing HDFS directory, and does not create any new directory in HDFS. To locate the HDFS
|
|
data directory for a table, issue a <codeph>DESCRIBE FORMATTED
|
|
<varname>table</varname></codeph> statement. To examine the contents of that HDFS
|
|
directory, use an OS command such as <codeph>hdfs dfs -ls
|
|
hdfs://<varname>path</varname></codeph>, either from the OS command line or through the
|
|
<codeph>shell</codeph> or <codeph>!</codeph> commands in <cmdname>impala-shell</cmdname>.
|
|
</p>
|
|
|
|
<p>
|
|
The <codeph>CREATE TABLE AS SELECT</codeph> syntax creates data files under the table data
|
|
directory to hold any data copied by the <codeph>INSERT</codeph> portion of the statement.
|
|
(Even if no data is copied, Impala might create one or more empty data files.)
|
|
</p>
|
|
|
|
<p conref="../shared/impala_common.xml#common/permissions_blurb"/>
|
|
|
|
<p rev="">
|
|
The user ID that the <cmdname>impalad</cmdname> daemon runs under, typically the
|
|
<codeph>impala</codeph> user, must have both execute and write permission for the database
|
|
directory where the table is being created.
|
|
</p>
|
|
|
|
<p conref="../shared/impala_common.xml#common/security_blurb"/>
|
|
|
|
<p conref="../shared/impala_common.xml#common/redaction_yes"/>
|
|
|
|
<p conref="../shared/impala_common.xml#common/cancel_blurb_maybe"/>
|
|
|
|
<p conref="../shared/impala_common.xml#common/related_info"/>
|
|
|
|
<p>
|
|
<xref href="impala_tables.xml#tables"/>,
|
|
<xref
|
|
href="impala_alter_table.xml#alter_table"/>,
|
|
<xref
|
|
href="impala_drop_table.xml#drop_table"/>,
|
|
<xref
|
|
href="impala_partitioning.xml#partitioning"/>,
|
|
<xref
|
|
href="impala_tables.xml#internal_tables"/>,
|
|
<xref
|
|
href="impala_tables.xml#external_tables"/>,
|
|
<xref
|
|
href="impala_compute_stats.xml#compute_stats"/>,
|
|
<xref
|
|
href="impala_sync_ddl.xml#sync_ddl"/>,
|
|
<xref
|
|
href="impala_show.xml#show_tables"/>,
|
|
<xref
|
|
href="impala_show.xml#show_create_table"/>,
|
|
<xref
|
|
href="impala_describe.xml#describe"/>
|
|
</p>
|
|
|
|
</conbody>
|
|
|
|
</concept>
|