mirror of
https://github.com/apache/impala.git
synced 2025-12-30 03:01:44 -05:00
For this change to land in master, the audience="hidden" code review needs to be completed first. Otherwise, the doc build would still work but the audience="hidden" content would be visible rather than hidden as desired. Some work happening in parallel might introduce additional instances of audience="Cloudera". I suggest addressing those in a followup CR so this global change can land quickly. Since the changes apply across so many different files, but are so narrow in scope, I suggest that the way to validate (check that no extraneous changes were introduced accidentally) is to diff just the changed lines: git diff -U0 HEAD^ HEAD In patch set 2, I updated other topics marked audience="Cloudera" by CRs that were pushed in the meantime. Change-Id: Ic93d89da77e1f51bbf548a522d98d0c4e2fb31c8 Reviewed-on: http://gerrit.cloudera.org:8080/5613 Reviewed-by: John Russell <jrussell@cloudera.com> Tested-by: Impala Public Jenkins
851 lines
40 KiB
XML
851 lines
40 KiB
XML
<?xml version="1.0" encoding="UTF-8"?>
|
|
<!--
|
|
Licensed to the Apache Software Foundation (ASF) under one
|
|
or more contributor license agreements. See the NOTICE file
|
|
distributed with this work for additional information
|
|
regarding copyright ownership. The ASF licenses this file
|
|
to you under the Apache License, Version 2.0 (the
|
|
"License"); you may not use this file except in compliance
|
|
with the License. You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing,
|
|
software distributed under the License is distributed on an
|
|
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
KIND, either express or implied. See the License for the
|
|
specific language governing permissions and limitations
|
|
under the License.
|
|
-->
|
|
<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
|
|
<concept id="create_table" outputclass="impala sql_statement">
|
|
|
|
<title outputclass="impala_title sql_statement_title">CREATE TABLE Statement</title>
|
|
<titlealts audience="PDF"><navtitle>CREATE TABLE</navtitle></titlealts>
|
|
<prolog>
|
|
<metadata>
|
|
<data name="Category" value="Impala"/>
|
|
<data name="Category" value="SQL"/>
|
|
<data name="Category" value="DDL"/>
|
|
<data name="Category" value="Impala Data Types"/>
|
|
<data name="Category" value="Developers"/>
|
|
<data name="Category" value="Data Analysts"/>
|
|
<data name="Category" value="HDFS Caching"/>
|
|
<data name="Category" value="Tables"/>
|
|
<data name="Category" value="Schemas"/>
|
|
<data name="Category" value="S3"/>
|
|
<!-- <data name="Category" value="Kudu"/> -->
|
|
</metadata>
|
|
</prolog>
|
|
|
|
<conbody>
|
|
|
|
<p>
|
|
<indexterm audience="hidden">CREATE TABLE statement</indexterm>
|
|
Creates a new table and specifies its characteristics. While creating a table, you optionally specify aspects
|
|
such as:
|
|
</p>
|
|
|
|
<ul>
|
|
<li>
|
|
Whether the table is internal or external.
|
|
</li>
|
|
|
|
<li>
|
|
The columns and associated data types.
|
|
</li>
|
|
|
|
<li>
|
|
The columns used for physically partitioning the data.
|
|
</li>
|
|
|
|
<li>
|
|
The file format for data files.
|
|
</li>
|
|
|
|
<li>
|
|
The HDFS directory where the data files are located.
|
|
</li>
|
|
</ul>
|
|
|
|
<p conref="../shared/impala_common.xml#common/syntax_blurb"/>
|
|
|
|
<p>
|
|
The general syntax for creating a table and specifying its columns is as follows:
|
|
</p>
|
|
|
|
<p>
|
|
<b>Explicit column definitions:</b>
|
|
</p>
|
|
|
|
<codeblock>CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [<varname>db_name</varname>.]<varname>table_name</varname>
|
|
(<varname>col_name</varname> <varname>data_type</varname> [COMMENT '<varname>col_comment</varname>'], ...)
|
|
[PARTITIONED BY (<varname>col_name</varname> <varname>data_type</varname> [COMMENT '<varname>col_comment</varname>'], ...)]
|
|
[COMMENT '<varname>table_comment</varname>']
|
|
[WITH SERDEPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)]
|
|
[
|
|
[ROW FORMAT <varname>row_format</varname>] [STORED AS <varname>file_format</varname>]
|
|
]
|
|
[LOCATION '<varname>hdfs_path</varname>']
|
|
[TBLPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)]
|
|
<ph rev="1.4.0"> [CACHED IN '<varname>pool_name</varname>'</ph> <ph rev="2.2.0">[WITH REPLICATION = <varname>integer</varname>]</ph> | UNCACHED]
|
|
</codeblock>
|
|
|
|
<p>
|
|
<b>Column definitions inferred from data file:</b>
|
|
</p>
|
|
|
|
<codeblock>CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [<varname>db_name</varname>.]<varname>table_name</varname>
|
|
LIKE PARQUET '<varname>hdfs_path_of_parquet_file</varname>'
|
|
[COMMENT '<varname>table_comment</varname>']
|
|
[PARTITIONED BY (<varname>col_name</varname> <varname>data_type</varname> [COMMENT '<varname>col_comment</varname>'], ...)]
|
|
[WITH SERDEPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)]
|
|
[
|
|
[ROW FORMAT <varname>row_format</varname>] [STORED AS <varname>file_format</varname>]
|
|
]
|
|
[LOCATION '<varname>hdfs_path</varname>']
|
|
[TBLPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)]
|
|
<ph rev="1.4.0"> [CACHED IN '<varname>pool_name</varname>'</ph> <ph rev="2.2.0">[WITH REPLICATION = <varname>integer</varname>]</ph> | UNCACHED]
|
|
data_type:
|
|
<varname>primitive_type</varname>
|
|
| array_type
|
|
| map_type
|
|
| struct_type
|
|
</codeblock>
|
|
|
|
<p>
|
|
<b>CREATE TABLE AS SELECT:</b>
|
|
</p>
|
|
|
|
<codeblock>CREATE [EXTERNAL] TABLE [IF NOT EXISTS] <varname>db_name</varname>.]<varname>table_name</varname>
|
|
<ph rev="2.5.0">[PARTITIONED BY (<varname>col_name</varname>[, ...])]</ph>
|
|
[COMMENT '<varname>table_comment</varname>']
|
|
[WITH SERDEPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)]
|
|
[
|
|
[ROW FORMAT <varname>row_format</varname>] <ph rev="CDH-41501">[STORED AS <varname>ctas_file_format</varname>]</ph>
|
|
]
|
|
[LOCATION '<varname>hdfs_path</varname>']
|
|
[TBLPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)]
|
|
<ph rev="1.4.0"> [CACHED IN '<varname>pool_name</varname>'</ph> <ph rev="2.2.0">[WITH REPLICATION = <varname>integer</varname>]</ph> | UNCACHED]
|
|
AS
|
|
<varname>select_statement</varname></codeblock>
|
|
|
|
<codeblock>primitive_type:
|
|
TINYINT
|
|
| SMALLINT
|
|
| INT
|
|
| BIGINT
|
|
| BOOLEAN
|
|
| FLOAT
|
|
| DOUBLE
|
|
<ph rev="1.4.0">| DECIMAL</ph>
|
|
| STRING
|
|
<ph rev="2.0.0">| CHAR</ph>
|
|
<ph rev="2.0.0">| VARCHAR</ph>
|
|
| TIMESTAMP
|
|
|
|
<ph rev="2.3.0">complex_type:
|
|
struct_type
|
|
| array_type
|
|
| map_type
|
|
|
|
struct_type: STRUCT < <varname>name</varname> : <varname>primitive_or_complex_type</varname> [COMMENT '<varname>comment_string</varname>'], ... >
|
|
|
|
array_type: ARRAY < <varname>primitive_or_complex_type</varname> >
|
|
|
|
map_type: MAP < <varname>primitive_type</varname>, <varname>primitive_or_complex_type</varname> >
|
|
</ph>
|
|
row_format:
|
|
DELIMITED [FIELDS TERMINATED BY '<varname>char</varname>' [ESCAPED BY '<varname>char</varname>']]
|
|
[LINES TERMINATED BY '<varname>char</varname>']
|
|
|
|
file_format:
|
|
PARQUET
|
|
| TEXTFILE
|
|
| AVRO
|
|
| SEQUENCEFILE
|
|
| RCFILE
|
|
|
|
<ph rev="CDH-41501">ctas_file_format:
|
|
PARQUET
|
|
| TEXTFILE</ph>
|
|
</codeblock>
|
|
|
|
<p conref="../shared/impala_common.xml#common/ddl_blurb"/>
|
|
|
|
<!-- Should really have some info up front about all the data types and file formats.
|
|
Consider adding here, or at least making inline links to the relevant keywords
|
|
in the syntax spec above. -->
|
|
|
|
<p>
|
|
<b>Column definitions:</b>
|
|
</p>
|
|
|
|
<p>
|
|
Depending on the form of the <codeph>CREATE TABLE</codeph> statement, the column definitions are
|
|
required or not allowed.
|
|
</p>
|
|
|
|
<p>
|
|
With the <codeph>CREATE TABLE AS SELECT</codeph> and <codeph>CREATE TABLE LIKE</codeph>
|
|
syntax, you do not specify the columns at all; the column names and types are derived from the source table, query,
|
|
or data file.
|
|
</p>
|
|
|
|
<p>
|
|
With the basic <codeph>CREATE TABLE</codeph> syntax, you must list one or more columns,
|
|
its name, type, and optionally a comment, in addition to any columns used as partitioning keys.
|
|
There is one exception where the column list is not required: when creating an Avro table with the
|
|
<codeph>STORED AS AVRO</codeph> clause, you can omit the list of columns and specify the same metadata
|
|
as part of the <codeph>TBLPROPERTIES</codeph> clause.
|
|
</p>
|
|
|
|
<p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
|
|
|
|
<p rev="2.3.0">
|
|
The Impala complex types (<codeph>STRUCT</codeph>, <codeph>ARRAY</codeph>, or <codeph>MAP</codeph>)
|
|
are available in <keyword keyref="impala23_full"/> and higher.
|
|
Because you can nest these types (for example, to make an array of maps or a struct
|
|
with an array field), these types are also sometimes referred to as nested types.
|
|
See <xref href="impala_complex_types.xml#complex_types"/> for usage details.
|
|
</p>
|
|
|
|
<!-- This is kind of an obscure and rare usage scenario. Consider moving all the complex type stuff further down
|
|
after some of the more common clauses. -->
|
|
<p rev="2.3.0">
|
|
Impala can create tables containing complex type columns, with any supported file format.
|
|
Because currently Impala can only query complex type columns in Parquet tables, creating
|
|
tables with complex type columns and other file formats such as text is of limited use.
|
|
For example, you might create a text table including some columns with complex types with Impala, and use Hive
|
|
as part of your to ingest the nested type data and copy it to an identical Parquet table.
|
|
Or you might create a partitioned table containing complex type columns using one file format, and
|
|
use <codeph>ALTER TABLE</codeph> to change the file format of individual partitions to Parquet; Impala
|
|
can then query only the Parquet-format partitions in that table.
|
|
</p>
|
|
|
|
<p conref="../shared/impala_common.xml#common/complex_types_partitioning"/>
|
|
|
|
<p>
|
|
<b>Internal and external tables (EXTERNAL and LOCATION clauses):</b>
|
|
</p>
|
|
|
|
<p>
|
|
By default, Impala creates an <q>internal</q> table, where Impala manages the underlying data files for the
|
|
table, and physically deletes the data files when you drop the table. If you specify the
|
|
<codeph>EXTERNAL</codeph> clause, Impala treats the table as an <q>external</q> table, where the data files
|
|
are typically produced outside Impala and queried from their original locations in HDFS, and Impala leaves
|
|
the data files in place when you drop the table. For details about internal and external tables, see
|
|
<xref href="impala_tables.xml#tables"/>.
|
|
</p>
|
|
|
|
<p>
|
|
Typically, for an external table you include a <codeph>LOCATION</codeph> clause to specify the path to the
|
|
HDFS directory where Impala reads and writes files for the table. For example, if your data pipeline produces
|
|
Parquet files in the HDFS directory <filepath>/user/etl/destination</filepath>, you might create an external
|
|
table as follows:
|
|
</p>
|
|
|
|
<codeblock>CREATE EXTERNAL TABLE external_parquet (c1 INT, c2 STRING, c3 TIMESTAMP)
|
|
STORED AS PARQUET LOCATION '/user/etl/destination';
|
|
</codeblock>
|
|
|
|
<p>
|
|
Although the <codeph>EXTERNAL</codeph> and <codeph>LOCATION</codeph> clauses are often specified together,
|
|
<codeph>LOCATION</codeph> is optional for external tables, and you can also specify <codeph>LOCATION</codeph>
|
|
for internal tables. The difference is all about whether Impala <q>takes control</q> of the underlying data
|
|
files and moves them when you rename the table, or deletes them when you drop the table. For more about
|
|
internal and external tables and how they interact with the <codeph>LOCATION</codeph> attribute, see
|
|
<xref href="impala_tables.xml#tables"/>.
|
|
</p>
|
|
|
|
<p>
|
|
<b>Partitioned tables (PARTITIONED BY clause):</b>
|
|
</p>
|
|
|
|
<p>
|
|
The <codeph>PARTITIONED BY</codeph> clause divides the data files based on the values from one or more
|
|
specified columns. Impala queries can use the partition metadata to minimize the amount of data that is read
|
|
from disk or transmitted across the network, particularly during join queries. For details about
|
|
partitioning, see <xref href="impala_partitioning.xml#partitioning"/>.
|
|
</p>
|
|
|
|
<p rev="2.5.0">
|
|
Prior to <keyword keyref="impala25_full"/> you could use a partitioned table
|
|
as the source and copy data from it, but could not specify any partitioning clauses for the new table.
|
|
In <keyword keyref="impala25_full"/> and higher, you can now use the <codeph>PARTITIONED BY</codeph> clause with a
|
|
<codeph>CREATE TABLE AS SELECT</codeph> statement. See the examples under the following discussion of
|
|
the <codeph>CREATE TABLE AS SELECT</codeph> syntax variation.
|
|
</p>
|
|
|
|
<!--
|
|
<p rev="kudu">
|
|
<b>Partitioning for Kudu tables (PARTITION BY clause)</b>
|
|
</p>
|
|
|
|
<p rev="kudu">
|
|
For Kudu tables, you specify logical partitioning across one or more columns using the
|
|
<codeph>PARTITION BY</codeph> clause. In contrast to partitioning for HDFS-based tables,
|
|
multiple values for a partition key column can be located in the same partition.
|
|
The optional <codeph>HASH</codeph> clause lets you divide one or a set of partition key columns
|
|
into a specified number of partitions; you can use more than one <codeph>HASH</codeph>
|
|
clause, specifying a distinct set of partition key columns for each.
|
|
The optional <codeph>RANGE</codeph> clause further subdivides the partitions, based on
|
|
a range of values for the partition key columns.
|
|
</p>
|
|
-->
|
|
|
|
<p>
|
|
<b>Specifying file format (STORED AS and ROW FORMAT clauses):</b>
|
|
</p>
|
|
|
|
<p rev="DOCS-1523">
|
|
The <codeph>STORED AS</codeph> clause identifies the format of the underlying data files. Currently, Impala
|
|
can query more types of file formats than it can create or insert into. Use Hive to perform any create or
|
|
data load operations that are not currently available in Impala. For example, Impala can create an Avro,
|
|
SequenceFile, or RCFile table but cannot insert data into it. There are also Impala-specific procedures for using
|
|
compression with each kind of file format. For details about working with data files of various formats, see
|
|
<xref href="impala_file_formats.xml#file_formats"/>.
|
|
</p>
|
|
|
|
<note>
|
|
In Impala 1.4.0 and higher, Impala can create Avro tables, which formerly required doing the <codeph>CREATE
|
|
TABLE</codeph> statement in Hive. See <xref href="impala_avro.xml#avro"/> for details and examples.
|
|
</note>
|
|
|
|
<p>
|
|
By default (when no <codeph>STORED AS</codeph> clause is specified), data files in Impala tables are created
|
|
as text files with Ctrl-A (hex 01) characters as the delimiter.
|
|
<!-- Verify if ROW FORMAT is entirely ignored outside of text tables, or does it apply somehow to SequenceFile and/or RCFile too? -->
|
|
Specify the <codeph>ROW FORMAT DELIMITED</codeph> clause to produce or ingest data files that use a different
|
|
delimiter character such as tab or <codeph>|</codeph>, or a different line end character such as carriage
|
|
return or newline. When specifying delimiter and line end characters with the <codeph>FIELDS TERMINATED
|
|
BY</codeph> and <codeph>LINES TERMINATED BY</codeph> clauses, use <codeph>'\t'</codeph> for tab,
|
|
<codeph>'\n'</codeph> for newline or linefeed, <codeph>'\r'</codeph> for carriage return, and
|
|
<codeph>\</codeph><codeph>0</codeph> for ASCII <codeph>nul</codeph> (hex 00). For more examples of text
|
|
tables, see <xref href="impala_txtfile.xml#txtfile"/>.
|
|
</p>
|
|
|
|
<p>
|
|
The <codeph>ESCAPED BY</codeph> clause applies both to text files that you create through an
|
|
<codeph>INSERT</codeph> statement to an Impala <codeph>TEXTFILE</codeph> table, and to existing data files
|
|
that you put into an Impala table directory. (You can ingest existing data files either by creating the table
|
|
with <codeph>CREATE EXTERNAL TABLE ... LOCATION</codeph>, the <codeph>LOAD DATA</codeph> statement, or
|
|
through an HDFS operation such as <codeph>hdfs dfs -put <varname>file</varname>
|
|
<varname>hdfs_path</varname></codeph>.) Choose an escape character that is not used anywhere else in the
|
|
file, and put it in front of each instance of the delimiter character that occurs within a field value.
|
|
Surrounding field values with quotation marks does not help Impala to parse fields with embedded delimiter
|
|
characters; the quotation marks are considered to be part of the column value. If you want to use
|
|
<codeph>\</codeph> as the escape character, specify the clause in <cmdname>impala-shell</cmdname> as
|
|
<codeph>ESCAPED BY '\\'</codeph>.
|
|
</p>
|
|
|
|
<note conref="../shared/impala_common.xml#common/thorn"/>
|
|
|
|
<p>
|
|
<b>Cloning tables (LIKE clause):</b>
|
|
</p>
|
|
|
|
<p>
|
|
To create an empty table with the same columns, comments, and other attributes as another table, use the
|
|
following variation. The <codeph>CREATE TABLE ... LIKE</codeph> form allows a restricted set of clauses,
|
|
currently only the <codeph>LOCATION</codeph>, <codeph>COMMENT</codeph>, and <codeph>STORED AS</codeph>
|
|
clauses.
|
|
</p>
|
|
|
|
<codeblock>CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [<varname>db_name</varname>.]<varname>table_name</varname>
|
|
<ph rev="1.4.0">LIKE { [<varname>db_name</varname>.]<varname>table_name</varname> | PARQUET '<varname>hdfs_path_of_parquet_file</varname>' }</ph>
|
|
[COMMENT '<varname>table_comment</varname>']
|
|
[STORED AS <varname>file_format</varname>]
|
|
[LOCATION '<varname>hdfs_path</varname>']</codeblock>
|
|
|
|
<note rev="1.2.0">
|
|
<p rev="1.2.0">
|
|
To clone the structure of a table and transfer data into it in a single operation, use the <codeph>CREATE
|
|
TABLE AS SELECT</codeph> syntax described in the next subsection.
|
|
</p>
|
|
</note>
|
|
|
|
<p>
|
|
When you clone the structure of an existing table using the <codeph>CREATE TABLE ... LIKE</codeph> syntax,
|
|
the new table keeps the same file format as the original one, so you only need to specify the <codeph>STORED
|
|
AS</codeph> clause if you want to use a different file format, or when specifying a view as the original
|
|
table. (Creating a table <q>like</q> a view produces a text table by default.)
|
|
</p>
|
|
|
|
<p>
|
|
Although normally Impala cannot create an HBase table directly, Impala can clone the structure of an existing
|
|
HBase table with the <codeph>CREATE TABLE ... LIKE</codeph> syntax, preserving the file format and metadata
|
|
from the original table.
|
|
</p>
|
|
|
|
<p>
|
|
There are some exceptions to the ability to use <codeph>CREATE TABLE ... LIKE</codeph> with an Avro table.
|
|
For example, you cannot use this technique for an Avro table that is specified with an Avro schema but no
|
|
columns. When in doubt, check if a <codeph>CREATE TABLE ... LIKE</codeph> operation works in Hive; if not, it
|
|
typically will not work in Impala either.
|
|
</p>
|
|
|
|
<p>
|
|
If the original table is partitioned, the new table inherits the same partition key columns. Because the new
|
|
table is initially empty, it does not inherit the actual partitions that exist in the original one. To create
|
|
partitions in the new table, insert data or issue <codeph>ALTER TABLE ... ADD PARTITION</codeph> statements.
|
|
</p>
|
|
|
|
<p conref="../shared/impala_common.xml#common/create_table_like_view"/>
|
|
|
|
<p>
|
|
Because <codeph>CREATE TABLE ... LIKE</codeph> only manipulates table metadata, not the physical data of the
|
|
table, issue <codeph>INSERT INTO TABLE</codeph> statements afterward to copy any data from the original table
|
|
into the new one, optionally converting the data to a new file format. (For some file formats, Impala can do
|
|
a <codeph>CREATE TABLE ... LIKE</codeph> to create the table, but Impala cannot insert data in that file
|
|
format; in these cases, you must load the data in Hive. See
|
|
<xref href="impala_file_formats.xml#file_formats"/> for details.)
|
|
</p>
|
|
|
|
<p rev="1.2" id="ctas">
|
|
<b>CREATE TABLE AS SELECT:</b>
|
|
</p>
|
|
|
|
<p>
|
|
The <codeph>CREATE TABLE AS SELECT</codeph> syntax is a shorthand notation to create a table based on column
|
|
definitions from another table, and copy data from the source table to the destination table without issuing
|
|
any separate <codeph>INSERT</codeph> statement. This idiom is so popular that it has its own acronym,
|
|
<q>CTAS</q>.
|
|
</p>
|
|
|
|
<p>
|
|
The following examples show how to copy data from a source table <codeph>T1</codeph>
|
|
to a variety of destinations tables, applying various transformations to the table
|
|
properties, table layout, or the data itself as part of the operation:
|
|
</p>
|
|
|
|
<codeblock>
|
|
-- Sample table to be the source of CTAS operations.
|
|
CREATE TABLE t1 (x INT, y STRING);
|
|
INSERT INTO t1 VALUES (1, 'one'), (2, 'two'), (3, 'three');
|
|
|
|
-- Clone all the columns and data from one table to another.
|
|
CREATE TABLE clone_of_t1 AS SELECT * FROM t1;
|
|
+-------------------+
|
|
| summary |
|
|
+-------------------+
|
|
| Inserted 3 row(s) |
|
|
+-------------------+
|
|
|
|
-- Clone the columns and data, and convert the data to a different file format.
|
|
CREATE TABLE parquet_version_of_t1 STORED AS PARQUET AS SELECT * FROM t1;
|
|
+-------------------+
|
|
| summary |
|
|
+-------------------+
|
|
| Inserted 3 row(s) |
|
|
+-------------------+
|
|
|
|
-- Copy only some rows to the new table.
|
|
CREATE TABLE subset_of_t1 AS SELECT * FROM t1 WHERE x >= 2;
|
|
+-------------------+
|
|
| summary |
|
|
+-------------------+
|
|
| Inserted 2 row(s) |
|
|
+-------------------+
|
|
|
|
-- Same idea as CREATE TABLE LIKE: clone table layout but do not copy any data.
|
|
CREATE TABLE empty_clone_of_t1 AS SELECT * FROM t1 WHERE 1=0;
|
|
+-------------------+
|
|
| summary |
|
|
+-------------------+
|
|
| Inserted 0 row(s) |
|
|
+-------------------+
|
|
|
|
-- Reorder and rename columns and transform the data.
|
|
CREATE TABLE t5 AS SELECT upper(y) AS s, x+1 AS a, 'Entirely new column' AS n FROM t1;
|
|
+-------------------+
|
|
| summary |
|
|
+-------------------+
|
|
| Inserted 3 row(s) |
|
|
+-------------------+
|
|
SELECT * FROM t5;
|
|
+-------+---+---------------------+
|
|
| s | a | n |
|
|
+-------+---+---------------------+
|
|
| ONE | 2 | Entirely new column |
|
|
| TWO | 3 | Entirely new column |
|
|
| THREE | 4 | Entirely new column |
|
|
+-------+---+---------------------+
|
|
</codeblock>
|
|
|
|
<!-- These are a little heavyweight to get into here. Therefore commenting out.
|
|
Some overlap with the new column-changing examples in the code listing above.
|
|
Create tables with different column order, names, or types than the original.
|
|
CREATE TABLE some_columns_from_t1 AS SELECT c1, c3, c5 FROM t1;
|
|
CREATE TABLE reordered_columns_from_t1 AS SELECT c4, c3, c1, c2 FROM t1;
|
|
CREATE TABLE synthesized_columns AS SELECT upper(c1) AS all_caps, c2+c3 AS total, "California" AS state FROM t1;</codeblock>
|
|
-->
|
|
|
|
<!-- CREATE TABLE AS <select> now incorporated up higher in the original syntax diagram. -->
|
|
|
|
<p rev="1.2">
|
|
See <xref href="impala_select.xml#select"/> for details about query syntax for the <codeph>SELECT</codeph>
|
|
portion of a <codeph>CREATE TABLE AS SELECT</codeph> statement.
|
|
</p>
|
|
|
|
<p rev="1.2">
|
|
The newly created table inherits the column names that you select from the original table, which you can
|
|
override by specifying column aliases in the query. Any column or table comments from the original table are
|
|
not carried over to the new table.
|
|
</p>
|
|
|
|
<note rev="DOCS-1523">
|
|
When using the <codeph>STORED AS</codeph> clause with a <codeph>CREATE TABLE AS SELECT</codeph>
|
|
statement, the destination table must be a file format that Impala can write to: currently,
|
|
text or Parquet. You cannot specify an Avro, SequenceFile, or RCFile table as the destination
|
|
table for a CTAS operation.
|
|
</note>
|
|
|
|
<p rev="2.5.0">
|
|
Prior to <keyword keyref="impala25_full"/> you could use a partitioned table
|
|
as the source and copy data from it, but could not specify any partitioning clauses for the new table.
|
|
In <keyword keyref="impala25_full"/> and higher, you can now use the <codeph>PARTITIONED BY</codeph> clause with a
|
|
<codeph>CREATE TABLE AS SELECT</codeph> statement. The following example demonstrates how you can copy
|
|
data from an unpartitioned table in a <codeph>CREATE TABLE AS SELECT</codeph> operation, creating a new
|
|
partitioned table in the process. The main syntax consideration is the column order in the <codeph>PARTITIONED BY</codeph>
|
|
clause and the select list: the partition key columns must be listed last in the select list, in the same
|
|
order as in the <codeph>PARTITIONED BY</codeph> clause. Therefore, in this case, the column order in the
|
|
destination table is different from the source table. You also only specify the column names in the
|
|
<codeph>PARTITIONED BY</codeph> clause, not the data types or column comments.
|
|
</p>
|
|
|
|
<codeblock rev="2.5.0">
|
|
create table partitions_no (year smallint, month tinyint, s string);
|
|
insert into partitions_no values (2016, 1, 'January 2016'),
|
|
(2016, 2, 'February 2016'), (2016, 3, 'March 2016');
|
|
|
|
-- Prove that the source table is not partitioned.
|
|
show partitions partitions_no;
|
|
ERROR: AnalysisException: Table is not partitioned: ctas_partition_by.partitions_no
|
|
|
|
-- Create new table with partitions based on column values from source table.
|
|
<b>create table partitions_yes partitioned by (year, month)
|
|
as select s, year, month from partitions_no;</b>
|
|
+-------------------+
|
|
| summary |
|
|
+-------------------+
|
|
| Inserted 3 row(s) |
|
|
+-------------------+
|
|
|
|
-- Prove that the destination table is partitioned.
|
|
show partitions partitions_yes;
|
|
+-------+-------+-------+--------+------+...
|
|
| year | month | #Rows | #Files | Size |...
|
|
+-------+-------+-------+--------+------+...
|
|
| 2016 | 1 | -1 | 1 | 13B |...
|
|
| 2016 | 2 | -1 | 1 | 14B |...
|
|
| 2016 | 3 | -1 | 1 | 11B |...
|
|
| Total | | -1 | 3 | 38B |...
|
|
+-------+-------+-------+--------+------+...
|
|
</codeblock>
|
|
|
|
<p rev="2.5.0">
|
|
The most convenient layout for partitioned tables is with all the
|
|
partition key columns at the end. The CTAS <codeph>PARTITIONED BY</codeph> syntax
|
|
requires that column order in the select list, resulting in that same
|
|
column order in the destination table.
|
|
</p>
|
|
|
|
<codeblock rev="2.5.0">
|
|
describe partitions_no;
|
|
+-------+----------+---------+
|
|
| name | type | comment |
|
|
+-------+----------+---------+
|
|
| year | smallint | |
|
|
| month | tinyint | |
|
|
| s | string | |
|
|
+-------+----------+---------+
|
|
|
|
-- The CTAS operation forced us to put the partition key columns last.
|
|
-- Having those columns last works better with idioms such as SELECT *
|
|
-- for partitioned tables.
|
|
describe partitions_yes;
|
|
+-------+----------+---------+
|
|
| name | type | comment |
|
|
+-------+----------+---------+
|
|
| s | string | |
|
|
| year | smallint | |
|
|
| month | tinyint | |
|
|
+-------+----------+---------+
|
|
</codeblock>
|
|
|
|
<p rev="2.5.0">
|
|
Attempting to use a select list with the partition key columns
|
|
not at the end results in an error due to a column name mismatch:
|
|
</p>
|
|
|
|
<codeblock rev="2.5.0">
|
|
-- We expect this CTAS to fail because non-key column S
|
|
-- comes after key columns YEAR and MONTH in the select list.
|
|
create table partitions_maybe partitioned by (year, month)
|
|
as select year, month, s from partitions_no;
|
|
ERROR: AnalysisException: Partition column name mismatch: year != month
|
|
</codeblock>
|
|
|
|
<p rev="1.2">
|
|
For example, the following statements show how you can clone all the data in a table, or a subset of the
|
|
columns and/or rows, or reorder columns, rename them, or construct them out of expressions:
|
|
</p>
|
|
|
|
<p rev="1.2">
|
|
As part of a CTAS operation, you can convert the data to any file format that Impala can write (currently,
|
|
<codeph>TEXTFILE</codeph> and <codeph>PARQUET</codeph>). You cannot specify the lower-level properties of a
|
|
text table, such as the delimiter.
|
|
</p>
|
|
|
|
<p rev="obwl" conref="../shared/impala_common.xml#common/insert_sort_blurb"/>
|
|
|
|
<p rev="1.4.0">
|
|
<b>CREATE TABLE LIKE PARQUET:</b>
|
|
</p>
|
|
|
|
<p rev="1.4.0">
|
|
The variation <codeph>CREATE TABLE ... LIKE PARQUET '<varname>hdfs_path_of_parquet_file</varname>'</codeph>
|
|
lets you skip the column definitions of the <codeph>CREATE TABLE</codeph> statement. The column names and
|
|
data types are automatically configured based on the organization of the specified Parquet data file, which
|
|
must already reside in HDFS. You can use a data file located outside the Impala database directories, or a
|
|
file from an existing Impala Parquet table; either way, Impala only uses the column definitions from the file
|
|
and does not use the HDFS location for the <codeph>LOCATION</codeph> attribute of the new table. (Although
|
|
you can also specify the enclosing directory with the <codeph>LOCATION</codeph> attribute, to both use the
|
|
same schema as the data file and point the Impala table at the associated directory for querying.)
|
|
</p>
|
|
|
|
<p rev="1.4.0">
|
|
The following considerations apply when you use the <codeph>CREATE TABLE LIKE PARQUET</codeph> technique:
|
|
</p>
|
|
|
|
<ul rev="1.4.0">
|
|
<li>
|
|
Any column comments from the original table are not preserved in the new table. Each column in the new
|
|
table has a comment stating the low-level Parquet field type used to deduce the appropriate SQL column
|
|
type.
|
|
</li>
|
|
|
|
<li>
|
|
If you use a data file from a partitioned Impala table, any partition key columns from the original table
|
|
are left out of the new table, because they are represented in HDFS directory names rather than stored in
|
|
the data file. To preserve the partition information, repeat the same <codeph>PARTITION</codeph> clause as
|
|
in the original <codeph>CREATE TABLE</codeph> statement.
|
|
</li>
|
|
|
|
<li>
|
|
The file format of the new table defaults to text, as with other kinds of <codeph>CREATE TABLE</codeph>
|
|
statements. To make the new table also use Parquet format, include the clause <codeph>STORED AS
|
|
PARQUET</codeph> in the <codeph>CREATE TABLE LIKE PARQUET</codeph> statement.
|
|
</li>
|
|
|
|
<li>
|
|
If the Parquet data file comes from an existing Impala table, currently, any <codeph>TINYINT</codeph> or
|
|
<codeph>SMALLINT</codeph> columns are turned into <codeph>INT</codeph> columns in the new table.
|
|
Internally, Parquet stores such values as 32-bit integers.
|
|
</li>
|
|
|
|
<li>
|
|
When the destination table uses the Parquet file format, the <codeph>CREATE TABLE AS SELECT</codeph> and
|
|
<codeph>INSERT ... SELECT</codeph> statements always create at least one data file, even if the
|
|
<codeph>SELECT</codeph> part of the statement does not match any rows. You can use such an empty Parquet
|
|
data file as a template for subsequent <codeph>CREATE TABLE LIKE PARQUET</codeph> statements.
|
|
</li>
|
|
</ul>
|
|
|
|
<p>
|
|
For more details about creating Parquet tables, and examples of the <codeph>CREATE TABLE LIKE
|
|
PARQUET</codeph> syntax, see <xref href="impala_parquet.xml#parquet"/>.
|
|
</p>
|
|
|
|
<p>
|
|
<b>Visibility and Metadata (TBLPROPERTIES and WITH SERDEPROPERTIES clauses):</b>
|
|
</p>
|
|
|
|
<p rev="1.2">
|
|
You can associate arbitrary items of metadata with a table by specifying the <codeph>TBLPROPERTIES</codeph>
|
|
clause. This clause takes a comma-separated list of key-value pairs and stores those items in the metastore
|
|
database. You can also change the table properties later with an <codeph>ALTER TABLE</codeph> statement. You
|
|
can observe the table properties for different delimiter and escape characters using the <codeph>DESCRIBE
|
|
FORMATTED</codeph> command, and change those settings for an existing table with <codeph>ALTER TABLE ... SET
|
|
TBLPROPERTIES</codeph>.
|
|
</p>
|
|
|
|
<p rev="1.2">
|
|
You can also associate SerDes properties with the table by specifying key-value pairs through the
|
|
<codeph>WITH SERDEPROPERTIES</codeph> clause. This metadata is not used by Impala, which has its own built-in
|
|
serializer and deserializer for the file formats it supports. Particular property values might be needed for
|
|
Hive compatibility with certain variations of file formats, particularly Avro.
|
|
</p>
|
|
|
|
<p>
|
|
Some DDL operations that interact with other Hadoop components require specifying particular values in the
|
|
<codeph>SERDEPROPERTIES</codeph> or <codeph>TBLPROPERTIES</codeph> fields, such as creating an Avro table or
|
|
an HBase table. (You typically create HBase tables in Hive, because they require additional clauses not
|
|
currently available in Impala.)
|
|
<!-- Haven't got a working example from Lenni, so suppressing this recommendation for now.
|
|
The Avro schema properties can be specified through either
|
|
<codeph>TBLPROPERTIES</codeph> or <codeph>SERDEPROPERTIES</codeph>;
|
|
for best compatibility with future versions of Hive,
|
|
use <codeph>SERDEPROPERTIES</codeph> in this case.
|
|
-->
|
|
</p>
|
|
|
|
<p>
|
|
To see the column definitions and column comments for an existing table, for example before issuing a
|
|
<codeph>CREATE TABLE ... LIKE</codeph> or a <codeph>CREATE TABLE ... AS SELECT</codeph> statement, issue the
|
|
statement <codeph>DESCRIBE <varname>table_name</varname></codeph>. To see even more detail, such as the
|
|
location of data files and the values for clauses such as <codeph>ROW FORMAT</codeph> and <codeph>STORED
|
|
AS</codeph>, issue the statement <codeph>DESCRIBE FORMATTED <varname>table_name</varname></codeph>.
|
|
<codeph>DESCRIBE FORMATTED</codeph> is also needed to see any overall table comment (as opposed to individual
|
|
column comments).
|
|
</p>
|
|
|
|
<p>
|
|
After creating a table, your <cmdname>impala-shell</cmdname> session or another
|
|
<cmdname>impala-shell</cmdname> connected to the same node can immediately query that table. There might be a
|
|
brief interval (one statestore heartbeat) before the table can be queried through a different Impala node. To
|
|
make the <codeph>CREATE TABLE</codeph> statement return only when the table is recognized by all Impala nodes
|
|
in the cluster, enable the <codeph>SYNC_DDL</codeph> query option.
|
|
</p>
|
|
|
|
<p rev="1.4.0">
|
|
<b>HDFS caching (CACHED IN clause):</b>
|
|
</p>
|
|
|
|
<p rev="1.4.0">
|
|
If you specify the <codeph>CACHED IN</codeph> clause, any existing or future data files in the table
|
|
directory or the partition subdirectories are designated to be loaded into memory with the HDFS caching
|
|
mechanism. See <xref href="impala_perf_hdfs_caching.xml#hdfs_caching"/> for details about using the HDFS
|
|
caching feature.
|
|
</p>
|
|
|
|
<p conref="../shared/impala_common.xml#common/impala_cache_replication_factor"/>
|
|
|
|
<!-- Say something in here about the SHOW statement, e.g. SHOW TABLES, SHOW TABLE/COLUMN STATS, SHOW PARTITIONS. -->
|
|
|
|
<p>
|
|
<b>Column order</b>:
|
|
</p>
|
|
|
|
<p>
|
|
If you intend to use the table to hold data files produced by some external source, specify the columns in
|
|
the same order as they appear in the data files.
|
|
</p>
|
|
|
|
<p>
|
|
If you intend to insert or copy data into the table through Impala, or if you have control over the way
|
|
externally produced data files are arranged, use your judgment to specify columns in the most convenient
|
|
order:
|
|
</p>
|
|
|
|
<ul>
|
|
<li>
|
|
<p>
|
|
If certain columns are often <codeph>NULL</codeph>, specify those columns last. You might produce data
|
|
files that omit these trailing columns entirely. Impala automatically fills in the <codeph>NULL</codeph>
|
|
values if so.
|
|
</p>
|
|
</li>
|
|
|
|
<li>
|
|
<p>
|
|
If an unpartitioned table will be used as the source for an <codeph>INSERT ... SELECT</codeph> operation
|
|
into a partitioned table, specify last in the unpartitioned table any columns that correspond to
|
|
partition key columns in the partitioned table, and in the same order as the partition key columns are
|
|
declared in the partitioned table. This technique lets you use <codeph>INSERT ... SELECT *</codeph> when
|
|
copying data to the partitioned table, rather than specifying each column name individually.
|
|
</p>
|
|
</li>
|
|
|
|
<li>
|
|
<p>
|
|
If you specify columns in an order that you later discover is suboptimal, you can sometimes work around
|
|
the problem without recreating the table. You can create a view that selects columns from the original
|
|
table in a permuted order, then do a <codeph>SELECT *</codeph> from the view. When inserting data into a
|
|
table, you can specify a permuted order for the inserted columns to match the order in the destination
|
|
table.
|
|
</p>
|
|
</li>
|
|
</ul>
|
|
|
|
<p conref="../shared/impala_common.xml#common/hive_blurb"/>
|
|
|
|
<p>
|
|
Impala queries can make use of metadata about the table and columns, such as the number of rows in a table or
|
|
the number of different values in a column. Prior to Impala 1.2.2, to create this metadata, you issued the
|
|
<codeph>ANALYZE TABLE</codeph> statement in Hive to gather this information, after creating the table and
|
|
loading representative data into it. In Impala 1.2.2 and higher, the <codeph>COMPUTE STATS</codeph> statement
|
|
produces these statistics within Impala, without needing to use Hive at all.
|
|
</p>
|
|
|
|
<p conref="../shared/impala_common.xml#common/hbase_blurb"/>
|
|
|
|
<note>
|
|
<p>
|
|
The Impala <codeph>CREATE TABLE</codeph> statement cannot create an HBase table, because it currently does
|
|
not support the <codeph>STORED BY</codeph> clause needed for HBase tables. Create such tables in Hive, then
|
|
query them through Impala. For information on using Impala with HBase tables, see
|
|
<xref href="impala_hbase.xml#impala_hbase"/>.
|
|
</p>
|
|
</note>
|
|
|
|
<p conref="../shared/impala_common.xml#common/s3_blurb"/>
|
|
<p rev="2.2.0">
|
|
To create a table where the data resides in the Amazon Simple Storage Service (S3),
|
|
specify a <codeph>s3a://</codeph> prefix <codeph>LOCATION</codeph> attribute pointing to the data files in S3.
|
|
</p>
|
|
|
|
<p rev="2.6.0 CDH-39913 IMPALA-1878">
|
|
In <keyword keyref="impala26_full"/> and higher, you can
|
|
use this special <codeph>LOCATION</codeph> syntax
|
|
as part of a <codeph>CREATE TABLE AS SELECT</codeph> statement.
|
|
</p>
|
|
|
|
<p conref="../shared/impala_common.xml#common/s3_ddl"/>
|
|
|
|
<p conref="../shared/impala_common.xml#common/insert_sort_blurb"/>
|
|
|
|
<p conref="../shared/impala_common.xml#common/hdfs_blurb"/>
|
|
|
|
<p>
|
|
The <codeph>CREATE TABLE</codeph> statement for an internal table creates a directory in HDFS. The
|
|
<codeph>CREATE EXTERNAL TABLE</codeph> statement associates the table with an existing HDFS directory, and
|
|
does not create any new directory in HDFS. To locate the HDFS data directory for a table, issue a
|
|
<codeph>DESCRIBE FORMATTED <varname>table</varname></codeph> statement. To examine the contents of that HDFS
|
|
directory, use an OS command such as <codeph>hdfs dfs -ls hdfs://<varname>path</varname></codeph>, either
|
|
from the OS command line or through the <codeph>shell</codeph> or <codeph>!</codeph> commands in
|
|
<cmdname>impala-shell</cmdname>.
|
|
</p>
|
|
|
|
<p>
|
|
The <codeph>CREATE TABLE AS SELECT</codeph> syntax creates data files under the table data directory to hold
|
|
any data copied by the <codeph>INSERT</codeph> portion of the statement. (Even if no data is copied, Impala
|
|
might create one or more empty data files.)
|
|
</p>
|
|
|
|
<p conref="../shared/impala_common.xml#common/permissions_blurb"/>
|
|
<p rev="CDH-19187">
|
|
The user ID that the <cmdname>impalad</cmdname> daemon runs under,
|
|
typically the <codeph>impala</codeph> user, must have both execute and write
|
|
permission for the database directory where the table is being created.
|
|
</p>
|
|
|
|
<p conref="../shared/impala_common.xml#common/security_blurb"/>
|
|
<p conref="../shared/impala_common.xml#common/redaction_yes"/>
|
|
|
|
<p conref="../shared/impala_common.xml#common/cancel_blurb_maybe"/>
|
|
|
|
<p conref="../shared/impala_common.xml#common/related_info"/>
|
|
|
|
<p>
|
|
<xref href="impala_tables.xml#tables"/>,
|
|
<xref href="impala_alter_table.xml#alter_table"/>, <xref href="impala_drop_table.xml#drop_table"/>,
|
|
<xref href="impala_partitioning.xml#partitioning"/>, <xref href="impala_tables.xml#internal_tables"/>,
|
|
<xref href="impala_tables.xml#external_tables"/>, <xref href="impala_compute_stats.xml#compute_stats"/>,
|
|
<xref href="impala_sync_ddl.xml#sync_ddl"/>, <xref href="impala_show.xml#show_tables"/>,
|
|
<xref href="impala_show.xml#show_create_table"/>, <xref href="impala_describe.xml#describe"/>
|
|
</p>
|
|
</conbody>
|
|
</concept>
|