impala/docs/topics/impala_create_table.xml

<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements.  See the NOTICE file
distributed with this work for additional information
regarding copyright ownership.  The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License.  You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied.  See the License for the
specific language governing permissions and limitations
under the License.
-->
<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
<concept id="create_table" outputclass="impala sql_statement">

  <title outputclass="impala_title sql_statement_title">CREATE TABLE Statement</title>
  <titlealts audience="PDF"><navtitle>CREATE TABLE</navtitle></titlealts>
  <prolog>
    <metadata>
      <data name="Category" value="Impala"/>
      <data name="Category" value="SQL"/>
      <data name="Category" value="DDL"/>
      <data name="Category" value="Impala Data Types"/>
      <data name="Category" value="Developers"/>
      <data name="Category" value="Data Analysts"/>
      <data name="Category" value="HDFS Caching"/>
      <data name="Category" value="Tables"/>
      <data name="Category" value="Schemas"/>
      <data name="Category" value="S3"/>
      <!-- <data name="Category" value="Kudu"/> -->
    </metadata>
  </prolog>

  <conbody>

    <p>
      <indexterm audience="hidden">CREATE TABLE statement</indexterm>
      Creates a new table and specifies its characteristics. While creating a table, you optionally specify aspects
      such as:
    </p>

    <ul>
      <li>
        Whether the table is internal or external.
      </li>

      <li>
        The columns and associated data types.
      </li>

      <li>
        The columns used for physically partitioning the data.
      </li>

      <li>
        The file format for data files.
      </li>

      <li>
        The HDFS directory where the data files are located.
      </li>
    </ul>

    <p conref="../shared/impala_common.xml#common/syntax_blurb"/>

    <p>
      The general syntax for creating a table and specifying its columns is as follows:
    </p>

    <p>
      <b>Explicit column definitions:</b>
    </p>

<codeblock>CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [<varname>db_name</varname>.]<varname>table_name</varname>
  (<varname>col_name</varname> <varname>data_type</varname> [COMMENT '<varname>col_comment</varname>'], ...)
  [PARTITIONED BY (<varname>col_name</varname> <varname>data_type</varname> [COMMENT '<varname>col_comment</varname>'], ...)]
  [COMMENT '<varname>table_comment</varname>']
  [WITH SERDEPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)]
  [
   [ROW FORMAT <varname>row_format</varname>] [STORED AS <varname>file_format</varname>]
  ]
  [LOCATION '<varname>hdfs_path</varname>']
  [TBLPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)]
<ph rev="1.4.0">  [CACHED IN '<varname>pool_name</varname>'</ph> <ph rev="2.2.0">[WITH REPLICATION = <varname>integer</varname>]</ph> | UNCACHED]
</codeblock>

    <p>
      <b>Column definitions inferred from data file:</b>
    </p>

<codeblock>CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [<varname>db_name</varname>.]<varname>table_name</varname>
  LIKE PARQUET '<varname>hdfs_path_of_parquet_file</varname>'
  [COMMENT '<varname>table_comment</varname>']
  [PARTITIONED BY (<varname>col_name</varname> <varname>data_type</varname> [COMMENT '<varname>col_comment</varname>'], ...)]
  [WITH SERDEPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)]
  [
   [ROW FORMAT <varname>row_format</varname>] [STORED AS <varname>file_format</varname>]
  ]
  [LOCATION '<varname>hdfs_path</varname>']
  [TBLPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)]
<ph rev="1.4.0">  [CACHED IN '<varname>pool_name</varname>'</ph> <ph rev="2.2.0">[WITH REPLICATION = <varname>integer</varname>]</ph> | UNCACHED]
data_type:
    <varname>primitive_type</varname>
  | array_type
  | map_type
  | struct_type
</codeblock>

    <p>
      <b>CREATE TABLE AS SELECT:</b>
    </p>

<codeblock>CREATE [EXTERNAL] TABLE [IF NOT EXISTS] <varname>db_name</varname>.]<varname>table_name</varname>
  <ph rev="2.5.0">[PARTITIONED BY (<varname>col_name</varname>[, ...])]</ph>
  [COMMENT '<varname>table_comment</varname>']
  [WITH SERDEPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)]
  [
   [ROW FORMAT <varname>row_format</varname>] <ph rev="CDH-41501">[STORED AS <varname>ctas_file_format</varname>]</ph>
  ]
  [LOCATION '<varname>hdfs_path</varname>']
  [TBLPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)]
<ph rev="1.4.0">  [CACHED IN '<varname>pool_name</varname>'</ph> <ph rev="2.2.0">[WITH REPLICATION = <varname>integer</varname>]</ph> | UNCACHED]
AS
  <varname>select_statement</varname></codeblock>

<codeblock>primitive_type:
    TINYINT
  | SMALLINT
  | INT
  | BIGINT
  | BOOLEAN
  | FLOAT
  | DOUBLE
  <ph rev="1.4.0">| DECIMAL</ph>
  | STRING
  <ph rev="2.0.0">| CHAR</ph>
  <ph rev="2.0.0">| VARCHAR</ph>
  | TIMESTAMP

<ph rev="2.3.0">complex_type:
    struct_type
  | array_type
  | map_type

struct_type: STRUCT &lt; <varname>name</varname> : <varname>primitive_or_complex_type</varname> [COMMENT '<varname>comment_string</varname>'], ... &gt;

array_type: ARRAY &lt; <varname>primitive_or_complex_type</varname> &gt;

map_type: MAP &lt; <varname>primitive_type</varname>, <varname>primitive_or_complex_type</varname> &gt;
</ph>
row_format:
  DELIMITED [FIELDS TERMINATED BY '<varname>char</varname>' [ESCAPED BY '<varname>char</varname>']]
  [LINES TERMINATED BY '<varname>char</varname>']

file_format:
    PARQUET
  | TEXTFILE
  | AVRO
  | SEQUENCEFILE
  | RCFILE

<ph rev="CDH-41501">ctas_file_format:
    PARQUET
  | TEXTFILE</ph>
</codeblock>

    <p conref="../shared/impala_common.xml#common/ddl_blurb"/>

    <!-- Should really have some info up front about all the data types and file formats.
         Consider adding here, or at least making inline links to the relevant keywords
         in the syntax spec above. -->

    <p>
      <b>Column definitions:</b>
    </p>

    <p>
      Depending on the form of the <codeph>CREATE TABLE</codeph> statement, the column definitions are
      required or not allowed.
    </p>

    <p>
      With the <codeph>CREATE TABLE AS SELECT</codeph> and <codeph>CREATE TABLE LIKE</codeph>
      syntax, you do not specify the columns at all; the column names and types are derived from the source table, query,
      or data file.
    </p>

    <p>
      With the basic <codeph>CREATE TABLE</codeph> syntax, you must list one or more columns,
      its name, type, and optionally a comment, in addition to any columns used as partitioning keys.
      There is one exception where the column list is not required: when creating an Avro table with the
      <codeph>STORED AS AVRO</codeph> clause, you can omit the list of columns and specify the same metadata
      as part of the <codeph>TBLPROPERTIES</codeph> clause.
    </p>

    <p conref="../shared/impala_common.xml#common/complex_types_blurb"/>

    <p rev="2.3.0">
      The Impala complex types (<codeph>STRUCT</codeph>, <codeph>ARRAY</codeph>, or <codeph>MAP</codeph>)
      are available in <keyword keyref="impala23_full"/> and higher.
      Because you can nest these types (for example, to make an array of maps or a struct
      with an array field), these types are also sometimes referred to as nested types.
      See <xref href="impala_complex_types.xml#complex_types"/> for usage details.
    </p>

    <!-- This is kind of an obscure and rare usage scenario. Consider moving all the complex type stuff further down
         after some of the more common clauses. -->
    <p rev="2.3.0">
      Impala can create tables containing complex type columns, with any supported file format.
      Because currently Impala can only query complex type columns in Parquet tables, creating
      tables with complex type columns and other file formats such as text is of limited use.
      For example, you might create a text table including some columns with complex types with Impala, and use Hive
      as part of your to ingest the nested type data and copy it to an identical Parquet table.
      Or you might create a partitioned table containing complex type columns using one file format, and
      use <codeph>ALTER TABLE</codeph> to change the file format of individual partitions to Parquet; Impala
      can then query only the Parquet-format partitions in that table.
    </p>

    <p conref="../shared/impala_common.xml#common/complex_types_partitioning"/>

    <p>
      <b>Internal and external tables (EXTERNAL and LOCATION clauses):</b>
    </p>

    <p>
      By default, Impala creates an <q>internal</q> table, where Impala manages the underlying data files for the
      table, and physically deletes the data files when you drop the table. If you specify the
      <codeph>EXTERNAL</codeph> clause, Impala treats the table as an <q>external</q> table, where the data files
      are typically produced outside Impala and queried from their original locations in HDFS, and Impala leaves
      the data files in place when you drop the table. For details about internal and external tables, see
      <xref href="impala_tables.xml#tables"/>.
    </p>

    <p>
      Typically, for an external table you include a <codeph>LOCATION</codeph> clause to specify the path to the
      HDFS directory where Impala reads and writes files for the table. For example, if your data pipeline produces
      Parquet files in the HDFS directory <filepath>/user/etl/destination</filepath>, you might create an external
      table as follows:
    </p>

<codeblock>CREATE EXTERNAL TABLE external_parquet (c1 INT, c2 STRING, c3 TIMESTAMP)
  STORED AS PARQUET LOCATION '/user/etl/destination';
</codeblock>

    <p>
      Although the <codeph>EXTERNAL</codeph> and <codeph>LOCATION</codeph> clauses are often specified together,
      <codeph>LOCATION</codeph> is optional for external tables, and you can also specify <codeph>LOCATION</codeph>
      for internal tables. The difference is all about whether Impala <q>takes control</q> of the underlying data
      files and moves them when you rename the table, or deletes them when you drop the table. For more about
      internal and external tables and how they interact with the <codeph>LOCATION</codeph> attribute, see
      <xref href="impala_tables.xml#tables"/>.
    </p>

    <p>
      <b>Partitioned tables (PARTITIONED BY clause):</b>
    </p>

    <p>
      The <codeph>PARTITIONED BY</codeph> clause divides the data files based on the values from one or more
      specified columns. Impala queries can use the partition metadata to minimize the amount of data that is read
      from disk or transmitted across the network, particularly during join queries. For details about
      partitioning, see <xref href="impala_partitioning.xml#partitioning"/>.
    </p>

    <p rev="2.5.0">
      Prior to <keyword keyref="impala25_full"/> you could use a partitioned table
      as the source and copy data from it, but could not specify any partitioning clauses for the new table.
      In <keyword keyref="impala25_full"/> and higher, you can now use the <codeph>PARTITIONED BY</codeph> clause with a
      <codeph>CREATE TABLE AS SELECT</codeph> statement. See the examples under the following discussion of
      the <codeph>CREATE TABLE AS SELECT</codeph> syntax variation.
    </p>

<!--
    <p rev="kudu">
      <b>Partitioning for Kudu tables (PARTITION BY clause)</b>
    </p>

    <p rev="kudu">
      For Kudu tables, you specify logical partitioning across one or more columns using the
      <codeph>PARTITION BY</codeph> clause. In contrast to partitioning for HDFS-based tables,
      multiple values for a partition key column can be located in the same partition.
      The optional <codeph>HASH</codeph> clause lets you divide one or a set of partition key columns
      into a specified number of partitions; you can use more than one <codeph>HASH</codeph>
      clause, specifying a distinct set of partition key columns for each.
      The optional <codeph>RANGE</codeph> clause further subdivides the partitions, based on
      a range of values for the partition key columns.
    </p>
-->

    <p>
      <b>Specifying file format (STORED AS and ROW FORMAT clauses):</b>
    </p>

    <p rev="DOCS-1523">
      The <codeph>STORED AS</codeph> clause identifies the format of the underlying data files. Currently, Impala
      can query more types of file formats than it can create or insert into. Use Hive to perform any create or
      data load operations that are not currently available in Impala. For example, Impala can create an Avro,
      SequenceFile, or RCFile table but cannot insert data into it. There are also Impala-specific procedures for using
      compression with each kind of file format. For details about working with data files of various formats, see
      <xref href="impala_file_formats.xml#file_formats"/>.
    </p>

    <note>
      In Impala 1.4.0 and higher, Impala can create Avro tables, which formerly required doing the <codeph>CREATE
      TABLE</codeph> statement in Hive. See <xref href="impala_avro.xml#avro"/> for details and examples.
    </note>

    <p>
      By default (when no <codeph>STORED AS</codeph> clause is specified), data files in Impala tables are created
      as text files with Ctrl-A (hex 01) characters as the delimiter.
<!-- Verify if ROW FORMAT is entirely ignored outside of text tables, or does it apply somehow to SequenceFile and/or RCFile too? -->
      Specify the <codeph>ROW FORMAT DELIMITED</codeph> clause to produce or ingest data files that use a different
      delimiter character such as tab or <codeph>|</codeph>, or a different line end character such as carriage
      return or newline. When specifying delimiter and line end characters with the <codeph>FIELDS TERMINATED
      BY</codeph> and <codeph>LINES TERMINATED BY</codeph> clauses, use <codeph>'\t'</codeph> for tab,
      <codeph>'\n'</codeph> for newline or linefeed, <codeph>'\r'</codeph> for carriage return, and
      <codeph>\</codeph><codeph>0</codeph> for ASCII <codeph>nul</codeph> (hex 00). For more examples of text
      tables, see <xref href="impala_txtfile.xml#txtfile"/>.
    </p>

    <p>
      The <codeph>ESCAPED BY</codeph> clause applies both to text files that you create through an
      <codeph>INSERT</codeph> statement to an Impala <codeph>TEXTFILE</codeph> table, and to existing data files
      that you put into an Impala table directory. (You can ingest existing data files either by creating the table
      with <codeph>CREATE EXTERNAL TABLE ... LOCATION</codeph>, the <codeph>LOAD DATA</codeph> statement, or
      through an HDFS operation such as <codeph>hdfs dfs -put <varname>file</varname>
      <varname>hdfs_path</varname></codeph>.) Choose an escape character that is not used anywhere else in the
      file, and put it in front of each instance of the delimiter character that occurs within a field value.
      Surrounding field values with quotation marks does not help Impala to parse fields with embedded delimiter
      characters; the quotation marks are considered to be part of the column value. If you want to use
      <codeph>\</codeph> as the escape character, specify the clause in <cmdname>impala-shell</cmdname> as
      <codeph>ESCAPED BY '\\'</codeph>.
    </p>

    <note conref="../shared/impala_common.xml#common/thorn"/>

    <p>
      <b>Cloning tables (LIKE clause):</b>
    </p>

    <p>
      To create an empty table with the same columns, comments, and other attributes as another table, use the
      following variation. The <codeph>CREATE TABLE ... LIKE</codeph> form allows a restricted set of clauses,
      currently only the <codeph>LOCATION</codeph>, <codeph>COMMENT</codeph>, and <codeph>STORED AS</codeph>
      clauses.
    </p>

<codeblock>CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [<varname>db_name</varname>.]<varname>table_name</varname>
  <ph rev="1.4.0">LIKE { [<varname>db_name</varname>.]<varname>table_name</varname> | PARQUET '<varname>hdfs_path_of_parquet_file</varname>' }</ph>
  [COMMENT '<varname>table_comment</varname>']
  [STORED AS <varname>file_format</varname>]
  [LOCATION '<varname>hdfs_path</varname>']</codeblock>

    <note rev="1.2.0">
      <p rev="1.2.0">
        To clone the structure of a table and transfer data into it in a single operation, use the <codeph>CREATE
        TABLE AS SELECT</codeph> syntax described in the next subsection.
      </p>
    </note>

    <p>
      When you clone the structure of an existing table using the <codeph>CREATE TABLE ... LIKE</codeph> syntax,
      the new table keeps the same file format as the original one, so you only need to specify the <codeph>STORED
      AS</codeph> clause if you want to use a different file format, or when specifying a view as the original
      table. (Creating a table <q>like</q> a view produces a text table by default.)
    </p>

    <p>
      Although normally Impala cannot create an HBase table directly, Impala can clone the structure of an existing
      HBase table with the <codeph>CREATE TABLE ... LIKE</codeph> syntax, preserving the file format and metadata
      from the original table.
    </p>

    <p>
      There are some exceptions to the ability to use <codeph>CREATE TABLE ... LIKE</codeph> with an Avro table.
      For example, you cannot use this technique for an Avro table that is specified with an Avro schema but no
      columns. When in doubt, check if a <codeph>CREATE TABLE ... LIKE</codeph> operation works in Hive; if not, it
      typically will not work in Impala either.
    </p>

    <p>
      If the original table is partitioned, the new table inherits the same partition key columns. Because the new
      table is initially empty, it does not inherit the actual partitions that exist in the original one. To create
      partitions in the new table, insert data or issue <codeph>ALTER TABLE ... ADD PARTITION</codeph> statements.
    </p>

    <p conref="../shared/impala_common.xml#common/create_table_like_view"/>

    <p>
      Because <codeph>CREATE TABLE ... LIKE</codeph> only manipulates table metadata, not the physical data of the
      table, issue <codeph>INSERT INTO TABLE</codeph> statements afterward to copy any data from the original table
      into the new one, optionally converting the data to a new file format. (For some file formats, Impala can do
      a <codeph>CREATE TABLE ... LIKE</codeph> to create the table, but Impala cannot insert data in that file
      format; in these cases, you must load the data in Hive. See
      <xref href="impala_file_formats.xml#file_formats"/> for details.)
    </p>

    <p rev="1.2" id="ctas">
      <b>CREATE TABLE AS SELECT:</b>
    </p>

    <p>
      The <codeph>CREATE TABLE AS SELECT</codeph> syntax is a shorthand notation to create a table based on column
      definitions from another table, and copy data from the source table to the destination table without issuing
      any separate <codeph>INSERT</codeph> statement. This idiom is so popular that it has its own acronym,
      <q>CTAS</q>.
    </p>

    <p>
      The following examples show how to copy data from a source table <codeph>T1</codeph>
      to a variety of destinations tables, applying various transformations to the table
      properties, table layout, or the data itself as part of the operation:
    </p>

<codeblock>
-- Sample table to be the source of CTAS operations.
CREATE TABLE t1 (x INT, y STRING);
INSERT INTO t1 VALUES (1, 'one'), (2, 'two'), (3, 'three');

-- Clone all the columns and data from one table to another.
CREATE TABLE clone_of_t1 AS SELECT * FROM t1;
+-------------------+
| summary           |
+-------------------+
| Inserted 3 row(s) |
+-------------------+

-- Clone the columns and data, and convert the data to a different file format.
CREATE TABLE parquet_version_of_t1 STORED AS PARQUET AS SELECT * FROM t1;
+-------------------+
| summary           |
+-------------------+
| Inserted 3 row(s) |
+-------------------+

-- Copy only some rows to the new table.
CREATE TABLE subset_of_t1 AS SELECT * FROM t1 WHERE x >= 2;
+-------------------+
| summary           |
+-------------------+
| Inserted 2 row(s) |
+-------------------+

-- Same idea as CREATE TABLE LIKE: clone table layout but do not copy any data.
CREATE TABLE empty_clone_of_t1 AS SELECT * FROM t1 WHERE 1=0;
+-------------------+
| summary           |
+-------------------+
| Inserted 0 row(s) |
+-------------------+

-- Reorder and rename columns and transform the data.
CREATE TABLE t5 AS SELECT upper(y) AS s, x+1 AS a, 'Entirely new column' AS n FROM t1;
+-------------------+
| summary           |
+-------------------+
| Inserted 3 row(s) |
+-------------------+
SELECT * FROM t5;
+-------+---+---------------------+
| s     | a | n                   |
+-------+---+---------------------+
| ONE   | 2 | Entirely new column |
| TWO   | 3 | Entirely new column |
| THREE | 4 | Entirely new column |
+-------+---+---------------------+
</codeblock>

<!-- These are a little heavyweight to get into here. Therefore commenting out.
     Some overlap with the new column-changing examples in the code listing above.
Create tables with different column order, names, or types than the original.
CREATE TABLE some_columns_from_t1 AS SELECT c1, c3, c5 FROM t1;
CREATE TABLE reordered_columns_from_t1 AS SELECT c4, c3, c1, c2 FROM t1;
CREATE TABLE synthesized_columns AS SELECT upper(c1) AS all_caps, c2+c3 AS total, "California" AS state FROM t1;</codeblock>
-->

<!-- CREATE TABLE AS <select> now incorporated up higher in the original syntax diagram. -->

    <p rev="1.2">
      See <xref href="impala_select.xml#select"/> for details about query syntax for the <codeph>SELECT</codeph>
      portion of a <codeph>CREATE TABLE AS SELECT</codeph> statement.
    </p>

    <p rev="1.2">
      The newly created table inherits the column names that you select from the original table, which you can
      override by specifying column aliases in the query. Any column or table comments from the original table are
      not carried over to the new table.
    </p>

    <note rev="DOCS-1523">
      When using the <codeph>STORED AS</codeph> clause with a <codeph>CREATE TABLE AS SELECT</codeph>
      statement, the destination table must be a file format that Impala can write to: currently,
      text or Parquet. You cannot specify an Avro, SequenceFile, or RCFile table as the destination
      table for a CTAS operation.
    </note>

    <p rev="2.5.0">
      Prior to <keyword keyref="impala25_full"/> you could use a partitioned table
      as the source and copy data from it, but could not specify any partitioning clauses for the new table.
      In <keyword keyref="impala25_full"/> and higher, you can now use the <codeph>PARTITIONED BY</codeph> clause with a
      <codeph>CREATE TABLE AS SELECT</codeph> statement. The following example demonstrates how you can copy
      data from an unpartitioned table in a <codeph>CREATE TABLE AS SELECT</codeph> operation, creating a new
      partitioned table in the process. The main syntax consideration is the column order in the <codeph>PARTITIONED BY</codeph>
      clause and the select list: the partition key columns must be listed last in the select list, in the same
      order as in the <codeph>PARTITIONED BY</codeph> clause. Therefore, in this case, the column order in the
      destination table is different from the source table. You also only specify the column names in the
      <codeph>PARTITIONED BY</codeph> clause, not the data types or column comments.
    </p>

<codeblock rev="2.5.0">
create table partitions_no (year smallint, month tinyint, s string);
insert into partitions_no values (2016, 1, 'January 2016'),
  (2016, 2, 'February 2016'), (2016, 3, 'March 2016');

-- Prove that the source table is not partitioned.
show partitions partitions_no;
ERROR: AnalysisException: Table is not partitioned: ctas_partition_by.partitions_no

-- Create new table with partitions based on column values from source table.
<b>create table partitions_yes partitioned by (year, month)
  as select s, year, month from partitions_no;</b>
+-------------------+
| summary           |
+-------------------+
| Inserted 3 row(s) |
+-------------------+

-- Prove that the destination table is partitioned.
show partitions partitions_yes;
+-------+-------+-------+--------+------+...
| year  | month | #Rows | #Files | Size |...
+-------+-------+-------+--------+------+...
| 2016  | 1     | -1    | 1      | 13B  |...
| 2016  | 2     | -1    | 1      | 14B  |...
| 2016  | 3     | -1    | 1      | 11B  |...
| Total |       | -1    | 3      | 38B  |...
+-------+-------+-------+--------+------+...
</codeblock>

  <p rev="2.5.0">
    The most convenient layout for partitioned tables is with all the
    partition key columns at the end. The CTAS <codeph>PARTITIONED BY</codeph> syntax
    requires that column order in the select list, resulting in that same
    column order in the destination table.
  </p>

<codeblock rev="2.5.0">
describe partitions_no;
+-------+----------+---------+
| name  | type     | comment |
+-------+----------+---------+
| year  | smallint |         |
| month | tinyint  |         |
| s     | string   |         |
+-------+----------+---------+

-- The CTAS operation forced us to put the partition key columns last.
-- Having those columns last works better with idioms such as SELECT *
-- for partitioned tables.
describe partitions_yes;
+-------+----------+---------+
| name  | type     | comment |
+-------+----------+---------+
| s     | string   |         |
| year  | smallint |         |
| month | tinyint  |         |
+-------+----------+---------+
</codeblock>

  <p rev="2.5.0">
    Attempting to use a select list with the partition key columns
    not at the end results in an error due to a column name mismatch:
  </p>

<codeblock rev="2.5.0">
-- We expect this CTAS to fail because non-key column S
-- comes after key columns YEAR and MONTH in the select list.
create table partitions_maybe partitioned by (year, month)
  as select year, month, s from partitions_no;
ERROR: AnalysisException: Partition column name mismatch: year != month
</codeblock>

    <p rev="1.2">
      For example, the following statements show how you can clone all the data in a table, or a subset of the
      columns and/or rows, or reorder columns, rename them, or construct them out of expressions:
    </p>

    <p rev="1.2">
      As part of a CTAS operation, you can convert the data to any file format that Impala can write (currently,
      <codeph>TEXTFILE</codeph> and <codeph>PARQUET</codeph>). You cannot specify the lower-level properties of a
      text table, such as the delimiter.
    </p>

    <p rev="obwl" conref="../shared/impala_common.xml#common/insert_sort_blurb"/>

    <p rev="1.4.0">
      <b>CREATE TABLE LIKE PARQUET:</b>
    </p>

    <p rev="1.4.0">
      The variation <codeph>CREATE TABLE ... LIKE PARQUET '<varname>hdfs_path_of_parquet_file</varname>'</codeph>
      lets you skip the column definitions of the <codeph>CREATE TABLE</codeph> statement. The column names and
      data types are automatically configured based on the organization of the specified Parquet data file, which
      must already reside in HDFS. You can use a data file located outside the Impala database directories, or a
      file from an existing Impala Parquet table; either way, Impala only uses the column definitions from the file
      and does not use the HDFS location for the <codeph>LOCATION</codeph> attribute of the new table. (Although
      you can also specify the enclosing directory with the <codeph>LOCATION</codeph> attribute, to both use the
      same schema as the data file and point the Impala table at the associated directory for querying.)
    </p>

    <p rev="1.4.0">
      The following considerations apply when you use the <codeph>CREATE TABLE LIKE PARQUET</codeph> technique:
    </p>

    <ul rev="1.4.0">
      <li>
        Any column comments from the original table are not preserved in the new table. Each column in the new
        table has a comment stating the low-level Parquet field type used to deduce the appropriate SQL column
        type.
      </li>

      <li>
        If you use a data file from a partitioned Impala table, any partition key columns from the original table
        are left out of the new table, because they are represented in HDFS directory names rather than stored in
        the data file. To preserve the partition information, repeat the same <codeph>PARTITION</codeph> clause as
        in the original <codeph>CREATE TABLE</codeph> statement.
      </li>

      <li>
        The file format of the new table defaults to text, as with other kinds of <codeph>CREATE TABLE</codeph>
        statements. To make the new table also use Parquet format, include the clause <codeph>STORED AS
        PARQUET</codeph> in the <codeph>CREATE TABLE LIKE PARQUET</codeph> statement.
      </li>

      <li>
        If the Parquet data file comes from an existing Impala table, currently, any <codeph>TINYINT</codeph> or
        <codeph>SMALLINT</codeph> columns are turned into <codeph>INT</codeph> columns in the new table.
        Internally, Parquet stores such values as 32-bit integers.
      </li>

      <li>
        When the destination table uses the Parquet file format, the <codeph>CREATE TABLE AS SELECT</codeph> and
        <codeph>INSERT ... SELECT</codeph> statements always create at least one data file, even if the
        <codeph>SELECT</codeph> part of the statement does not match any rows. You can use such an empty Parquet
        data file as a template for subsequent <codeph>CREATE TABLE LIKE PARQUET</codeph> statements.
      </li>
    </ul>

    <p>
      For more details about creating Parquet tables, and examples of the <codeph>CREATE TABLE LIKE
      PARQUET</codeph> syntax, see <xref href="impala_parquet.xml#parquet"/>.
    </p>

    <p>
      <b>Visibility and Metadata (TBLPROPERTIES and WITH SERDEPROPERTIES clauses):</b>
    </p>

    <p rev="1.2">
      You can associate arbitrary items of metadata with a table by specifying the <codeph>TBLPROPERTIES</codeph>
      clause. This clause takes a comma-separated list of key-value pairs and stores those items in the metastore
      database. You can also change the table properties later with an <codeph>ALTER TABLE</codeph> statement. You
      can observe the table properties for different delimiter and escape characters using the <codeph>DESCRIBE
      FORMATTED</codeph> command, and change those settings for an existing table with <codeph>ALTER TABLE ... SET
      TBLPROPERTIES</codeph>.
    </p>

    <p rev="1.2">
      You can also associate SerDes properties with the table by specifying key-value pairs through the
      <codeph>WITH SERDEPROPERTIES</codeph> clause. This metadata is not used by Impala, which has its own built-in
      serializer and deserializer for the file formats it supports. Particular property values might be needed for
      Hive compatibility with certain variations of file formats, particularly Avro.
    </p>

    <p>
      Some DDL operations that interact with other Hadoop components require specifying particular values in the
      <codeph>SERDEPROPERTIES</codeph> or <codeph>TBLPROPERTIES</codeph> fields, such as creating an Avro table or
      an HBase table. (You typically create HBase tables in Hive, because they require additional clauses not
      currently available in Impala.)
<!-- Haven't got a working example from Lenni, so suppressing this recommendation for now.
        The Avro schema properties can be specified through either
        <codeph>TBLPROPERTIES</codeph> or <codeph>SERDEPROPERTIES</codeph>;
        for best compatibility with future versions of Hive,
        use <codeph>SERDEPROPERTIES</codeph> in this case.
-->
    </p>

    <p>
      To see the column definitions and column comments for an existing table, for example before issuing a
      <codeph>CREATE TABLE ... LIKE</codeph> or a <codeph>CREATE TABLE ... AS SELECT</codeph> statement, issue the
      statement <codeph>DESCRIBE <varname>table_name</varname></codeph>. To see even more detail, such as the
      location of data files and the values for clauses such as <codeph>ROW FORMAT</codeph> and <codeph>STORED
      AS</codeph>, issue the statement <codeph>DESCRIBE FORMATTED <varname>table_name</varname></codeph>.
      <codeph>DESCRIBE FORMATTED</codeph> is also needed to see any overall table comment (as opposed to individual
      column comments).
    </p>

    <p>
      After creating a table, your <cmdname>impala-shell</cmdname> session or another
      <cmdname>impala-shell</cmdname> connected to the same node can immediately query that table. There might be a
      brief interval (one statestore heartbeat) before the table can be queried through a different Impala node. To
      make the <codeph>CREATE TABLE</codeph> statement return only when the table is recognized by all Impala nodes
      in the cluster, enable the <codeph>SYNC_DDL</codeph> query option.
    </p>

    <p rev="1.4.0">
      <b>HDFS caching (CACHED IN clause):</b>
    </p>

    <p rev="1.4.0">
      If you specify the <codeph>CACHED IN</codeph> clause, any existing or future data files in the table
      directory or the partition subdirectories are designated to be loaded into memory with the HDFS caching
      mechanism. See <xref href="impala_perf_hdfs_caching.xml#hdfs_caching"/> for details about using the HDFS
      caching feature.
    </p>

    <p conref="../shared/impala_common.xml#common/impala_cache_replication_factor"/>

<!-- Say something in here about the SHOW statement, e.g. SHOW TABLES, SHOW TABLE/COLUMN STATS, SHOW PARTITIONS. -->

    <p>
      <b>Column order</b>:
    </p>

    <p>
      If you intend to use the table to hold data files produced by some external source, specify the columns in
      the same order as they appear in the data files.
    </p>

    <p>
      If you intend to insert or copy data into the table through Impala, or if you have control over the way
      externally produced data files are arranged, use your judgment to specify columns in the most convenient
      order:
    </p>

    <ul>
      <li>
        <p>
          If certain columns are often <codeph>NULL</codeph>, specify those columns last. You might produce data
          files that omit these trailing columns entirely. Impala automatically fills in the <codeph>NULL</codeph>
          values if so.
        </p>
      </li>

      <li>
        <p>
          If an unpartitioned table will be used as the source for an <codeph>INSERT ... SELECT</codeph> operation
          into a partitioned table, specify last in the unpartitioned table any columns that correspond to
          partition key columns in the partitioned table, and in the same order as the partition key columns are
          declared in the partitioned table. This technique lets you use <codeph>INSERT ... SELECT *</codeph> when
          copying data to the partitioned table, rather than specifying each column name individually.
        </p>
      </li>

      <li>
        <p>
          If you specify columns in an order that you later discover is suboptimal, you can sometimes work around
          the problem without recreating the table. You can create a view that selects columns from the original
          table in a permuted order, then do a <codeph>SELECT *</codeph> from the view. When inserting data into a
          table, you can specify a permuted order for the inserted columns to match the order in the destination
          table.
        </p>
      </li>
    </ul>

    <p conref="../shared/impala_common.xml#common/hive_blurb"/>

    <p>
      Impala queries can make use of metadata about the table and columns, such as the number of rows in a table or
      the number of different values in a column. Prior to Impala 1.2.2, to create this metadata, you issued the
      <codeph>ANALYZE TABLE</codeph> statement in Hive to gather this information, after creating the table and
      loading representative data into it. In Impala 1.2.2 and higher, the <codeph>COMPUTE STATS</codeph> statement
      produces these statistics within Impala, without needing to use Hive at all.
    </p>

    <p conref="../shared/impala_common.xml#common/hbase_blurb"/>

    <note>
      <p>
        The Impala <codeph>CREATE TABLE</codeph> statement cannot create an HBase table, because it currently does
        not support the <codeph>STORED BY</codeph> clause needed for HBase tables. Create such tables in Hive, then
        query them through Impala. For information on using Impala with HBase tables, see
        <xref href="impala_hbase.xml#impala_hbase"/>.
      </p>
    </note>

    <p conref="../shared/impala_common.xml#common/s3_blurb"/>
    <p rev="2.2.0">
      To create a table where the data resides in the Amazon Simple Storage Service (S3),
      specify a <codeph>s3a://</codeph> prefix <codeph>LOCATION</codeph> attribute pointing to the data files in S3.
    </p>

    <p rev="2.6.0 CDH-39913 IMPALA-1878">
      In <keyword keyref="impala26_full"/> and higher, you can
      use this special <codeph>LOCATION</codeph> syntax
      as part of a <codeph>CREATE TABLE AS SELECT</codeph> statement.
    </p>

    <p conref="../shared/impala_common.xml#common/s3_ddl"/>

    <p conref="../shared/impala_common.xml#common/insert_sort_blurb"/>

    <p conref="../shared/impala_common.xml#common/hdfs_blurb"/>

    <p>
      The <codeph>CREATE TABLE</codeph> statement for an internal table creates a directory in HDFS. The
      <codeph>CREATE EXTERNAL TABLE</codeph> statement associates the table with an existing HDFS directory, and
      does not create any new directory in HDFS. To locate the HDFS data directory for a table, issue a
      <codeph>DESCRIBE FORMATTED <varname>table</varname></codeph> statement. To examine the contents of that HDFS
      directory, use an OS command such as <codeph>hdfs dfs -ls hdfs://<varname>path</varname></codeph>, either
      from the OS command line or through the <codeph>shell</codeph> or <codeph>!</codeph> commands in
      <cmdname>impala-shell</cmdname>.
    </p>

    <p>
      The <codeph>CREATE TABLE AS SELECT</codeph> syntax creates data files under the table data directory to hold
      any data copied by the <codeph>INSERT</codeph> portion of the statement. (Even if no data is copied, Impala
      might create one or more empty data files.)
    </p>

    <p conref="../shared/impala_common.xml#common/permissions_blurb"/>
    <p rev="CDH-19187">
      The user ID that the <cmdname>impalad</cmdname> daemon runs under,
      typically the <codeph>impala</codeph> user, must have both execute and write
      permission for the database directory where the table is being created.
    </p>

    <p conref="../shared/impala_common.xml#common/security_blurb"/>
    <p conref="../shared/impala_common.xml#common/redaction_yes"/>

    <p conref="../shared/impala_common.xml#common/cancel_blurb_maybe"/>

    <p conref="../shared/impala_common.xml#common/related_info"/>

    <p>
      <xref href="impala_tables.xml#tables"/>,
      <xref href="impala_alter_table.xml#alter_table"/>, <xref href="impala_drop_table.xml#drop_table"/>,
      <xref href="impala_partitioning.xml#partitioning"/>, <xref href="impala_tables.xml#internal_tables"/>,
      <xref href="impala_tables.xml#external_tables"/>, <xref href="impala_compute_stats.xml#compute_stats"/>,
      <xref href="impala_sync_ddl.xml#sync_ddl"/>, <xref href="impala_show.xml#show_tables"/>,
      <xref href="impala_show.xml#show_create_table"/>, <xref href="impala_describe.xml#describe"/>
    </p>
  </conbody>
</concept>