mirror of
https://github.com/apache/impala.git
synced 2025-12-19 09:58:28 -05:00
Extended the ALTER TABLE documentation with the SORT BY clause. Also added more information about the available and the deafult sort orders to the CREATE TABLE description. Testing: Built docs locally. Change-Id: Ieb348d8395a6140f0be200d73e2f22fded9a5116 Reviewed-on: http://gerrit.cloudera.org:8080/21083 Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Reviewed-by: Daniel Becker <daniel.becker@cloudera.com>
1218 lines
54 KiB
XML
1218 lines
54 KiB
XML
<?xml version="1.0" encoding="UTF-8"?>
|
|
<!--
|
|
Licensed to the Apache Software Foundation (ASF) under one
|
|
or more contributor license agreements. See the NOTICE file
|
|
distributed with this work for additional information
|
|
regarding copyright ownership. The ASF licenses this file
|
|
to you under the Apache License, Version 2.0 (the
|
|
"License"); you may not use this file except in compliance
|
|
with the License. You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing,
|
|
software distributed under the License is distributed on an
|
|
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
KIND, either express or implied. See the License for the
|
|
specific language governing permissions and limitations
|
|
under the License.
|
|
-->
|
|
<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
|
|
<concept id="alter_table">
|
|
|
|
<title>ALTER TABLE Statement</title>
|
|
|
|
<titlealts audience="PDF">
|
|
|
|
<navtitle>ALTER TABLE</navtitle>
|
|
|
|
</titlealts>
|
|
|
|
<prolog>
|
|
<metadata>
|
|
<data name="Category" value="Impala"/>
|
|
<data name="Category" value="Impala Data Types"/>
|
|
<data name="Category" value="SQL"/>
|
|
<data name="Category" value="DDL"/>
|
|
<data name="Category" value="HDFS Caching"/>
|
|
<data name="Category" value="Tables"/>
|
|
<data name="Category" value="Schemas"/>
|
|
<data name="Category" value="S3"/>
|
|
<data name="Category" value="Developers"/>
|
|
<data name="Category" value="Data Analysts"/>
|
|
<data name="Category" value="Kudu"/>
|
|
</metadata>
|
|
</prolog>
|
|
|
|
<conbody>
|
|
|
|
<p> The <codeph>ALTER TABLE</codeph> statement changes the structure or
|
|
properties of an existing Impala table. </p>
|
|
|
|
<p>
|
|
In Impala, this is primarily a logical operation that updates the table metadata in the
|
|
metastore database that Impala shares with Hive. Most <codeph>ALTER TABLE</codeph>
|
|
operations do not actually rewrite, move, and so on the actual data files. (The
|
|
<codeph>RENAME TO</codeph> clause is the one exception; it can cause HDFS files to be
|
|
moved to different paths.) When you do an <codeph>ALTER TABLE</codeph> operation, you
|
|
typically need to perform corresponding physical filesystem operations, such as rewriting
|
|
the data files to include extra fields, or converting them to a different file format.
|
|
</p>
|
|
|
|
<p conref="../shared/impala_common.xml#common/syntax_blurb"/>
|
|
|
|
<codeblock>ALTER TABLE [<varname>old_db_name</varname>.]<varname>old_table_name</varname> RENAME TO [<varname>new_db_name</varname>.]<varname>new_table_name</varname>
|
|
|
|
ALTER TABLE <varname>name</varname> ADD [IF NOT EXISTS] COLUMNS (<varname>col_spec</varname>[, <varname>col_spec</varname> ...])
|
|
ALTER TABLE <varname>name</varname> REPLACE COLUMNS (<varname>col_spec</varname>[, <varname>col_spec</varname> ...])
|
|
|
|
ALTER TABLE <varname>name</varname> ADD COLUMN [IF NOT EXISTS] <varname>col_spec</varname>
|
|
ALTER TABLE <varname>name</varname> DROP [COLUMN] <varname>column_name</varname>
|
|
ALTER TABLE <varname>name</varname> CHANGE <varname>column_name</varname> <varname>col_spec</varname>
|
|
|
|
<ph rev="3.1 IMPALA-6988">ALTER TABLE <varname>name</varname> SET OWNER USER <varname>user_name</varname>
|
|
</ph>
|
|
<ph rev="2.10.0 IMPALA-4622">-- Kudu tables only.
|
|
ALTER TABLE <varname>name</varname> ALTER [COLUMN] <varname>column_name</varname>
|
|
{ SET <varname>kudu_storage_attr</varname> <varname>attr_value</varname>
|
|
| DROP DEFAULT }
|
|
|
|
kudu_storage_attr ::= { DEFAULT | BLOCK_SIZE | ENCODING | COMPRESSION }</ph>
|
|
|
|
<ph rev="2.10.0 IMPALA-4622">-- Non-Kudu tables only.
|
|
ALTER TABLE <varname>name</varname> ALTER [COLUMN] <varname>column_name</varname>
|
|
SET COMMENT '<varname>comment_text</varname>'</ph>
|
|
|
|
ALTER TABLE <varname>name</varname> ADD [IF NOT EXISTS] PARTITION (<varname>partition_spec</varname>)
|
|
<ph rev="IMPALA-4390">[<varname>location_spec</varname>]</ph>
|
|
<ph rev="IMPALA-4390">[<varname>cache_spec</varname>]</ph>
|
|
<ph rev="kudu">ALTER TABLE <varname>name</varname> ADD [IF NOT EXISTS] RANGE PARTITION <varname>kudu_partition_spec</varname></ph>
|
|
|
|
ALTER TABLE <varname>name</varname> DROP [IF EXISTS] PARTITION (<varname>partition_spec</varname>)
|
|
<ph rev="2.3.0">[PURGE]</ph>
|
|
<ph rev="kudu">ALTER TABLE <varname>name</varname> DROP [IF EXISTS] RANGE PARTITION <varname>kudu_partition_spec</varname></ph>
|
|
|
|
<ph rev="2.3.0 IMPALA-1568">ALTER TABLE <varname>name</varname> RECOVER PARTITIONS</ph>
|
|
|
|
ALTER TABLE <varname>name</varname> [PARTITION (<varname>partition_spec</varname>)]
|
|
SET { FILEFORMAT <varname>file_format</varname>
|
|
| ROW FORMAT <varname>row_format</varname>
|
|
| LOCATION '<varname>hdfs_path_of_directory</varname>'
|
|
| TBLPROPERTIES (<varname>table_properties</varname>)
|
|
| SERDEPROPERTIES (<varname>serde_properties</varname>) }
|
|
|
|
<ph rev="2.6.0 IMPALA-3369">ALTER TABLE <varname>name</varname> <varname>colname</varname>
|
|
('<varname>statsKey</varname>'='<varname>val</varname>', ...)
|
|
|
|
statsKey ::= numDVs | numNulls | avgSize | maxSize</ph>
|
|
|
|
<ph rev="1.4.0">ALTER TABLE <varname>name</varname> [PARTITION (<varname>partition_spec</varname>)] SET { CACHED IN '<varname>pool_name</varname>' <ph rev="2.2.0">[WITH REPLICATION = <varname>integer</varname>]</ph> | UNCACHED }</ph>
|
|
|
|
<varname>new_name</varname> ::= [<varname>new_database</varname>.]<varname>new_table_name</varname>
|
|
|
|
<varname>col_spec</varname> ::= <varname>col_name</varname> <varname>type_name</varname> <ph rev="2.9.0 IMPALA-4616"> COMMENT '<varname>column-comment</varname>' [<varname>kudu_attributes</varname>]</ph>
|
|
|
|
<ph rev="2.9.0 IMPALA-4616"><varname>kudu_attributes</varname> ::= { [NOT] NULL | ENCODING <varname>codec</varname> | COMPRESSION <varname>algorithm</varname> |
|
|
DEFAULT <varname>constant</varname> | BLOCK_SIZE <varname>number</varname> }</ph>
|
|
|
|
<varname>partition_spec</varname> ::= <varname>simple_partition_spec</varname> | <ph rev="IMPALA-1654"><varname>complex_partition_spec</varname></ph>
|
|
|
|
<varname>simple_partition_spec</varname> ::= <varname>partition_col</varname>=<varname>constant_value</varname>
|
|
|
|
<ph rev="IMPALA-1654"><varname>complex_partition_spec</varname> ::= <varname>comparison_expression_on_partition_col</varname></ph>
|
|
|
|
<ph rev="kudu"><varname>kudu_partition_spec</varname> ::= <varname>constant</varname> <varname>range_operator</varname> VALUES <varname>range_operator</varname> <varname>constant</varname> | VALUE = <varname>constant</varname></ph>
|
|
|
|
<ph rev="IMPALA-4390">cache_spec ::= CACHED IN '<varname>pool_name</varname>' [WITH REPLICATION = <varname>integer</varname>] | UNCACHED</ph>
|
|
|
|
<ph rev="IMPALA-4390">location_spec ::= LOCATION '<varname>hdfs_path_of_directory</varname>'</ph>
|
|
|
|
<varname>table_properties</varname> ::= '<varname>name</varname>'='<varname>value</varname>'[, '<varname>name</varname>'='<varname>value</varname>' ...]
|
|
|
|
<varname>serde_properties</varname> ::= '<varname>name</varname>'='<varname>value</varname>'[, '<varname>name</varname>'='<varname>value</varname>' ...]
|
|
|
|
<varname>file_format</varname> ::= { PARQUET | TEXTFILE | RCFILE | SEQUENCEFILE | AVRO }
|
|
|
|
<varname>row_format</varname> ::= DELIMITED
|
|
[FIELDS TERMINATED BY '<varname>char</varname>' [ESCAPED BY '<varname>char</varname>']]
|
|
[LINES TERMINATED BY '<varname>char</varname>']</codeblock>
|
|
|
|
<p conref="../shared/impala_common.xml#common/ddl_blurb"/>
|
|
|
|
<p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
|
|
|
|
<p rev="2.3.0">
|
|
In <keyword keyref="impala23_full"/> and higher, the <codeph>ALTER TABLE</codeph>
|
|
statement can change the metadata for tables containing complex types
|
|
(<codeph>ARRAY</codeph>, <codeph>STRUCT</codeph>, and <codeph>MAP</codeph>). For example,
|
|
you can use an <codeph>ADD COLUMNS</codeph>, <codeph>DROP COLUMN</codeph>, or
|
|
<codeph>CHANGE</codeph> clause to modify the table layout for complex type columns.
|
|
Although Impala queries only work for complex type columns in Parquet tables, the complex
|
|
type support in the <codeph>ALTER TABLE</codeph> statement applies to all file formats.
|
|
For example, you can use Impala to update metadata for a staging table in a non-Parquet
|
|
file format where the data is populated by Hive. Or you can use <codeph>ALTER TABLE SET
|
|
FILEFORMAT</codeph> to change the format of an existing table to Parquet so that Impala
|
|
can query it. Remember that changing the file format for a table does not convert the data
|
|
files within the table; you must prepare any Parquet data files containing complex types
|
|
outside Impala, and bring them into the table using <codeph>LOAD DATA</codeph> or updating
|
|
the table's <codeph>LOCATION</codeph> property. See
|
|
<xref
|
|
href="impala_complex_types.xml#complex_types"/> for details about using
|
|
complex types.
|
|
</p>
|
|
|
|
<p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
|
|
|
|
<p>
|
|
Whenever you specify partitions in an <codeph>ALTER TABLE</codeph> statement, through the
|
|
<codeph>PARTITION (<varname>partition_spec</varname>)</codeph> clause, you must include
|
|
all the partitioning columns in the specification.
|
|
</p>
|
|
|
|
<p>
|
|
Most of the <codeph>ALTER TABLE</codeph> operations work the same for internal tables
|
|
(managed by Impala) as for external tables (with data files located in arbitrary
|
|
locations). The exception is renaming a table; for an external table, the underlying data
|
|
directory is not renamed or moved.
|
|
</p>
|
|
|
|
<p>
|
|
<b>To drop or alter multiple partitions:</b>
|
|
</p>
|
|
|
|
<p rev="IMPALA-1654">
|
|
In <keyword keyref="impala28_full"/> and higher, the expression for the partition clause
|
|
with a <codeph>DROP</codeph> or <codeph>SET</codeph> operation can include comparison
|
|
operators such as <codeph><</codeph>, <codeph>IN</codeph>, or <codeph>BETWEEN</codeph>,
|
|
and Boolean operators such as <codeph>AND</codeph> and <codeph>OR</codeph>.
|
|
</p>
|
|
|
|
<p rev="IMPALA-1654">
|
|
For example, you might drop a group of partitions corresponding to a particular date range
|
|
after the data <q>ages out</q>:
|
|
</p>
|
|
|
|
<codeblock><![CDATA[
|
|
alter table historical_data drop partition (year < 1995);
|
|
alter table historical_data drop partition (year = 1996 and month between 1 and 6);
|
|
]]>
|
|
</codeblock>
|
|
|
|
<p rev="IMPALA-1654">
|
|
For tables with multiple partition keys columns, you can specify multiple conditions
|
|
separated by commas, and the operation only applies to the partitions that match all the
|
|
conditions (similar to using an <codeph>AND</codeph> clause):
|
|
</p>
|
|
|
|
<codeblock><![CDATA[
|
|
alter table historical_data drop partition (year < 1995, last_name like 'A%');
|
|
]]>
|
|
</codeblock>
|
|
|
|
<p rev="IMPALA-1654">
|
|
This technique can also be used to change the file format of groups of partitions, as part
|
|
of an ETL pipeline that periodically consolidates and rewrites the underlying data files
|
|
in a different file format:
|
|
</p>
|
|
|
|
<codeblock><![CDATA[
|
|
alter table fast_growing_data partition (year = 2016, month in (10,11,12)) set fileformat parquet;
|
|
]]>
|
|
</codeblock>
|
|
|
|
<note>
|
|
<p rev="IMPALA-1654">
|
|
The extended syntax involving comparison operators and multiple partitions applies to
|
|
the <codeph>SET FILEFORMAT</codeph>, <codeph>SET TBLPROPERTIES</codeph>, <codeph>SET
|
|
SERDEPROPERTIES</codeph>, and <codeph>SET [UN]CACHED</codeph> clauses. You can also use
|
|
this syntax with the <codeph>PARTITION</codeph> clause in the <codeph>COMPUTE
|
|
INCREMENTAL STATS</codeph> statement, and with the <codeph>PARTITION</codeph> clause of
|
|
the <codeph>SHOW FILES</codeph> statement. Some forms of <codeph>ALTER TABLE</codeph>
|
|
still only apply to one partition at a time: the <codeph>SET LOCATION</codeph> and
|
|
<codeph>ADD PARTITION</codeph> clauses. The <codeph>PARTITION</codeph> clauses in the
|
|
<codeph>LOAD DATA</codeph> and <codeph>INSERT</codeph> statements also only apply to one
|
|
partition at a time.
|
|
</p>
|
|
|
|
<p>
|
|
A DDL statement that applies to multiple partitions is considered successful (resulting
|
|
in no changes) even if no partitions match the conditions. The results are the same as
|
|
if the <codeph>IF EXISTS</codeph> clause was specified.
|
|
</p>
|
|
|
|
<p>
|
|
The performance and scalability of this technique is similar to issuing a sequence of
|
|
single-partition <codeph>ALTER TABLE</codeph> statements in quick succession. To
|
|
minimize bottlenecks due to communication with the metastore database, or causing other
|
|
DDL operations on the same table to wait, test the effects of performing <codeph>ALTER
|
|
TABLE</codeph> statements that affect large numbers of partitions.
|
|
</p>
|
|
</note>
|
|
|
|
<p conref="../shared/impala_common.xml#common/s3_blurb"/>
|
|
|
|
<p rev="2.6.0 IMPALA-1878">
|
|
You can specify an <codeph>s3a://</codeph> prefix on the <codeph>LOCATION</codeph>
|
|
attribute of a table or partition to make Impala query data from the Amazon S3 filesystem.
|
|
In <keyword
|
|
keyref="impala26_full"/> and higher, Impala automatically handles
|
|
creating or removing the associated folders when you issue <codeph>ALTER TABLE</codeph>
|
|
statements with the <codeph>ADD PARTITION</codeph> or <codeph>DROP PARTITION</codeph>
|
|
clauses.
|
|
</p>
|
|
|
|
<p conref="../shared/impala_common.xml#common/s3_ddl"/>
|
|
|
|
<p rev="1.4.0">
|
|
<b>HDFS caching (CACHED IN clause):</b>
|
|
</p>
|
|
|
|
<p rev="1.4.0">
|
|
If you specify the <codeph>CACHED IN</codeph> clause, any existing or future data files in
|
|
the table directory or the partition subdirectories are designated to be loaded into
|
|
memory with the HDFS caching mechanism. See
|
|
<xref
|
|
href="impala_perf_hdfs_caching.xml#hdfs_caching"/> for details about using
|
|
the HDFS caching feature.
|
|
</p>
|
|
|
|
<p
|
|
conref="../shared/impala_common.xml#common/impala_cache_replication_factor"/>
|
|
|
|
<p conref="../shared/impala_common.xml#common/sync_ddl_blurb"/>
|
|
|
|
<p>
|
|
<b>To rename a table (RENAME TO clause):</b>
|
|
</p>
|
|
|
|
<!-- Beefing up the syntax in its original location up to, don't need to repeat it here.
|
|
<codeblock>ALTER TABLE <varname>old_name</varname> RENAME TO <varname>new_name</varname>;</codeblock>
|
|
-->
|
|
|
|
<p>
|
|
The <codeph>RENAME TO</codeph> clause lets you change the name of an existing table, and
|
|
optionally which database it is located in.
|
|
</p>
|
|
|
|
<p>
|
|
For internal tables, this operation physically renames the directory within HDFS that
|
|
contains the data files; the original directory name no longer exists. By qualifying the
|
|
table names with database names, you can use this technique to move an internal table (and
|
|
its associated data directory) from one database to another. For example:
|
|
</p>
|
|
|
|
<codeblock>create database d1;
|
|
create database d2;
|
|
create database d3;
|
|
use d1;
|
|
create table mobile (x int);
|
|
use d2;
|
|
-- Move table from another database to the current one.
|
|
alter table d1.mobile rename to mobile;
|
|
use d1;
|
|
-- Move table from one database to another.
|
|
alter table d2.mobile rename to d3.mobile;</codeblock>
|
|
|
|
<p rev="3.1 IMPALA-6988">
|
|
<b>To change the owner of a table:</b>
|
|
</p>
|
|
|
|
<codeblock>ALTER TABLE <varname>name</varname> SET OWNER USER <varname>user_name;</varname>
|
|
</codeblock>
|
|
|
|
<p>
|
|
The table owner is originally set to the user who creates the table. The term
|
|
<codeph>OWNER</codeph> is used to differentiate between the <codeph>ALL</codeph>
|
|
privilege that is explicitly granted via the <codeph>GRANT</codeph> statement and a
|
|
privilege that is implicitly granted by the <codeph>CREATE TABLE</codeph> statement.
|
|
</p>
|
|
|
|
<p>
|
|
Use the <codeph>ALTER TABLE SET OWNER</codeph> to transfer the ownership from the current
|
|
owner to another user.
|
|
</p>
|
|
|
|
<p>
|
|
<b>To change the physical location where Impala looks for data files associated with a
|
|
table or partition:</b>
|
|
</p>
|
|
|
|
<codeblock>ALTER TABLE <varname>table_name</varname> [PARTITION (<varname>partition_spec</varname>)] SET LOCATION '<varname>hdfs_path_of_directory</varname>';</codeblock>
|
|
|
|
<p>
|
|
The path you specify is the full HDFS path where the data files reside, or will be
|
|
created. Impala does not create any additional subdirectory named after the table. Impala
|
|
does not move any data files to this new location or change any data files that might
|
|
already exist in that directory.
|
|
</p>
|
|
|
|
<p>
|
|
To set the location for a single partition, include the <codeph>PARTITION</codeph> clause.
|
|
Specify all the same partitioning columns for the table, with a constant value for each,
|
|
to precisely identify the single partition affected by the statement:
|
|
</p>
|
|
|
|
<codeblock>create table p1 (s string) partitioned by (month int, day int);
|
|
-- Each ADD PARTITION clause creates a subdirectory in HDFS.
|
|
alter table p1 add partition (month=1, day=1);
|
|
alter table p1 add partition (month=1, day=2);
|
|
alter table p1 add partition (month=2, day=1);
|
|
alter table p1 add partition (month=2, day=2);
|
|
-- Redirect queries, INSERT, and LOAD DATA for one partition
|
|
-- to a specific different directory.
|
|
alter table p1 partition (month=1, day=1) set location '/usr/external_data/new_years_day';
|
|
</codeblock>
|
|
|
|
<note conref="../shared/impala_common.xml#common/add_partition_set_location"/>
|
|
|
|
<p rev="2.3.0 IMPALA-1568">
|
|
<b>To automatically detect new partition directories added through Hive or HDFS
|
|
operations:</b>
|
|
</p>
|
|
|
|
<p rev="2.3.0 IMPALA-1568">
|
|
In <keyword keyref="impala23_full"/> and higher, the <codeph>RECOVER PARTITIONS</codeph>
|
|
clause scans a partitioned table to detect if any new partition directories were added
|
|
outside of Impala, such as by Hive <codeph>ALTER TABLE</codeph> statements or by
|
|
<cmdname>hdfs dfs</cmdname> or <cmdname>hadoop fs</cmdname> commands. The <codeph>RECOVER
|
|
PARTITIONS</codeph> clause automatically recognizes any data files present in these new
|
|
directories, the same as the <codeph>REFRESH</codeph> statement does.
|
|
</p>
|
|
|
|
<p rev="2.3.0 IMPALA-1568">
|
|
For example, here is a sequence of examples showing how you might create a partitioned
|
|
table in Impala, create new partitions through Hive, copy data files into the new
|
|
partitions with the <cmdname>hdfs</cmdname> command, and have Impala recognize the new
|
|
partitions and new data:
|
|
</p>
|
|
|
|
<p rev="2.3.0 IMPALA-1568">
|
|
In Impala, create the table, and a single partition for demonstration purposes:
|
|
</p>
|
|
|
|
<codeblock rev="2.3.0 IMPALA-1568">
|
|
<![CDATA[
|
|
create database recover_partitions;
|
|
use recover_partitions;
|
|
create table t1 (s string) partitioned by (yy int, mm int);
|
|
insert into t1 partition (yy = 2016, mm = 1) values ('Partition exists');
|
|
show files in t1;
|
|
+---------------------------------------------------------------------+------+--------------+
|
|
| Path | Size | Partition |
|
|
+---------------------------------------------------------------------+------+--------------+
|
|
| /user/hive/warehouse/recover_partitions.db/t1/yy=2016/mm=1/data.txt | 17B | yy=2016/mm=1 |
|
|
+---------------------------------------------------------------------+------+--------------+
|
|
quit;
|
|
]]>
|
|
</codeblock>
|
|
|
|
<p rev="2.3.0 IMPALA-1568">
|
|
In Hive, create some new partitions. In a real use case, you might create the partitions
|
|
and populate them with data as the final stages of an ETL pipeline.
|
|
</p>
|
|
|
|
<codeblock rev="2.3.0 IMPALA-1568">
|
|
<![CDATA[
|
|
hive> use recover_partitions;
|
|
OK
|
|
hive> alter table t1 add partition (yy = 2016, mm = 2);
|
|
OK
|
|
hive> alter table t1 add partition (yy = 2016, mm = 3);
|
|
OK
|
|
hive> quit;
|
|
]]>
|
|
</codeblock>
|
|
|
|
<p rev="2.3.0 IMPALA-1568">
|
|
For demonstration purposes, manually copy data (a single row) into these new partitions,
|
|
using manual HDFS operations:
|
|
</p>
|
|
|
|
<codeblock rev="2.3.0 IMPALA-1568">
|
|
<![CDATA[
|
|
$ hdfs dfs -ls /user/hive/warehouse/recover_partitions.db/t1/yy=2016/
|
|
Found 3 items
|
|
drwxr-xr-x - impala hive 0 2016-05-09 16:06 /user/hive/warehouse/recover_partitions.db/t1/yy=2016/mm=1
|
|
drwxr-xr-x - jrussell hive 0 2016-05-09 16:14 /user/hive/warehouse/recover_partitions.db/t1/yy=2016/mm=2
|
|
drwxr-xr-x - jrussell hive 0 2016-05-09 16:13 /user/hive/warehouse/recover_partitions.db/t1/yy=2016/mm=3
|
|
|
|
$ hdfs dfs -cp /user/hive/warehouse/recover_partitions.db/t1/yy=2016/mm=1/data.txt \
|
|
/user/hive/warehouse/recover_partitions.db/t1/yy=2016/mm=2/data.txt
|
|
$ hdfs dfs -cp /user/hive/warehouse/recover_partitions.db/t1/yy=2016/mm=1/data.txt \
|
|
/user/hive/warehouse/recover_partitions.db/t1/yy=2016/mm=3/data.txt
|
|
]]>
|
|
</codeblock>
|
|
|
|
<codeblock rev="2.3.0 IMPALA-1568">
|
|
<![CDATA[
|
|
hive> select * from t1;
|
|
OK
|
|
Partition exists 2016 1
|
|
Partition exists 2016 2
|
|
Partition exists 2016 3
|
|
hive> quit;
|
|
]]>
|
|
</codeblock>
|
|
|
|
<p rev="2.3.0 IMPALA-1568">
|
|
In Impala, initially the partitions and data are not visible. Running <codeph>ALTER
|
|
TABLE</codeph> with the <codeph>RECOVER PARTITIONS</codeph> clause scans the table data
|
|
directory to find any new partition directories, and the data files inside them:
|
|
</p>
|
|
|
|
<codeblock rev="2.3.0 IMPALA-1568">
|
|
<![CDATA[
|
|
select * from t1;
|
|
+------------------+------+----+
|
|
| s | yy | mm |
|
|
+------------------+------+----+
|
|
| Partition exists | 2016 | 1 |
|
|
+------------------+------+----+
|
|
|
|
alter table t1 recover partitions;
|
|
select * from t1;
|
|
+------------------+------+----+
|
|
| s | yy | mm |
|
|
+------------------+------+----+
|
|
| Partition exists | 2016 | 1 |
|
|
| Partition exists | 2016 | 3 |
|
|
| Partition exists | 2016 | 2 |
|
|
+------------------+------+----+
|
|
]]>
|
|
</codeblock>
|
|
|
|
<p rev="1.2">
|
|
<b>To change the key-value pairs of the TBLPROPERTIES and SERDEPROPERTIES fields:</b>
|
|
</p>
|
|
|
|
<codeblock>ALTER TABLE <varname>table_name</varname> SET TBLPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>'[, ...]);
|
|
ALTER TABLE <varname>table_name</varname> SET SERDEPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>'[, ...]);</codeblock>
|
|
|
|
<p>
|
|
The <codeph>TBLPROPERTIES</codeph> clause is primarily a way to associate arbitrary
|
|
user-specified data items with a particular table.
|
|
</p>
|
|
|
|
<p>
|
|
The <codeph>SERDEPROPERTIES</codeph> clause sets up metadata defining how tables are read
|
|
or written, needed in some cases by Hive but not used extensively by Impala. You would use
|
|
this clause primarily to change the delimiter in an existing text table or partition, by
|
|
setting the <codeph>'serialization.format'</codeph> and <codeph>'field.delim'</codeph>
|
|
property values to the new delimiter character: The <codeph>SERDEPROPERTIES</codeph>
|
|
clause does not change the existing data in the table. The change only affects the future
|
|
insert into the table.
|
|
</p>
|
|
|
|
<p>
|
|
Use the <codeph>DESCRIBE FORMATTED</codeph> statement to see the current values of these
|
|
properties for an existing table. See
|
|
<xref
|
|
href="impala_create_table.xml#create_table"/> for more details about these
|
|
clauses.
|
|
</p>
|
|
|
|
<p>
|
|
<b>To specify a sort order for new records that are added to the table:</b>
|
|
</p>
|
|
|
|
<codeblock>ALTER TABLE <varname>table_name</varname> SORT BY [LEXICAL|ZORDER](<varname>column_name1</varname>[, <varname>column_name2</varname> ...]);</codeblock>
|
|
|
|
<p>
|
|
Specifying the sort order is optional. The default sort order is <codeph>LEXICAL</codeph>.
|
|
Setting the <codeph>SORT BY</codeph> property will not rewrite existing data files,
|
|
but subsequent inserts will be ordered. Sorting is always ascending.
|
|
</p>
|
|
|
|
<p>
|
|
Use the <codeph>DESCRIBE FORMATTED</codeph> statement to see the current sort
|
|
properties ('<codeph>sort.columns</codeph>' and '<codeph>sort.order</codeph>')
|
|
for an existing table. They can also be set using <codeph>SET TBLPROPERTIES</codeph>.
|
|
</p>
|
|
<p>
|
|
For details about sort order see <xref href="impala_create_table.xml#create_table"/>.
|
|
</p>
|
|
|
|
<p>
|
|
<b>To manually set or update table or column statistics:</b>
|
|
</p>
|
|
|
|
<p>
|
|
Although for most tables the <codeph>COMPUTE STATS</codeph> or <codeph>COMPUTE INCREMENTAL
|
|
STATS</codeph> statement is all you need to keep table and column statistics up to date
|
|
for a table, sometimes for a very large table or one that is updated frequently, the
|
|
length of time to recompute all the statistics might make it impractical to run those
|
|
statements as often as needed. As a workaround, you can use the <codeph>ALTER
|
|
TABLE</codeph> statement to set table statistics at the level of the entire table or a
|
|
single partition, or column statistics at the level of the entire table.
|
|
</p>
|
|
|
|
<p> You can set the <codeph>numrows</codeph> value for table statistics by
|
|
changing the <codeph>TBLPROPERTIES</codeph> setting for a table or
|
|
partition. For example: <codeblock conref="../shared/impala_common.xml#common/set_numrows_example"/>
|
|
<codeblock conref="../shared/impala_common.xml#common/set_numrows_partitioned_example"/></p>
|
|
|
|
<p rev="2.6.0 IMPALA-3369">
|
|
In <keyword keyref="impala26_full"/> and higher, you can use the <codeph>SET COLUMN
|
|
STATS</codeph> clause to set a specific stats value for a particular column.
|
|
</p>
|
|
|
|
<p conref="../shared/impala_common.xml#common/set_column_stats_example"/>
|
|
|
|
<p>
|
|
<b>To reorganize columns for a table:</b>
|
|
</p>
|
|
<p>You can add multiple columns at a time using the <codeph>ALTER
|
|
TABLE</codeph> statement. If you specify the <codeph>IF NOT
|
|
EXISTS</codeph> clause, Impala silently ignores the <codeph>ADD</codeph>
|
|
request and does not return an error if a column with the same name exists
|
|
in the table.</p>
|
|
<p>When you replace columns, all the original column definitions are
|
|
discarded. </p>
|
|
<p>You might use these statements if you receive a new set of data files
|
|
with different data types or columns in a different order. The data files
|
|
are retained, so if the new columns are incompatible with the old ones,
|
|
use <codeph>INSERT OVERWRITE</codeph> or <codeph>LOAD DATA
|
|
OVERWRITE</codeph> to replace all the data before issuing any further
|
|
queries.</p>
|
|
|
|
<p rev="">
|
|
For example, here is how you might add columns to an existing table. The first
|
|
<codeph>ALTER TABLE</codeph> adds two new columns, and the second <codeph>ALTER
|
|
TABLE</codeph> adds one new column. A single Impala query reads both the old and new data
|
|
files, containing different numbers of columns. For any columns not present in a
|
|
particular data file, all the column values are considered to be <codeph>NULL</codeph>.
|
|
</p>
|
|
|
|
<codeblock rev="">
|
|
create table t1 (x int);
|
|
insert into t1 values (1), (2);
|
|
|
|
alter table t1 add columns (s string, t timestamp);
|
|
insert into t1 values (3, 'three', now());
|
|
|
|
alter table t1 add columns (b boolean);
|
|
insert into t1 values (4, 'four', now(), true);
|
|
|
|
select * from t1 order by x;
|
|
+---+-------+-------------------------------+------+
|
|
| x | s | t | b |
|
|
+---+-------+-------------------------------+------+
|
|
| 1 | NULL | NULL | NULL |
|
|
| 2 | NULL | NULL | NULL |
|
|
| 3 | three | 2016-05-11 11:19:45.054457000 | NULL |
|
|
| 4 | four | 2016-05-11 11:20:20.260733000 | true |
|
|
+---+-------+-------------------------------+------+
|
|
</codeblock>
|
|
|
|
<p>
|
|
You might use the <codeph>CHANGE</codeph> clause to rename a single column, or to treat an
|
|
existing column as a different type than before, such as to switch between treating a
|
|
column as <codeph>STRING</codeph> and <codeph>TIMESTAMP</codeph>, or between
|
|
<codeph>INT</codeph> and <codeph>BIGINT</codeph>. You can only drop a single column at a
|
|
time; to drop multiple columns, issue multiple <codeph>ALTER TABLE</codeph> statements, or
|
|
define the new set of columns with a single <codeph>ALTER TABLE ... REPLACE
|
|
COLUMNS</codeph> statement.
|
|
</p>
|
|
|
|
<p rev="">
|
|
The following examples show some safe operations to drop or change columns. Dropping the
|
|
final column in a table lets Impala ignore the data causing any disruption to existing
|
|
data files. Changing the type of a column works if existing data values can be safely
|
|
converted to the new type. The type conversion rules depend on the file format of the
|
|
underlying table. For example, in a text table, the same value can be interpreted as a
|
|
<codeph>STRING</codeph> or a numeric value, while in a binary format such as Parquet, the
|
|
rules are stricter and type conversions only work between certain sizes of integers.
|
|
</p>
|
|
|
|
<codeblock rev="">
|
|
create table optional_columns (x int, y int, z int, a1 int, a2 int);
|
|
insert into optional_columns values (1,2,3,0,0), (2,3,4,100,100);
|
|
|
|
-- When the last column in the table is dropped, Impala ignores the
|
|
-- values that are no longer needed. (Dropping A1 but leaving A2
|
|
-- would cause problems, as we will see in a subsequent example.)
|
|
alter table optional_columns drop column a2;
|
|
alter table optional_columns drop column a1;
|
|
|
|
select * from optional_columns;
|
|
+---+---+---+
|
|
| x | y | z |
|
|
+---+---+---+
|
|
| 1 | 2 | 3 |
|
|
| 2 | 3 | 4 |
|
|
+---+---+---+
|
|
</codeblock>
|
|
|
|
<codeblock rev="">
|
|
create table int_to_string (s string, x int);
|
|
insert into int_to_string values ('one', 1), ('two', 2);
|
|
|
|
-- What was an INT column will now be interpreted as STRING.
|
|
-- This technique works for text tables but not other file formats.
|
|
-- The second X represents the new name of the column, which we keep the same.
|
|
alter table int_to_string change x x string;
|
|
|
|
-- Once the type is changed, we can insert non-integer values into the X column
|
|
-- and treat that column as a string, for example by uppercasing or concatenating.
|
|
insert into int_to_string values ('three', 'trois');
|
|
select s, upper(x) from int_to_string;
|
|
+-------+----------+
|
|
| s | upper(x) |
|
|
+-------+----------+
|
|
| one | 1 |
|
|
| two | 2 |
|
|
| three | TROIS |
|
|
+-------+----------+
|
|
</codeblock>
|
|
|
|
<p rev="">
|
|
Remember that Impala does not actually do any conversion for the underlying data files as
|
|
a result of <codeph>ALTER TABLE</codeph> statements. If you use <codeph>ALTER
|
|
TABLE</codeph> to create a table layout that does not agree with the contents of the
|
|
underlying files, you must replace the files yourself, such as using <codeph>LOAD
|
|
DATA</codeph> to load a new set of data files, or <codeph>INSERT OVERWRITE</codeph> to
|
|
copy from another table and replace the original data.
|
|
</p>
|
|
|
|
<p rev="">
|
|
The following example shows what happens if you delete the middle column from a Parquet
|
|
table containing three columns. The underlying data files still contain three columns of
|
|
data. Because the columns are interpreted based on their positions in the data file
|
|
instead of the specific column names, a <codeph>SELECT *</codeph> query now reads the
|
|
first and second columns from the data file, potentially leading to unexpected results or
|
|
conversion errors. For this reason, if you expect to someday drop a column, declare it as
|
|
the last column in the table, where its data can be ignored by queries after the column is
|
|
dropped. Or, re-run your ETL process and create new data files if you drop or change the
|
|
type of a column in a way that causes problems with existing data files.
|
|
</p>
|
|
|
|
<codeblock rev="">
|
|
-- Parquet table showing how dropping a column can produce unexpected results.
|
|
create table p1 (s1 string, s2 string, s3 string) stored as parquet;
|
|
|
|
insert into p1 values ('one', 'un', 'uno'), ('two', 'deux', 'dos'),
|
|
('three', 'trois', 'tres');
|
|
select * from p1;
|
|
+-------+-------+------+
|
|
| s1 | s2 | s3 |
|
|
+-------+-------+------+
|
|
| one | un | uno |
|
|
| two | deux | dos |
|
|
| three | trois | tres |
|
|
+-------+-------+------+
|
|
|
|
alter table p1 drop column s2;
|
|
-- The S3 column contains unexpected results.
|
|
-- Because S2 and S3 have compatible types, the query reads
|
|
-- values from the dropped S2, because the existing data files
|
|
-- still contain those values as the second column.
|
|
select * from p1;
|
|
+-------+-------+
|
|
| s1 | s3 |
|
|
+-------+-------+
|
|
| one | un |
|
|
| two | deux |
|
|
| three | trois |
|
|
+-------+-------+
|
|
</codeblock>
|
|
|
|
<codeblock rev="">
|
|
-- Parquet table showing how dropping a column can produce conversion errors.
|
|
create table p2 (s1 string, x int, s3 string) stored as parquet;
|
|
|
|
insert into p2 values ('one', 1, 'uno'), ('two', 2, 'dos'), ('three', 3, 'tres');
|
|
select * from p2;
|
|
+-------+---+------+
|
|
| s1 | x | s3 |
|
|
+-------+---+------+
|
|
| one | 1 | uno |
|
|
| two | 2 | dos |
|
|
| three | 3 | tres |
|
|
+-------+---+------+
|
|
|
|
alter table p2 drop column x;
|
|
select * from p2;
|
|
WARNINGS:
|
|
File '<varname>hdfs_filename</varname>' has an incompatible Parquet schema for column 'add_columns.p2.s3'.
|
|
Column type: STRING, Parquet schema:
|
|
optional int32 x [i:1 d:1 r:0]
|
|
|
|
File '<varname>hdfs_filename</varname>' has an incompatible Parquet schema for column 'add_columns.p2.s3'.
|
|
Column type: STRING, Parquet schema:
|
|
optional int32 x [i:1 d:1 r:0]
|
|
</codeblock>
|
|
|
|
<p rev="IMPALA-3092">
|
|
In <keyword keyref="impala26_full"/> and higher, if an Avro table is created without
|
|
column definitions in the <codeph>CREATE TABLE</codeph> statement, and columns are later
|
|
added through <codeph>ALTER TABLE</codeph>, the resulting table is now queryable. Missing
|
|
values from the newly added columns now default to <codeph>NULL</codeph>.
|
|
</p>
|
|
|
|
<p>
|
|
<b>To change the file format that Impala expects data to be in, for a table or
|
|
partition:</b>
|
|
</p>
|
|
|
|
<p>
|
|
Use an <codeph>ALTER TABLE ... SET FILEFORMAT</codeph> clause. You can include an optional
|
|
<codeph>PARTITION (<varname>col1</varname>=<varname>val1</varname>,
|
|
<varname>col2</varname>=<varname>val2</varname>, ...</codeph>) clause so that the file
|
|
format is changed for a specific partition rather than the entire table.
|
|
</p>
|
|
|
|
<p>
|
|
Because this operation only changes the table metadata, you must do any conversion of
|
|
existing data using regular Hadoop techniques outside of Impala. Any new data created by
|
|
the Impala <codeph>INSERT</codeph> statement will be in the new format. You cannot specify
|
|
the delimiter for Text files; the data files must be comma-delimited.
|
|
<!-- Although Impala can read Avro tables
|
|
created through Hive, you cannot specify the Avro file format in an Impala
|
|
<codeph>ALTER TABLE</codeph> statement. -->
|
|
</p>
|
|
|
|
<p>
|
|
To set the file format for a single partition, include the <codeph>PARTITION</codeph>
|
|
clause. Specify all the same partitioning columns for the table, with a constant value for
|
|
each, to precisely identify the single partition affected by the statement:
|
|
</p>
|
|
|
|
<codeblock>create table p1 (s string) partitioned by (month int, day int);
|
|
-- Each ADD PARTITION clause creates a subdirectory in HDFS.
|
|
alter table p1 add partition (month=1, day=1);
|
|
alter table p1 add partition (month=1, day=2);
|
|
alter table p1 add partition (month=2, day=1);
|
|
alter table p1 add partition (month=2, day=2);
|
|
-- Queries and INSERT statements will read and write files
|
|
-- in this format for this specific partition.
|
|
alter table p1 partition (month=2, day=2) set fileformat parquet;
|
|
</codeblock>
|
|
|
|
<p>
|
|
<b>To change the row format with different delimiter characters:</b>
|
|
</p>
|
|
|
|
<p>
|
|
Use the <codeph>SET ROW FORMAT DELIMITED</codeph> clause to ingest data files that use a
|
|
different delimiter character or a different line end character. When specifying delimiter
|
|
and line end characters with the <codeph>FIELDS TERMINATED BY</codeph>, <codeph>ESCAPED
|
|
BY</codeph>, and <codeph>LINES TERMINATED BY</codeph> clauses, you can use the following:
|
|
<ul>
|
|
<li>
|
|
A regular ASCII character surrounded by single or double quotation marks.
|
|
</li>
|
|
|
|
<li>
|
|
An octal sequence, such as <codeph>'\054'</codeph> representing a comma or
|
|
<codeph>'\0'</codeph> for ASCII null (hex 00).
|
|
</li>
|
|
|
|
<li>
|
|
Special characters, such as:
|
|
<ul>
|
|
<li>
|
|
<codeph>'\t'</codeph> for tab
|
|
</li>
|
|
|
|
<li>
|
|
<codeph>'\n'</codeph> for newline or linefeed
|
|
</li>
|
|
|
|
<li>
|
|
<codeph>'\r'</codeph> for carriage return
|
|
</li>
|
|
</ul>
|
|
</li>
|
|
|
|
<li>
|
|
An integer in the range '-127'..'128' (with quotation marks but no backslash)
|
|
<p>
|
|
Negative values are subtracted from 256. For example, <codeph>FIELDS TERMINATED BY
|
|
'-2'</codeph> sets the field delimiter to ASCII code 254.
|
|
</p>
|
|
</li>
|
|
</ul>
|
|
</p>
|
|
|
|
<p>
|
|
For more examples of text tables, see <xref
|
|
href="impala_txtfile.xml#txtfile"/>.
|
|
</p>
|
|
|
|
<p>
|
|
For the <codeph>ESCAPED BY</codeph> clause, choose an escape character that is not used
|
|
anywhere else in the file. The character following the escape character is taken literally
|
|
as part of a field value.
|
|
</p>
|
|
|
|
<p>
|
|
Surrounding field values with quotation marks does not help Impala to parse fields with
|
|
embedded delimiter characters as the quotation marks are considered to be part of the
|
|
column value.
|
|
</p>
|
|
|
|
<p>
|
|
If you want to use <codeph>\</codeph> as the escape character, specify the clause in
|
|
<cmdname>impala-shell</cmdname> as <codeph>ESCAPED BY '\\'</codeph>.
|
|
</p>
|
|
|
|
<p>
|
|
<b>To add or drop partitions for a table</b>, the table must already be partitioned (that
|
|
is, created with a <codeph>PARTITIONED BY</codeph> clause). The partition is a physical
|
|
directory in HDFS, with a name that encodes a particular column value (the <b>partition
|
|
key</b>). The Impala <codeph>INSERT</codeph> statement already creates the partition if
|
|
necessary, so the <codeph>ALTER TABLE ... ADD PARTITION</codeph> is primarily useful for
|
|
importing data by moving or copying existing data files into the HDFS directory
|
|
corresponding to a partition. (You can use the <codeph>LOAD DATA</codeph> statement to
|
|
move files into the partition directory, or <codeph>ALTER TABLE ... PARTITION (...) SET
|
|
LOCATION</codeph> to point a partition at a directory that already contains data files.
|
|
</p>
|
|
|
|
<p>
|
|
The <codeph>DROP PARTITION</codeph> clause is used to remove the HDFS directory and
|
|
associated data files for a particular set of partition key values; for example, if you
|
|
always analyze the last 3 months worth of data, at the beginning of each month you might
|
|
drop the oldest partition that is no longer needed. Removing partitions reduces the amount
|
|
of metadata associated with the table and the complexity of calculating the optimal query
|
|
plan, which can simplify and speed up queries on partitioned tables, particularly join
|
|
queries. Here is an example showing the <codeph>ADD PARTITION</codeph> and <codeph>DROP
|
|
PARTITION</codeph> clauses.
|
|
</p>
|
|
|
|
<p>
|
|
To avoid errors while adding or dropping partitions whose existence is not certain, add
|
|
the optional <codeph>IF [NOT] EXISTS</codeph> clause between the <codeph>ADD</codeph> or
|
|
<codeph>DROP</codeph> keyword and the <codeph>PARTITION</codeph> keyword. That is, the
|
|
entire clause becomes <codeph>ADD IF NOT EXISTS PARTITION</codeph> or <codeph>DROP IF
|
|
EXISTS PARTITION</codeph>. The following example shows how partitions can be created
|
|
automatically through <codeph>INSERT</codeph> statements, or manually through
|
|
<codeph>ALTER TABLE</codeph> statements. The <codeph>IF [NOT] EXISTS</codeph> clauses let
|
|
the <codeph>ALTER TABLE</codeph> statements succeed even if a new requested partition
|
|
already exists, or a partition to be dropped does not exist.
|
|
</p>
|
|
|
|
<p>
|
|
Inserting 2 year values creates 2 partitions:
|
|
</p>
|
|
|
|
<codeblock>
|
|
create table partition_t (s string) partitioned by (y int);
|
|
insert into partition_t (s,y) values ('two thousand',2000), ('nineteen ninety',1990);
|
|
show partitions partition_t;
|
|
+-------+-------+--------+------+--------------+-------------------+--------+-------------------+
|
|
| y | #Rows | #Files | Size | Bytes Cached | Cache Replication | Format | Incremental stats |
|
|
+-------+-------+--------+------+--------------+-------------------+--------+-------+
|
|
| 1990 | -1 | 1 | 16B | NOT CACHED | NOT CACHED | TEXT | false |
|
|
| 2000 | -1 | 1 | 13B | NOT CACHED | NOT CACHED | TEXT | false |
|
|
| Total | -1 | 2 | 29B | 0B | | | |
|
|
+-------+-------+--------+------+--------------+-------------------+--------+-------+
|
|
</codeblock>
|
|
|
|
<p>
|
|
Without the <codeph>IF NOT EXISTS</codeph> clause, an attempt to add a new partition might
|
|
fail:
|
|
</p>
|
|
|
|
<codeblock>
|
|
alter table partition_t add partition (y=2000);
|
|
ERROR: AnalysisException: Partition spec already exists: (y=2000).
|
|
</codeblock>
|
|
|
|
<p>
|
|
The <codeph>IF NOT EXISTS</codeph> clause makes the statement succeed whether or not there
|
|
was already a partition with the specified key value:
|
|
</p>
|
|
|
|
<codeblock>
|
|
alter table partition_t add if not exists partition (y=2000);
|
|
alter table partition_t add if not exists partition (y=2010);
|
|
show partitions partition_t;
|
|
+-------+-------+--------+------+--------------+-------------------+--------+-------------------+
|
|
| y | #Rows | #Files | Size | Bytes Cached | Cache Replication | Format | Incremental stats |
|
|
+-------+-------+--------+------+--------------+-------------------+--------+-------+
|
|
| 1990 | -1 | 1 | 16B | NOT CACHED | NOT CACHED | TEXT | false |
|
|
| 2000 | -1 | 1 | 13B | NOT CACHED | NOT CACHED | TEXT | false |
|
|
| 2010 | -1 | 0 | 0B | NOT CACHED | NOT CACHED | TEXT | false |
|
|
| Total | -1 | 2 | 29B | 0B | | | |
|
|
+-------+-------+--------+------+--------------+-------------------+--------+-------+
|
|
</codeblock>
|
|
|
|
<p>
|
|
Likewise, the <codeph>IF EXISTS</codeph> clause lets <codeph>DROP PARTITION</codeph>
|
|
succeed whether or not the partition is already in the table:
|
|
</p>
|
|
|
|
<codeblock>
|
|
alter table partition_t drop if exists partition (y=2000);
|
|
alter table partition_t drop if exists partition (y=1950);
|
|
show partitions partition_t;
|
|
+-------+-------+--------+------+--------------+-------------------+--------+-------------------+
|
|
| y | #Rows | #Files | Size | Bytes Cached | Cache Replication | Format | Incremental stats |
|
|
+-------+-------+--------+------+--------------+-------------------+--------+-------+
|
|
| 1990 | -1 | 1 | 16B | NOT CACHED | NOT CACHED | TEXT | false |
|
|
| 2010 | -1 | 0 | 0B | NOT CACHED | NOT CACHED | TEXT | false |
|
|
| Total | -1 | 1 | 16B | 0B | | | |
|
|
+-------+-------+--------+------+--------------+-------------------+--------+-------+
|
|
</codeblock>
|
|
|
|
<p rev="2.3.0">
|
|
The optional <codeph>PURGE</codeph> keyword, available in
|
|
<keyword keyref="impala23_full"/> and higher, is used with the <codeph>DROP
|
|
PARTITION</codeph> clause to remove associated HDFS data files immediately rather than
|
|
going through the HDFS trashcan mechanism. Use this keyword when dropping a partition if
|
|
it is crucial to remove the data as quickly as possible to free up space, or if there is a
|
|
problem with the trashcan, such as the trash cannot being configured or being in a
|
|
different HDFS encryption zone than the data files.
|
|
</p>
|
|
|
|
<!--
|
|
To do: Make example more general by partitioning by year/month/day.
|
|
Then could show inserting into fixed year, variable month and day;
|
|
dropping particular year/month/day partition.
|
|
-->
|
|
|
|
<codeblock>-- Create an empty table and define the partitioning scheme.
|
|
create table part_t (x int) partitioned by (month int);
|
|
-- Create an empty partition into which you could copy data files from some other source.
|
|
alter table part_t add partition (month=1);
|
|
-- After changing the underlying data, issue a REFRESH statement to make the data visible in Impala.
|
|
refresh part_t;
|
|
-- Later, do the same for the next month.
|
|
alter table part_t add partition (month=2);
|
|
|
|
-- Now you no longer need the older data.
|
|
alter table part_t drop partition (month=1);
|
|
-- If the table was partitioned by month and year, you would issue a statement like:
|
|
-- alter table part_t drop partition (year=2003,month=1);
|
|
-- which would require 12 ALTER TABLE statements to remove a year's worth of data.
|
|
|
|
-- If the data files for subsequent months were in a different file format,
|
|
-- you could set a different file format for the new partition as you create it.
|
|
alter table part_t add partition (month=3) set fileformat=parquet;
|
|
</codeblock>
|
|
|
|
<p>
|
|
The value specified for a partition key can be an arbitrary constant expression, without
|
|
any references to columns. For example:
|
|
</p>
|
|
|
|
<codeblock>alter table time_data add partition (month=concat('Decem','ber'));
|
|
alter table sales_data add partition (zipcode = cast(9021 * 10 as string));</codeblock>
|
|
|
|
<note>
|
|
<p>
|
|
An alternative way to reorganize a table and its associated data files is to use
|
|
<codeph>CREATE TABLE</codeph> to create a variation of the original table, then use
|
|
<codeph>INSERT</codeph> to copy the transformed or reordered data to the new table. The
|
|
advantage of <codeph>ALTER TABLE</codeph> is that it avoids making a duplicate copy of
|
|
the data files, allowing you to reorganize huge volumes of data in a space-efficient way
|
|
using familiar Hadoop techniques.
|
|
</p>
|
|
</note>
|
|
|
|
<p>
|
|
<b>To switch a table between internal and external:</b>
|
|
</p>
|
|
|
|
<p
|
|
conref="../shared/impala_common.xml#common/switch_internal_external_table"/>
|
|
|
|
<p conref="../shared/impala_common.xml#common/cancel_blurb_no"/>
|
|
|
|
<p conref="../shared/impala_common.xml#common/permissions_blurb"/>
|
|
|
|
<p rev="">
|
|
Most <codeph>ALTER TABLE</codeph> clauses do not actually read or write any HDFS files,
|
|
and so do not depend on specific HDFS permissions. For example, the <codeph>SET
|
|
FILEFORMAT</codeph> clause does not actually check the file format existing data files or
|
|
convert them to the new format, and the <codeph>SET LOCATION</codeph> clause does not
|
|
require any special permissions on the new location. (Any permission-related failures
|
|
would come later, when you actually query or insert into the table.)
|
|
</p>
|
|
|
|
<!-- Haven't rigorously tested all the assertions in the following paragraph. -->
|
|
|
|
<!-- Most testing so far has been around RENAME TO clause. -->
|
|
|
|
<p>
|
|
In general, <codeph>ALTER TABLE</codeph> clauses that do touch HDFS files and directories
|
|
require the same HDFS permissions as corresponding <codeph>CREATE</codeph>,
|
|
<codeph>INSERT</codeph>, or <codeph>SELECT</codeph> statements. The permissions allow the
|
|
user ID that the <cmdname>impalad</cmdname> daemon runs under, typically the
|
|
<codeph>impala</codeph> user, to read or write files or directories, or (in the case of
|
|
the execute bit) descend into a directory. The <codeph>RENAME TO</codeph> clause requires
|
|
read, write, and execute permission in the source and destination database directories and
|
|
in the table data directory, and read and write permission for the data files within the
|
|
table. The <codeph>ADD PARTITION</codeph> and <codeph>DROP PARTITION</codeph> clauses
|
|
require write and execute permissions for the associated partition directory.
|
|
</p>
|
|
|
|
<p conref="../shared/impala_common.xml#common/kudu_blurb"/>
|
|
|
|
<p rev="kudu IMPALA-2890">
|
|
Because of the extra constraints and features of Kudu tables, such as the <codeph>NOT
|
|
NULL</codeph> and <codeph>DEFAULT</codeph> attributes for columns, <codeph>ALTER
|
|
TABLE</codeph> has specific requirements related to Kudu tables:
|
|
<ul>
|
|
<li>
|
|
<p>
|
|
In an <codeph>ADD COLUMNS</codeph> operation, you can specify the
|
|
<codeph>NULL</codeph>, <codeph>NOT NULL</codeph>, and <codeph>DEFAULT
|
|
<varname>default_value</varname></codeph> column attributes.
|
|
</p>
|
|
</li>
|
|
|
|
<li>
|
|
<p rev="2.9.0 IMPALA-4616">
|
|
In <keyword keyref="impala29_full"/> and higher, you can also specify the
|
|
<codeph>ENCODING</codeph>, <codeph>COMPRESSION</codeph>, and
|
|
<codeph>BLOCK_SIZE</codeph> attributes when adding a column.
|
|
</p>
|
|
</li>
|
|
|
|
<li>
|
|
<p>
|
|
If you add a column with a <codeph>NOT NULL</codeph> attribute, it must also have a
|
|
<codeph>DEFAULT</codeph> attribute, so the default value can be assigned to that
|
|
column for all existing rows.
|
|
</p>
|
|
</li>
|
|
|
|
<li>
|
|
<p>
|
|
The <codeph>DROP COLUMN</codeph> clause works the same for a Kudu table as for other
|
|
kinds of tables.
|
|
</p>
|
|
</li>
|
|
|
|
<li>
|
|
<p>
|
|
Although you can change the name of a column with the <codeph>CHANGE</codeph>
|
|
clause, you cannot change the type of a column in a Kudu table.
|
|
</p>
|
|
</li>
|
|
|
|
<li>
|
|
<p>
|
|
You cannot change the nullability of existing columns in a Kudu table.
|
|
</p>
|
|
</li>
|
|
|
|
<li>
|
|
<p rev="2.10.0 IMPALA-4622">
|
|
In <keyword keyref="impala210_full"/>, you can change the default value, encoding,
|
|
compression, or block size of existing columns in a Kudu table by using the
|
|
<codeph>SET</codeph> clause.
|
|
</p>
|
|
</li>
|
|
|
|
<li>
|
|
<p>
|
|
You cannot use the <codeph>REPLACE COLUMNS</codeph> clause with a Kudu table.
|
|
</p>
|
|
</li>
|
|
|
|
<li>
|
|
The <codeph>RENAME TO</codeph> clause for a Kudu table only affects the name stored in
|
|
the metastore database that Impala uses to refer to the table. To change which
|
|
underlying Kudu table is associated with an Impala table name, you must change the
|
|
<codeph>TBLPROPERTIES</codeph> property of the table: <codeph>SET
|
|
TBLPROPERTIES('kudu.table_name'='<varname>kudu_tbl_name</varname>')</codeph>. You can
|
|
only change underlying Kudu tables for the external tables.
|
|
</li>
|
|
</ul>
|
|
</p>
|
|
|
|
<p>
|
|
The following are some examples of using the <codeph>ADD COLUMNS</codeph> clause for a
|
|
Kudu table:
|
|
</p>
|
|
|
|
<codeblock rev="2.9.0 IMPALA-4616">
|
|
CREATE TABLE t1 ( x INT, PRIMARY KEY (x) )
|
|
PARTITION BY HASH (x) PARTITIONS 16
|
|
STORED AS KUDU
|
|
|
|
ALTER TABLE t1 ADD COLUMNS (y STRING ENCODING prefix_encoding);
|
|
ALTER TABLE t1 ADD COLUMNS (z INT DEFAULT 10);
|
|
ALTER TABLE t1 ADD COLUMNS (a STRING NOT NULL DEFAULT '', t TIMESTAMP COMPRESSION default_compression);
|
|
</codeblock>
|
|
|
|
<p rev="2.10.0 IMPALA-4622">
|
|
The following are some examples of modifying column defaults and storage attributes for a
|
|
Kudu table:
|
|
</p>
|
|
|
|
<codeblock rev="2.10.0 IMPALA-4622">
|
|
create table kt (x bigint primary key, s string default 'yes', t timestamp)
|
|
stored as kudu;
|
|
|
|
-- You can change the default value for a column, which affects any rows
|
|
-- inserted after this change is made.
|
|
alter table kt alter column s set default 'no';
|
|
|
|
-- You can remove the default value for a column, which affects any rows
|
|
-- inserted after this change is made. If the column is nullable, any
|
|
-- future inserts default to NULL for this column. If the column is marked
|
|
-- NOT NULL, any future inserts must specify a value for the column.
|
|
alter table kt alter column s drop default;
|
|
|
|
insert into kt values (1, 'foo', now());
|
|
-- Because of the DROP DEFAULT above, omitting S from the insert
|
|
-- gives it a value of NULL.
|
|
insert into kt (x, t) values (2, now());
|
|
|
|
select * from kt;
|
|
+---+------+-------------------------------+
|
|
| x | s | t |
|
|
+---+------+-------------------------------+
|
|
| 2 | NULL | 2017-10-02 00:03:40.652156000 |
|
|
| 1 | foo | 2017-10-02 00:03:04.346185000 |
|
|
+---+------+-------------------------------+
|
|
|
|
-- Other storage-related attributes can also be changed for columns.
|
|
-- These changes take effect for any newly inserted rows, or rows
|
|
-- rearranged due to compaction after deletes or updates.
|
|
alter table kt alter column s set encoding prefix_encoding;
|
|
-- The COLUMN keyword is optional in the syntax.
|
|
alter table kt alter x set block_size 2048;
|
|
alter table kt alter column t set compression zlib;
|
|
|
|
desc kt;
|
|
+------+-----------+---------+-------------+----------+---------------+-----------------+---------------------+------------+
|
|
| name | type | comment | primary_key | nullable | default_value | encoding | compression | block_size |
|
|
+------+-----------+---------+-------------+----------+---------------+-----------------+---------------------+------------+
|
|
| x | bigint | | true | false | | AUTO_ENCODING | DEFAULT_COMPRESSION | 2048 |
|
|
| s | string | | false | true | | PREFIX_ENCODING | DEFAULT_COMPRESSION | 0 |
|
|
| t | timestamp | | false | true | | AUTO_ENCODING | ZLIB | 0 |
|
|
+------+-----------+---------+-------------+----------+---------------+-----------------+---------------------+------------+
|
|
</codeblock>
|
|
|
|
<p rev="kudu">
|
|
Kudu tables all use an underlying partitioning mechanism. The partition syntax is
|
|
different than for non-Kudu tables. You can use the <codeph>ALTER TABLE</codeph> statement
|
|
to add and drop <term>range partitions</term> from a Kudu table. Any new range must not
|
|
overlap with any existing ranges. Dropping a range removes all the associated rows from
|
|
the table. See <xref href="impala_kudu.xml#kudu_partitioning"/> for details.
|
|
</p>
|
|
|
|
<p conref="../shared/impala_common.xml#common/related_info"/>
|
|
|
|
<p>
|
|
<xref href="impala_tables.xml#tables"/>,
|
|
<xref
|
|
href="impala_create_table.xml#create_table"/>,
|
|
<xref
|
|
href="impala_drop_table.xml#drop_table"/>,
|
|
<xref
|
|
href="impala_partitioning.xml#partitioning"/>,
|
|
<xref
|
|
href="impala_tables.xml#internal_tables"/>,
|
|
<xref
|
|
href="impala_tables.xml#external_tables"/>
|
|
</p>
|
|
|
|
</conbody>
|
|
|
|
</concept>
|