mirror of
https://github.com/apache/impala.git
synced 2026-01-03 15:00:52 -05:00
HIVE-5795 introduced a parameter skip.header.line.count to skip header
lines from input files. This change introduces the capability to skip
an arbitrary number of header lines from csv input files on hdfs. The
size of the total file header must be smaller than
max_scan_range_length, otherwise an error will be reported. This is
necessary because scan ranges are not read in disk order, so there is
no way of identifying header lines except by counting from the start
of the first scan range.
[localhost:21000] > alter table t1 set
tblproperties('skip.header.line.count'='1');
Query: alter table t1 set tblproperties('skip.header.line.count'='1')
[localhost:21000] > select * from t1;
Query: select * from t1
+----+----+
| c1 | c2 |
+----+----+
| 1 | 1 |
| 2 | 2 |
| 3 | 3 |
+----+----+
Fetched 3 row(s) in 0.32s
[localhost:21000] > alter table t1 set
tblproperties('skip.header.line.count'='0');
Query: alter table t1 set tblproperties('skip.header.line.count'='0')
[localhost:21000] > select * from t1;
Query: select * from t1
+------+------+
| c1 | c2 |
+------+------+
| NULL | NULL |
| 1 | 1 |
| 2 | 2 |
| 3 | 3 |
+------+------+
WARNINGS: Error converting column: 0 TO INT (Data is: num1)
Error converting column: 1 TO DOUBLE (Data is: num2)
file: hdfs://localhost:20500/test-warehouse/t1/test.txt
record: num1,num2
Fetched 4 row(s) in 0.41s
Change-Id: I595f01a165d41499ca1956fe748ba3840a6eb543
Reviewed-on: http://gerrit.cloudera.org:8080/2110
Reviewed-by: Lars Volker <lv@cloudera.com>
Tested-by: Internal Jenkins
133 lines
2.5 KiB
Plaintext
133 lines
2.5 KiB
Plaintext
====
|
|
---- QUERY
|
|
set max_scan_range_length=0;
|
|
select c1, c2 from functional.table_with_header
|
|
---- RESULTS
|
|
1,2
|
|
3,4
|
|
5,6
|
|
---- TYPES
|
|
INT,DOUBLE
|
|
====
|
|
---- QUERY
|
|
set max_scan_range_length=0;
|
|
select count(*) from functional.table_with_header
|
|
---- RESULTS
|
|
3
|
|
---- TYPES
|
|
BIGINT
|
|
====
|
|
---- QUERY
|
|
set max_scan_range_length=2;
|
|
select c1, c2 from functional.table_with_header
|
|
---- RESULTS
|
|
1,2
|
|
3,4
|
|
5,6
|
|
---- TYPES
|
|
INT,DOUBLE
|
|
====
|
|
---- QUERY
|
|
set max_scan_range_length=2;
|
|
select count(*) from functional.table_with_header
|
|
---- RESULTS
|
|
3
|
|
---- TYPES
|
|
BIGINT
|
|
====
|
|
---- QUERY
|
|
set max_scan_range_length=30;
|
|
select c1, c2 from functional.table_with_header
|
|
---- RESULTS
|
|
1,2
|
|
3,4
|
|
5,6
|
|
---- TYPES
|
|
INT,DOUBLE
|
|
====
|
|
---- QUERY
|
|
set max_scan_range_length=30;
|
|
select count(*) from functional.table_with_header
|
|
---- RESULTS
|
|
3
|
|
---- TYPES
|
|
BIGINT
|
|
====
|
|
---- QUERY
|
|
set max_scan_range_length=0;
|
|
select c1, c2 from functional.table_with_header_2
|
|
---- RESULTS
|
|
1,2
|
|
3,4
|
|
5,6
|
|
---- TYPES
|
|
INT,DOUBLE
|
|
====
|
|
---- QUERY
|
|
set max_scan_range_length=0;
|
|
select count(*) from functional.table_with_header_2
|
|
---- RESULTS
|
|
3
|
|
---- TYPES
|
|
BIGINT
|
|
====
|
|
---- QUERY
|
|
set max_scan_range_length=2;
|
|
set abort_on_error=1;
|
|
select c1, c2 from functional.table_with_header_2
|
|
---- CATCH
|
|
increasing max_scan_range_length to a value larger than the size of the file's header.
|
|
---- TYPES
|
|
INT,DOUBLE
|
|
====
|
|
---- QUERY
|
|
set max_scan_range_length=2;
|
|
set abort_on_error=0;
|
|
select c1, c2 from functional.table_with_header_2
|
|
---- CATCH
|
|
increasing max_scan_range_length to a value larger than the size of the file's header.
|
|
---- TYPES
|
|
INT,DOUBLE
|
|
====
|
|
---- QUERY
|
|
set max_scan_range_length=30;
|
|
select c1, c2 from functional.table_with_header_2
|
|
---- RESULTS
|
|
1,2
|
|
3,4
|
|
5,6
|
|
---- TYPES
|
|
INT,DOUBLE
|
|
====
|
|
---- QUERY
|
|
set max_scan_range_length=30;
|
|
select count(*) from functional.table_with_header_2
|
|
---- RESULTS
|
|
3
|
|
---- TYPES
|
|
BIGINT
|
|
====
|
|
---- QUERY
|
|
drop table if exists mixed;
|
|
create table mixed (kf smallint) partitioned by (year smallint) stored as textfile;
|
|
alter table mixed add partition (year=2012);
|
|
alter table mixed add partition (year=2013);
|
|
alter table mixed partition (year=2013) set fileformat parquet;
|
|
insert into mixed partition (year=2012) values (1),(2),(3);
|
|
insert into mixed partition (year=2013) values (4),(5),(6);
|
|
alter table mixed set tblproperties("skip.header.line.count"="1");
|
|
alter table mixed set fileformat parquet;
|
|
alter table mixed set tblproperties("skip.header.line.count"="2");
|
|
select * from mixed;
|
|
---- RESULTS
|
|
3,2012
|
|
4,2013
|
|
5,2013
|
|
6,2013
|
|
---- TYPES
|
|
SMALLINT,SMALLINT
|
|
====
|
|
---- QUERY
|
|
drop table mixed;
|
|
====
|